From 7cd468a3d7dee7d6c92f69a0bb7061ae208ec727 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 19 Dec 2016 23:05:39 +0100 Subject: Reorganize source tree to use single autotools instance Change-Id: I7b51f88292e057c6443b12224486f2d0c9f8ae23 Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 1987 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1987 insertions(+) create mode 100644 src/vlib/buffer.c (limited to 'src/vlib/buffer.c') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c new file mode 100644 index 00000000..4bf6d125 --- /dev/null +++ b/src/vlib/buffer.c @@ -0,0 +1,1987 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.c: allocate/free network buffers. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @cond (!DPDK) + * @file + * + * Allocate/free network buffers. + */ + +#if DPDK > 0 +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include + +#if DPDK > 0 +#pragma weak rte_mem_virt2phy +#pragma weak rte_eal_has_hugepages +#pragma weak rte_socket_id +#pragma weak rte_pktmbuf_pool_create +#endif + +uword +vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, + vlib_buffer_t * b_first) +{ + vlib_buffer_t *b = b_first; + uword l_first = b_first->current_length; + uword l = 0; + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + l += b->current_length; + } + b_first->total_length_not_including_first_buffer = l; + b_first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + return l + l_first; +} + +u8 * +format_vlib_buffer (u8 * s, va_list * args) +{ + vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); +#if DPDK > 0 + uword indent = format_get_indent (s); + + s = format (s, "current data %d, length %d, free-list %d", + b->current_data, b->current_length, b->free_list_index); + + if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID) + s = format (s, ", totlen-nifb %d", + b->total_length_not_including_first_buffer); + + if (b->flags & VLIB_BUFFER_IS_TRACED) + s = format (s, ", trace 0x%x", b->trace_index); + + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + vlib_main_t *vm = vlib_get_main (); + u32 next_buffer = b->next_buffer; + b = vlib_get_buffer (vm, next_buffer); + + s = format (s, "\n%Unext-buffer 0x%x, segment length %d", + format_white_space, indent, next_buffer, b->current_length); + } + +#else + + s = format (s, "current data %d, length %d, free-list %d", + b->current_data, b->current_length, b->free_list_index); + + if (b->flags & VLIB_BUFFER_IS_TRACED) + s = format (s, ", trace 0x%x", b->trace_index); + + if (b->flags & VLIB_BUFFER_NEXT_PRESENT) + s = format (s, ", next-buffer 0x%x", b->next_buffer); +#endif + + return s; +} + +u8 * +format_vlib_buffer_and_data (u8 * s, va_list * args) +{ + vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); + + s = format (s, "%U, %U", + format_vlib_buffer, b, + format_hex_bytes, vlib_buffer_get_current (b), 64); + + return s; +} + +#if DPDK == 0 +static u8 * +format_vlib_buffer_known_state (u8 * s, va_list * args) +{ + vlib_buffer_known_state_t state = va_arg (*args, vlib_buffer_known_state_t); + char *t; + + switch (state) + { + case VLIB_BUFFER_UNKNOWN: + t = "unknown"; + break; + + case VLIB_BUFFER_KNOWN_ALLOCATED: + t = "known-allocated"; + break; + + case VLIB_BUFFER_KNOWN_FREE: + t = "known-free"; + break; + + default: + t = "invalid"; + break; + } + + return format (s, "%s", t); +} +#endif + +u8 * +format_vlib_buffer_contents (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_buffer_t *b = va_arg (*va, vlib_buffer_t *); + + while (1) + { + vec_add (s, vlib_buffer_get_current (b), b->current_length); + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + break; + b = vlib_get_buffer (vm, b->next_buffer); + } + + return s; +} + +#if DPDK == 0 +static u8 * +vlib_validate_buffer_helper (vlib_main_t * vm, + u32 bi, + uword follow_buffer_next, uword ** unique_hash) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + + if (pool_is_free_index (bm->buffer_free_list_pool, b->free_list_index)) + return format (0, "unknown free list 0x%x", b->free_list_index); + + fl = pool_elt_at_index (bm->buffer_free_list_pool, b->free_list_index); + + if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE) + return format (0, "current data %d before pre-data", b->current_data); +#if DPDK == 0 + if (b->current_data + b->current_length > fl->n_data_bytes) + return format (0, "%d-%d beyond end of buffer %d", + b->current_data, b->current_length, fl->n_data_bytes); +#endif + + if (follow_buffer_next && (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + vlib_buffer_known_state_t k; + u8 *msg, *result; + + k = vlib_buffer_is_known (vm, b->next_buffer); + if (k != VLIB_BUFFER_KNOWN_ALLOCATED) + return format (0, "next 0x%x: %U", + b->next_buffer, format_vlib_buffer_known_state, k); + + if (unique_hash) + { + if (hash_get (*unique_hash, b->next_buffer)) + return format (0, "duplicate buffer 0x%x", b->next_buffer); + + hash_set1 (*unique_hash, b->next_buffer); + } + + msg = vlib_validate_buffer (vm, b->next_buffer, follow_buffer_next); + if (msg) + { + result = format (0, "next 0x%x: %v", b->next_buffer, msg); + vec_free (msg); + return result; + } + } + + return 0; +} + +u8 * +vlib_validate_buffer (vlib_main_t * vm, u32 bi, uword follow_buffer_next) +{ + return vlib_validate_buffer_helper (vm, bi, follow_buffer_next, + /* unique_hash */ 0); +} + +u8 * +vlib_validate_buffers (vlib_main_t * vm, + u32 * buffers, + uword next_buffer_stride, + uword n_buffers, + vlib_buffer_known_state_t known_state, + uword follow_buffer_next) +{ + uword i, *hash; + u32 bi, *b = buffers; + vlib_buffer_known_state_t k; + u8 *msg = 0, *result = 0; + + hash = hash_create (0, 0); + for (i = 0; i < n_buffers; i++) + { + bi = b[0]; + b += next_buffer_stride; + + /* Buffer is not unique. */ + if (hash_get (hash, bi)) + { + msg = format (0, "not unique"); + goto done; + } + + k = vlib_buffer_is_known (vm, bi); + if (k != known_state) + { + msg = format (0, "is %U; expected %U", + format_vlib_buffer_known_state, k, + format_vlib_buffer_known_state, known_state); + goto done; + } + + msg = vlib_validate_buffer_helper (vm, bi, follow_buffer_next, &hash); + if (msg) + goto done; + + hash_set1 (hash, bi); + } + +done: + if (msg) + { + result = format (0, "0x%x: %v", bi, msg); + vec_free (msg); + } + hash_free (hash); + return result; +} +#endif + +vlib_main_t **vlib_mains; + +#if DPDK == 0 +/* When dubugging validate that given buffers are either known allocated + or known free. */ +static void +vlib_buffer_validate_alloc_free (vlib_main_t * vm, + u32 * buffers, + uword n_buffers, + vlib_buffer_known_state_t expected_state) +{ + u32 *b; + uword i, bi, is_free; + + if (CLIB_DEBUG == 0) + return; + + ASSERT (os_get_cpu_number () == 0); + + /* smp disaster check */ + if (vlib_mains) + ASSERT (vm == vlib_mains[0]); + + is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED; + b = buffers; + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_known_state_t known; + + bi = b[0]; + b += 1; + known = vlib_buffer_is_known (vm, bi); + if (known != expected_state) + { + ASSERT (0); + vlib_panic_with_msg + (vm, "%s %U buffer 0x%x", + is_free ? "freeing" : "allocating", + format_vlib_buffer_known_state, known, bi); + } + + vlib_buffer_set_known_state + (vm, bi, + is_free ? VLIB_BUFFER_KNOWN_FREE : VLIB_BUFFER_KNOWN_ALLOCATED); + } +} +#endif + +#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) + +/* Make sure we have at least given number of unaligned buffers. */ +static void +fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + uword n_unaligned_buffers) +{ + word la = vec_len (free_list->aligned_buffers); + word lu = vec_len (free_list->unaligned_buffers); + + /* Aligned come in aligned copy-sized chunks. */ + ASSERT (la % BUFFERS_PER_COPY == 0); + + ASSERT (la >= n_unaligned_buffers); + + while (lu < n_unaligned_buffers) + { + /* Copy 4 buffers from end of aligned vector to unaligned vector. */ + vec_add (free_list->unaligned_buffers, + free_list->aligned_buffers + la - BUFFERS_PER_COPY, + BUFFERS_PER_COPY); + la -= BUFFERS_PER_COPY; + lu += BUFFERS_PER_COPY; + } + _vec_len (free_list->aligned_buffers) = la; +} + +/* After free aligned buffers may not contain even sized chunks. */ +static void +trim_aligned (vlib_buffer_free_list_t * f) +{ + uword l, n_trim; + + /* Add unaligned to aligned before trim. */ + l = vec_len (f->unaligned_buffers); + if (l > 0) + { + vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, + /* align */ sizeof (vlib_copy_unit_t)); + + _vec_len (f->unaligned_buffers) = 0; + } + + /* Remove unaligned buffers from end of aligned vector and save for next trim. */ + l = vec_len (f->aligned_buffers); + n_trim = l % BUFFERS_PER_COPY; + if (n_trim) + { + /* Trim aligned -> unaligned. */ + vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); + + /* Remove from aligned. */ + _vec_len (f->aligned_buffers) = l - n_trim; + } +} + +static void +merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src) +{ + uword l; + u32 *d; + + trim_aligned (src); + trim_aligned (dst); + + l = vec_len (src->aligned_buffers); + if (l > 0) + { + vec_add2_aligned (dst->aligned_buffers, d, l, + /* align */ sizeof (vlib_copy_unit_t)); + clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); + vec_free (src->aligned_buffers); + } + + l = vec_len (src->unaligned_buffers); + if (l > 0) + { + vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); + vec_free (src->unaligned_buffers); + } +} + +always_inline u32 +vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + size = vlib_buffer_round_size (size); + uword *p = hash_get (bm->free_list_by_size, size); + return p ? p[0] : ~0; +} + +/* Add buffer free list. */ +static u32 +vlib_buffer_create_free_list_helper (vlib_main_t * vm, + u32 n_data_bytes, + u32 is_public, u32 is_default, u8 * name) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; +#if DPDK > 0 + int i; + + ASSERT (os_get_cpu_number () == 0); + + if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) + { + u32 default_free_free_list_index; + + /* *INDENT-OFF* */ + default_free_free_list_index = + vlib_buffer_create_free_list_helper + (vm, + /* default buffer size */ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + /* is_public */ 1, + /* is_default */ 1, + (u8 *) "default"); + /* *INDENT-ON* */ + ASSERT (default_free_free_list_index == + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public) + return default_free_free_list_index; + } + + pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES); + + memset (f, 0, sizeof (f[0])); + f->index = f - bm->buffer_free_list_pool; + f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); + f->min_n_buffers_each_physmem_alloc = 16; + f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); + + /* Setup free buffer template. */ + f->buffer_init_template.free_list_index = f->index; + + if (is_public) + { + uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes); + if (!p) + hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); + } + + for (i = 1; i < vec_len (vlib_mains); i++) + { + vlib_buffer_main_t *wbm = vlib_mains[i]->buffer_main; + vlib_buffer_free_list_t *wf; + pool_get_aligned (wbm->buffer_free_list_pool, + wf, CLIB_CACHE_LINE_BYTES); + ASSERT (f - bm->buffer_free_list_pool == + wf - wbm->buffer_free_list_pool); + wf[0] = f[0]; + wf->aligned_buffers = 0; + wf->unaligned_buffers = 0; + wf->n_alloc = 0; + } +#else + + if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) + { + u32 default_free_free_list_index; + + default_free_free_list_index = vlib_buffer_create_free_list_helper (vm, + /* default buffer size */ + VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + /* is_public */ + 1, + /* is_default */ + 1, + (u8 + *) + "default"); + ASSERT (default_free_free_list_index == + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public) + return default_free_free_list_index; + } + + pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES); + + memset (f, 0, sizeof (f[0])); + f->index = f - bm->buffer_free_list_pool; + f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); + f->min_n_buffers_each_physmem_alloc = 256; + f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); + + /* Setup free buffer template. */ + f->buffer_init_template.free_list_index = f->index; + + if (is_public) + { + uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes); + if (!p) + hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); + } +#endif + + return f->index; +} + +u32 +vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...) +{ + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + return vlib_buffer_create_free_list_helper (vm, n_data_bytes, + /* is_public */ 0, + /* is_default */ 0, + name); +} + +u32 +vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...) +{ + u32 i = vlib_buffer_get_free_list_with_size (vm, n_data_bytes); + + if (i == ~0) + { + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + i = vlib_buffer_create_free_list_helper (vm, n_data_bytes, + /* is_public */ 1, + /* is_default */ 0, + name); + } + + return i; +} + +static void +del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) +{ + u32 i; +#if DPDK > 0 + struct rte_mbuf *mb; + vlib_buffer_t *b; + + for (i = 0; i < vec_len (f->unaligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->unaligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + for (i = 0; i < vec_len (f->aligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->aligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + vec_free (f->name); +#else + + for (i = 0; i < vec_len (f->buffer_memory_allocated); i++) + vm->os_physmem_free (f->buffer_memory_allocated[i]); + vec_free (f->name); + vec_free (f->buffer_memory_allocated); +#endif + vec_free (f->unaligned_buffers); + vec_free (f->aligned_buffers); +} + +/* Add buffer free list. */ +void +vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + u32 merge_index; +#if DPDK > 0 + int i; + + ASSERT (os_get_cpu_number () == 0); + + f = vlib_buffer_get_free_list (vm, free_list_index); + + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, + merge_index), f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); + + for (i = 1; i < vec_len (vlib_mains); i++) + { + bm = vlib_mains[i]->buffer_main; + f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);; + memset (f, 0xab, sizeof (f[0])); + pool_put (bm->buffer_free_list_pool, f); + } +#else + + f = vlib_buffer_get_free_list (vm, free_list_index); + + ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == + f->n_alloc); + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, + merge_index), f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); +#endif +} + +/* Make sure free list has at least given number of free buffers. */ +static uword +fill_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, uword min_free_buffers) +{ +#if DPDK > 0 + vlib_buffer_t *b; + int n, i; + u32 bi; + u32 n_remaining = 0, n_alloc = 0; + unsigned socket_id = rte_socket_id ? rte_socket_id () : 0; + struct rte_mempool *rmp = vm->buffer_main->pktmbuf_pools[socket_id]; + struct rte_mbuf *mb; + + /* Too early? */ + if (PREDICT_FALSE (rmp == 0)) + return 0; + + trim_aligned (fl); + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->aligned_buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, BUFFERS_PER_COPY); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + vec_validate (vm->mbuf_alloc_list, n - 1); + + if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) + return 0; + + _vec_len (vm->mbuf_alloc_list) = n; + + for (i = 0; i < n; i++) + { + mb = vm->mbuf_alloc_list[i]; + + ASSERT (rte_mbuf_refcnt_read (mb) == 0); + rte_mbuf_refcnt_set (mb, 1); + + b = vlib_buffer_from_rte_mbuf (mb); + bi = vlib_get_buffer_index (vm, b); + + vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); + n_alloc++; + n_remaining--; + + vlib_buffer_init_for_free_list (b, fl); + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, &bi, 1); + } + + fl->n_alloc += n; + + return n; +#else + vlib_buffer_t *buffers, *b; + int n, n_bytes, i; + u32 *bi; + u32 n_remaining, n_alloc, n_this_chunk; + + trim_aligned (fl); + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->aligned_buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, BUFFERS_PER_COPY); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + n_remaining = n; + n_alloc = 0; + while (n_remaining > 0) + { + n_this_chunk = clib_min (n_remaining, 16); + + n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes); + + /* drb: removed power-of-2 ASSERT */ + buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main, + n_bytes, + sizeof (vlib_buffer_t)); + if (!buffers) + return n_alloc; + + /* Record chunk as being allocated so we can free it later. */ + vec_add1 (fl->buffer_memory_allocated, buffers); + + fl->n_alloc += n_this_chunk; + n_alloc += n_this_chunk; + n_remaining -= n_this_chunk; + + b = buffers; + vec_add2_aligned (fl->aligned_buffers, bi, n_this_chunk, + sizeof (vlib_copy_unit_t)); + for (i = 0; i < n_this_chunk; i++) + { + bi[i] = vlib_get_buffer_index (vm, b); + + if (CLIB_DEBUG > 0) + vlib_buffer_set_known_state (vm, bi[i], VLIB_BUFFER_KNOWN_FREE); + b = vlib_buffer_next_contiguous (b, fl->n_data_bytes); + } + + memset (buffers, 0, n_bytes); + + /* Initialize all new buffers. */ + b = buffers; + for (i = 0; i < n_this_chunk; i++) + { + vlib_buffer_init_for_free_list (b, fl); + b = vlib_buffer_next_contiguous (b, fl->n_data_bytes); + } + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, bi, n_this_chunk); + } + return n_alloc; +#endif +} + +always_inline uword +copy_alignment (u32 * x) +{ + return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; +} + +static u32 +alloc_from_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + u32 * alloc_buffers, u32 n_alloc_buffers) +{ + u32 *dst, *u_src; + uword u_len, n_left; + uword n_unaligned_start, n_unaligned_end, n_filled; + +#if DPDK == 0 + ASSERT (os_get_cpu_number () == 0); + +#endif + n_left = n_alloc_buffers; + dst = alloc_buffers; + n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) + & (BUFFERS_PER_COPY - 1)); + + n_filled = fill_free_list (vm, free_list, n_alloc_buffers); + if (n_filled == 0) + return 0; + + n_left = n_filled < n_left ? n_filled : n_left; + n_alloc_buffers = n_left; + + if (n_unaligned_start >= n_left) + { + n_unaligned_start = n_left; + n_unaligned_end = 0; + } + else + n_unaligned_end = copy_alignment (dst + n_alloc_buffers); + + fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + + u_len = vec_len (free_list->unaligned_buffers); + u_src = free_list->unaligned_buffers + u_len - 1; + + if (n_unaligned_start) + { + uword n_copy = n_unaligned_start; + if (n_copy > n_left) + n_copy = n_left; + n_left -= n_copy; + + while (n_copy > 0) + { + *dst++ = *u_src--; + n_copy--; + u_len--; + } + + /* Now dst should be aligned. */ + if (n_left > 0) + ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); + } + + /* Aligned copy. */ + { + vlib_copy_unit_t *d, *s; + uword n_copy; + + if (vec_len (free_list->aligned_buffers) < + ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) + abort (); + + n_copy = n_left / BUFFERS_PER_COPY; + n_left = n_left % BUFFERS_PER_COPY; + + /* Remove buffers from aligned free list. */ + _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; + + s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); + d = (vlib_copy_unit_t *) dst; + + /* Fast path loop. */ + while (n_copy >= 4) + { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + n_copy -= 4; + s += 4; + d += 4; + } + + while (n_copy >= 1) + { + d[0] = s[0]; + n_copy -= 1; + s += 1; + d += 1; + } + + dst = (void *) d; + } + + /* Unaligned copy. */ + ASSERT (n_unaligned_end == n_left); + while (n_left > 0) + { + *dst++ = *u_src--; + n_left--; + u_len--; + } + + if (!free_list->unaligned_buffers) + ASSERT (u_len == 0); + else + _vec_len (free_list->unaligned_buffers) = u_len; + +#if DPDK == 0 + /* Verify that buffers are known free. */ + vlib_buffer_validate_alloc_free (vm, alloc_buffers, + n_alloc_buffers, VLIB_BUFFER_KNOWN_FREE); +#endif + + return n_alloc_buffers; +} + +/* Allocate a given number of buffers into given array. + Returns number actually allocated which will be either zero or + number requested. */ +u32 +vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; +#if DPDK == 0 + ASSERT (os_get_cpu_number () == 0); +#endif + + return alloc_from_free_list + (vm, + pool_elt_at_index (bm->buffer_free_list_pool, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX), + buffers, n_buffers); +} + +u32 +vlib_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index); + return alloc_from_free_list (vm, f, buffers, n_buffers); +} + +always_inline void +add_buffer_to_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * f, + u32 buffer_index, u8 do_init) +{ + vlib_buffer_t *b; + b = vlib_get_buffer (vm, buffer_index); + if (PREDICT_TRUE (do_init)) + vlib_buffer_init_for_free_list (b, f); + vec_add1_aligned (f->aligned_buffers, buffer_index, + sizeof (vlib_copy_unit_t)); +} + +always_inline vlib_buffer_free_list_t * +buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + u32 i; + + *index = i = b->free_list_index; + return pool_elt_at_index (bm->buffer_free_list_pool, i); +} + +void * +vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + void *rv = bm->buffer_free_callback; + + bm->buffer_free_callback = fp; + return rv; +} + +#if DPDK == 0 +void vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) __attribute__ ((weak)); +void +vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) +{ +} + +#endif +static_always_inline void +vlib_buffer_free_inline (vlib_main_t * vm, + u32 * buffers, u32 n_buffers, u32 follow_buffer_next) +{ +#if DPDK > 0 + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + u32 fi; + int i; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b; + struct rte_mbuf *mb; + + b = vlib_get_buffer (vm, buffers[i]); + + fl = buffer_get_free_list (vm, b, &fi); + + /* The only current use of this callback: multicast recycle */ + if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) + { + int j; + + add_buffer_to_free_list + (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); + + for (j = 0; j < vec_len (bm->announce_list); j++) + { + if (fl == bm->announce_list[j]) + goto already_announced; + } + vec_add1 (bm->announce_list, fl); + already_announced: + ; + } + else + { + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) + { + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + } + } + if (vec_len (bm->announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (bm->announce_list); i++) + { + fl = bm->announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (bm->announce_list) = 0; + } +#else + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + static u32 *next_to_free[2]; /* smp bad */ + u32 i_next_to_free, *b, *n, *f, fi; + uword n_left; + int i; + static vlib_buffer_free_list_t **announce_list; + vlib_buffer_free_list_t *fl0 = 0, *fl1 = 0; + u32 bi0 = (u32) ~ 0, bi1 = (u32) ~ 0, fi0, fi1 = (u32) ~ 0; + u8 free0, free1 = 0, free_next0, free_next1; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + ASSERT (os_get_cpu_number () == 0); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + /* Use first buffer to get default free list. */ + { + u32 bi0 = buffers[0]; + vlib_buffer_t *b0; + + b0 = vlib_get_buffer (vm, bi0); + fl = buffer_get_free_list (vm, b0, &fi); + if (fl->buffers_added_to_freelist_function) + vec_add1 (announce_list, fl); + } + + vec_validate (next_to_free[0], n_buffers - 1); + vec_validate (next_to_free[1], n_buffers - 1); + + i_next_to_free = 0; + n_left = n_buffers; + b = buffers; + +again: + /* Verify that buffers are known allocated. */ + vlib_buffer_validate_alloc_free (vm, b, + n_left, VLIB_BUFFER_KNOWN_ALLOCATED); + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + + n = next_to_free[i_next_to_free]; + while (n_left >= 4) + { + vlib_buffer_t *b0, *b1, *binit0, *binit1, dummy_buffers[2]; + + bi0 = b[0]; + bi1 = b[1]; + + f[0] = bi0; + f[1] = bi1; + f += 2; + b += 2; + n_left -= 2; + + /* Prefetch buffers for next iteration. */ + vlib_prefetch_buffer_with_index (vm, b[0], WRITE); + vlib_prefetch_buffer_with_index (vm, b[1], WRITE); + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; + free1 = (b1->flags & VLIB_BUFFER_RECYCLE) == 0; + + /* Must be before init which will over-write buffer flags. */ + if (follow_buffer_next) + { + n[0] = b0->next_buffer; + free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next0; + + n[0] = b1->next_buffer; + free_next1 = free1 && (b1->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next1; + } + else + free_next0 = free_next1 = 0; + + /* Must be before init which will over-write buffer free list. */ + fi0 = b0->free_list_index; + fi1 = b1->free_list_index; + + if (PREDICT_FALSE (fi0 != fi || fi1 != fi)) + goto slow_path_x2; + + binit0 = free0 ? b0 : &dummy_buffers[0]; + binit1 = free1 ? b1 : &dummy_buffers[1]; + + vlib_buffer_init_two_for_free_list (binit0, binit1, fl); + continue; + + slow_path_x2: + /* Backup speculation. */ + f -= 2; + n -= free_next0 + free_next1; + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); + fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); + + add_buffer_to_free_list (vm, fl0, bi0, free0); + if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl0 == announce_list[i]) + goto no_fl0; + vec_add1 (announce_list, fl0); + } + no_fl0: + if (PREDICT_FALSE (fl1->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl1 == announce_list[i]) + goto no_fl1; + vec_add1 (announce_list, fl1); + } + + no_fl1: + add_buffer_to_free_list (vm, fl1, bi1, free1); + + /* Possibly change current free list. */ + if (fi0 != fi && fi1 != fi) + { + fi = fi1; + fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); + } + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + } + + while (n_left >= 1) + { + vlib_buffer_t *b0, *binit0, dummy_buffers[1]; + + bi0 = b[0]; + f[0] = bi0; + f += 1; + b += 1; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; + + /* Must be before init which will over-write buffer flags. */ + if (follow_buffer_next) + { + n[0] = b0->next_buffer; + free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next0; + } + else + free_next0 = 0; + + /* Must be before init which will over-write buffer free list. */ + fi0 = b0->free_list_index; + + if (PREDICT_FALSE (fi0 != fi)) + goto slow_path_x1; + + binit0 = free0 ? b0 : &dummy_buffers[0]; + + vlib_buffer_init_for_free_list (binit0, fl); + continue; + + slow_path_x1: + /* Backup speculation. */ + f -= 1; + n -= free_next0; + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); + + add_buffer_to_free_list (vm, fl0, bi0, free0); + if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl0 == announce_list[i]) + goto no_fl00; + vec_add1 (announce_list, fl0); + } + + no_fl00: + fi = fi0; + fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + } + + if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0)) + { + b = next_to_free[i_next_to_free]; + i_next_to_free ^= 1; + goto again; + } + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + if (vec_len (announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (announce_list); i++) + { + fl = announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (announce_list) = 0; + } +#endif +} + +void +vlib_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 1); +} + +void +vlib_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 0); +} + +#if DPDK == 0 +/* Copy template packet data into buffers as they are allocated. */ +static void +vlib_packet_template_buffer_init (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, + u32 * buffers, u32 n_buffers) +{ + vlib_packet_template_t *t = + uword_to_pointer (fl->buffer_init_function_opaque, + vlib_packet_template_t *); + uword i; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b = vlib_get_buffer (vm, buffers[i]); + ASSERT (b->current_length == vec_len (t->packet_data)); + clib_memcpy (vlib_buffer_get_current (b), t->packet_data, + b->current_length); + } +} +#endif + +void +vlib_packet_template_init (vlib_main_t * vm, + vlib_packet_template_t * t, + void *packet_data, + uword n_packet_data_bytes, + uword min_n_buffers_each_physmem_alloc, + char *fmt, ...) +{ +#if DPDK > 0 + va_list va; + __attribute__ ((unused)) u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + vlib_worker_thread_barrier_sync (vm); + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + + vlib_worker_thread_barrier_release (vm); +#else + vlib_buffer_free_list_t *fl; + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + t->min_n_buffers_each_physmem_alloc = min_n_buffers_each_physmem_alloc; + + t->free_list_index = vlib_buffer_create_free_list_helper + (vm, n_packet_data_bytes, + /* is_public */ 1, + /* is_default */ 0, + name); + + ASSERT (t->free_list_index != 0); + fl = vlib_buffer_get_free_list (vm, t->free_list_index); + fl->min_n_buffers_each_physmem_alloc = t->min_n_buffers_each_physmem_alloc; + + fl->buffer_init_function = vlib_packet_template_buffer_init; + fl->buffer_init_function_opaque = pointer_to_uword (t); + + fl->buffer_init_template.current_data = 0; + fl->buffer_init_template.current_length = n_packet_data_bytes; + fl->buffer_init_template.flags = 0; +#endif +} + +void * +vlib_packet_template_get_packet (vlib_main_t * vm, + vlib_packet_template_t * t, u32 * bi_result) +{ + u32 bi; + vlib_buffer_t *b; + + if (vlib_buffer_alloc (vm, &bi, 1) != 1) + return 0; + + *bi_result = bi; + + b = vlib_get_buffer (vm, bi); + clib_memcpy (vlib_buffer_get_current (b), + t->packet_data, vec_len (t->packet_data)); + b->current_length = vec_len (t->packet_data); + + return b->data; +} + +#if DPDK == 0 +void +vlib_packet_template_get_packet_helper (vlib_main_t * vm, + vlib_packet_template_t * t) +{ + word n = t->min_n_buffers_each_physmem_alloc; + word l = vec_len (t->packet_data); + word n_alloc; + + ASSERT (l > 0); + ASSERT (vec_len (t->free_buffers) == 0); + + vec_validate (t->free_buffers, n - 1); + n_alloc = vlib_buffer_alloc_from_free_list (vm, t->free_buffers, + n, t->free_list_index); + _vec_len (t->free_buffers) = n_alloc; +} + +#endif +/* Append given data to end of buffer, possibly allocating new buffers. */ +u32 +vlib_buffer_add_data (vlib_main_t * vm, + u32 free_list_index, + u32 buffer_index, void *data, u32 n_data_bytes) +{ + u32 n_buffer_bytes, n_left, n_left_this_buffer, bi; + vlib_buffer_t *b; + void *d; + + bi = buffer_index; + if (bi == 0 + && 1 != vlib_buffer_alloc_from_free_list (vm, &bi, 1, free_list_index)) + goto out_of_buffers; + + d = data; + n_left = n_data_bytes; + n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, free_list_index); + + b = vlib_get_buffer (vm, bi); + b->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + + /* Get to the end of the chain before we try to append data... */ + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + b = vlib_get_buffer (vm, b->next_buffer); + + while (1) + { + u32 n; + + ASSERT (n_buffer_bytes >= b->current_length); + n_left_this_buffer = + n_buffer_bytes - (b->current_data + b->current_length); + n = clib_min (n_left_this_buffer, n_left); + clib_memcpy (vlib_buffer_get_current (b) + b->current_length, d, n); + b->current_length += n; + n_left -= n; + if (n_left == 0) + break; + + d += n; + if (1 != + vlib_buffer_alloc_from_free_list (vm, &b->next_buffer, 1, + free_list_index)) + goto out_of_buffers; + + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + b = vlib_get_buffer (vm, b->next_buffer); + } + + return bi; + +out_of_buffers: + clib_error ("out of buffers"); + return bi; +} + +u16 +vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, + u32 free_list_index, + vlib_buffer_t * first, + vlib_buffer_t ** last, + void *data, u16 data_len) +{ + vlib_buffer_t *l = *last; + u32 n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, free_list_index); + u16 copied = 0; + ASSERT (n_buffer_bytes >= l->current_length + l->current_data); + while (data_len) + { + u16 max = n_buffer_bytes - l->current_length - l->current_data; + if (max == 0) + { + if (1 != + vlib_buffer_alloc_from_free_list (vm, &l->next_buffer, 1, + free_list_index)) + return copied; + *last = l = vlib_buffer_chain_buffer (vm, first, l, l->next_buffer); + max = n_buffer_bytes - l->current_length - l->current_data; + } + + u16 len = (data_len > max) ? max : data_len; + clib_memcpy (vlib_buffer_get_current (l) + l->current_length, + data + copied, len); + vlib_buffer_chain_increase_length (first, l, len); + data_len -= len; + copied += len; + } + return copied; +} + +#if DPDK > 0 +clib_error_t * +vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_physmem_main_t *vpm = &vm->physmem_main; + struct rte_mempool *rmp; + int i; + + if (!rte_pktmbuf_pool_create) + return clib_error_return (0, "not linked with DPDK"); + + vec_validate_aligned (bm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); + + /* pool already exists, nothing to do */ + if (bm->pktmbuf_pools[socket_id]) + return 0; + + u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0); + + rmp = rte_pktmbuf_pool_create ((char *) pool_name, /* pool name */ + num_mbufs, /* number of mbufs */ + 512, /* cache size */ + VLIB_BUFFER_HDR_SIZE, /* priv size */ + VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE, /* dataroom size */ + socket_id); /* cpu socket */ + + if (rmp) + { + { + uword this_pool_end; + uword this_pool_start; + uword this_pool_size; + uword save_vpm_start, save_vpm_end, save_vpm_size; + struct rte_mempool_memhdr *memhdr; + + this_pool_start = ~0ULL; + this_pool_end = 0LL; + + STAILQ_FOREACH (memhdr, &rmp->mem_list, next) + { + if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) + this_pool_end = (uword) (memhdr->addr + memhdr->len); + if (((uword) memhdr->addr) < this_pool_start) + this_pool_start = (uword) (memhdr->addr); + } + ASSERT (this_pool_start < ~0ULL && this_pool_end > 0); + this_pool_size = this_pool_end - this_pool_start; + + if (CLIB_DEBUG > 1) + { + clib_warning ("%s: pool start %llx pool end %llx pool size %lld", + pool_name, this_pool_start, this_pool_end, + this_pool_size); + clib_warning + ("before: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + save_vpm_start = vpm->virtual.start; + save_vpm_end = vpm->virtual.end; + save_vpm_size = vpm->virtual.size; + + if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) + vpm->virtual.start = this_pool_start; + if (this_pool_end > vpm->virtual.end) + vpm->virtual.end = this_pool_end; + + vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; + + if (CLIB_DEBUG > 1) + { + clib_warning + ("after: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + /* check if fits into buffer index range */ + if ((u64) vpm->virtual.size > + ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) + { + clib_warning ("physmem: virtual size out of range!"); + vpm->virtual.start = save_vpm_start; + vpm->virtual.end = save_vpm_end; + vpm->virtual.size = save_vpm_size; + rmp = 0; + } + } + if (rmp) + { + bm->pktmbuf_pools[socket_id] = rmp; + vec_free (pool_name); + return 0; + } + } + + vec_free (pool_name); + + /* no usable pool for this socket, try to use pool from another one */ + for (i = 0; i < vec_len (bm->pktmbuf_pools); i++) + { + if (bm->pktmbuf_pools[i]) + { + clib_warning + ("WARNING: Failed to allocate mempool for CPU socket %u. " + "Threads running on socket %u will use socket %u mempool.", + socket_id, socket_id, i); + bm->pktmbuf_pools[socket_id] = bm->pktmbuf_pools[i]; + return 0; + } + } + + return clib_error_return (0, "failed to allocate mempool on socket %u", + socket_id); +} +#endif + +static void +vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + uword n, n_bytes_to_write; + vlib_buffer_t *last; + + n_bytes_to_write = s->current_buffer_index; + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + ASSERT (sm->tx.max_n_data_bytes_per_chain > 0); + if (serialize_stream_is_end_of_stream (s) + || sm->tx.n_total_data_bytes + n_bytes_to_write > + sm->tx.max_n_data_bytes_per_chain) + { + vlib_process_t *p = vlib_get_current_process (vm); + + last = vlib_get_buffer (vm, sm->last_buffer); + last->current_length = n_bytes_to_write; + + vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, + sm->first_buffer); + + sm->first_buffer = sm->last_buffer = ~0; + sm->tx.n_total_data_bytes = 0; + } + + else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0) + { + ASSERT (sm->first_buffer == ~0); + ASSERT (sm->last_buffer == ~0); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->last_buffer = sm->first_buffer; + s->n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index); + } + + if (n_bytes_to_write > 0) + { + vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->tx.n_total_data_bytes += n_bytes_to_write; + prev->current_length = n_bytes_to_write; + prev->next_buffer = sm->last_buffer; + prev->flags |= VLIB_BUFFER_NEXT_PRESENT; + } + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + s->buffer = vlib_buffer_get_current (last); + s->current_buffer_index = 0; + ASSERT (last->current_data == s->current_buffer_index); + } +} + +static void +vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + vlib_buffer_t *last; + + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + if (serialize_stream_is_end_of_stream (s)) + return; + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + + if (last->flags & VLIB_BUFFER_NEXT_PRESENT) + sm->last_buffer = last->next_buffer; + else + { + vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1); + sm->first_buffer = sm->last_buffer = ~0; + } + } + + if (sm->last_buffer == ~0) + { + while (clib_fifo_elts (sm->rx.buffer_fifo) == 0) + { + sm->rx.ready_one_time_event = + vlib_process_create_one_time_event (vm, vlib_current_process (vm), + ~0); + vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, + sm->rx.ready_one_time_event); + } + + clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer); + sm->last_buffer = sm->first_buffer; + } + + ASSERT (sm->last_buffer != ~0); + + last = vlib_get_buffer (vm, sm->last_buffer); + s->current_buffer_index = 0; + s->buffer = vlib_buffer_get_current (last); + s->n_buffer_bytes = last->current_length; +} + +static void +serialize_open_vlib_helper (serialize_main_t * m, + vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm, uword is_read) +{ + /* Initialize serialize main but save overflow buffer for re-use between calls. */ + { + u8 *save = m->stream.overflow_buffer; + memset (m, 0, sizeof (m[0])); + m->stream.overflow_buffer = save; + if (save) + _vec_len (save) = 0; + } + + sm->first_buffer = sm->last_buffer = ~0; + if (is_read) + clib_fifo_reset (sm->rx.buffer_fifo); + else + sm->tx.n_total_data_bytes = 0; + sm->vlib_main = vm; + m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx; + m->stream.data_function_opaque = pointer_to_uword (sm); +} + +void +serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); +} + +void +unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); +} + +u32 +serialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + vlib_buffer_t *last; + serialize_stream_t *s = &m->stream; + + last = vlib_get_buffer (sm->vlib_main, sm->last_buffer); + last->current_length = s->current_buffer_index; + + if (vec_len (s->overflow_buffer) > 0) + { + sm->last_buffer + = vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index, + sm->last_buffer == ~0 ? 0 : sm->last_buffer, + s->overflow_buffer, + vec_len (s->overflow_buffer)); + _vec_len (s->overflow_buffer) = 0; + } + + return sm->first_buffer; +} + +void +unserialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + if (sm->first_buffer != ~0) + vlib_buffer_free_one (sm->vlib_main, sm->first_buffer); + clib_fifo_reset (sm->rx.buffer_fifo); + if (m->stream.overflow_buffer) + _vec_len (m->stream.overflow_buffer) = 0; +} + +static u8 * +format_vlib_buffer_free_list (u8 * s, va_list * va) +{ + vlib_buffer_free_list_t *f = va_arg (*va, vlib_buffer_free_list_t *); +#if DPDK > 0 + u32 threadnum = va_arg (*va, u32); + uword bytes_alloc, bytes_free, n_free, size; + + if (!f) + return format (s, "%=7s%=30s%=12s%=12s%=12s%=12s%=12s%=12s", + "Thread", "Name", "Index", "Size", "Alloc", "Free", + "#Alloc", "#Free"); + + size = sizeof (vlib_buffer_t) + f->n_data_bytes; + n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); + bytes_alloc = size * f->n_alloc; + bytes_free = size * n_free; + + s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d", threadnum, +#else + uword bytes_alloc, bytes_free, n_free, size; + + if (!f) + return format (s, "%=30s%=12s%=12s%=12s%=12s%=12s%=12s", + "Name", "Index", "Size", "Alloc", "Free", "#Alloc", + "#Free"); + + size = sizeof (vlib_buffer_t) + f->n_data_bytes; + n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); + bytes_alloc = size * f->n_alloc; + bytes_free = size * n_free; + + s = format (s, "%30s%12d%12d%=12U%=12U%=12d%=12d", +#endif + f->name, f->index, f->n_data_bytes, + format_memory_size, bytes_alloc, + format_memory_size, bytes_free, f->n_alloc, n_free); + + return s; +} + +static clib_error_t * +show_buffers (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ +#if DPDK > 0 + vlib_buffer_main_t *bm; + vlib_buffer_free_list_t *f; + vlib_main_t *curr_vm; + u32 vm_index = 0; + + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0, 0); + + do + { + curr_vm = vec_len (vlib_mains) ? vlib_mains[vm_index] : vm; + bm = curr_vm->buffer_main; + + /* *INDENT-OFF* */ + pool_foreach (f, bm->buffer_free_list_pool, ({ + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f, vm_index); + })); + /* *INDENT-ON* */ + + vm_index++; + } + while (vm_index < vec_len (vlib_mains)); + +#else + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0); + /* *INDENT-OFF* */ + pool_foreach (f, bm->buffer_free_list_pool, ({ + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f); + })); +/* *INDENT-ON* */ + +#endif + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_buffers_command, static) = { + .path = "show buffers", + .short_help = "Show packet buffer allocation", + .function = show_buffers, +}; +/* *INDENT-ON* */ + +#if DPDK > 0 +#if CLIB_DEBUG > 0 + +u32 *vlib_buffer_state_validation_lock; +uword *vlib_buffer_state_validation_hash; +void *vlib_buffer_state_heap; + +static clib_error_t * +buffer_state_validation_init (vlib_main_t * vm) +{ + void *oldheap; + + vlib_buffer_state_heap = mheap_alloc (0, 10 << 20); + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword)); + vec_validate_aligned (vlib_buffer_state_validation_lock, 0, + CLIB_CACHE_LINE_BYTES); + clib_mem_set_heap (oldheap); + return 0; +} + +VLIB_INIT_FUNCTION (buffer_state_validation_init); +#endif +#endif + + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From 878c609889dcdc58538d40d8b3f662320f88573d Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 4 Jan 2017 13:19:27 +0100 Subject: vlib: add buffer and thread callbacks Change-Id: I8e2e8f94a884ab2f9909d0c83ba00edd38cdab77 Signed-off-by: Damjan Marion --- src/plugins/flowperpkt/flowperpkt.c | 2 +- src/vlib.am | 1 + src/vlib/buffer.c | 736 +++--------------------------------- src/vlib/buffer.h | 45 ++- src/vlib/buffer_funcs.h | 82 ++-- src/vlib/buffer_serialize.c | 248 ++++++++++++ src/vlib/main.c | 7 +- src/vlib/threads.c | 112 +++--- src/vlib/threads.h | 17 +- src/vlib/threads_cli.c | 25 -- src/vlib/unix/physmem.c | 15 +- src/vnet.am | 2 + src/vnet/devices/dpdk/buffer.c | 729 +++++++++++++++++++++++++++++++++++ src/vnet/devices/dpdk/cli.c | 4 +- src/vnet/devices/dpdk/device.c | 7 +- src/vnet/devices/dpdk/dpdk.h | 3 + src/vnet/devices/dpdk/dpdk_priv.h | 3 + src/vnet/devices/dpdk/init.c | 6 +- src/vnet/devices/dpdk/thread.c | 85 +++++ src/vnet/sr/sr_replicate.c | 7 +- 20 files changed, 1304 insertions(+), 832 deletions(-) create mode 100644 src/vlib/buffer_serialize.c create mode 100644 src/vnet/devices/dpdk/buffer.c create mode 100644 src/vnet/devices/dpdk/thread.c (limited to 'src/vlib/buffer.c') diff --git a/src/plugins/flowperpkt/flowperpkt.c b/src/plugins/flowperpkt/flowperpkt.c index fb71d5b0..cc351599 100644 --- a/src/plugins/flowperpkt/flowperpkt.c +++ b/src/plugins/flowperpkt/flowperpkt.c @@ -643,7 +643,7 @@ flowperpkt_init (vlib_main_t * vm) vec_free (name); /* Decide how many worker threads we have */ - num_threads = 1 /* main thread */ + tm->n_eal_threads; + num_threads = 1 /* main thread */ + tm->n_threads; /* Allocate per worker thread vectors */ vec_validate (fm->ipv4_buffers_per_worker, num_threads - 1); diff --git a/src/vlib.am b/src/vlib.am index 0154d841..c21f88c4 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -23,6 +23,7 @@ vlib/config.h: libvlib_la_SOURCES = \ vlib/buffer.c \ + vlib/buffer_serialize.c \ vlib/cli.c \ vlib/cli.h \ vlib/config.h \ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 4bf6d125..0b0e6054 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -38,50 +38,13 @@ */ /** - * @cond (!DPDK) * @file * * Allocate/free network buffers. */ -#if DPDK > 0 -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#endif - #include -#if DPDK > 0 -#pragma weak rte_mem_virt2phy -#pragma weak rte_eal_has_hugepages -#pragma weak rte_socket_id -#pragma weak rte_pktmbuf_pool_create -#endif - uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, vlib_buffer_t * b_first) @@ -103,7 +66,6 @@ u8 * format_vlib_buffer (u8 * s, va_list * args) { vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); -#if DPDK > 0 uword indent = format_get_indent (s); s = format (s, "current data %d, length %d, free-list %d", @@ -126,18 +88,6 @@ format_vlib_buffer (u8 * s, va_list * args) format_white_space, indent, next_buffer, b->current_length); } -#else - - s = format (s, "current data %d, length %d, free-list %d", - b->current_data, b->current_length, b->free_list_index); - - if (b->flags & VLIB_BUFFER_IS_TRACED) - s = format (s, ", trace 0x%x", b->trace_index); - - if (b->flags & VLIB_BUFFER_NEXT_PRESENT) - s = format (s, ", next-buffer 0x%x", b->next_buffer); -#endif - return s; } @@ -153,7 +103,6 @@ format_vlib_buffer_and_data (u8 * s, va_list * args) return s; } -#if DPDK == 0 static u8 * format_vlib_buffer_known_state (u8 * s, va_list * args) { @@ -181,7 +130,6 @@ format_vlib_buffer_known_state (u8 * s, va_list * args) return format (s, "%s", t); } -#endif u8 * format_vlib_buffer_contents (u8 * s, va_list * va) @@ -200,7 +148,6 @@ format_vlib_buffer_contents (u8 * s, va_list * va) return s; } -#if DPDK == 0 static u8 * vlib_validate_buffer_helper (vlib_main_t * vm, u32 bi, @@ -217,11 +164,10 @@ vlib_validate_buffer_helper (vlib_main_t * vm, if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE) return format (0, "current data %d before pre-data", b->current_data); -#if DPDK == 0 + if (b->current_data + b->current_length > fl->n_data_bytes) return format (0, "%d-%d beyond end of buffer %d", b->current_data, b->current_length, fl->n_data_bytes); -#endif if (follow_buffer_next && (b->flags & VLIB_BUFFER_NEXT_PRESENT)) { @@ -311,14 +257,12 @@ done: hash_free (hash); return result; } -#endif vlib_main_t **vlib_mains; -#if DPDK == 0 /* When dubugging validate that given buffers are either known allocated or known free. */ -static void +static void __attribute__ ((unused)) vlib_buffer_validate_alloc_free (vlib_main_t * vm, u32 * buffers, uword n_buffers, @@ -359,7 +303,6 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, is_free ? VLIB_BUFFER_KNOWN_FREE : VLIB_BUFFER_KNOWN_ALLOCATED); } } -#endif #define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) @@ -463,7 +406,6 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *f; -#if DPDK > 0 int i; ASSERT (os_get_cpu_number () == 0); @@ -519,47 +461,6 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, wf->unaligned_buffers = 0; wf->n_alloc = 0; } -#else - - if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) - { - u32 default_free_free_list_index; - - default_free_free_list_index = vlib_buffer_create_free_list_helper (vm, - /* default buffer size */ - VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, - /* is_public */ - 1, - /* is_default */ - 1, - (u8 - *) - "default"); - ASSERT (default_free_free_list_index == - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); - - if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public) - return default_free_free_list_index; - } - - pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES); - - memset (f, 0, sizeof (f[0])); - f->index = f - bm->buffer_free_list_pool; - f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); - f->min_n_buffers_each_physmem_alloc = 256; - f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); - - /* Setup free buffer template. */ - f->buffer_init_template.free_list_index = f->index; - - if (is_public) - { - uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes); - if (!p) - hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); - } -#endif return f->index; } @@ -609,50 +510,30 @@ static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { u32 i; -#if DPDK > 0 - struct rte_mbuf *mb; - vlib_buffer_t *b; - - for (i = 0; i < vec_len (f->unaligned_buffers); i++) - { - b = vlib_get_buffer (vm, f->unaligned_buffers[i]); - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } - for (i = 0; i < vec_len (f->aligned_buffers); i++) - { - b = vlib_get_buffer (vm, f->aligned_buffers[i]); - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } - vec_free (f->name); -#else for (i = 0; i < vec_len (f->buffer_memory_allocated); i++) vm->os_physmem_free (f->buffer_memory_allocated[i]); vec_free (f->name); vec_free (f->buffer_memory_allocated); -#endif vec_free (f->unaligned_buffers); vec_free (f->aligned_buffers); } /* Add buffer free list. */ void -vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *f; u32 merge_index; -#if DPDK > 0 int i; ASSERT (os_get_cpu_number () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); + ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == + f->n_alloc); merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); if (merge_index != ~0 && merge_index != free_list_index) { @@ -674,26 +555,6 @@ vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) memset (f, 0xab, sizeof (f[0])); pool_put (bm->buffer_free_list_pool, f); } -#else - - f = vlib_buffer_get_free_list (vm, free_list_index); - - ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == - f->n_alloc); - merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); - if (merge_index != ~0 && merge_index != free_list_index) - { - merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, - merge_index), f); - } - - del_free_list (vm, f); - - /* Poison it. */ - memset (f, 0xab, sizeof (f[0])); - - pool_put (bm->buffer_free_list_pool, f); -#endif } /* Make sure free list has at least given number of free buffers. */ @@ -701,63 +562,6 @@ static uword fill_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * fl, uword min_free_buffers) { -#if DPDK > 0 - vlib_buffer_t *b; - int n, i; - u32 bi; - u32 n_remaining = 0, n_alloc = 0; - unsigned socket_id = rte_socket_id ? rte_socket_id () : 0; - struct rte_mempool *rmp = vm->buffer_main->pktmbuf_pools[socket_id]; - struct rte_mbuf *mb; - - /* Too early? */ - if (PREDICT_FALSE (rmp == 0)) - return 0; - - trim_aligned (fl); - - /* Already have enough free buffers on free list? */ - n = min_free_buffers - vec_len (fl->aligned_buffers); - if (n <= 0) - return min_free_buffers; - - /* Always allocate round number of buffers. */ - n = round_pow2 (n, BUFFERS_PER_COPY); - - /* Always allocate new buffers in reasonably large sized chunks. */ - n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); - - vec_validate (vm->mbuf_alloc_list, n - 1); - - if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) - return 0; - - _vec_len (vm->mbuf_alloc_list) = n; - - for (i = 0; i < n; i++) - { - mb = vm->mbuf_alloc_list[i]; - - ASSERT (rte_mbuf_refcnt_read (mb) == 0); - rte_mbuf_refcnt_set (mb, 1); - - b = vlib_buffer_from_rte_mbuf (mb); - bi = vlib_get_buffer_index (vm, b); - - vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); - n_alloc++; - n_remaining--; - - vlib_buffer_init_for_free_list (b, fl); - - if (fl->buffer_init_function) - fl->buffer_init_function (vm, fl, &bi, 1); - } - - fl->n_alloc += n; - - return n; -#else vlib_buffer_t *buffers, *b; int n, n_bytes, i; u32 *bi; @@ -824,7 +628,6 @@ fill_free_list (vlib_main_t * vm, fl->buffer_init_function (vm, fl, bi, n_this_chunk); } return n_alloc; -#endif } always_inline uword @@ -833,6 +636,7 @@ copy_alignment (u32 * x) return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; } + static u32 alloc_from_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * free_list, @@ -842,10 +646,6 @@ alloc_from_free_list (vlib_main_t * vm, uword u_len, n_left; uword n_unaligned_start, n_unaligned_end, n_filled; -#if DPDK == 0 - ASSERT (os_get_cpu_number () == 0); - -#endif n_left = n_alloc_buffers; dst = alloc_buffers; n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) @@ -945,25 +745,21 @@ alloc_from_free_list (vlib_main_t * vm, else _vec_len (free_list->unaligned_buffers) = u_len; -#if DPDK == 0 /* Verify that buffers are known free. */ vlib_buffer_validate_alloc_free (vm, alloc_buffers, n_alloc_buffers, VLIB_BUFFER_KNOWN_FREE); -#endif return n_alloc_buffers; } + /* Allocate a given number of buffers into given array. Returns number actually allocated which will be either zero or number requested. */ -u32 -vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +static u32 +vlib_buffer_alloc_internal (vlib_main_t * vm, u32 * buffers, u32 n_buffers) { vlib_buffer_main_t *bm = vm->buffer_main; -#if DPDK == 0 - ASSERT (os_get_cpu_number () == 0); -#endif return alloc_from_free_list (vm, @@ -972,10 +768,10 @@ vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) buffers, n_buffers); } -u32 -vlib_buffer_alloc_from_free_list (vlib_main_t * vm, - u32 * buffers, - u32 n_buffers, u32 free_list_index) +static u32 +vlib_buffer_alloc_from_free_list_internal (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *f; @@ -1016,81 +812,10 @@ vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp) return rv; } -#if DPDK == 0 -void vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) __attribute__ ((weak)); -void -vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) -{ -} - -#endif static_always_inline void vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, u32 follow_buffer_next) { -#if DPDK > 0 - vlib_buffer_main_t *bm = vm->buffer_main; - vlib_buffer_free_list_t *fl; - u32 fi; - int i; - u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, - u32 follow_buffer_next); - - cb = bm->buffer_free_callback; - - if (PREDICT_FALSE (cb != 0)) - n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); - - if (!n_buffers) - return; - - for (i = 0; i < n_buffers; i++) - { - vlib_buffer_t *b; - struct rte_mbuf *mb; - - b = vlib_get_buffer (vm, buffers[i]); - - fl = buffer_get_free_list (vm, b, &fi); - - /* The only current use of this callback: multicast recycle */ - if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) - { - int j; - - add_buffer_to_free_list - (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); - - for (j = 0; j < vec_len (bm->announce_list); j++) - { - if (fl == bm->announce_list[j]) - goto already_announced; - } - vec_add1 (bm->announce_list, fl); - already_announced: - ; - } - else - { - if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) - { - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } - } - } - if (vec_len (bm->announce_list)) - { - vlib_buffer_free_list_t *fl; - for (i = 0; i < vec_len (bm->announce_list); i++) - { - fl = bm->announce_list[i]; - fl->buffers_added_to_freelist_function (vm, fl); - } - _vec_len (bm->announce_list) = 0; - } -#else vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *fl; static u32 *next_to_free[2]; /* smp bad */ @@ -1315,26 +1040,25 @@ again: } _vec_len (announce_list) = 0; } -#endif } -void -vlib_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +static void +vlib_buffer_free_internal (vlib_main_t * vm, u32 * buffers, u32 n_buffers) { vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 1); } -void -vlib_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +static void +vlib_buffer_free_no_next_internal (vlib_main_t * vm, u32 * buffers, + u32 n_buffers) { vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 0); } -#if DPDK == 0 /* Copy template packet data into buffers as they are allocated. */ -static void +static void __attribute__ ((unused)) vlib_packet_template_buffer_init (vlib_main_t * vm, vlib_buffer_free_list_t * fl, u32 * buffers, u32 n_buffers) @@ -1352,7 +1076,6 @@ vlib_packet_template_buffer_init (vlib_main_t * vm, b->current_length); } } -#endif void vlib_packet_template_init (vlib_main_t * vm, @@ -1362,28 +1085,22 @@ vlib_packet_template_init (vlib_main_t * vm, uword min_n_buffers_each_physmem_alloc, char *fmt, ...) { -#if DPDK > 0 + vlib_buffer_main_t *bm = vm->buffer_main; va_list va; __attribute__ ((unused)) u8 *name; + vlib_buffer_free_list_t *fl; va_start (va, fmt); name = va_format (0, fmt, &va); va_end (va); - vlib_worker_thread_barrier_sync (vm); - memset (t, 0, sizeof (t[0])); - - vec_add (t->packet_data, packet_data, n_packet_data_bytes); + if (bm->cb.vlib_packet_template_init_cb) + bm->cb.vlib_packet_template_init_cb (vm, (void *) t, packet_data, + n_packet_data_bytes, + min_n_buffers_each_physmem_alloc, + name); - vlib_worker_thread_barrier_release (vm); -#else - vlib_buffer_free_list_t *fl; - va_list va; - u8 *name; - - va_start (va, fmt); - name = va_format (0, fmt, &va); - va_end (va); + vlib_worker_thread_barrier_sync (vm); memset (t, 0, sizeof (t[0])); @@ -1406,7 +1123,7 @@ vlib_packet_template_init (vlib_main_t * vm, fl->buffer_init_template.current_data = 0; fl->buffer_init_template.current_length = n_packet_data_bytes; fl->buffer_init_template.flags = 0; -#endif + vlib_worker_thread_barrier_release (vm); } void * @@ -1429,7 +1146,6 @@ vlib_packet_template_get_packet (vlib_main_t * vm, return b->data; } -#if DPDK == 0 void vlib_packet_template_get_packet_helper (vlib_main_t * vm, vlib_packet_template_t * t) @@ -1447,7 +1163,6 @@ vlib_packet_template_get_packet_helper (vlib_main_t * vm, _vec_len (t->free_buffers) = n_alloc; } -#endif /* Append given data to end of buffer, possibly allocating new buffers. */ u32 vlib_buffer_add_data (vlib_main_t * vm, @@ -1541,328 +1256,11 @@ vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, return copied; } -#if DPDK > 0 -clib_error_t * -vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, - unsigned socket_id) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - vlib_physmem_main_t *vpm = &vm->physmem_main; - struct rte_mempool *rmp; - int i; - - if (!rte_pktmbuf_pool_create) - return clib_error_return (0, "not linked with DPDK"); - - vec_validate_aligned (bm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); - - /* pool already exists, nothing to do */ - if (bm->pktmbuf_pools[socket_id]) - return 0; - - u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0); - - rmp = rte_pktmbuf_pool_create ((char *) pool_name, /* pool name */ - num_mbufs, /* number of mbufs */ - 512, /* cache size */ - VLIB_BUFFER_HDR_SIZE, /* priv size */ - VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE, /* dataroom size */ - socket_id); /* cpu socket */ - - if (rmp) - { - { - uword this_pool_end; - uword this_pool_start; - uword this_pool_size; - uword save_vpm_start, save_vpm_end, save_vpm_size; - struct rte_mempool_memhdr *memhdr; - - this_pool_start = ~0ULL; - this_pool_end = 0LL; - - STAILQ_FOREACH (memhdr, &rmp->mem_list, next) - { - if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) - this_pool_end = (uword) (memhdr->addr + memhdr->len); - if (((uword) memhdr->addr) < this_pool_start) - this_pool_start = (uword) (memhdr->addr); - } - ASSERT (this_pool_start < ~0ULL && this_pool_end > 0); - this_pool_size = this_pool_end - this_pool_start; - - if (CLIB_DEBUG > 1) - { - clib_warning ("%s: pool start %llx pool end %llx pool size %lld", - pool_name, this_pool_start, this_pool_end, - this_pool_size); - clib_warning - ("before: virtual.start %llx virtual.end %llx virtual.size %lld", - vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); - } - - save_vpm_start = vpm->virtual.start; - save_vpm_end = vpm->virtual.end; - save_vpm_size = vpm->virtual.size; - - if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) - vpm->virtual.start = this_pool_start; - if (this_pool_end > vpm->virtual.end) - vpm->virtual.end = this_pool_end; - - vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; - - if (CLIB_DEBUG > 1) - { - clib_warning - ("after: virtual.start %llx virtual.end %llx virtual.size %lld", - vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); - } - - /* check if fits into buffer index range */ - if ((u64) vpm->virtual.size > - ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) - { - clib_warning ("physmem: virtual size out of range!"); - vpm->virtual.start = save_vpm_start; - vpm->virtual.end = save_vpm_end; - vpm->virtual.size = save_vpm_size; - rmp = 0; - } - } - if (rmp) - { - bm->pktmbuf_pools[socket_id] = rmp; - vec_free (pool_name); - return 0; - } - } - - vec_free (pool_name); - - /* no usable pool for this socket, try to use pool from another one */ - for (i = 0; i < vec_len (bm->pktmbuf_pools); i++) - { - if (bm->pktmbuf_pools[i]) - { - clib_warning - ("WARNING: Failed to allocate mempool for CPU socket %u. " - "Threads running on socket %u will use socket %u mempool.", - socket_id, socket_id, i); - bm->pktmbuf_pools[socket_id] = bm->pktmbuf_pools[i]; - return 0; - } - } - - return clib_error_return (0, "failed to allocate mempool on socket %u", - socket_id); -} -#endif - -static void -vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s) -{ - vlib_main_t *vm; - vlib_serialize_buffer_main_t *sm; - uword n, n_bytes_to_write; - vlib_buffer_t *last; - - n_bytes_to_write = s->current_buffer_index; - sm = - uword_to_pointer (s->data_function_opaque, - vlib_serialize_buffer_main_t *); - vm = sm->vlib_main; - - ASSERT (sm->tx.max_n_data_bytes_per_chain > 0); - if (serialize_stream_is_end_of_stream (s) - || sm->tx.n_total_data_bytes + n_bytes_to_write > - sm->tx.max_n_data_bytes_per_chain) - { - vlib_process_t *p = vlib_get_current_process (vm); - - last = vlib_get_buffer (vm, sm->last_buffer); - last->current_length = n_bytes_to_write; - - vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, - sm->first_buffer); - - sm->first_buffer = sm->last_buffer = ~0; - sm->tx.n_total_data_bytes = 0; - } - - else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0) - { - ASSERT (sm->first_buffer == ~0); - ASSERT (sm->last_buffer == ~0); - n = - vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, - sm->tx.free_list_index); - if (n != 1) - serialize_error (m, - clib_error_create - ("vlib_buffer_alloc_from_free_list fails")); - sm->last_buffer = sm->first_buffer; - s->n_buffer_bytes = - vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index); - } - - if (n_bytes_to_write > 0) - { - vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer); - n = - vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, - sm->tx.free_list_index); - if (n != 1) - serialize_error (m, - clib_error_create - ("vlib_buffer_alloc_from_free_list fails")); - sm->tx.n_total_data_bytes += n_bytes_to_write; - prev->current_length = n_bytes_to_write; - prev->next_buffer = sm->last_buffer; - prev->flags |= VLIB_BUFFER_NEXT_PRESENT; - } - - if (sm->last_buffer != ~0) - { - last = vlib_get_buffer (vm, sm->last_buffer); - s->buffer = vlib_buffer_get_current (last); - s->current_buffer_index = 0; - ASSERT (last->current_data == s->current_buffer_index); - } -} - -static void -vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s) -{ - vlib_main_t *vm; - vlib_serialize_buffer_main_t *sm; - vlib_buffer_t *last; - - sm = - uword_to_pointer (s->data_function_opaque, - vlib_serialize_buffer_main_t *); - vm = sm->vlib_main; - - if (serialize_stream_is_end_of_stream (s)) - return; - - if (sm->last_buffer != ~0) - { - last = vlib_get_buffer (vm, sm->last_buffer); - - if (last->flags & VLIB_BUFFER_NEXT_PRESENT) - sm->last_buffer = last->next_buffer; - else - { - vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1); - sm->first_buffer = sm->last_buffer = ~0; - } - } - - if (sm->last_buffer == ~0) - { - while (clib_fifo_elts (sm->rx.buffer_fifo) == 0) - { - sm->rx.ready_one_time_event = - vlib_process_create_one_time_event (vm, vlib_current_process (vm), - ~0); - vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, - sm->rx.ready_one_time_event); - } - - clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer); - sm->last_buffer = sm->first_buffer; - } - - ASSERT (sm->last_buffer != ~0); - - last = vlib_get_buffer (vm, sm->last_buffer); - s->current_buffer_index = 0; - s->buffer = vlib_buffer_get_current (last); - s->n_buffer_bytes = last->current_length; -} - -static void -serialize_open_vlib_helper (serialize_main_t * m, - vlib_main_t * vm, - vlib_serialize_buffer_main_t * sm, uword is_read) -{ - /* Initialize serialize main but save overflow buffer for re-use between calls. */ - { - u8 *save = m->stream.overflow_buffer; - memset (m, 0, sizeof (m[0])); - m->stream.overflow_buffer = save; - if (save) - _vec_len (save) = 0; - } - - sm->first_buffer = sm->last_buffer = ~0; - if (is_read) - clib_fifo_reset (sm->rx.buffer_fifo); - else - sm->tx.n_total_data_bytes = 0; - sm->vlib_main = vm; - m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx; - m->stream.data_function_opaque = pointer_to_uword (sm); -} - -void -serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, - vlib_serialize_buffer_main_t * sm) -{ - serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); -} - -void -unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, - vlib_serialize_buffer_main_t * sm) -{ - serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); -} - -u32 -serialize_close_vlib_buffer (serialize_main_t * m) -{ - vlib_serialize_buffer_main_t *sm - = uword_to_pointer (m->stream.data_function_opaque, - vlib_serialize_buffer_main_t *); - vlib_buffer_t *last; - serialize_stream_t *s = &m->stream; - - last = vlib_get_buffer (sm->vlib_main, sm->last_buffer); - last->current_length = s->current_buffer_index; - - if (vec_len (s->overflow_buffer) > 0) - { - sm->last_buffer - = vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index, - sm->last_buffer == ~0 ? 0 : sm->last_buffer, - s->overflow_buffer, - vec_len (s->overflow_buffer)); - _vec_len (s->overflow_buffer) = 0; - } - - return sm->first_buffer; -} - -void -unserialize_close_vlib_buffer (serialize_main_t * m) -{ - vlib_serialize_buffer_main_t *sm - = uword_to_pointer (m->stream.data_function_opaque, - vlib_serialize_buffer_main_t *); - if (sm->first_buffer != ~0) - vlib_buffer_free_one (sm->vlib_main, sm->first_buffer); - clib_fifo_reset (sm->rx.buffer_fifo); - if (m->stream.overflow_buffer) - _vec_len (m->stream.overflow_buffer) = 0; -} static u8 * format_vlib_buffer_free_list (u8 * s, va_list * va) { vlib_buffer_free_list_t *f = va_arg (*va, vlib_buffer_free_list_t *); -#if DPDK > 0 u32 threadnum = va_arg (*va, u32); uword bytes_alloc, bytes_free, n_free, size; @@ -1877,21 +1275,6 @@ format_vlib_buffer_free_list (u8 * s, va_list * va) bytes_free = size * n_free; s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d", threadnum, -#else - uword bytes_alloc, bytes_free, n_free, size; - - if (!f) - return format (s, "%=30s%=12s%=12s%=12s%=12s%=12s%=12s", - "Name", "Index", "Size", "Alloc", "Free", "#Alloc", - "#Free"); - - size = sizeof (vlib_buffer_t) + f->n_data_bytes; - n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); - bytes_alloc = size * f->n_alloc; - bytes_free = size * n_free; - - s = format (s, "%30s%12d%12d%=12U%=12U%=12d%=12d", -#endif f->name, f->index, f->n_data_bytes, format_memory_size, bytes_alloc, format_memory_size, bytes_free, f->n_alloc, n_free); @@ -1903,7 +1286,6 @@ static clib_error_t * show_buffers (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { -#if DPDK > 0 vlib_buffer_main_t *bm; vlib_buffer_free_list_t *f; vlib_main_t *curr_vm; @@ -1926,18 +1308,6 @@ show_buffers (vlib_main_t * vm, } while (vm_index < vec_len (vlib_mains)); -#else - vlib_buffer_main_t *bm = vm->buffer_main; - vlib_buffer_free_list_t *f; - - vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0); - /* *INDENT-OFF* */ - pool_foreach (f, bm->buffer_free_list_pool, ({ - vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f); - })); -/* *INDENT-ON* */ - -#endif return 0; } @@ -1949,34 +1319,38 @@ VLIB_CLI_COMMAND (show_buffers_command, static) = { }; /* *INDENT-ON* */ -#if DPDK > 0 -#if CLIB_DEBUG > 0 - -u32 *vlib_buffer_state_validation_lock; -uword *vlib_buffer_state_validation_hash; -void *vlib_buffer_state_heap; - -static clib_error_t * -buffer_state_validation_init (vlib_main_t * vm) +void +vlib_buffer_cb_init (struct vlib_main_t *vm) { - void *oldheap; - - vlib_buffer_state_heap = mheap_alloc (0, 10 << 20); - - oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + vlib_buffer_main_t *bm = vm->buffer_main; + bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal; + bm->cb.vlib_buffer_alloc_from_free_list_cb = + &vlib_buffer_alloc_from_free_list_internal; + bm->cb.vlib_buffer_free_cb = &vlib_buffer_free_internal; + bm->cb.vlib_buffer_free_no_next_cb = &vlib_buffer_free_no_next_internal; + bm->cb.vlib_buffer_delete_free_list_cb = + &vlib_buffer_delete_free_list_internal; + bm->extern_buffer_mgmt = 0; +} - vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword)); - vec_validate_aligned (vlib_buffer_state_validation_lock, 0, - CLIB_CACHE_LINE_BYTES); - clib_mem_set_heap (oldheap); +int +vlib_buffer_cb_register (struct vlib_main_t *vm, vlib_buffer_callbacks_t * cb) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + if (bm->extern_buffer_mgmt) + return -1; + +#define _(x) bm->cb.x = cb->x + _(vlib_buffer_alloc_cb); + _(vlib_buffer_alloc_from_free_list_cb); + _(vlib_buffer_free_cb); + _(vlib_buffer_free_no_next_cb); + _(vlib_buffer_delete_free_list_cb); +#undef _ + bm->extern_buffer_mgmt = 1; return 0; } -VLIB_INIT_FUNCTION (buffer_state_validation_init); -#endif -#endif - - /** @endcond */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 5f1e62f0..d270c08a 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -46,15 +46,9 @@ #include #include /* for vlib_error_t */ -#if DPDK > 0 -#include -#define VLIB_BUFFER_DATA_SIZE (2048) -#define VLIB_BUFFER_PRE_DATA_SIZE RTE_PKTMBUF_HEADROOM -#else #include /* for __PRE_DATA_SIZE */ -#define VLIB_BUFFER_DATA_SIZE (512) +#define VLIB_BUFFER_DATA_SIZE (2048) #define VLIB_BUFFER_PRE_DATA_SIZE __PRE_DATA_SIZE -#endif #if defined (CLIB_HAVE_VEC128) || defined (__aarch64__) typedef u8x16 vlib_copy_unit_t; @@ -296,6 +290,27 @@ typedef struct vlib_buffer_free_list_t uword buffer_init_function_opaque; } __attribute__ ((aligned (16))) vlib_buffer_free_list_t; +typedef struct +{ + u32 (*vlib_buffer_alloc_cb) (struct vlib_main_t * vm, u32 * buffers, + u32 n_buffers); + u32 (*vlib_buffer_alloc_from_free_list_cb) (struct vlib_main_t * vm, + u32 * buffers, u32 n_buffers, + u32 free_list_index); + void (*vlib_buffer_free_cb) (struct vlib_main_t * vm, u32 * buffers, + u32 n_buffers); + void (*vlib_buffer_free_no_next_cb) (struct vlib_main_t * vm, u32 * buffers, + u32 n_buffers); + void (*vlib_packet_template_init_cb) (struct vlib_main_t * vm, void *t, + void *packet_data, + uword n_packet_data_bytes, + uword + min_n_buffers_each_physmem_alloc, + u8 * name); + void (*vlib_buffer_delete_free_list_cb) (struct vlib_main_t * vm, + u32 free_list_index); +} vlib_buffer_callbacks_t; + typedef struct { /* Buffer free callback, for subversive activities */ @@ -323,12 +338,15 @@ typedef struct /* List of free-lists needing Blue Light Special announcements */ vlib_buffer_free_list_t **announce_list; - /* Vector of rte_mempools per socket */ -#if DPDK == 1 - struct rte_mempool **pktmbuf_pools; -#endif + /* Callbacks */ + vlib_buffer_callbacks_t cb; + int extern_buffer_mgmt; } vlib_buffer_main_t; +void vlib_buffer_cb_init (struct vlib_main_t *vm); +int vlib_buffer_cb_register (struct vlib_main_t *vm, + vlib_buffer_callbacks_t * cb); + typedef struct { struct vlib_main_t *vlib_main; @@ -385,11 +403,6 @@ serialize_vlib_buffer_n_bytes (serialize_main_t * m) vec_len (s->overflow_buffer); } -#if DPDK > 0 -#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1) -#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1)) -#endif - /* */ diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 75716eca..15d93c16 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -195,8 +195,6 @@ do { \ } while (0) #endif -#if DPDK == 0 - typedef enum { /* Index is unknown. */ @@ -232,8 +230,6 @@ vlib_buffer_set_known_state (vlib_main_t * vm, u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index, uword follow_chain); -#endif /* DPDK == 0 */ - clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id); @@ -245,7 +241,15 @@ clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, @return - (u32) number of buffers actually allocated, may be less than the number requested or zero */ -u32 vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers); +always_inline u32 +vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_alloc_cb); + + return bm->cb.vlib_buffer_alloc_cb (vm, buffers, n_buffers); +} always_inline u32 vlib_buffer_round_size (u32 size) @@ -261,9 +265,18 @@ vlib_buffer_round_size (u32 size) @return - (u32) number of buffers actually allocated, may be less than the number requested or zero */ -u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm, - u32 * buffers, - u32 n_buffers, u32 free_list_index); +always_inline u32 +vlib_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_alloc_from_free_list_cb); + + return bm->cb.vlib_buffer_alloc_from_free_list_cb (vm, buffers, n_buffers, + free_list_index); +} /** \brief Free buffers Frees the entire buffer chain for each buffer @@ -273,11 +286,19 @@ u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm, @param n_buffers - (u32) number of buffers to free */ -void vlib_buffer_free (vlib_main_t * vm, - /* pointer to first buffer */ - u32 * buffers, - /* number of buffers to free */ - u32 n_buffers); +always_inline void +vlib_buffer_free (vlib_main_t * vm, + /* pointer to first buffer */ + u32 * buffers, + /* number of buffers to free */ + u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_free_cb); + + return bm->cb.vlib_buffer_free_cb (vm, buffers, n_buffers); +} /** \brief Free buffers, does not free the buffer chain for each buffer @@ -286,11 +307,19 @@ void vlib_buffer_free (vlib_main_t * vm, @param n_buffers - (u32) number of buffers to free */ -void vlib_buffer_free_no_next (vlib_main_t * vm, - /* pointer to first buffer */ - u32 * buffers, - /* number of buffers to free */ - u32 n_buffers); +always_inline void +vlib_buffer_free_no_next (vlib_main_t * vm, + /* pointer to first buffer */ + u32 * buffers, + /* number of buffers to free */ + u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_free_no_next_cb); + + return bm->cb.vlib_buffer_free_no_next_cb (vm, buffers, n_buffers); +} /** \brief Free one buffer Shorthand to free a single buffer chain. @@ -307,7 +336,15 @@ vlib_buffer_free_one (vlib_main_t * vm, u32 buffer_index) /* Add/delete buffer free lists. */ u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes, char *fmt, ...); -void vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index); +always_inline void +vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_delete_free_list_cb); + + bm->cb.vlib_buffer_delete_free_list_cb (vm, free_list_index); +} /* Find already existing public free list with given size or create one. */ u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, @@ -453,11 +490,6 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) return fd; } -/* - * vlib_buffer_chain_* functions provide a way to create long buffers. - * When DPDK is enabled, the 'hidden' DPDK header is taken care of transparently. - */ - /* Initializes the buffer as an empty packet with no chained buffers. */ always_inline void vlib_buffer_chain_init (vlib_buffer_t * first) @@ -537,8 +569,6 @@ typedef struct /* Vector of packet data. */ u8 *packet_data; - /* Note: the next three fields are unused if DPDK == 1 */ - /* Number of buffers to allocate in each call to physmem allocator. */ u32 min_n_buffers_each_physmem_alloc; diff --git a/src/vlib/buffer_serialize.c b/src/vlib/buffer_serialize.c new file mode 100644 index 00000000..96a5f0a0 --- /dev/null +++ b/src/vlib/buffer_serialize.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.c: allocate/free network buffers. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +static void +vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + uword n, n_bytes_to_write; + vlib_buffer_t *last; + + n_bytes_to_write = s->current_buffer_index; + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + ASSERT (sm->tx.max_n_data_bytes_per_chain > 0); + if (serialize_stream_is_end_of_stream (s) + || sm->tx.n_total_data_bytes + n_bytes_to_write > + sm->tx.max_n_data_bytes_per_chain) + { + vlib_process_t *p = vlib_get_current_process (vm); + + last = vlib_get_buffer (vm, sm->last_buffer); + last->current_length = n_bytes_to_write; + + vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, + sm->first_buffer); + + sm->first_buffer = sm->last_buffer = ~0; + sm->tx.n_total_data_bytes = 0; + } + + else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0) + { + ASSERT (sm->first_buffer == ~0); + ASSERT (sm->last_buffer == ~0); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->last_buffer = sm->first_buffer; + s->n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index); + } + + if (n_bytes_to_write > 0) + { + vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->tx.n_total_data_bytes += n_bytes_to_write; + prev->current_length = n_bytes_to_write; + prev->next_buffer = sm->last_buffer; + prev->flags |= VLIB_BUFFER_NEXT_PRESENT; + } + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + s->buffer = vlib_buffer_get_current (last); + s->current_buffer_index = 0; + ASSERT (last->current_data == s->current_buffer_index); + } +} + +static void +vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + vlib_buffer_t *last; + + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + if (serialize_stream_is_end_of_stream (s)) + return; + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + + if (last->flags & VLIB_BUFFER_NEXT_PRESENT) + sm->last_buffer = last->next_buffer; + else + { + vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1); + sm->first_buffer = sm->last_buffer = ~0; + } + } + + if (sm->last_buffer == ~0) + { + while (clib_fifo_elts (sm->rx.buffer_fifo) == 0) + { + sm->rx.ready_one_time_event = + vlib_process_create_one_time_event (vm, vlib_current_process (vm), + ~0); + vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, + sm->rx.ready_one_time_event); + } + + clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer); + sm->last_buffer = sm->first_buffer; + } + + ASSERT (sm->last_buffer != ~0); + + last = vlib_get_buffer (vm, sm->last_buffer); + s->current_buffer_index = 0; + s->buffer = vlib_buffer_get_current (last); + s->n_buffer_bytes = last->current_length; +} + +static void +serialize_open_vlib_helper (serialize_main_t * m, + vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm, uword is_read) +{ + /* Initialize serialize main but save overflow buffer for re-use between calls. */ + { + u8 *save = m->stream.overflow_buffer; + memset (m, 0, sizeof (m[0])); + m->stream.overflow_buffer = save; + if (save) + _vec_len (save) = 0; + } + + sm->first_buffer = sm->last_buffer = ~0; + if (is_read) + clib_fifo_reset (sm->rx.buffer_fifo); + else + sm->tx.n_total_data_bytes = 0; + sm->vlib_main = vm; + m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx; + m->stream.data_function_opaque = pointer_to_uword (sm); +} + +void +serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); +} + +void +unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); +} + +u32 +serialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + vlib_buffer_t *last; + serialize_stream_t *s = &m->stream; + + last = vlib_get_buffer (sm->vlib_main, sm->last_buffer); + last->current_length = s->current_buffer_index; + + if (vec_len (s->overflow_buffer) > 0) + { + sm->last_buffer + = vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index, + sm->last_buffer == ~0 ? 0 : sm->last_buffer, + s->overflow_buffer, + vec_len (s->overflow_buffer)); + _vec_len (s->overflow_buffer) = 0; + } + + return sm->first_buffer; +} + +void +unserialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + if (sm->first_buffer != ~0) + vlib_buffer_free_one (sm->vlib_main, sm->first_buffer); + clib_fifo_reset (sm->rx.buffer_fifo); + if (m->stream.overflow_buffer) + _vec_len (m->stream.overflow_buffer) = 0; +} + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/main.c b/src/vlib/main.c index 6c6cad98..09f34bbd 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -465,7 +465,7 @@ vlib_put_next_frame (vlib_main_t * vm, vlib_frame_t *f; u32 n_vectors_in_frame; - if (DPDK == 0 && CLIB_DEBUG > 0) + if (vm->buffer_main->extern_buffer_mgmt == 0 && CLIB_DEBUG > 0) vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left); nf = vlib_node_runtime_get_next_frame (vm, r, next_index); @@ -1012,8 +1012,8 @@ dispatch_node (vlib_main_t * vm, /* When in interrupt mode and vector rate crosses threshold switch to polling mode. */ - if ((DPDK == 0 && dispatch_state == VLIB_NODE_STATE_INTERRUPT) - || (DPDK == 0 && dispatch_state == VLIB_NODE_STATE_POLLING + if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT) + || (dispatch_state == VLIB_NODE_STATE_POLLING && (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE))) { @@ -1615,6 +1615,7 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) vm->name = "VLIB"; vec_validate (vm->buffer_main, 0); + vlib_buffer_cb_init (vm); if ((error = vlib_thread_init (vm))) { diff --git a/src/vlib/threads.c b/src/vlib/threads.c index c5e58bc0..b3bbd30e 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -22,29 +22,10 @@ #include #include - -#if DPDK==1 -#include -#include -#include -#include -#include -#endif DECLARE_CJ_GLOBAL_LOG; #define FRAME_QUEUE_NELTS 32 - -#if DPDK==1 -/* - * Weak definitions of DPDK symbols used in this file. - * Needed for linking test programs without DPDK libs. - */ -unsigned __thread __attribute__ ((weak)) RTE_PER_LCORE (_lcore_id); -struct lcore_config __attribute__ ((weak)) lcore_config[]; -unsigned __attribute__ ((weak)) rte_socket_id (); -int __attribute__ ((weak)) rte_eal_remote_launch (); -#endif u32 vl (void *p) { @@ -194,14 +175,17 @@ vlib_thread_init (vlib_main_t * vm) tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1); /* pin main thread to main_lcore */ -#if DPDK==0 - { - cpu_set_t cpuset; - CPU_ZERO (&cpuset); - CPU_SET (tm->main_lcore, &cpuset); - pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset); - } -#endif + if (tm->cb.vlib_thread_set_lcore_cb) + { + tm->cb.vlib_thread_set_lcore_cb (0, tm->main_lcore); + } + else + { + cpu_set_t cpuset; + CPU_ZERO (&cpuset); + CPU_SET (tm->main_lcore, &cpuset); + pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset); + } /* as many threads as stacks... */ vec_validate_aligned (vlib_worker_threads, vec_len (vlib_thread_stacks) - 1, @@ -520,32 +504,29 @@ vlib_worker_thread_bootstrap_fn (void *arg) return rv; } -static int -vlib_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) +static clib_error_t * +vlib_launch_thread_int (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) { + vlib_thread_main_t *tm = &vlib_thread_main; void *(*fp_arg) (void *) = fp; w->lcore_id = lcore_id; -#if DPDK==1 - if (!w->registration->use_pthreads) - if (rte_eal_remote_launch) /* do we have dpdk linked */ - return rte_eal_remote_launch (fp, (void *) w, lcore_id); - else - return -1; + if (tm->cb.vlib_launch_thread_cb && !w->registration->use_pthreads) + return tm->cb.vlib_launch_thread_cb (fp, (void *) w, lcore_id); else -#endif { - int ret; pthread_t worker; cpu_set_t cpuset; CPU_ZERO (&cpuset); CPU_SET (lcore_id, &cpuset); - ret = pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w); - if (ret == 0) - return pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset); - else - return ret; + if (pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w)) + return clib_error_return_unix (0, "pthread_create"); + + if (pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset)) + return clib_error_return_unix (0, "pthread_setaffinity_np"); + + return 0; } } @@ -769,6 +750,7 @@ start_workers (vlib_main_t * vm) for (i = 0; i < vec_len (tm->registrations); i++) { + clib_error_t *err; int j; tr = tm->registrations[i]; @@ -778,22 +760,24 @@ start_workers (vlib_main_t * vm) for (j = 0; j < tr->count; j++) { w = vlib_worker_threads + worker_thread_index++; - if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, 0) < - 0) - clib_warning ("Couldn't start '%s' pthread ", tr->name); + err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn, + w, 0); + if (err) + clib_error_report (err); } } else { uword c; - /* *INDENT-OFF* */ - clib_bitmap_foreach (c, tr->coremask, ({ - w = vlib_worker_threads + worker_thread_index++; - if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, c) < 0) - clib_warning ("Couldn't start DPDK lcore %d", c); - - })); -/* *INDENT-ON* */ + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tr->coremask, ({ + w = vlib_worker_threads + worker_thread_index++; + err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn, + w, c); + if (err) + clib_error_report (err); + })); + /* *INDENT-ON* */ } } vlib_worker_thread_barrier_sync (vm); @@ -1105,7 +1089,7 @@ cpu_config (vlib_main_t * vm, unformat_input_t * input) { tm->n_thread_stacks += tr->count; tm->n_pthreads += tr->count * tr->use_pthreads; - tm->n_eal_threads += tr->count * (tr->use_pthreads == 0); + tm->n_threads += tr->count * (tr->use_pthreads == 0); tr = tr->next; } @@ -1423,6 +1407,7 @@ void vlib_worker_thread_fn (void *arg) { vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; + vlib_thread_main_t *tm = vlib_get_thread_main (); vlib_main_t *vm = vlib_get_main (); ASSERT (vm->cpu_index == os_get_cpu_number ()); @@ -1431,12 +1416,9 @@ vlib_worker_thread_fn (void *arg) clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); -#if DPDK > 0 /* Wait until the dpdk init sequence is complete */ - vlib_thread_main_t *tm = vlib_get_thread_main (); - while (tm->worker_thread_release == 0) + while (tm->extern_thread_mgmt && tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); -#endif vlib_worker_thread_internal (vm); } @@ -1475,6 +1457,20 @@ vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts) return (fqm - tm->frame_queue_mains); } + +int +vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + + if (tm->extern_thread_mgmt) + return -1; + + tm->cb.vlib_launch_thread_cb = cb->vlib_launch_thread_cb; + tm->extern_thread_mgmt = 1; + return 0; +} + clib_error_t * threads_init (vlib_main_t * vm) { diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 34ab5be8..75a5a281 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -263,6 +263,13 @@ typedef enum SCHED_POLICY_N, } sched_policy_t; +typedef struct +{ + clib_error_t *(*vlib_launch_thread_cb) (void *fp, vlib_worker_thread_t * w, + unsigned lcore_id); + clib_error_t *(*vlib_thread_set_lcore_cb) (u32 thread, u16 lcore); +} vlib_thread_callbacks_t; + typedef struct { /* Link list of registrations, built by constructors */ @@ -290,8 +297,8 @@ typedef struct /* Number of pthreads */ u32 n_pthreads; - /* Number of DPDK eal threads */ - u32 n_eal_threads; + /* Number of threads */ + u32 n_threads; /* Number of cores to skip, must match the core mask */ u32 skip_cores; @@ -320,6 +327,9 @@ typedef struct /* scheduling policy priority */ u32 sched_priority; + /* callbacks */ + vlib_thread_callbacks_t cb; + int extern_thread_mgmt; } vlib_thread_main_t; extern vlib_thread_main_t vlib_thread_main; @@ -459,6 +469,9 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, return elt; } +int vlib_thread_cb_register (struct vlib_main_t *vm, + vlib_thread_callbacks_t * cb); + #endif /* included_vlib_threads_h */ /* diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index ee632279..b64028c4 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -20,14 +20,6 @@ #include #include -#if DPDK==1 -#include -#include -#include -#include -#include -#endif - static u8 * format_sched_policy_and_priority (u8 * s, va_list * args) { @@ -116,23 +108,6 @@ show_threads_fn (vlib_main_t * vm, vec_free (p); line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id); -#if DPDK==1 - ASSERT (lcore <= RTE_MAX_LCORE); - switch (lcore_config[lcore].state) - { - case WAIT: - line = format (line, "wait"); - break; - case RUNNING: - line = format (line, "running"); - break; - case FINISHED: - line = format (line, "finished"); - break; - default: - line = format (line, "unknown"); - } -#endif } else { diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 80ab7b9d..8d10ad2e 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -45,13 +45,13 @@ static void * unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, uword alignment) { + vlib_main_t *vm = vlib_get_main (); physmem_main_t *pm = &physmem_main; uword lo_offset, hi_offset; uword *to_free = 0; -#if DPDK > 0 - clib_warning ("unsafe alloc!"); -#endif + if (vm->buffer_main->extern_buffer_mgmt) + clib_warning ("unsafe alloc!"); /* IO memory is always at least cache aligned. */ alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); @@ -269,16 +269,17 @@ static clib_error_t * show_physmem (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { -#if DPDK > 0 - vlib_cli_output (vm, "Not supported with DPDK drivers."); -#else physmem_main_t *pm = &physmem_main; + if (vm->buffer_main->extern_buffer_mgmt) + { + vlib_cli_output (vm, "Not supported with external buffer management."); + return 0; + } if (pm->heap) vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); else vlib_cli_output (vm, "No physmem allocated."); -#endif return 0; } diff --git a/src/vnet.am b/src/vnet.am index 665a16ea..47c5eda7 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -761,11 +761,13 @@ nobase_include_HEADERS += \ ######################################## if WITH_DPDK libvnet_la_SOURCES += \ + vnet/devices/dpdk/buffer.c \ vnet/devices/dpdk/dpdk_priv.h \ vnet/devices/dpdk/device.c \ vnet/devices/dpdk/format.c \ vnet/devices/dpdk/init.c \ vnet/devices/dpdk/node.c \ + vnet/devices/dpdk/thread.c \ vnet/devices/dpdk/hqos.c \ vnet/devices/dpdk/cli.c \ vnet/devices/dpdk/dpdk_api.c diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c new file mode 100644 index 00000000..214a9162 --- /dev/null +++ b/src/vnet/devices/dpdk/buffer.c @@ -0,0 +1,729 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.c: allocate/free network buffers. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @file + * + * Allocate/free network buffers. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, + "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM"); + +#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) + +/* Make sure we have at least given number of unaligned buffers. */ +static void +fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + uword n_unaligned_buffers) +{ + word la = vec_len (free_list->aligned_buffers); + word lu = vec_len (free_list->unaligned_buffers); + + /* Aligned come in aligned copy-sized chunks. */ + ASSERT (la % BUFFERS_PER_COPY == 0); + + ASSERT (la >= n_unaligned_buffers); + + while (lu < n_unaligned_buffers) + { + /* Copy 4 buffers from end of aligned vector to unaligned vector. */ + vec_add (free_list->unaligned_buffers, + free_list->aligned_buffers + la - BUFFERS_PER_COPY, + BUFFERS_PER_COPY); + la -= BUFFERS_PER_COPY; + lu += BUFFERS_PER_COPY; + } + _vec_len (free_list->aligned_buffers) = la; +} + +/* After free aligned buffers may not contain even sized chunks. */ +static void +trim_aligned (vlib_buffer_free_list_t * f) +{ + uword l, n_trim; + + /* Add unaligned to aligned before trim. */ + l = vec_len (f->unaligned_buffers); + if (l > 0) + { + vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, + /* align */ sizeof (vlib_copy_unit_t)); + + _vec_len (f->unaligned_buffers) = 0; + } + + /* Remove unaligned buffers from end of aligned vector and save for next trim. */ + l = vec_len (f->aligned_buffers); + n_trim = l % BUFFERS_PER_COPY; + if (n_trim) + { + /* Trim aligned -> unaligned. */ + vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); + + /* Remove from aligned. */ + _vec_len (f->aligned_buffers) = l - n_trim; + } +} + +static void +merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src) +{ + uword l; + u32 *d; + + trim_aligned (src); + trim_aligned (dst); + + l = vec_len (src->aligned_buffers); + if (l > 0) + { + vec_add2_aligned (dst->aligned_buffers, d, l, + /* align */ sizeof (vlib_copy_unit_t)); + clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); + vec_free (src->aligned_buffers); + } + + l = vec_len (src->unaligned_buffers); + if (l > 0) + { + vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); + vec_free (src->unaligned_buffers); + } +} + +always_inline u32 +dpdk_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + size = vlib_buffer_round_size (size); + uword *p = hash_get (bm->free_list_by_size, size); + return p ? p[0] : ~0; +} + +static void +del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) +{ + u32 i; + struct rte_mbuf *mb; + vlib_buffer_t *b; + + for (i = 0; i < vec_len (f->unaligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->unaligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + for (i = 0; i < vec_len (f->aligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->aligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + vec_free (f->name); + vec_free (f->unaligned_buffers); + vec_free (f->aligned_buffers); +} + +/* Add buffer free list. */ +static void +dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + u32 merge_index; + int i; + + ASSERT (os_get_cpu_number () == 0); + + f = vlib_buffer_get_free_list (vm, free_list_index); + + merge_index = dpdk_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, + merge_index), f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); + + for (i = 1; i < vec_len (vlib_mains); i++) + { + bm = vlib_mains[i]->buffer_main; + f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);; + memset (f, 0xab, sizeof (f[0])); + pool_put (bm->buffer_free_list_pool, f); + } +} + +/* Make sure free list has at least given number of free buffers. */ +static uword +fill_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, uword min_free_buffers) +{ + dpdk_main_t *dm = &dpdk_main; + vlib_buffer_t *b; + int n, i; + u32 bi; + u32 n_remaining = 0, n_alloc = 0; + unsigned socket_id = rte_socket_id (); + struct rte_mempool *rmp = dm->pktmbuf_pools[socket_id]; + struct rte_mbuf *mb; + + /* Too early? */ + if (PREDICT_FALSE (rmp == 0)) + return 0; + + trim_aligned (fl); + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->aligned_buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, BUFFERS_PER_COPY); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + vec_validate (vm->mbuf_alloc_list, n - 1); + + if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) + return 0; + + _vec_len (vm->mbuf_alloc_list) = n; + + for (i = 0; i < n; i++) + { + mb = vm->mbuf_alloc_list[i]; + + ASSERT (rte_mbuf_refcnt_read (mb) == 0); + rte_mbuf_refcnt_set (mb, 1); + + b = vlib_buffer_from_rte_mbuf (mb); + bi = vlib_get_buffer_index (vm, b); + + vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); + n_alloc++; + n_remaining--; + + vlib_buffer_init_for_free_list (b, fl); + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, &bi, 1); + } + + fl->n_alloc += n; + + return n; +} + +always_inline uword +copy_alignment (u32 * x) +{ + return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; +} + +static u32 +alloc_from_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + u32 * alloc_buffers, u32 n_alloc_buffers) +{ + u32 *dst, *u_src; + uword u_len, n_left; + uword n_unaligned_start, n_unaligned_end, n_filled; + + n_left = n_alloc_buffers; + dst = alloc_buffers; + n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) + & (BUFFERS_PER_COPY - 1)); + + n_filled = fill_free_list (vm, free_list, n_alloc_buffers); + if (n_filled == 0) + return 0; + + n_left = n_filled < n_left ? n_filled : n_left; + n_alloc_buffers = n_left; + + if (n_unaligned_start >= n_left) + { + n_unaligned_start = n_left; + n_unaligned_end = 0; + } + else + n_unaligned_end = copy_alignment (dst + n_alloc_buffers); + + fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + + u_len = vec_len (free_list->unaligned_buffers); + u_src = free_list->unaligned_buffers + u_len - 1; + + if (n_unaligned_start) + { + uword n_copy = n_unaligned_start; + if (n_copy > n_left) + n_copy = n_left; + n_left -= n_copy; + + while (n_copy > 0) + { + *dst++ = *u_src--; + n_copy--; + u_len--; + } + + /* Now dst should be aligned. */ + if (n_left > 0) + ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); + } + + /* Aligned copy. */ + { + vlib_copy_unit_t *d, *s; + uword n_copy; + + if (vec_len (free_list->aligned_buffers) < + ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) + abort (); + + n_copy = n_left / BUFFERS_PER_COPY; + n_left = n_left % BUFFERS_PER_COPY; + + /* Remove buffers from aligned free list. */ + _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; + + s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); + d = (vlib_copy_unit_t *) dst; + + /* Fast path loop. */ + while (n_copy >= 4) + { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + n_copy -= 4; + s += 4; + d += 4; + } + + while (n_copy >= 1) + { + d[0] = s[0]; + n_copy -= 1; + s += 1; + d += 1; + } + + dst = (void *) d; + } + + /* Unaligned copy. */ + ASSERT (n_unaligned_end == n_left); + while (n_left > 0) + { + *dst++ = *u_src--; + n_left--; + u_len--; + } + + if (!free_list->unaligned_buffers) + ASSERT (u_len == 0); + else + _vec_len (free_list->unaligned_buffers) = u_len; + + return n_alloc_buffers; +} + +/* Allocate a given number of buffers into given array. + Returns number actually allocated which will be either zero or + number requested. */ +u32 +dpdk_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + return alloc_from_free_list + (vm, + pool_elt_at_index (bm->buffer_free_list_pool, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX), + buffers, n_buffers); +} + + +u32 +dpdk_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index); + return alloc_from_free_list (vm, f, buffers, n_buffers); +} + +always_inline void +add_buffer_to_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * f, + u32 buffer_index, u8 do_init) +{ + vlib_buffer_t *b; + b = vlib_get_buffer (vm, buffer_index); + if (PREDICT_TRUE (do_init)) + vlib_buffer_init_for_free_list (b, f); + vec_add1_aligned (f->aligned_buffers, buffer_index, + sizeof (vlib_copy_unit_t)); +} + +always_inline vlib_buffer_free_list_t * +buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + u32 i; + + *index = i = b->free_list_index; + return pool_elt_at_index (bm->buffer_free_list_pool, i); +} + +static_always_inline void +vlib_buffer_free_inline (vlib_main_t * vm, + u32 * buffers, u32 n_buffers, u32 follow_buffer_next) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + u32 fi; + int i; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b; + struct rte_mbuf *mb; + + b = vlib_get_buffer (vm, buffers[i]); + + fl = buffer_get_free_list (vm, b, &fi); + + /* The only current use of this callback: multicast recycle */ + if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) + { + int j; + + add_buffer_to_free_list + (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); + + for (j = 0; j < vec_len (bm->announce_list); j++) + { + if (fl == bm->announce_list[j]) + goto already_announced; + } + vec_add1 (bm->announce_list, fl); + already_announced: + ; + } + else + { + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) + { + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + } + } + if (vec_len (bm->announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (bm->announce_list); i++) + { + fl = bm->announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (bm->announce_list) = 0; + } +} + +static void +dpdk_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 1); +} + +static void +dpdk_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 0); +} + +static void +dpdk_packet_template_init (vlib_main_t * vm, + void *vt, + void *packet_data, + uword n_packet_data_bytes, + uword min_n_buffers_each_physmem_alloc, u8 * name) +{ + vlib_packet_template_t *t = (vlib_packet_template_t *) vt; + + vlib_worker_thread_barrier_sync (vm); + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + + vlib_worker_thread_barrier_release (vm); +} + +clib_error_t * +vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id) +{ + dpdk_main_t *dm = &dpdk_main; + vlib_physmem_main_t *vpm = &vm->physmem_main; + struct rte_mempool *rmp; + int i; + + vec_validate_aligned (dm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); + + /* pool already exists, nothing to do */ + if (dm->pktmbuf_pools[socket_id]) + return 0; + + u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0); + + rmp = rte_pktmbuf_pool_create ((char *) pool_name, /* pool name */ + num_mbufs, /* number of mbufs */ + 512, /* cache size */ + VLIB_BUFFER_HDR_SIZE, /* priv size */ + VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE, /* dataroom size */ + socket_id); /* cpu socket */ + + if (rmp) + { + { + uword this_pool_end; + uword this_pool_start; + uword this_pool_size; + uword save_vpm_start, save_vpm_end, save_vpm_size; + struct rte_mempool_memhdr *memhdr; + + this_pool_start = ~0ULL; + this_pool_end = 0LL; + + STAILQ_FOREACH (memhdr, &rmp->mem_list, next) + { + if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) + this_pool_end = (uword) (memhdr->addr + memhdr->len); + if (((uword) memhdr->addr) < this_pool_start) + this_pool_start = (uword) (memhdr->addr); + } + ASSERT (this_pool_start < ~0ULL && this_pool_end > 0); + this_pool_size = this_pool_end - this_pool_start; + + if (CLIB_DEBUG > 1) + { + clib_warning ("%s: pool start %llx pool end %llx pool size %lld", + pool_name, this_pool_start, this_pool_end, + this_pool_size); + clib_warning + ("before: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + save_vpm_start = vpm->virtual.start; + save_vpm_end = vpm->virtual.end; + save_vpm_size = vpm->virtual.size; + + if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) + vpm->virtual.start = this_pool_start; + if (this_pool_end > vpm->virtual.end) + vpm->virtual.end = this_pool_end; + + vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; + + if (CLIB_DEBUG > 1) + { + clib_warning + ("after: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + /* check if fits into buffer index range */ + if ((u64) vpm->virtual.size > + ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) + { + clib_warning ("physmem: virtual size out of range!"); + vpm->virtual.start = save_vpm_start; + vpm->virtual.end = save_vpm_end; + vpm->virtual.size = save_vpm_size; + rmp = 0; + } + } + if (rmp) + { + dm->pktmbuf_pools[socket_id] = rmp; + vec_free (pool_name); + return 0; + } + } + + vec_free (pool_name); + + /* no usable pool for this socket, try to use pool from another one */ + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + { + if (dm->pktmbuf_pools[i]) + { + clib_warning + ("WARNING: Failed to allocate mempool for CPU socket %u. " + "Threads running on socket %u will use socket %u mempool.", + socket_id, socket_id, i); + dm->pktmbuf_pools[socket_id] = dm->pktmbuf_pools[i]; + return 0; + } + } + + return clib_error_return (0, "failed to allocate mempool on socket %u", + socket_id); +} + +#if CLIB_DEBUG > 0 + +u32 *vlib_buffer_state_validation_lock; +uword *vlib_buffer_state_validation_hash; +void *vlib_buffer_state_heap; + +static clib_error_t * +buffer_state_validation_init (vlib_main_t * vm) +{ + void *oldheap; + + vlib_buffer_state_heap = mheap_alloc (0, 10 << 20); + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword)); + vec_validate_aligned (vlib_buffer_state_validation_lock, 0, + CLIB_CACHE_LINE_BYTES); + clib_mem_set_heap (oldheap); + return 0; +} + +VLIB_INIT_FUNCTION (buffer_state_validation_init); +#endif + +static vlib_buffer_callbacks_t callbacks = { + .vlib_buffer_alloc_cb = &dpdk_buffer_alloc, + .vlib_buffer_alloc_from_free_list_cb = &dpdk_buffer_alloc_from_free_list, + .vlib_buffer_free_cb = &dpdk_buffer_free, + .vlib_buffer_free_no_next_cb = &dpdk_buffer_free_no_next, + .vlib_packet_template_init_cb = &dpdk_packet_template_init, + .vlib_buffer_delete_free_list_cb = &dpdk_buffer_delete_free_list, +}; + +static clib_error_t * +dpdk_buffer_init (vlib_main_t * vm) +{ + vlib_buffer_cb_register (vm, &callbacks); + return 0; +} + +VLIB_INIT_FUNCTION (dpdk_buffer_init); + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/devices/dpdk/cli.c b/src/vnet/devices/dpdk/cli.c index 538a00fd..22bd4b4f 100644 --- a/src/vnet/devices/dpdk/cli.c +++ b/src/vnet/devices/dpdk/cli.c @@ -164,9 +164,9 @@ show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, struct rte_mempool *rmp; int i; - for (i = 0; i < vec_len (vm->buffer_main->pktmbuf_pools); i++) + for (i = 0; i < vec_len (dpdk_main.pktmbuf_pools); i++) { - rmp = vm->buffer_main->pktmbuf_pools[i]; + rmp = dpdk_main.pktmbuf_pools[i]; if (rmp) { unsigned count = rte_mempool_avail_count (rmp); diff --git a/src/vnet/devices/dpdk/device.c b/src/vnet/devices/dpdk/device.c index b22fbf2e..0deab6aa 100644 --- a/src/vnet/devices/dpdk/device.c +++ b/src/vnet/devices/dpdk/device.c @@ -87,19 +87,18 @@ dpdk_set_mc_filter (vnet_hw_interface_t * hi, struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b) { - vlib_main_t *vm = vlib_get_main (); - vlib_buffer_main_t *bm = vm->buffer_main; + dpdk_main_t *dm = &dpdk_main; struct rte_mbuf **mbufs = 0, *s, *d; u8 nb_segs; unsigned socket_id = rte_socket_id (); int i; - ASSERT (bm->pktmbuf_pools[socket_id]); + ASSERT (dm->pktmbuf_pools[socket_id]); s = rte_mbuf_from_vlib_buffer (b); nb_segs = s->nb_segs; vec_validate (mbufs, nb_segs - 1); - if (rte_pktmbuf_alloc_bulk (bm->pktmbuf_pools[socket_id], mbufs, nb_segs)) + if (rte_pktmbuf_alloc_bulk (dm->pktmbuf_pools[socket_id], mbufs, nb_segs)) { vec_free (mbufs); return 0; diff --git a/src/vnet/devices/dpdk/dpdk.h b/src/vnet/devices/dpdk/dpdk.h index e0436031..066ec6fa 100644 --- a/src/vnet/devices/dpdk/dpdk.h +++ b/src/vnet/devices/dpdk/dpdk.h @@ -425,6 +425,9 @@ typedef struct vlib_main_t *vlib_main; vnet_main_t *vnet_main; dpdk_config_main_t *conf; + + /* mempool */ + struct rte_mempool **pktmbuf_pools; } dpdk_main_t; dpdk_main_t dpdk_main; diff --git a/src/vnet/devices/dpdk/dpdk_priv.h b/src/vnet/devices/dpdk/dpdk_priv.h index 0c81dbc3..dd40ff48 100644 --- a/src/vnet/devices/dpdk/dpdk_priv.h +++ b/src/vnet/devices/dpdk/dpdk_priv.h @@ -13,6 +13,9 @@ * limitations under the License. */ +#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1) +#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1)) + #define DPDK_NB_RX_DESC_DEFAULT 1024 #define DPDK_NB_TX_DESC_DEFAULT 1024 #define DPDK_NB_RX_DESC_VIRTIO 256 diff --git a/src/vnet/devices/dpdk/init.c b/src/vnet/devices/dpdk/init.c index 60689463..4c040d20 100755 --- a/src/vnet/devices/dpdk/init.c +++ b/src/vnet/devices/dpdk/init.c @@ -64,8 +64,6 @@ static struct rte_eth_conf port_conf_template = { clib_error_t * dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) { - vlib_main_t *vm = vlib_get_main (); - vlib_buffer_main_t *bm = vm->buffer_main; int rv; int j; @@ -107,7 +105,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, xd->cpu_socket, 0, - bm-> + dm-> pktmbuf_pools[xd->cpu_socket_id_by_queue [j]]); @@ -115,7 +113,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) if (rv < 0) rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, SOCKET_ID_ANY, 0, - bm-> + dm-> pktmbuf_pools[xd->cpu_socket_id_by_queue [j]]); if (rv < 0) diff --git a/src/vnet/devices/dpdk/thread.c b/src/vnet/devices/dpdk/thread.c new file mode 100644 index 00000000..475dd142 --- /dev/null +++ b/src/vnet/devices/dpdk/thread.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static clib_error_t * +dpdk_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) +{ + int r; + r = rte_eal_remote_launch (fp, (void *) w, lcore_id); + if (r) + return clib_error_return (0, "Failed to launch thread %u", lcore_id); + return 0; +} + +static clib_error_t * +dpdk_thread_set_lcore (u32 thread, u16 lcore) +{ + return 0; +} + +static vlib_thread_callbacks_t callbacks = { + .vlib_launch_thread_cb = &dpdk_launch_thread, + .vlib_thread_set_lcore_cb = &dpdk_thread_set_lcore, +}; + +static clib_error_t * +dpdk_thread_init (vlib_main_t * vm) +{ + vlib_thread_cb_register (vm, &callbacks); + return 0; +} + +VLIB_INIT_FUNCTION (dpdk_thread_init); + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/sr/sr_replicate.c b/src/vnet/sr/sr_replicate.c index 5f9de504..fa5a68c3 100644 --- a/src/vnet/sr/sr_replicate.c +++ b/src/vnet/sr/sr_replicate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -142,6 +143,7 @@ static uword sr_replicate_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { + dpdk_main_t *dm = &dpdk_main; u32 n_left_from, *from, *to_next; sr_replicate_next_t next_index; int pkts_replicated = 0; @@ -149,7 +151,6 @@ sr_replicate_node_fn (vlib_main_t * vm, int no_buffer_drops = 0; vlib_buffer_free_list_t *fl; unsigned socket_id = rte_socket_id (); - vlib_buffer_main_t *bm = vm->buffer_main; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -246,13 +247,13 @@ sr_replicate_node_fn (vlib_main_t * vm, vlib_buffer_t *clone0_c, *clone_b0; t0 = vec_elt_at_index (sm->tunnels, pol0->tunnel_indices[i]); - hdr_mb0 = rte_pktmbuf_alloc (bm->pktmbuf_pools[socket_id]); + hdr_mb0 = rte_pktmbuf_alloc (dm->pktmbuf_pools[socket_id]); if (i < (num_replicas - 1)) { /* Not the last tunnel to process */ clone0 = rte_pktmbuf_clone - (orig_mb0, bm->pktmbuf_pools[socket_id]); + (orig_mb0, dm->pktmbuf_pools[socket_id]); if (clone0 == 0) goto clone_fail; nb_seg = 0; -- cgit 1.2.3-korg From 8a6a3b2ddb2e2929c4697b31746b3f617886f157 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 17 Jan 2017 14:12:42 +0100 Subject: dpdk: remove duplicate code in buffers.c Change-Id: Idc17b4a32d40012556d5d8550942db0372ebf23d Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 73 +++++++---------------- src/vlib/buffer_funcs.h | 48 +++++++++++++++ src/vnet/devices/dpdk/buffer.c | 131 +++-------------------------------------- 3 files changed, 78 insertions(+), 174 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 0b0e6054..ea4960e2 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -307,10 +307,10 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, #define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) /* Make sure we have at least given number of unaligned buffers. */ -static void -fill_unaligned (vlib_main_t * vm, - vlib_buffer_free_list_t * free_list, - uword n_unaligned_buffers) +void +vlib_buffer_free_list_fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + uword n_unaligned_buffers) { word la = vec_len (free_list->aligned_buffers); word lu = vec_len (free_list->unaligned_buffers); @@ -333,8 +333,8 @@ fill_unaligned (vlib_main_t * vm, } /* After free aligned buffers may not contain even sized chunks. */ -static void -trim_aligned (vlib_buffer_free_list_t * f) +void +vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f) { uword l, n_trim; @@ -361,15 +361,15 @@ trim_aligned (vlib_buffer_free_list_t * f) } } -static void -merge_free_lists (vlib_buffer_free_list_t * dst, - vlib_buffer_free_list_t * src) +void +vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src) { uword l; u32 *d; - trim_aligned (src); - trim_aligned (dst); + vlib_buffer_free_list_trim_aligned (src); + vlib_buffer_free_list_trim_aligned (dst); l = vec_len (src->aligned_buffers); if (l > 0) @@ -388,16 +388,6 @@ merge_free_lists (vlib_buffer_free_list_t * dst, } } -always_inline u32 -vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - - size = vlib_buffer_round_size (size); - uword *p = hash_get (bm->free_list_by_size, size); - return p ? p[0] : ~0; -} - /* Add buffer free list. */ static u32 vlib_buffer_create_free_list_helper (vlib_main_t * vm, @@ -537,8 +527,9 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); if (merge_index != ~0 && merge_index != free_list_index) { - merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, - merge_index), f); + vlib_buffer_merge_free_lists (pool_elt_at_index + (bm->buffer_free_list_pool, merge_index), + f); } del_free_list (vm, f); @@ -567,7 +558,7 @@ fill_free_list (vlib_main_t * vm, u32 *bi; u32 n_remaining, n_alloc, n_this_chunk; - trim_aligned (fl); + vlib_buffer_free_list_trim_aligned (fl); /* Already have enough free buffers on free list? */ n = min_free_buffers - vec_len (fl->aligned_buffers); @@ -666,7 +657,8 @@ alloc_from_free_list (vlib_main_t * vm, else n_unaligned_end = copy_alignment (dst + n_alloc_buffers); - fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + vlib_buffer_free_list_fill_unaligned (vm, free_list, + n_unaligned_start + n_unaligned_end); u_len = vec_len (free_list->unaligned_buffers); u_src = free_list->unaligned_buffers + u_len - 1; @@ -779,29 +771,6 @@ vlib_buffer_alloc_from_free_list_internal (vlib_main_t * vm, return alloc_from_free_list (vm, f, buffers, n_buffers); } -always_inline void -add_buffer_to_free_list (vlib_main_t * vm, - vlib_buffer_free_list_t * f, - u32 buffer_index, u8 do_init) -{ - vlib_buffer_t *b; - b = vlib_get_buffer (vm, buffer_index); - if (PREDICT_TRUE (do_init)) - vlib_buffer_init_for_free_list (b, f); - vec_add1_aligned (f->aligned_buffers, buffer_index, - sizeof (vlib_copy_unit_t)); -} - -always_inline vlib_buffer_free_list_t * -buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - u32 i; - - *index = i = b->free_list_index; - return pool_elt_at_index (bm->buffer_free_list_pool, i); -} - void * vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp) { @@ -845,7 +814,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, vlib_buffer_t *b0; b0 = vlib_get_buffer (vm, bi0); - fl = buffer_get_free_list (vm, b0, &fi); + fl = vlib_buffer_get_buffer_free_list (vm, b0, &fi); if (fl->buffers_added_to_freelist_function) vec_add1 (announce_list, fl); } @@ -926,7 +895,7 @@ again: fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); - add_buffer_to_free_list (vm, fl0, bi0, free0); + vlib_buffer_add_to_free_list (vm, fl0, bi0, free0); if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) { int i; @@ -946,7 +915,7 @@ again: } no_fl1: - add_buffer_to_free_list (vm, fl1, bi1, free1); + vlib_buffer_add_to_free_list (vm, fl1, bi1, free1); /* Possibly change current free list. */ if (fi0 != fi && fi1 != fi) @@ -1003,7 +972,7 @@ again: fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); - add_buffer_to_free_list (vm, fl0, bi0, free0); + vlib_buffer_add_to_free_list (vm, fl0, bi0, free0); if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) { int i; diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 15d93c16..543a903c 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -350,6 +350,41 @@ vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, char *fmt, ...); + +/* After free aligned buffers may not contain even sized chunks. */ +void vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f); + +/* Merge two free lists */ +void vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src); + +/* Make sure we have at least given number of unaligned buffers. */ +void vlib_buffer_free_list_fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * + free_list, + uword n_unaligned_buffers); + +always_inline u32 +vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + size = vlib_buffer_round_size (size); + uword *p = hash_get (bm->free_list_by_size, size); + return p ? p[0] : ~0; +} + +always_inline vlib_buffer_free_list_t * +vlib_buffer_get_buffer_free_list (vlib_main_t * vm, vlib_buffer_t * b, + u32 * index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + u32 i; + + *index = i = b->free_list_index; + return pool_elt_at_index (bm->buffer_free_list_pool, i); +} + always_inline vlib_buffer_free_list_t * vlib_buffer_get_free_list (vlib_main_t * vm, u32 free_list_index) { @@ -674,6 +709,19 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * _dst, ASSERT (dst->b.total_length_not_including_first_buffer == 0); } +always_inline void +vlib_buffer_add_to_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * f, + u32 buffer_index, u8 do_init) +{ + vlib_buffer_t *b; + b = vlib_get_buffer (vm, buffer_index); + if (PREDICT_TRUE (do_init)) + vlib_buffer_init_for_free_list (b, f); + vec_add1_aligned (f->aligned_buffers, buffer_index, + sizeof (vlib_copy_unit_t)); +} + always_inline void vlib_buffer_init_two_for_free_list (vlib_buffer_t * _dst0, vlib_buffer_t * _dst1, diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c index 214a9162..038f46d9 100644 --- a/src/vnet/devices/dpdk/buffer.c +++ b/src/vnet/devices/dpdk/buffer.c @@ -81,98 +81,6 @@ STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, #define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) -/* Make sure we have at least given number of unaligned buffers. */ -static void -fill_unaligned (vlib_main_t * vm, - vlib_buffer_free_list_t * free_list, - uword n_unaligned_buffers) -{ - word la = vec_len (free_list->aligned_buffers); - word lu = vec_len (free_list->unaligned_buffers); - - /* Aligned come in aligned copy-sized chunks. */ - ASSERT (la % BUFFERS_PER_COPY == 0); - - ASSERT (la >= n_unaligned_buffers); - - while (lu < n_unaligned_buffers) - { - /* Copy 4 buffers from end of aligned vector to unaligned vector. */ - vec_add (free_list->unaligned_buffers, - free_list->aligned_buffers + la - BUFFERS_PER_COPY, - BUFFERS_PER_COPY); - la -= BUFFERS_PER_COPY; - lu += BUFFERS_PER_COPY; - } - _vec_len (free_list->aligned_buffers) = la; -} - -/* After free aligned buffers may not contain even sized chunks. */ -static void -trim_aligned (vlib_buffer_free_list_t * f) -{ - uword l, n_trim; - - /* Add unaligned to aligned before trim. */ - l = vec_len (f->unaligned_buffers); - if (l > 0) - { - vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, - /* align */ sizeof (vlib_copy_unit_t)); - - _vec_len (f->unaligned_buffers) = 0; - } - - /* Remove unaligned buffers from end of aligned vector and save for next trim. */ - l = vec_len (f->aligned_buffers); - n_trim = l % BUFFERS_PER_COPY; - if (n_trim) - { - /* Trim aligned -> unaligned. */ - vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); - - /* Remove from aligned. */ - _vec_len (f->aligned_buffers) = l - n_trim; - } -} - -static void -merge_free_lists (vlib_buffer_free_list_t * dst, - vlib_buffer_free_list_t * src) -{ - uword l; - u32 *d; - - trim_aligned (src); - trim_aligned (dst); - - l = vec_len (src->aligned_buffers); - if (l > 0) - { - vec_add2_aligned (dst->aligned_buffers, d, l, - /* align */ sizeof (vlib_copy_unit_t)); - clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); - vec_free (src->aligned_buffers); - } - - l = vec_len (src->unaligned_buffers); - if (l > 0) - { - vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); - vec_free (src->unaligned_buffers); - } -} - -always_inline u32 -dpdk_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - - size = vlib_buffer_round_size (size); - uword *p = hash_get (bm->free_list_by_size, size); - return p ? p[0] : ~0; -} - static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { @@ -212,11 +120,12 @@ dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) f = vlib_buffer_get_free_list (vm, free_list_index); - merge_index = dpdk_buffer_get_free_list_with_size (vm, f->n_data_bytes); + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); if (merge_index != ~0 && merge_index != free_list_index) { - merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, - merge_index), f); + vlib_buffer_merge_free_lists (pool_elt_at_index + (bm->buffer_free_list_pool, merge_index), + f); } del_free_list (vm, f); @@ -253,7 +162,7 @@ fill_free_list (vlib_main_t * vm, if (PREDICT_FALSE (rmp == 0)) return 0; - trim_aligned (fl); + vlib_buffer_free_list_trim_aligned (fl); /* Already have enough free buffers on free list? */ n = min_free_buffers - vec_len (fl->aligned_buffers); @@ -333,7 +242,8 @@ alloc_from_free_list (vlib_main_t * vm, else n_unaligned_end = copy_alignment (dst + n_alloc_buffers); - fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + vlib_buffer_free_list_fill_unaligned (vm, free_list, + n_unaligned_start + n_unaligned_end); u_len = vec_len (free_list->unaligned_buffers); u_src = free_list->unaligned_buffers + u_len - 1; @@ -442,29 +352,6 @@ dpdk_buffer_alloc_from_free_list (vlib_main_t * vm, return alloc_from_free_list (vm, f, buffers, n_buffers); } -always_inline void -add_buffer_to_free_list (vlib_main_t * vm, - vlib_buffer_free_list_t * f, - u32 buffer_index, u8 do_init) -{ - vlib_buffer_t *b; - b = vlib_get_buffer (vm, buffer_index); - if (PREDICT_TRUE (do_init)) - vlib_buffer_init_for_free_list (b, f); - vec_add1_aligned (f->aligned_buffers, buffer_index, - sizeof (vlib_copy_unit_t)); -} - -always_inline vlib_buffer_free_list_t * -buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - u32 i; - - *index = i = b->free_list_index; - return pool_elt_at_index (bm->buffer_free_list_pool, i); -} - static_always_inline void vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, u32 follow_buffer_next) @@ -491,14 +378,14 @@ vlib_buffer_free_inline (vlib_main_t * vm, b = vlib_get_buffer (vm, buffers[i]); - fl = buffer_get_free_list (vm, b, &fi); + fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); /* The only current use of this callback: multicast recycle */ if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) { int j; - add_buffer_to_free_list + vlib_buffer_add_to_free_list (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); for (j = 0; j < vec_len (bm->announce_list); j++) -- cgit 1.2.3-korg From bd69a5f24c6e83e9101f203dd124864fb2877a17 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Sun, 5 Feb 2017 23:44:42 +0100 Subject: vlib: remove algned/unaligned buffers scheme Change-Id: I4433eaed3f4e201edc329c4842cbbf74beb19a9a Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 220 +++++------------------------------------ src/vlib/buffer.h | 13 +-- src/vlib/buffer_funcs.h | 53 +++------- src/vlib/threads.c | 3 +- src/vnet/devices/dpdk/buffer.c | 131 +++--------------------- src/vnet/replication.c | 23 +---- 6 files changed, 57 insertions(+), 386 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index ea4960e2..95b4344f 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -304,63 +304,6 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, } } -#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) - -/* Make sure we have at least given number of unaligned buffers. */ -void -vlib_buffer_free_list_fill_unaligned (vlib_main_t * vm, - vlib_buffer_free_list_t * free_list, - uword n_unaligned_buffers) -{ - word la = vec_len (free_list->aligned_buffers); - word lu = vec_len (free_list->unaligned_buffers); - - /* Aligned come in aligned copy-sized chunks. */ - ASSERT (la % BUFFERS_PER_COPY == 0); - - ASSERT (la >= n_unaligned_buffers); - - while (lu < n_unaligned_buffers) - { - /* Copy 4 buffers from end of aligned vector to unaligned vector. */ - vec_add (free_list->unaligned_buffers, - free_list->aligned_buffers + la - BUFFERS_PER_COPY, - BUFFERS_PER_COPY); - la -= BUFFERS_PER_COPY; - lu += BUFFERS_PER_COPY; - } - _vec_len (free_list->aligned_buffers) = la; -} - -/* After free aligned buffers may not contain even sized chunks. */ -void -vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f) -{ - uword l, n_trim; - - /* Add unaligned to aligned before trim. */ - l = vec_len (f->unaligned_buffers); - if (l > 0) - { - vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, - /* align */ sizeof (vlib_copy_unit_t)); - - _vec_len (f->unaligned_buffers) = 0; - } - - /* Remove unaligned buffers from end of aligned vector and save for next trim. */ - l = vec_len (f->aligned_buffers); - n_trim = l % BUFFERS_PER_COPY; - if (n_trim) - { - /* Trim aligned -> unaligned. */ - vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); - - /* Remove from aligned. */ - _vec_len (f->aligned_buffers) = l - n_trim; - } -} - void vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, vlib_buffer_free_list_t * src) @@ -368,23 +311,12 @@ vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, uword l; u32 *d; - vlib_buffer_free_list_trim_aligned (src); - vlib_buffer_free_list_trim_aligned (dst); - - l = vec_len (src->aligned_buffers); - if (l > 0) - { - vec_add2_aligned (dst->aligned_buffers, d, l, - /* align */ sizeof (vlib_copy_unit_t)); - clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); - vec_free (src->aligned_buffers); - } - - l = vec_len (src->unaligned_buffers); + l = vec_len (src->buffers); if (l > 0) { - vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); - vec_free (src->unaligned_buffers); + vec_add2_aligned (dst->buffers, d, l, CLIB_CACHE_LINE_BYTES); + clib_memcpy (d, src->buffers, l * sizeof (d[0])); + vec_free (src->buffers); } } @@ -447,8 +379,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, ASSERT (f - bm->buffer_free_list_pool == wf - wbm->buffer_free_list_pool); wf[0] = f[0]; - wf->aligned_buffers = 0; - wf->unaligned_buffers = 0; + wf->buffers = 0; wf->n_alloc = 0; } @@ -505,8 +436,7 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) vm->os_physmem_free (f->buffer_memory_allocated[i]); vec_free (f->name); vec_free (f->buffer_memory_allocated); - vec_free (f->unaligned_buffers); - vec_free (f->aligned_buffers); + vec_free (f->buffers); } /* Add buffer free list. */ @@ -522,8 +452,7 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) f = vlib_buffer_get_free_list (vm, free_list_index); - ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == - f->n_alloc); + ASSERT (vec_len (f->buffers) == f->n_alloc); merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); if (merge_index != ~0 && merge_index != free_list_index) { @@ -558,15 +487,13 @@ fill_free_list (vlib_main_t * vm, u32 *bi; u32 n_remaining, n_alloc, n_this_chunk; - vlib_buffer_free_list_trim_aligned (fl); - /* Already have enough free buffers on free list? */ - n = min_free_buffers - vec_len (fl->aligned_buffers); + n = min_free_buffers - vec_len (fl->buffers); if (n <= 0) return min_free_buffers; /* Always allocate round number of buffers. */ - n = round_pow2 (n, BUFFERS_PER_COPY); + n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32)); /* Always allocate new buffers in reasonably large sized chunks. */ n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); @@ -594,8 +521,7 @@ fill_free_list (vlib_main_t * vm, n_remaining -= n_this_chunk; b = buffers; - vec_add2_aligned (fl->aligned_buffers, bi, n_this_chunk, - sizeof (vlib_copy_unit_t)); + vec_add2_aligned (fl->buffers, bi, n_this_chunk, CLIB_CACHE_LINE_BYTES); for (i = 0; i < n_this_chunk; i++) { bi[i] = vlib_get_buffer_index (vm, b); @@ -621,121 +547,28 @@ fill_free_list (vlib_main_t * vm, return n_alloc; } -always_inline uword -copy_alignment (u32 * x) -{ - return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; -} - - static u32 alloc_from_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * free_list, u32 * alloc_buffers, u32 n_alloc_buffers) { - u32 *dst, *u_src; - uword u_len, n_left; - uword n_unaligned_start, n_unaligned_end, n_filled; + u32 *dst, *src; + uword len; + uword n_filled; - n_left = n_alloc_buffers; dst = alloc_buffers; - n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) - & (BUFFERS_PER_COPY - 1)); n_filled = fill_free_list (vm, free_list, n_alloc_buffers); if (n_filled == 0) return 0; - n_left = n_filled < n_left ? n_filled : n_left; - n_alloc_buffers = n_left; - - if (n_unaligned_start >= n_left) - { - n_unaligned_start = n_left; - n_unaligned_end = 0; - } - else - n_unaligned_end = copy_alignment (dst + n_alloc_buffers); - - vlib_buffer_free_list_fill_unaligned (vm, free_list, - n_unaligned_start + n_unaligned_end); - - u_len = vec_len (free_list->unaligned_buffers); - u_src = free_list->unaligned_buffers + u_len - 1; + len = vec_len (free_list->buffers); + ASSERT (len >= n_alloc_buffers); - if (n_unaligned_start) - { - uword n_copy = n_unaligned_start; - if (n_copy > n_left) - n_copy = n_left; - n_left -= n_copy; - - while (n_copy > 0) - { - *dst++ = *u_src--; - n_copy--; - u_len--; - } - - /* Now dst should be aligned. */ - if (n_left > 0) - ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); - } - - /* Aligned copy. */ - { - vlib_copy_unit_t *d, *s; - uword n_copy; - - if (vec_len (free_list->aligned_buffers) < - ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) - abort (); - - n_copy = n_left / BUFFERS_PER_COPY; - n_left = n_left % BUFFERS_PER_COPY; - - /* Remove buffers from aligned free list. */ - _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; - - s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); - d = (vlib_copy_unit_t *) dst; - - /* Fast path loop. */ - while (n_copy >= 4) - { - d[0] = s[0]; - d[1] = s[1]; - d[2] = s[2]; - d[3] = s[3]; - n_copy -= 4; - s += 4; - d += 4; - } - - while (n_copy >= 1) - { - d[0] = s[0]; - n_copy -= 1; - s += 1; - d += 1; - } - - dst = (void *) d; - } - - /* Unaligned copy. */ - ASSERT (n_unaligned_end == n_left); - while (n_left > 0) - { - *dst++ = *u_src--; - n_left--; - u_len--; - } + src = free_list->buffers + len - n_alloc_buffers; + clib_memcpy (dst, src, n_alloc_buffers * sizeof (u32)); - if (!free_list->unaligned_buffers) - ASSERT (u_len == 0); - else - _vec_len (free_list->unaligned_buffers) = u_len; + _vec_len (free_list->buffers) -= n_alloc_buffers; /* Verify that buffers are known free. */ vlib_buffer_validate_alloc_free (vm, alloc_buffers, @@ -831,8 +664,7 @@ again: vlib_buffer_validate_alloc_free (vm, b, n_left, VLIB_BUFFER_KNOWN_ALLOCATED); - vec_add2_aligned (fl->aligned_buffers, f, n_left, - /* align */ sizeof (vlib_copy_unit_t)); + vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); n = next_to_free[i_next_to_free]; while (n_left >= 4) @@ -890,7 +722,7 @@ again: f -= 2; n -= free_next0 + free_next1; - _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + _vec_len (fl->buffers) = f - fl->buffers; fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); @@ -924,8 +756,7 @@ again: fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); } - vec_add2_aligned (fl->aligned_buffers, f, n_left, - /* align */ sizeof (vlib_copy_unit_t)); + vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); } while (n_left >= 1) @@ -968,7 +799,7 @@ again: f -= 1; n -= free_next0; - _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + _vec_len (fl->buffers) = f - fl->buffers; fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); @@ -986,8 +817,7 @@ again: fi = fi0; fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); - vec_add2_aligned (fl->aligned_buffers, f, n_left, - /* align */ sizeof (vlib_copy_unit_t)); + vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); } if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0)) @@ -997,7 +827,7 @@ again: goto again; } - _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + _vec_len (fl->buffers) = f - fl->buffers; if (vec_len (announce_list)) { @@ -1239,7 +1069,7 @@ format_vlib_buffer_free_list (u8 * s, va_list * va) "#Alloc", "#Free"); size = sizeof (vlib_buffer_t) + f->n_data_bytes; - n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); + n_free = vec_len (f->buffers); bytes_alloc = size * f->n_alloc; bytes_free = size * n_free; diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index d270c08a..fffb50c8 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -50,12 +50,6 @@ #define VLIB_BUFFER_DATA_SIZE (2048) #define VLIB_BUFFER_PRE_DATA_SIZE __PRE_DATA_SIZE -#if defined (CLIB_HAVE_VEC128) || defined (__aarch64__) -typedef u8x16 vlib_copy_unit_t; -#else -typedef u64 vlib_copy_unit_t; -#endif - /** \file vlib buffer structure definition and a few select access methods. This structure and the buffer allocation @@ -262,11 +256,8 @@ typedef struct vlib_buffer_free_list_t /* Total number of buffers allocated from this free list. */ u32 n_alloc; - /* Vector of free buffers. Each element is a byte offset into I/O heap. - Aligned vectors always has naturally aligned vlib_copy_unit_t sized chunks - of buffer indices. Unaligned vector has any left over. This is meant to - speed up copy routines. */ - u32 *aligned_buffers, *unaligned_buffers; + /* Vector of free buffers. Each element is a byte offset into I/O heap. */ + u32 *buffers; /* Memory chunks allocated for this free list recorded here so they can be freed when free list diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 543a903c..fd051de5 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -350,10 +350,6 @@ vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, char *fmt, ...); - -/* After free aligned buffers may not contain even sized chunks. */ -void vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f); - /* Merge two free lists */ void vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, vlib_buffer_free_list_t * src); @@ -664,23 +660,14 @@ unserialize_vlib_buffer_n_bytes (serialize_main_t * m) return n; } -typedef union -{ - vlib_buffer_t b; - vlib_copy_unit_t i[sizeof (vlib_buffer_t) / sizeof (vlib_copy_unit_t)]; -} -vlib_buffer_union_t; - /* Set a buffer quickly into "uninitialized" state. We want this to be extremely cheap and arrange for all fields that need to be initialized to be in the first 128 bits of the buffer. */ always_inline void -vlib_buffer_init_for_free_list (vlib_buffer_t * _dst, +vlib_buffer_init_for_free_list (vlib_buffer_t * dst, vlib_buffer_free_list_t * fl) { - vlib_buffer_union_t *dst = (vlib_buffer_union_t *) _dst; - vlib_buffer_union_t *src = - (vlib_buffer_union_t *) & fl->buffer_init_template; + vlib_buffer_t *src = &fl->buffer_init_template; /* Make sure vlib_buffer_t is cacheline aligned and sized */ ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline0) == 0); @@ -692,21 +679,14 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * _dst, /* Make sure buffer template is sane. */ ASSERT (fl->index == fl->buffer_init_template.free_list_index); - /* Copy template from src->current_data thru src->free_list_index */ - dst->i[0] = src->i[0]; - if (1 * sizeof (dst->i[0]) < 16) - dst->i[1] = src->i[1]; - if (2 * sizeof (dst->i[0]) < 16) - dst->i[2] = src->i[2]; - /* Make sure it really worked. */ -#define _(f) ASSERT (dst->b.f == src->b.f) +#define _(f) dst->f = src->f _(current_data); _(current_length); _(flags); _(free_list_index); #undef _ - ASSERT (dst->b.total_length_not_including_first_buffer == 0); + ASSERT (dst->total_length_not_including_first_buffer == 0); } always_inline void @@ -718,39 +698,28 @@ vlib_buffer_add_to_free_list (vlib_main_t * vm, b = vlib_get_buffer (vm, buffer_index); if (PREDICT_TRUE (do_init)) vlib_buffer_init_for_free_list (b, f); - vec_add1_aligned (f->aligned_buffers, buffer_index, - sizeof (vlib_copy_unit_t)); + vec_add1_aligned (f->buffers, buffer_index, CLIB_CACHE_LINE_BYTES); } always_inline void -vlib_buffer_init_two_for_free_list (vlib_buffer_t * _dst0, - vlib_buffer_t * _dst1, +vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, + vlib_buffer_t * dst1, vlib_buffer_free_list_t * fl) { - vlib_buffer_union_t *dst0 = (vlib_buffer_union_t *) _dst0; - vlib_buffer_union_t *dst1 = (vlib_buffer_union_t *) _dst1; - vlib_buffer_union_t *src = - (vlib_buffer_union_t *) & fl->buffer_init_template; + vlib_buffer_t *src = &fl->buffer_init_template; /* Make sure buffer template is sane. */ ASSERT (fl->index == fl->buffer_init_template.free_list_index); - /* Copy template from src->current_data thru src->free_list_index */ - dst0->i[0] = dst1->i[0] = src->i[0]; - if (1 * sizeof (dst0->i[0]) < 16) - dst0->i[1] = dst1->i[1] = src->i[1]; - if (2 * sizeof (dst0->i[0]) < 16) - dst0->i[2] = dst1->i[2] = src->i[2]; - /* Make sure it really worked. */ -#define _(f) ASSERT (dst0->b.f == src->b.f && dst1->b.f == src->b.f) +#define _(f) dst0->f = src->f; dst1->f = src->f _(current_data); _(current_length); _(flags); _(free_list_index); #undef _ - ASSERT (dst0->b.total_length_not_including_first_buffer == 0); - ASSERT (dst1->b.total_length_not_including_first_buffer == 0); + ASSERT (dst0->total_length_not_including_first_buffer == 0); + ASSERT (dst1->total_length_not_including_first_buffer == 0); } #if CLIB_DEBUG > 0 diff --git a/src/vlib/threads.c b/src/vlib/threads.c index b3bbd30e..e3ea3c9c 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -708,8 +708,7 @@ start_workers (vlib_main_t * vm) == fl_clone - bm_clone->buffer_free_list_pool); fl_clone[0] = fl_orig[0]; - fl_clone->aligned_buffers = 0; - fl_clone->unaligned_buffers = 0; + fl_clone->buffers = 0; fl_clone->n_alloc = 0; })); /* *INDENT-ON* */ diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c index 038f46d9..43ceb91e 100644 --- a/src/vnet/devices/dpdk/buffer.c +++ b/src/vnet/devices/dpdk/buffer.c @@ -79,8 +79,6 @@ STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM"); -#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) - static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { @@ -88,23 +86,15 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) struct rte_mbuf *mb; vlib_buffer_t *b; - for (i = 0; i < vec_len (f->unaligned_buffers); i++) + for (i = 0; i < vec_len (f->buffers); i++) { - b = vlib_get_buffer (vm, f->unaligned_buffers[i]); - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } - for (i = 0; i < vec_len (f->aligned_buffers); i++) - { - b = vlib_get_buffer (vm, f->aligned_buffers[i]); + b = vlib_get_buffer (vm, f->buffers[i]); mb = rte_mbuf_from_vlib_buffer (b); ASSERT (rte_mbuf_refcnt_read (mb) == 1); rte_pktmbuf_free (mb); } vec_free (f->name); - vec_free (f->unaligned_buffers); - vec_free (f->aligned_buffers); + vec_free (f->buffers); } /* Add buffer free list. */ @@ -162,15 +152,13 @@ fill_free_list (vlib_main_t * vm, if (PREDICT_FALSE (rmp == 0)) return 0; - vlib_buffer_free_list_trim_aligned (fl); - /* Already have enough free buffers on free list? */ - n = min_free_buffers - vec_len (fl->aligned_buffers); + n = min_free_buffers - vec_len (fl->buffers); if (n <= 0) return min_free_buffers; /* Always allocate round number of buffers. */ - n = round_pow2 (n, BUFFERS_PER_COPY); + n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32)); /* Always allocate new buffers in reasonably large sized chunks. */ n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); @@ -192,7 +180,7 @@ fill_free_list (vlib_main_t * vm, b = vlib_buffer_from_rte_mbuf (mb); bi = vlib_get_buffer_index (vm, b); - vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); + vec_add1_aligned (fl->buffers, bi, CLIB_CACHE_LINE_BYTES); n_alloc++; n_remaining--; @@ -207,120 +195,27 @@ fill_free_list (vlib_main_t * vm, return n; } -always_inline uword -copy_alignment (u32 * x) -{ - return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; -} - static u32 alloc_from_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * free_list, u32 * alloc_buffers, u32 n_alloc_buffers) { - u32 *dst, *u_src; - uword u_len, n_left; - uword n_unaligned_start, n_unaligned_end, n_filled; + u32 *dst, *src; + uword len, n_filled; - n_left = n_alloc_buffers; dst = alloc_buffers; - n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) - & (BUFFERS_PER_COPY - 1)); n_filled = fill_free_list (vm, free_list, n_alloc_buffers); if (n_filled == 0) return 0; - n_left = n_filled < n_left ? n_filled : n_left; - n_alloc_buffers = n_left; - - if (n_unaligned_start >= n_left) - { - n_unaligned_start = n_left; - n_unaligned_end = 0; - } - else - n_unaligned_end = copy_alignment (dst + n_alloc_buffers); - - vlib_buffer_free_list_fill_unaligned (vm, free_list, - n_unaligned_start + n_unaligned_end); - - u_len = vec_len (free_list->unaligned_buffers); - u_src = free_list->unaligned_buffers + u_len - 1; - - if (n_unaligned_start) - { - uword n_copy = n_unaligned_start; - if (n_copy > n_left) - n_copy = n_left; - n_left -= n_copy; - - while (n_copy > 0) - { - *dst++ = *u_src--; - n_copy--; - u_len--; - } - - /* Now dst should be aligned. */ - if (n_left > 0) - ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); - } - - /* Aligned copy. */ - { - vlib_copy_unit_t *d, *s; - uword n_copy; - - if (vec_len (free_list->aligned_buffers) < - ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) - abort (); - - n_copy = n_left / BUFFERS_PER_COPY; - n_left = n_left % BUFFERS_PER_COPY; - - /* Remove buffers from aligned free list. */ - _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; - - s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); - d = (vlib_copy_unit_t *) dst; + len = vec_len (free_list->buffers); + ASSERT (len >= n_alloc_buffers); - /* Fast path loop. */ - while (n_copy >= 4) - { - d[0] = s[0]; - d[1] = s[1]; - d[2] = s[2]; - d[3] = s[3]; - n_copy -= 4; - s += 4; - d += 4; - } - - while (n_copy >= 1) - { - d[0] = s[0]; - n_copy -= 1; - s += 1; - d += 1; - } - - dst = (void *) d; - } - - /* Unaligned copy. */ - ASSERT (n_unaligned_end == n_left); - while (n_left > 0) - { - *dst++ = *u_src--; - n_left--; - u_len--; - } + src = free_list->buffers + len - n_alloc_buffers; + clib_memcpy (dst, src, n_alloc_buffers * sizeof (u32)); - if (!free_list->unaligned_buffers) - ASSERT (u_len == 0); - else - _vec_len (free_list->unaligned_buffers) = u_len; + _vec_len (free_list->buffers) -= n_alloc_buffers; return n_alloc_buffers; } diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 561c86cd..02755195 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -168,32 +168,20 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) * Note: this could be sped up if the node index were stuffed into * the freelist itself. */ - if (vec_len (fl->aligned_buffers) > 0) + if (vec_len (fl->buffers) > 0) { - bi0 = fl->aligned_buffers[0]; - b0 = vlib_get_buffer (vm, bi0); - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); - feature_node_index = ctx->recycle_node_index; - } - else if (vec_len (fl->unaligned_buffers) > 0) - { - bi0 = fl->unaligned_buffers[0]; + bi0 = fl->buffers[0]; b0 = vlib_get_buffer (vm, bi0); ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); feature_node_index = ctx->recycle_node_index; } - /* aligned, unaligned buffers */ + /* buffers */ for (i = 0; i < 2; i++) { if (i == 0) { - from = fl->aligned_buffers; - n_left_from = vec_len (from); - } - else - { - from = fl->unaligned_buffers; + from = fl->buffers; n_left_from = vec_len (from); } @@ -245,8 +233,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) } } - vec_reset_length (fl->aligned_buffers); - vec_reset_length (fl->unaligned_buffers); + vec_reset_length (fl->buffers); if (f) { -- cgit 1.2.3-korg From c47ed032c6d036a9f942fc9ced48874fad55b48c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 25 Jan 2017 14:18:03 +0100 Subject: vlib: add buffer cloning support Change-Id: I50070611af15b2b4cc29664a8bee4f821ac3c835 Signed-off-by: Damjan Marion --- src/scripts/vnet/mcast/ip4 | 19 +-- src/vlib/buffer.c | 254 ++++++++++------------------------------- src/vlib/buffer.h | 4 +- src/vlib/buffer_funcs.h | 113 +++++++++++++++++- src/vnet/devices/dpdk/buffer.c | 41 +++++-- src/vnet/devices/dpdk/device.c | 11 +- src/vnet/dpo/replicate_dpo.c | 76 ++++++------ src/vnet/dpo/replicate_dpo.h | 3 + 8 files changed, 256 insertions(+), 265 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/scripts/vnet/mcast/ip4 b/src/scripts/vnet/mcast/ip4 index 69f1ee00..eb6bab27 100644 --- a/src/scripts/vnet/mcast/ip4 +++ b/src/scripts/vnet/mcast/ip4 @@ -2,7 +2,7 @@ packet-generator new { name x limit 1 node ip4-input - size 64-64 + size 512-512 no-recycle data { ICMP: 1.0.0.2 -> 232.1.1.1 @@ -11,12 +11,15 @@ packet-generator new { } } -trace add pg-input 100 -loop create -loop create -set int state loop0 up -set int state loop1 up +create packet-generator interface pg1 +create packet-generator interface pg2 +create packet-generator interface pg3 + +set int state pg1 up +set int state pg2 up +set int state pg3 up ip mroute add 232.1.1.1 via pg0 Accept -ip mroute add 232.1.1.1 via loop0 Forward -ip mroute add 232.1.1.1 via loop1 Forward +ip mroute add 232.1.1.1 via pg1 Forward +ip mroute add 232.1.1.1 via pg2 Forward +ip mroute add 232.1.1.1 via pg3 Forward diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 95b4344f..4f5eb09d 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -68,8 +68,9 @@ format_vlib_buffer (u8 * s, va_list * args) vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); uword indent = format_get_indent (s); - s = format (s, "current data %d, length %d, free-list %d", - b->current_data, b->current_length, b->free_list_index); + s = format (s, "current data %d, length %d, free-list %d, clone-count %u", + b->current_data, b->current_length, b->free_list_index, + b->n_add_refs); if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID) s = format (s, ", totlen-nifb %d", @@ -84,8 +85,10 @@ format_vlib_buffer (u8 * s, va_list * args) u32 next_buffer = b->next_buffer; b = vlib_get_buffer (vm, next_buffer); - s = format (s, "\n%Unext-buffer 0x%x, segment length %d", - format_white_space, indent, next_buffer, b->current_length); + s = + format (s, "\n%Unext-buffer 0x%x, segment length %d, clone-count %u", + format_white_space, indent, next_buffer, b->current_length, + b->n_add_refs); } return s; @@ -262,7 +265,7 @@ vlib_main_t **vlib_mains; /* When dubugging validate that given buffers are either known allocated or known free. */ -static void __attribute__ ((unused)) +static void vlib_buffer_validate_alloc_free (vlib_main_t * vm, u32 * buffers, uword n_buffers, @@ -362,6 +365,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, /* Setup free buffer template. */ f->buffer_init_template.free_list_index = f->index; + f->buffer_init_template.n_add_refs = 0; if (is_public) { @@ -620,19 +624,11 @@ vlib_buffer_free_inline (vlib_main_t * vm, { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *fl; - static u32 *next_to_free[2]; /* smp bad */ - u32 i_next_to_free, *b, *n, *f, fi; - uword n_left; + u32 fi; int i; - static vlib_buffer_free_list_t **announce_list; - vlib_buffer_free_list_t *fl0 = 0, *fl1 = 0; - u32 bi0 = (u32) ~ 0, bi1 = (u32) ~ 0, fi0, fi1 = (u32) ~ 0; - u8 free0, free1 = 0, free_next0, free_next1; u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, u32 follow_buffer_next); - ASSERT (os_get_cpu_number () == 0); - cb = bm->buffer_free_callback; if (PREDICT_FALSE (cb != 0)) @@ -641,203 +637,68 @@ vlib_buffer_free_inline (vlib_main_t * vm, if (!n_buffers) return; - /* Use first buffer to get default free list. */ - { - u32 bi0 = buffers[0]; - vlib_buffer_t *b0; - - b0 = vlib_get_buffer (vm, bi0); - fl = vlib_buffer_get_buffer_free_list (vm, b0, &fi); - if (fl->buffers_added_to_freelist_function) - vec_add1 (announce_list, fl); - } - - vec_validate (next_to_free[0], n_buffers - 1); - vec_validate (next_to_free[1], n_buffers - 1); - - i_next_to_free = 0; - n_left = n_buffers; - b = buffers; - -again: - /* Verify that buffers are known allocated. */ - vlib_buffer_validate_alloc_free (vm, b, - n_left, VLIB_BUFFER_KNOWN_ALLOCATED); - - vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); - - n = next_to_free[i_next_to_free]; - while (n_left >= 4) - { - vlib_buffer_t *b0, *b1, *binit0, *binit1, dummy_buffers[2]; - - bi0 = b[0]; - bi1 = b[1]; - - f[0] = bi0; - f[1] = bi1; - f += 2; - b += 2; - n_left -= 2; - - /* Prefetch buffers for next iteration. */ - vlib_prefetch_buffer_with_index (vm, b[0], WRITE); - vlib_prefetch_buffer_with_index (vm, b[1], WRITE); - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; - free1 = (b1->flags & VLIB_BUFFER_RECYCLE) == 0; - - /* Must be before init which will over-write buffer flags. */ - if (follow_buffer_next) - { - n[0] = b0->next_buffer; - free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; - n += free_next0; - - n[0] = b1->next_buffer; - free_next1 = free1 && (b1->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; - n += free_next1; - } - else - free_next0 = free_next1 = 0; - - /* Must be before init which will over-write buffer free list. */ - fi0 = b0->free_list_index; - fi1 = b1->free_list_index; - - if (PREDICT_FALSE (fi0 != fi || fi1 != fi)) - goto slow_path_x2; - - binit0 = free0 ? b0 : &dummy_buffers[0]; - binit1 = free1 ? b1 : &dummy_buffers[1]; - - vlib_buffer_init_two_for_free_list (binit0, binit1, fl); - continue; - - slow_path_x2: - /* Backup speculation. */ - f -= 2; - n -= free_next0 + free_next1; - - _vec_len (fl->buffers) = f - fl->buffers; - - fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); - fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); - - vlib_buffer_add_to_free_list (vm, fl0, bi0, free0); - if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) - { - int i; - for (i = 0; i < vec_len (announce_list); i++) - if (fl0 == announce_list[i]) - goto no_fl0; - vec_add1 (announce_list, fl0); - } - no_fl0: - if (PREDICT_FALSE (fl1->buffers_added_to_freelist_function != 0)) - { - int i; - for (i = 0; i < vec_len (announce_list); i++) - if (fl1 == announce_list[i]) - goto no_fl1; - vec_add1 (announce_list, fl1); - } - - no_fl1: - vlib_buffer_add_to_free_list (vm, fl1, bi1, free1); - - /* Possibly change current free list. */ - if (fi0 != fi && fi1 != fi) - { - fi = fi1; - fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); - } - - vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); - } - - while (n_left >= 1) + for (i = 0; i < n_buffers; i++) { - vlib_buffer_t *b0, *binit0, dummy_buffers[1]; + vlib_buffer_t *b; + u32 bi = buffers[i]; - bi0 = b[0]; - f[0] = bi0; - f += 1; - b += 1; - n_left -= 1; - - b0 = vlib_get_buffer (vm, bi0); + b = vlib_get_buffer (vm, bi); - free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; + fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); - /* Must be before init which will over-write buffer flags. */ - if (follow_buffer_next) + /* The only current use of this callback: multicast recycle */ + if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) { - n[0] = b0->next_buffer; - free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; - n += free_next0; + int j; + + vlib_buffer_add_to_free_list + (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); + + for (j = 0; j < vec_len (bm->announce_list); j++) + { + if (fl == bm->announce_list[j]) + goto already_announced; + } + vec_add1 (bm->announce_list, fl); + already_announced: + ; } else - free_next0 = 0; - - /* Must be before init which will over-write buffer free list. */ - fi0 = b0->free_list_index; - - if (PREDICT_FALSE (fi0 != fi)) - goto slow_path_x1; - - binit0 = free0 ? b0 : &dummy_buffers[0]; - - vlib_buffer_init_for_free_list (binit0, fl); - continue; - - slow_path_x1: - /* Backup speculation. */ - f -= 1; - n -= free_next0; - - _vec_len (fl->buffers) = f - fl->buffers; - - fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); - - vlib_buffer_add_to_free_list (vm, fl0, bi0, free0); - if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) { - int i; - for (i = 0; i < vec_len (announce_list); i++) - if (fl0 == announce_list[i]) - goto no_fl00; - vec_add1 (announce_list, fl0); + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) + { + u32 flags, next; + + do + { + vlib_buffer_t *nb = vlib_get_buffer (vm, bi); + flags = nb->flags; + next = nb->next_buffer; + if (nb->n_add_refs) + nb->n_add_refs--; + else + { + vlib_buffer_validate_alloc_free (vm, &bi, 1, + VLIB_BUFFER_KNOWN_ALLOCATED); + vlib_buffer_add_to_free_list (vm, fl, bi, 1); + } + bi = next; + } + while (follow_buffer_next + && (flags & VLIB_BUFFER_NEXT_PRESENT)); + + } } - - no_fl00: - fi = fi0; - fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); - - vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); } - - if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0)) - { - b = next_to_free[i_next_to_free]; - i_next_to_free ^= 1; - goto again; - } - - _vec_len (fl->buffers) = f - fl->buffers; - - if (vec_len (announce_list)) + if (vec_len (bm->announce_list)) { vlib_buffer_free_list_t *fl; - for (i = 0; i < vec_len (announce_list); i++) + for (i = 0; i < vec_len (bm->announce_list); i++) { - fl = announce_list[i]; + fl = bm->announce_list[i]; fl->buffers_added_to_freelist_function (vm, fl); } - _vec_len (announce_list) = 0; + _vec_len (bm->announce_list) = 0; } } @@ -922,6 +783,7 @@ vlib_packet_template_init (vlib_main_t * vm, fl->buffer_init_template.current_data = 0; fl->buffer_init_template.current_length = n_packet_data_bytes; fl->buffer_init_template.flags = 0; + fl->buffer_init_template.n_add_refs = 0; vlib_worker_thread_barrier_release (vm); } diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 8ea79502..b4015b30 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -119,7 +119,9 @@ typedef struct feature node */ - u8 dont_waste_me[3]; /**< Available space in the (precious) + u8 n_add_refs; /**< Number of additional references to this buffer. */ + + u8 dont_waste_me[2]; /**< Available space in the (precious) first 32 octets of buffer metadata Before allocating any of it, discussion required! */ diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 0b583a61..e0fde5f2 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -530,6 +530,110 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) return fd; } +/** \brief Create multiple clones of buffer and store them in the supplied array + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param src_buffer - (u32) source buffer index + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u8) number of buffer clones requested + @param head_end_offset - (u16) offset relative to current position + where packet head ends + @return - (u8) number of buffers actually cloned, may be + less than the number requested or zero +*/ + +always_inline u8 +vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers, + u8 n_buffers, u16 head_end_offset) +{ + u8 i; + vlib_buffer_t *s = vlib_get_buffer (vm, src_buffer); + + ASSERT (s->n_add_refs == 0); + ASSERT (n_buffers); + + if (s->current_length <= head_end_offset + CLIB_CACHE_LINE_BYTES * 2) + { + buffers[0] = src_buffer; + for (i = 1; i < n_buffers; i++) + { + vlib_buffer_t *d; + d = vlib_buffer_copy (vm, s); + if (d == 0) + return i; + buffers[i] = vlib_get_buffer_index (vm, d); + + } + return n_buffers; + } + + n_buffers = vlib_buffer_alloc_from_free_list (vm, buffers, n_buffers, + s->free_list_index); + if (PREDICT_FALSE (n_buffers == 0)) + { + buffers[0] = src_buffer; + return 1; + } + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *d = vlib_get_buffer (vm, buffers[i]); + d->current_data = s->current_data; + d->current_length = head_end_offset; + d->free_list_index = s->free_list_index; + d->total_length_not_including_first_buffer = + s->total_length_not_including_first_buffer + s->current_length - + head_end_offset; + d->flags = s->flags | VLIB_BUFFER_NEXT_PRESENT; + d->flags &= ~VLIB_BUFFER_EXT_HDR_VALID; + clib_memcpy (d->opaque, s->opaque, sizeof (s->opaque)); + clib_memcpy (vlib_buffer_get_current (d), vlib_buffer_get_current (s), + head_end_offset); + d->next_buffer = src_buffer; + } + vlib_buffer_advance (s, head_end_offset); + s->n_add_refs = n_buffers - 1; + while (s->flags & VLIB_BUFFER_NEXT_PRESENT) + { + s = vlib_get_buffer (vm, s->next_buffer); + s->n_add_refs = n_buffers - 1; + } + + return n_buffers; +} + +/** \brief Attach cloned tail to the buffer + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param head - (vlib_buffer_t *) head buffer + @param tail - (Vlib buffer_t *) tail buffer to clone and attach to head +*/ + +always_inline void +vlib_buffer_attach_clone (vlib_main_t * vm, vlib_buffer_t * head, + vlib_buffer_t * tail) +{ + ASSERT ((head->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + ASSERT (head->free_list_index == tail->free_list_index); + + head->flags |= VLIB_BUFFER_NEXT_PRESENT; + head->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + head->flags &= ~VLIB_BUFFER_EXT_HDR_VALID; + head->flags |= (tail->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID); + head->next_buffer = vlib_get_buffer_index (vm, tail); + head->total_length_not_including_first_buffer = tail->current_length + + tail->total_length_not_including_first_buffer; + +next_segment: + __sync_add_and_fetch (&tail->n_add_refs, 1); + + if (tail->flags & VLIB_BUFFER_NEXT_PRESENT) + { + tail = vlib_get_buffer (vm, tail->next_buffer); + goto next_segment; + } +} + /* Initializes the buffer as an empty packet with no chained buffers. */ always_inline void vlib_buffer_chain_init (vlib_buffer_t * first) @@ -695,7 +799,8 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, _(flags); _(free_list_index); #undef _ - ASSERT (dst->total_length_not_including_first_buffer == 0); + dst->total_length_not_including_first_buffer = 0; + ASSERT (dst->n_add_refs == 0); } always_inline void @@ -727,8 +832,10 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, _(flags); _(free_list_index); #undef _ - ASSERT (dst0->total_length_not_including_first_buffer == 0); - ASSERT (dst1->total_length_not_including_first_buffer == 0); + dst0->total_length_not_including_first_buffer = 0; + dst1->total_length_not_including_first_buffer = 0; + ASSERT (dst0->n_add_refs == 0); + ASSERT (dst1->n_add_refs == 0); } #if CLIB_DEBUG > 0 diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c index 007093e4..f95d4cb5 100644 --- a/src/vnet/devices/dpdk/buffer.c +++ b/src/vnet/devices/dpdk/buffer.c @@ -79,20 +79,46 @@ STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM"); +static_always_inline void +dpdk_rte_pktmbuf_free (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_buffer_t *hb = b; + struct rte_mbuf *mb; + u32 next, flags; + mb = rte_mbuf_from_vlib_buffer (hb); + +next: + flags = b->flags; + next = b->next_buffer; + mb = rte_mbuf_from_vlib_buffer (b); + + if (PREDICT_FALSE (b->n_add_refs)) + { + rte_mbuf_refcnt_update (mb, b->n_add_refs); + b->n_add_refs = 0; + } + + rte_pktmbuf_free_seg (mb); + + if (flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, next); + goto next; + } +} + static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { u32 i; - struct rte_mbuf *mb; vlib_buffer_t *b; for (i = 0; i < vec_len (f->buffers); i++) { b = vlib_get_buffer (vm, f->buffers[i]); - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); + dpdk_rte_pktmbuf_free (vm, b); } + vec_free (f->name); vec_free (f->buffers); } @@ -325,7 +351,6 @@ vlib_buffer_free_inline (vlib_main_t * vm, for (i = 0; i < n_buffers; i++) { vlib_buffer_t *b; - struct rte_mbuf *mb; b = vlib_get_buffer (vm, buffers[i]); @@ -351,11 +376,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, else { if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) - { - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } + dpdk_rte_pktmbuf_free (vm, b); } } if (vec_len (bm->announce_list)) diff --git a/src/vnet/devices/dpdk/device.c b/src/vnet/devices/dpdk/device.c index c9d9a567..17397900 100644 --- a/src/vnet/devices/dpdk/device.c +++ b/src/vnet/devices/dpdk/device.c @@ -168,13 +168,11 @@ dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b, { b2 = vlib_get_buffer (vm, b2->next_buffer); mb = rte_mbuf_from_vlib_buffer (b2); - last_mb->next = mb; - last_mb = mb; rte_pktmbuf_reset (mb); } } - first_mb = mb = rte_mbuf_from_vlib_buffer (b); + last_mb = first_mb = mb = rte_mbuf_from_vlib_buffer (b); first_mb->nb_segs = 1; mb->data_len = b->current_length; mb->pkt_len = maybe_multiseg ? vlib_buffer_length_in_chain (vm, b) : @@ -185,10 +183,17 @@ dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b, { b = vlib_get_buffer (vm, b->next_buffer); mb = rte_mbuf_from_vlib_buffer (b); + last_mb->next = mb; + last_mb = mb; mb->data_len = b->current_length; mb->pkt_len = b->current_length; mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data; first_mb->nb_segs++; + if (PREDICT_FALSE (b->n_add_refs)) + { + rte_mbuf_refcnt_update (mb, b->n_add_refs); + b->n_add_refs = 0; + } } } diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index a67b19c8..a9f334be 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -625,6 +625,7 @@ replicate_inline (vlib_main_t * vm, vlib_frame_t * frame) { vlib_combined_counter_main_t * cm = &replicate_main.repm_counters; + replicate_main_t * rm = &replicate_main; u32 n_left_from, * from, * to_next, next_index; u32 cpu_index = os_get_cpu_number(); @@ -645,13 +646,11 @@ replicate_inline (vlib_main_t * vm, const replicate_t *rep0; vlib_buffer_t * b0, *c0; const dpo_id_t *dpo0; + u8 num_cloned; bi0 = from[0]; - to_next[0] = bi0; from += 1; - to_next += 1; n_left_from -= 1; - n_left_to_next -= 1; b0 = vlib_get_buffer (vm, bi0); repi0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; @@ -661,50 +660,21 @@ replicate_inline (vlib_main_t * vm, cm, cpu_index, repi0, 1, vlib_buffer_length_in_chain(vm, b0)); - /* ship the original to the first bucket */ - dpo0 = replicate_get_bucket_i(rep0, 0); - next0 = dpo0->dpoi_next_node; - vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + vec_validate (rm->clones[cpu_index], rep0->rep_n_buckets - 1); - if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) - { - replicate_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); - t->rep_index = repi0; - t->dpo = *dpo0; - } - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi0, next0); + num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[cpu_index], rep0->rep_n_buckets, 128); - /* ship copies to the rest of the buckets */ - for (bucket = 1; bucket < rep0->rep_n_buckets; bucket++) - { - /* - * After the enqueue of the first buffer, and of all subsequent - * buffers in this loop, it is possible that we over-flow the - * frame of the to-next node. When this happens we need to 'put' - * that full frame to the node and get a fresh empty one. - * Note that these are macros with side effects that change - * to_next & n_left_to_next - */ - if (PREDICT_FALSE(0 == n_left_to_next)) - { - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - } + if (num_cloned != rep0->rep_n_buckets) + { + vlib_node_increment_counter + (vm, node->node_index, + REPLICATE_DPO_ERROR_BUFFER_ALLOCATION_FAILURE, 1); + } - /* Make a copy. This can fail, so deal with it. */ - c0 = vlib_buffer_copy(vm, b0); - if (PREDICT_FALSE (c0 == 0)) - { - vlib_node_increment_counter - (vm, node->node_index, - REPLICATE_DPO_ERROR_BUFFER_ALLOCATION_FAILURE, - 1); - continue; - } - - ci0 = vlib_get_buffer_index(vm, c0); + for (bucket = 0; bucket < num_cloned; bucket++) + { + ci0 = rm->clones[cpu_index][bucket]; + c0 = vlib_get_buffer(vm, ci0); to_next[0] = ci0; to_next += 1; @@ -724,7 +694,13 @@ replicate_inline (vlib_main_t * vm, vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, ci0, next0); + if (PREDICT_FALSE (n_left_to_next == 0)) + { + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + } } + vec_reset_length (rm->clones[cpu_index]); } vlib_put_next_frame (vm, node, next_index, n_left_to_next); @@ -797,3 +773,15 @@ VLIB_REGISTER_NODE (ip6_replicate_node) = { [0] = "error-drop", }, }; + +clib_error_t * +replicate_dpo_init (vlib_main_t * vm) +{ + replicate_main_t * rm = &replicate_main; + + vec_validate (rm->clones, vlib_num_workers()); + + return 0; +} + +VLIB_INIT_FUNCTION (replicate_dpo_init); diff --git a/src/vnet/dpo/replicate_dpo.h b/src/vnet/dpo/replicate_dpo.h index a564739c..77273015 100644 --- a/src/vnet/dpo/replicate_dpo.h +++ b/src/vnet/dpo/replicate_dpo.h @@ -32,6 +32,9 @@ typedef struct replicate_main_t_ { vlib_combined_counter_main_t repm_counters; + + /* per-cpu vector of cloned packets */ + u32 **clones; } replicate_main_t; extern replicate_main_t replicate_main; -- cgit 1.2.3-korg From 68b0fb0c620c7451ef1a6380c43c39de6614db51 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 28 Feb 2017 15:15:56 -0500 Subject: VPP-598: tcp stack initial commit Change-Id: I49e5ce0aae6e4ff634024387ceaf7dbc432a0351 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/Makefile.am | 1 + src/plugins/ioam/export-common/ioam_export.h | 2 +- src/plugins/ioam/ipfixcollector/ipfixcollector.c | 2 +- src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c | 2 +- src/plugins/snat/in2out.c | 26 +- src/plugins/snat/out2in.c | 24 +- src/scripts/vnet/tcp | 18 +- src/scripts/vnet/udp | 19 + src/scripts/vnet/uri/tcp-setup.sh | 39 + src/scripts/vnet/uri/tcp_server | 4 + src/scripts/vnet/uri/udp | 19 + src/svm.am | 10 +- src/svm/ssvm.c | 16 + src/svm/ssvm.h | 18 +- src/svm/svm_fifo.c | 568 ++++++ src/svm/svm_fifo.h | 157 ++ src/svm/svm_fifo_segment.c | 193 ++ src/svm/svm_fifo_segment.h | 89 + src/svm/test_svm_fifo1.c | 361 ++++ src/uri.am | 22 + src/uri/uri_tcp_test.c | 916 +++++++++ src/uri/uri_udp_test.c | 553 ++++++ src/uri/uri_udp_test2.c | 954 +++++++++ src/uri/uritest.c | 484 +++++ src/vlib/buffer.c | 2 +- src/vlib/buffer.h | 68 + src/vlibmemory/unix_shared_memory_queue.c | 12 +- src/vlibmemory/unix_shared_memory_queue.h | 2 +- src/vnet.am | 66 +- src/vnet/api_errno.h | 21 +- src/vnet/bfd/bfd_udp.c | 4 +- src/vnet/buffer.h | 10 + src/vnet/classify/vnet_classify.c | 4 +- src/vnet/dhcp/dhcp_proxy.h | 2 +- src/vnet/flow/flow_report.h | 2 +- src/vnet/ip/ip.h | 4 +- src/vnet/ip/ip4.h | 42 +- src/vnet/ip/ip4_forward.c | 173 +- src/vnet/ip/ip4_packet.h | 26 +- src/vnet/ip/ip6.h | 44 +- src/vnet/ip/ip6_packet.h | 26 +- src/vnet/ip/punt.c | 2 +- src/vnet/ip/tcp_packet.h | 141 -- src/vnet/ip/udp.h | 315 --- src/vnet/ip/udp_error.def | 21 - src/vnet/ip/udp_format.c | 91 - src/vnet/ip/udp_init.c | 71 - src/vnet/ip/udp_local.c | 645 ------ src/vnet/ip/udp_packet.h | 65 - src/vnet/ip/udp_pg.c | 237 --- src/vnet/ipsec/ikev2.c | 2 +- src/vnet/ipsec/ikev2_cli.c | 2 +- src/vnet/ipsec/ikev2_crypto.c | 2 +- src/vnet/lisp-cp/packets.c | 65 +- src/vnet/lisp-cp/packets.h | 45 - src/vnet/lisp-gpe/interface.c | 2 +- src/vnet/lisp-gpe/lisp_gpe.h | 4 +- src/vnet/lisp-gpe/lisp_gpe_adjacency.c | 2 + src/vnet/session/application.c | 343 ++++ src/vnet/session/application.h | 120 ++ src/vnet/session/application_interface.c | 459 +++++ src/vnet/session/application_interface.h | 136 ++ src/vnet/session/hashes.c | 28 + src/vnet/session/node.c | 435 ++++ src/vnet/session/session.api | 429 ++++ src/vnet/session/session.c | 1286 ++++++++++++ src/vnet/session/session.h | 380 ++++ src/vnet/session/session_api.c | 821 ++++++++ src/vnet/session/session_cli.c | 189 ++ src/vnet/session/transport.c | 64 + src/vnet/session/transport.h | 250 +++ src/vnet/tcp/tcp.c | 708 +++++++ src/vnet/tcp/tcp.h | 624 ++++++ src/vnet/tcp/tcp_error.def | 35 + src/vnet/tcp/tcp_format.c | 136 ++ src/vnet/tcp/tcp_input.c | 2316 ++++++++++++++++++++++ src/vnet/tcp/tcp_newreno.c | 93 + src/vnet/tcp/tcp_output.c | 1412 +++++++++++++ src/vnet/tcp/tcp_packet.h | 184 ++ src/vnet/tcp/tcp_pg.c | 236 +++ src/vnet/tcp/tcp_syn_filter4.c | 542 +++++ src/vnet/tcp/tcp_timer.h | 29 + src/vnet/udp/builtin_server.c | 239 +++ src/vnet/udp/udp.c | 342 ++++ src/vnet/udp/udp.h | 362 ++++ src/vnet/udp/udp_error.def | 21 + src/vnet/udp/udp_format.c | 91 + src/vnet/udp/udp_input.c | 314 +++ src/vnet/udp/udp_local.c | 666 +++++++ src/vnet/udp/udp_packet.h | 65 + src/vnet/udp/udp_pg.c | 237 +++ src/vnet/vnet_all_api_h.h | 1 + src/vnet/vxlan-gpe/vxlan_gpe.h | 2 +- src/vnet/vxlan/vxlan.h | 2 +- src/vpp/api/vpe.api | 1 + src/vppinfra.am | 5 + src/vppinfra/bihash_16_8.h | 103 + src/vppinfra/bihash_48_8.h | 116 ++ src/vppinfra/tw_timer_16t_1w_2048sl.c | 26 + src/vppinfra/tw_timer_16t_1w_2048sl.h | 46 + 100 files changed, 18737 insertions(+), 1874 deletions(-) create mode 100644 src/scripts/vnet/udp create mode 100755 src/scripts/vnet/uri/tcp-setup.sh create mode 100644 src/scripts/vnet/uri/tcp_server create mode 100644 src/scripts/vnet/uri/udp create mode 100644 src/svm/svm_fifo.c create mode 100644 src/svm/svm_fifo.h create mode 100644 src/svm/svm_fifo_segment.c create mode 100644 src/svm/svm_fifo_segment.h create mode 100644 src/svm/test_svm_fifo1.c create mode 100644 src/uri.am create mode 100644 src/uri/uri_tcp_test.c create mode 100644 src/uri/uri_udp_test.c create mode 100644 src/uri/uri_udp_test2.c create mode 100644 src/uri/uritest.c delete mode 100644 src/vnet/ip/tcp_packet.h delete mode 100644 src/vnet/ip/udp.h delete mode 100644 src/vnet/ip/udp_error.def delete mode 100644 src/vnet/ip/udp_format.c delete mode 100644 src/vnet/ip/udp_init.c delete mode 100644 src/vnet/ip/udp_local.c delete mode 100644 src/vnet/ip/udp_packet.h delete mode 100644 src/vnet/ip/udp_pg.c create mode 100644 src/vnet/session/application.c create mode 100644 src/vnet/session/application.h create mode 100644 src/vnet/session/application_interface.c create mode 100644 src/vnet/session/application_interface.h create mode 100644 src/vnet/session/hashes.c create mode 100644 src/vnet/session/node.c create mode 100644 src/vnet/session/session.api create mode 100644 src/vnet/session/session.c create mode 100644 src/vnet/session/session.h create mode 100644 src/vnet/session/session_api.c create mode 100644 src/vnet/session/session_cli.c create mode 100644 src/vnet/session/transport.c create mode 100644 src/vnet/session/transport.h create mode 100644 src/vnet/tcp/tcp.c create mode 100644 src/vnet/tcp/tcp.h create mode 100644 src/vnet/tcp/tcp_error.def create mode 100644 src/vnet/tcp/tcp_format.c create mode 100644 src/vnet/tcp/tcp_input.c create mode 100644 src/vnet/tcp/tcp_newreno.c create mode 100644 src/vnet/tcp/tcp_output.c create mode 100644 src/vnet/tcp/tcp_packet.h create mode 100644 src/vnet/tcp/tcp_pg.c create mode 100644 src/vnet/tcp/tcp_syn_filter4.c create mode 100644 src/vnet/tcp/tcp_timer.h create mode 100644 src/vnet/udp/builtin_server.c create mode 100644 src/vnet/udp/udp.c create mode 100644 src/vnet/udp/udp.h create mode 100644 src/vnet/udp/udp_error.def create mode 100644 src/vnet/udp/udp_format.c create mode 100644 src/vnet/udp/udp_input.c create mode 100644 src/vnet/udp/udp_local.c create mode 100644 src/vnet/udp/udp_packet.h create mode 100644 src/vnet/udp/udp_pg.c create mode 100644 src/vppinfra/bihash_16_8.h create mode 100644 src/vppinfra/bihash_48_8.h create mode 100644 src/vppinfra/tw_timer_16t_1w_2048sl.c create mode 100644 src/vppinfra/tw_timer_16t_1w_2048sl.h (limited to 'src/vlib/buffer.c') diff --git a/src/Makefile.am b/src/Makefile.am index 08feb29a..641707ed 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -88,6 +88,7 @@ include vlib-api.am include vnet.am include vpp.am include vpp-api-test.am +include uri.am SUBDIRS += plugins diff --git a/src/plugins/ioam/export-common/ioam_export.h b/src/plugins/ioam/export-common/ioam_export.h index e84dab0b..dd48a93b 100644 --- a/src/plugins/ioam/export-common/ioam_export.h +++ b/src/plugins/ioam/export-common/ioam_export.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/plugins/ioam/ipfixcollector/ipfixcollector.c b/src/plugins/ioam/ipfixcollector/ipfixcollector.c index 4ae47edc..71b934ec 100644 --- a/src/plugins/ioam/ipfixcollector/ipfixcollector.c +++ b/src/plugins/ioam/ipfixcollector/ipfixcollector.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include ipfix_collector_main_t ipfix_collector_main; diff --git a/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c b/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c index b42c357c..f334c983 100644 --- a/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c +++ b/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c index e30c913c..b4b7793d 100644 --- a/src/plugins/snat/in2out.c +++ b/src/plugins/snat/in2out.c @@ -689,12 +689,12 @@ snat_hairpinning (snat_main_t *sm, ip4_header_t, dst_address); ip0->checksum = ip_csum_fold (sum0); - old_dst_port0 = tcp0->ports.dst; + old_dst_port0 = tcp0->dst; if (PREDICT_TRUE(new_dst_port0 != old_dst_port0)) { if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - tcp0->ports.dst = new_dst_port0; + tcp0->dst = new_dst_port0; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, ip4_header_t, dst_address); @@ -872,9 +872,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.src; - tcp0->ports.src = s0->out2in.port; - new_port0 = tcp0->ports.src; + old_port0 = tcp0->src_port; + tcp0->src_port = s0->out2in.port; + new_port0 = tcp0->src_port; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, @@ -1012,9 +1012,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, if (PREDICT_TRUE(proto1 == SNAT_PROTOCOL_TCP)) { - old_port1 = tcp1->ports.src; - tcp1->ports.src = s1->out2in.port; - new_port1 = tcp1->ports.src; + old_port1 = tcp1->src_port; + tcp1->src_port = s1->out2in.port; + new_port1 = tcp1->src_port; sum1 = tcp1->checksum; sum1 = ip_csum_update (sum1, old_addr1, new_addr1, @@ -1188,9 +1188,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.src; - tcp0->ports.src = s0->out2in.port; - new_port0 = tcp0->ports.src; + old_port0 = tcp0->src_port; + tcp0->src_port = s0->out2in.port; + new_port0 = tcp0->src_port; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, @@ -1667,8 +1667,8 @@ snat_in2out_fast_static_map_fn (vlib_main_t * vm, { if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.src; - tcp0->ports.src = new_port0; + old_port0 = tcp0->src_port; + tcp0->src_port = new_port0; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c index 328f5ba4..3bfc0aa3 100644 --- a/src/plugins/snat/out2in.c +++ b/src/plugins/snat/out2in.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -602,9 +602,9 @@ snat_out2in_node_fn (vlib_main_t * vm, if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.dst; - tcp0->ports.dst = s0->in2out.port; - new_port0 = tcp0->ports.dst; + old_port0 = tcp0->dst_port; + tcp0->dst_port = s0->in2out.port; + new_port0 = tcp0->dst_port; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, @@ -737,9 +737,9 @@ snat_out2in_node_fn (vlib_main_t * vm, if (PREDICT_TRUE(proto1 == SNAT_PROTOCOL_TCP)) { - old_port1 = tcp1->ports.dst; - tcp1->ports.dst = s1->in2out.port; - new_port1 = tcp1->ports.dst; + old_port1 = tcp1->dst_port; + tcp1->dst_port = s1->in2out.port; + new_port1 = tcp1->dst_port; sum1 = tcp1->checksum; sum1 = ip_csum_update (sum1, old_addr1, new_addr1, @@ -907,9 +907,9 @@ snat_out2in_node_fn (vlib_main_t * vm, if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.dst; - tcp0->ports.dst = s0->in2out.port; - new_port0 = tcp0->ports.dst; + old_port0 = tcp0->dst_port; + tcp0->dst_port = s0->in2out.port; + new_port0 = tcp0->dst_port; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, @@ -1369,8 +1369,8 @@ snat_out2in_fast_node_fn (vlib_main_t * vm, { if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.dst; - tcp0->ports.dst = new_port0; + old_port0 = tcp0->dst_port; + tcp0->dst_port = new_port0; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, diff --git a/src/scripts/vnet/tcp b/src/scripts/vnet/tcp index a2ee8b2d..b9c23c3a 100644 --- a/src/scripts/vnet/tcp +++ b/src/scripts/vnet/tcp @@ -1,16 +1,18 @@ +loop create +set int ip address loop0 192.168.1.1/8 +set int state loop0 up + packet-generator new { name x - limit 1 + limit 2048 node ip4-input - size 64-64 + size 100-100 + interface loop0 no-recycle data { - TCP: 1.2.3.4 -> 5.6.7.8 - TCP: 1234 -> 5678 + TCP: 192.168.1.2 -> 192.168.1.1 + TCP: 32415 -> 80 + SYN incrementing 100 } } - -tr add pg-input 100 -ip route 5.6.7.8/32 via local -ip route 1.2.3.4/32 via local diff --git a/src/scripts/vnet/udp b/src/scripts/vnet/udp new file mode 100644 index 00000000..7dda1eec --- /dev/null +++ b/src/scripts/vnet/udp @@ -0,0 +1,19 @@ +loop create +set int ip address loop0 192.168.1.1/8 +set int state loop0 up + +packet-generator new { + name udp + limit 512 + rate 1e4 + node ip4-input + size 100-100 + interface loop0 + no-recycle + data { + UDP: 192.168.1.2 - 192.168.2.255 -> 192.168.1.1 + UDP: 4321 -> 1234 + length 72 + incrementing 100 + } +} diff --git a/src/scripts/vnet/uri/tcp-setup.sh b/src/scripts/vnet/uri/tcp-setup.sh new file mode 100755 index 00000000..e0b01588 --- /dev/null +++ b/src/scripts/vnet/uri/tcp-setup.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +function topo_setup +{ + ip netns add vppns1 + ip link add veth_vpp1 type veth peer name vpp1 + ip link set dev vpp1 up + ip link set dev veth_vpp1 up netns vppns1 + + ip netns exec vppns1 \ + bash -c " + ip link set dev lo up + ip addr add 6.0.1.2/24 dev veth_vpp1 + " + + ethtool --offload vpp1 rx off tx off + ip netns exec vppns1 ethtool --offload veth_vpp1 rx off tx off + +} + +function topo_clean +{ + ip link del dev veth_vpp1 &> /dev/null + ip netns del vppns1 &> /dev/null +} + +if [ "$1" == "clean" ] ; then + topo_clean + exit 0 +else + topo_setup +fi + +# to test connectivity do: +# sudo ip netns exec vppns1 telnet 6.0.1.1 1234 +# to push traffic to the server +# dd if=/dev/zero bs=1024K count=512 | nc 6.0.1.1 +# to listen for incoming connection from vpp +# nc -l 1234 diff --git a/src/scripts/vnet/uri/tcp_server b/src/scripts/vnet/uri/tcp_server new file mode 100644 index 00000000..7f5a86de --- /dev/null +++ b/src/scripts/vnet/uri/tcp_server @@ -0,0 +1,4 @@ +create host-interface name vpp1 +set int state host-vpp1 up +set int ip address host-vpp1 6.0.1.1/24 +trace add af-packet-input 10 diff --git a/src/scripts/vnet/uri/udp b/src/scripts/vnet/uri/udp new file mode 100644 index 00000000..ca13b83c --- /dev/null +++ b/src/scripts/vnet/uri/udp @@ -0,0 +1,19 @@ +loop create +set int ip address loop0 10.0.0.1/32 +set int state loop0 up + +packet-generator new { + name udp + limit 512 + rate 1e4 + node ip4-input + size 100-100 + interface loop0 + no-recycle + data { + UDP: 192.168.1.2 - 192.168.2.255 -> 192.168.1.1 + UDP: 4321 -> 1234 + length 72 + incrementing 100 + } +} diff --git a/src/svm.am b/src/svm.am index 2cd385bd..442eba8e 100644 --- a/src/svm.am +++ b/src/svm.am @@ -13,13 +13,14 @@ bin_PROGRAMS += svmtool svmdbtool -nobase_include_HEADERS += svm/svm.h svm/ssvm.h svm/svmdb.h +nobase_include_HEADERS += svm/svm.h svm/ssvm.h svm/svmdb.h \ + svm/svm_fifo.h svm/svm_fifo_segment.h lib_LTLIBRARIES += libsvm.la libsvmdb.la +libsvm_la_SOURCES = svm/svm.c svm/ssvm.c svm/svm_fifo.c svm/svm_fifo_segment.c libsvm_la_LIBADD = libvppinfra.la -lrt -lpthread libsvm_la_DEPENDENCIES = libvppinfra.la -libsvm_la_SOURCES = svm/svm.c svm/ssvm.c svmtool_SOURCES = svm/svmtool.c svmtool_LDADD = libsvm.la libvppinfra.la -lpthread -lrt @@ -31,4 +32,9 @@ libsvmdb_la_SOURCES = svm/svmdb.c svmdbtool_SOURCES = svm/svmdbtool.c svmdbtool_LDADD = libsvmdb.la libsvm.la libvppinfra.la -lpthread -lrt +noinst_PROGRAMS += test_svm_fifo1 +test_svm_fifo1_SOURCES = svm/test_svm_fifo1.c +test_svm_fifo1_LDADD = libsvm.la libvppinfra.la -lpthread -lrt +test_svm_fifo1_LDFLAGS = -static + # vi:syntax=automake diff --git a/src/svm/ssvm.c b/src/svm/ssvm.c index 6f409eb6..6cda1f27 100644 --- a/src/svm/ssvm.c +++ b/src/svm/ssvm.c @@ -169,6 +169,22 @@ re_map_it: return 0; } +void +ssvm_delete (ssvm_private_t * ssvm) +{ + u8 *fn; + + fn = format (0, "/dev/shm/%s%c", ssvm->name, 0); + + /* Throw away the backing file */ + if (unlink ((char *) fn) < 0) + clib_unix_warning ("unlink segment '%s'", ssvm->name); + + munmap ((void *) ssvm->requested_va, ssvm->ssvm_size); + vec_free (fn); +} + + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/svm/ssvm.h b/src/svm/ssvm.h index 9e61b9a0..bccfc164 100644 --- a/src/svm/ssvm.h +++ b/src/svm/ssvm.h @@ -38,7 +38,10 @@ #include #include -#define MMAP_PAGESIZE (4<<10) +#ifndef MMAP_PAGESIZE +#define MMAP_PAGESIZE (clib_mem_get_page_size()) +#endif + #define SSVM_N_OPAQUE 7 typedef struct @@ -125,12 +128,12 @@ ssvm_pop_heap (void *oldheap) } #define foreach_ssvm_api_error \ -_(NO_NAME, "No shared segment name", -10) \ -_(NO_SIZE, "Size not set (master)", -11) \ -_(CREATE_FAILURE, "Create failed", -12) \ -_(SET_SIZE, "Set size failed", -13) \ -_(MMAP, "mmap failed", -14) \ -_(SLAVE_TIMEOUT, "Slave map timeout", -15) +_(NO_NAME, "No shared segment name", -100) \ +_(NO_SIZE, "Size not set (master)", -101) \ +_(CREATE_FAILURE, "Create failed", -102) \ +_(SET_SIZE, "Set size failed", -103) \ +_(MMAP, "mmap failed", -104) \ +_(SLAVE_TIMEOUT, "Slave map timeout", -105) typedef enum { @@ -143,6 +146,7 @@ typedef enum int ssvm_master_init (ssvm_private_t * ssvm, u32 master_index); int ssvm_slave_init (ssvm_private_t * ssvm, int timeout_in_seconds); +void ssvm_delete (ssvm_private_t * ssvm); #endif /* __included_ssvm_h__ */ diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c new file mode 100644 index 00000000..11f90193 --- /dev/null +++ b/src/svm/svm_fifo.c @@ -0,0 +1,568 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "svm_fifo.h" + +/** create an svm fifo, in the current heap. Fails vs blow up the process */ +svm_fifo_t * +svm_fifo_create (u32 data_size_in_bytes) +{ + svm_fifo_t *f; + pthread_mutexattr_t attr; + pthread_condattr_t cattr; + + f = clib_mem_alloc_aligned_or_null (sizeof (*f) + data_size_in_bytes, + CLIB_CACHE_LINE_BYTES); + if (f == 0) + return 0; + + memset (f, 0, sizeof (*f) + data_size_in_bytes); + f->nitems = data_size_in_bytes; + f->ooos_list_head = OOO_SEGMENT_INVALID_INDEX; + + memset (&attr, 0, sizeof (attr)); + memset (&cattr, 0, sizeof (cattr)); + + if (pthread_mutexattr_init (&attr)) + clib_unix_warning ("mutexattr_init"); + if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED)) + clib_unix_warning ("pthread_mutexattr_setpshared"); + if (pthread_mutex_init (&f->mutex, &attr)) + clib_unix_warning ("mutex_init"); + if (pthread_mutexattr_destroy (&attr)) + clib_unix_warning ("mutexattr_destroy"); + if (pthread_condattr_init (&cattr)) + clib_unix_warning ("condattr_init"); + if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED)) + clib_unix_warning ("condattr_setpshared"); + if (pthread_cond_init (&f->condvar, &cattr)) + clib_unix_warning ("cond_init1"); + if (pthread_condattr_destroy (&cattr)) + clib_unix_warning ("cond_init2"); + + return (f); +} + +always_inline ooo_segment_t * +ooo_segment_new (svm_fifo_t * f, u32 start, u32 length) +{ + ooo_segment_t *s; + + pool_get (f->ooo_segments, s); + + s->fifo_position = start; + s->length = length; + + s->prev = s->next = OOO_SEGMENT_INVALID_INDEX; + + return s; +} + +always_inline void +ooo_segment_del (svm_fifo_t * f, u32 index) +{ + ooo_segment_t *cur, *prev = 0, *next = 0; + cur = pool_elt_at_index (f->ooo_segments, index); + + if (cur->next != OOO_SEGMENT_INVALID_INDEX) + { + next = pool_elt_at_index (f->ooo_segments, cur->next); + next->prev = cur->prev; + } + + if (cur->prev != OOO_SEGMENT_INVALID_INDEX) + { + prev = pool_elt_at_index (f->ooo_segments, cur->prev); + prev->next = cur->next; + } + else + { + f->ooos_list_head = cur->next; + } + + pool_put (f->ooo_segments, cur); +} + +/** + * Add segment to fifo's out-of-order segment list. Takes care of merging + * adjacent segments and removing overlapping ones. + */ +static void +ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) +{ + ooo_segment_t *s, *new_s, *prev, *next, *it; + u32 new_index, position, end_offset, s_sof, s_eof, s_index; + + position = (f->tail + offset) % f->nitems; + end_offset = offset + length; + + if (f->ooos_list_head == OOO_SEGMENT_INVALID_INDEX) + { + s = ooo_segment_new (f, position, length); + f->ooos_list_head = s - f->ooo_segments; + f->ooos_newest = f->ooos_list_head; + return; + } + + /* Find first segment that starts after new segment */ + s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); + while (s->next != OOO_SEGMENT_INVALID_INDEX + && ooo_segment_offset (f, s) <= offset) + s = pool_elt_at_index (f->ooo_segments, s->next); + + s_index = s - f->ooo_segments; + s_sof = ooo_segment_offset (f, s); + s_eof = ooo_segment_end_offset (f, s); + + /* No overlap, add before current segment */ + if (end_offset < s_sof) + { + new_s = ooo_segment_new (f, position, length); + new_index = new_s - f->ooo_segments; + + /* Pool might've moved, get segment again */ + s = pool_elt_at_index (f->ooo_segments, s_index); + + if (s->prev != OOO_SEGMENT_INVALID_INDEX) + { + new_s->prev = s->prev; + + prev = pool_elt_at_index (f->ooo_segments, new_s->prev); + prev->next = new_index; + } + else + { + /* New head */ + f->ooos_list_head = new_index; + } + + new_s->next = s - f->ooo_segments; + s->prev = new_index; + f->ooos_newest = new_index; + return; + } + /* No overlap, add after current segment */ + else if (s_eof < offset) + { + new_s = ooo_segment_new (f, position, length); + new_index = new_s - f->ooo_segments; + + /* Pool might've moved, get segment again */ + s = pool_elt_at_index (f->ooo_segments, s_index); + + if (s->next != OOO_SEGMENT_INVALID_INDEX) + { + new_s->next = s->next; + + next = pool_elt_at_index (f->ooo_segments, new_s->next); + next->prev = new_index; + } + + new_s->prev = s - f->ooo_segments; + s->next = new_index; + f->ooos_newest = new_index; + + return; + } + + /* + * Merge needed + */ + + /* Merge at head */ + if (offset <= s_sof) + { + /* If we have a previous, check if we overlap */ + if (s->prev != OOO_SEGMENT_INVALID_INDEX) + { + prev = pool_elt_at_index (f->ooo_segments, s->prev); + + /* New segment merges prev and current. Remove previous and + * update position of current. */ + if (ooo_segment_end_offset (f, prev) >= offset) + { + s->fifo_position = prev->fifo_position; + s->length = s_eof - ooo_segment_offset (f, prev); + ooo_segment_del (f, s->prev); + } + } + else + { + s->fifo_position = position; + s->length = s_eof - ooo_segment_offset (f, s); + } + + /* The new segment's tail may cover multiple smaller ones */ + if (s_eof < end_offset) + { + /* Remove segments completely covered */ + it = (s->next != OOO_SEGMENT_INVALID_INDEX) ? + pool_elt_at_index (f->ooo_segments, s->next) : 0; + while (it && ooo_segment_end_offset (f, it) < end_offset) + { + next = (it->next != OOO_SEGMENT_INVALID_INDEX) ? + pool_elt_at_index (f->ooo_segments, it->next) : 0; + ooo_segment_del (f, it - f->ooo_segments); + it = next; + } + + /* Update length. Segment's start might have changed. */ + s->length = end_offset - ooo_segment_offset (f, s); + + /* If partial overlap with last, merge */ + if (it && ooo_segment_offset (f, it) < end_offset) + { + s->length += + it->length - (ooo_segment_offset (f, it) - end_offset); + ooo_segment_del (f, it - f->ooo_segments); + } + } + } + /* Last but overlapping previous */ + else if (s_eof <= end_offset) + { + s->length = end_offset - ooo_segment_offset (f, s); + } + /* New segment completely covered by current one */ + else + { + /* Do Nothing */ + } + + /* Most recently updated segment */ + f->ooos_newest = s - f->ooo_segments; +} + +/** + * Removes segments that can now be enqueued because the fifo's tail has + * advanced. Returns the number of bytes added to tail. + */ +static int +ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) +{ + ooo_segment_t *s; + u32 index, bytes = 0, diff; + + s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); + + /* If last tail update overlaps one/multiple ooo segments, remove them */ + diff = (f->nitems + f->tail - s->fifo_position) % f->nitems; + while (0 < diff && diff < n_bytes_enqueued) + { + /* Segment end is beyond the tail. Advance tail and be done */ + if (diff < s->length) + { + f->tail += s->length - diff; + f->tail %= f->nitems; + break; + } + /* If we have next go on */ + else if (s->next != OOO_SEGMENT_INVALID_INDEX) + { + index = s - f->ooo_segments; + s = pool_elt_at_index (f->ooo_segments, s->next); + diff = (f->nitems + f->tail - s->fifo_position) % f->nitems; + ooo_segment_del (f, index); + } + /* End of search */ + else + { + break; + } + } + + /* If tail is adjacent to an ooo segment, 'consume' it */ + if (diff == 0) + { + bytes = ((f->nitems - f->cursize) >= s->length) ? s->length : + f->nitems - f->cursize; + + f->tail += bytes; + f->tail %= f->nitems; + + ooo_segment_del (f, s - f->ooo_segments); + } + + return bytes; +} + +static int +svm_fifo_enqueue_internal (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_from_here) +{ + u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; + u32 cursize, nitems; + + if (PREDICT_FALSE (f->cursize == f->nitems)) + return -2; /* fifo stuffed */ + + /* read cursize, which can only decrease while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Number of bytes we're going to copy */ + total_copy_bytes = (nitems - cursize) < max_bytes ? + (nitems - cursize) : max_bytes; + + if (PREDICT_TRUE (copy_from_here != 0)) + { + /* Number of bytes in first copy segment */ + first_copy_bytes = ((nitems - f->tail) < total_copy_bytes) + ? (nitems - f->tail) : total_copy_bytes; + + clib_memcpy (&f->data[f->tail], copy_from_here, first_copy_bytes); + f->tail += first_copy_bytes; + f->tail = (f->tail == nitems) ? 0 : f->tail; + + /* Number of bytes in second copy segment, if any */ + second_copy_bytes = total_copy_bytes - first_copy_bytes; + if (second_copy_bytes) + { + clib_memcpy (&f->data[f->tail], copy_from_here + first_copy_bytes, + second_copy_bytes); + f->tail += second_copy_bytes; + f->tail = (f->tail == nitems) ? 0 : f->tail; + } + } + else + { + /* Account for a zero-copy enqueue done elsewhere */ + ASSERT (max_bytes <= (nitems - cursize)); + f->tail += max_bytes; + f->tail = f->tail % nitems; + total_copy_bytes = max_bytes; + } + + /* Any out-of-order segments to collect? */ + if (PREDICT_FALSE (f->ooos_list_head != OOO_SEGMENT_INVALID_INDEX)) + total_copy_bytes += ooo_segment_try_collect (f, total_copy_bytes); + + /* Atomically increase the queue length */ + __sync_fetch_and_add (&f->cursize, total_copy_bytes); + + return (total_copy_bytes); +} + +int +svm_fifo_enqueue_nowait (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_from_here) +{ + return svm_fifo_enqueue_internal (f, pid, max_bytes, copy_from_here); +} + +/** Enqueue a future segment. + * Two choices: either copies the entire segment, or copies nothing + * Returns 0 of the entire segment was copied + * Returns -1 if none of the segment was copied due to lack of space + */ + +static int +svm_fifo_enqueue_with_offset_internal2 (svm_fifo_t * f, + int pid, + u32 offset, + u32 required_bytes, + u8 * copy_from_here) +{ + u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; + u32 cursize, nitems; + u32 tail_plus_offset; + + ASSERT (offset > 0); + + /* read cursize, which can only decrease while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Will this request fit? */ + if ((required_bytes + offset) > (nitems - cursize)) + return -1; + + ooo_segment_add (f, offset, required_bytes); + + /* Number of bytes we're going to copy */ + total_copy_bytes = required_bytes; + tail_plus_offset = (f->tail + offset) % nitems; + + /* Number of bytes in first copy segment */ + first_copy_bytes = ((nitems - tail_plus_offset) < total_copy_bytes) + ? (nitems - tail_plus_offset) : total_copy_bytes; + + clib_memcpy (&f->data[tail_plus_offset], copy_from_here, first_copy_bytes); + + /* Number of bytes in second copy segment, if any */ + second_copy_bytes = total_copy_bytes - first_copy_bytes; + if (second_copy_bytes) + { + tail_plus_offset += first_copy_bytes; + tail_plus_offset %= nitems; + + ASSERT (tail_plus_offset == 0); + + clib_memcpy (&f->data[tail_plus_offset], + copy_from_here + first_copy_bytes, second_copy_bytes); + } + + return (0); +} + + +int +svm_fifo_enqueue_with_offset (svm_fifo_t * f, + int pid, + u32 offset, + u32 required_bytes, u8 * copy_from_here) +{ + return svm_fifo_enqueue_with_offset_internal2 + (f, pid, offset, required_bytes, copy_from_here); +} + + +static int +svm_fifo_dequeue_internal2 (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_here) +{ + u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; + u32 cursize, nitems; + + if (PREDICT_FALSE (f->cursize == 0)) + return -2; /* nothing in the fifo */ + + /* read cursize, which can only increase while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Number of bytes we're going to copy */ + total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; + + if (PREDICT_TRUE (copy_here != 0)) + { + /* Number of bytes in first copy segment */ + first_copy_bytes = ((nitems - f->head) < total_copy_bytes) + ? (nitems - f->head) : total_copy_bytes; + clib_memcpy (copy_here, &f->data[f->head], first_copy_bytes); + f->head += first_copy_bytes; + f->head = (f->head == nitems) ? 0 : f->head; + + /* Number of bytes in second copy segment, if any */ + second_copy_bytes = total_copy_bytes - first_copy_bytes; + if (second_copy_bytes) + { + clib_memcpy (copy_here + first_copy_bytes, + &f->data[f->head], second_copy_bytes); + f->head += second_copy_bytes; + f->head = (f->head == nitems) ? 0 : f->head; + } + } + else + { + /* Account for a zero-copy dequeue done elsewhere */ + ASSERT (max_bytes <= cursize); + f->head += max_bytes; + f->head = f->head % nitems; + cursize -= max_bytes; + total_copy_bytes = max_bytes; + } + + __sync_fetch_and_sub (&f->cursize, total_copy_bytes); + + return (total_copy_bytes); +} + +int +svm_fifo_dequeue_nowait (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_here) +{ + return svm_fifo_dequeue_internal2 (f, pid, max_bytes, copy_here); +} + +int +svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, + u8 * copy_here) +{ + u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; + u32 cursize, nitems; + + if (PREDICT_FALSE (f->cursize == 0)) + return -2; /* nothing in the fifo */ + + /* read cursize, which can only increase while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Number of bytes we're going to copy */ + total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; + + if (PREDICT_TRUE (copy_here != 0)) + { + /* Number of bytes in first copy segment */ + first_copy_bytes = + ((nitems - f->head) < total_copy_bytes) ? + (nitems - f->head) : total_copy_bytes; + clib_memcpy (copy_here, &f->data[f->head], first_copy_bytes); + + /* Number of bytes in second copy segment, if any */ + second_copy_bytes = total_copy_bytes - first_copy_bytes; + if (second_copy_bytes) + { + clib_memcpy (copy_here + first_copy_bytes, &f->data[0], + second_copy_bytes); + } + } + return total_copy_bytes; +} + +int +svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes) +{ + u32 total_drop_bytes, first_drop_bytes, second_drop_bytes; + u32 cursize, nitems; + + if (PREDICT_FALSE (f->cursize == 0)) + return -2; /* nothing in the fifo */ + + /* read cursize, which can only increase while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Number of bytes we're going to drop */ + total_drop_bytes = (cursize < max_bytes) ? cursize : max_bytes; + + /* Number of bytes in first copy segment */ + first_drop_bytes = + ((nitems - f->head) < total_drop_bytes) ? + (nitems - f->head) : total_drop_bytes; + f->head += first_drop_bytes; + f->head = (f->head == nitems) ? 0 : f->head; + + /* Number of bytes in second drop segment, if any */ + second_drop_bytes = total_drop_bytes - first_drop_bytes; + if (second_drop_bytes) + { + f->head += second_drop_bytes; + f->head = (f->head == nitems) ? 0 : f->head; + } + + __sync_fetch_and_sub (&f->cursize, total_drop_bytes); + + return total_drop_bytes; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h new file mode 100644 index 00000000..70624b74 --- /dev/null +++ b/src/svm/svm_fifo.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ssvm_fifo_h__ +#define __included_ssvm_fifo_h__ + +#include +#include +#include +#include +#include +#include +#include + +typedef enum +{ + SVM_FIFO_TAG_NOT_HELD = 0, + SVM_FIFO_TAG_DEQUEUE, + SVM_FIFO_TAG_ENQUEUE, +} svm_lock_tag_t; + +/** Out-of-order segment */ +typedef struct +{ + u32 next; /**< Next linked-list element pool index */ + u32 prev; /**< Previous linked-list element pool index */ + + u32 fifo_position; /**< Start of segment, normalized*/ + u32 length; /**< Length of segment */ +} ooo_segment_t; + +#define OOO_SEGMENT_INVALID_INDEX ((u32)~0) + +typedef struct +{ + pthread_mutex_t mutex; /* 8 bytes */ + pthread_cond_t condvar; /* 8 bytes */ + u32 owner_pid; + svm_lock_tag_t tag; + volatile u32 cursize; + u32 nitems; + + /* Backpointers */ + u32 server_session_index; + u32 client_session_index; + u8 server_thread_index; + u8 client_thread_index; + CLIB_CACHE_LINE_ALIGN_MARK (end_shared); + u32 head; + CLIB_CACHE_LINE_ALIGN_MARK (end_consumer); + + /* producer */ + u32 tail; + + ooo_segment_t *ooo_segments; /**< Pool of ooo segments */ + u32 ooos_list_head; /**< Head of out-of-order linked-list */ + u32 ooos_newest; /**< Last segment to have been updated */ + + CLIB_CACHE_LINE_ALIGN_MARK (data); +} svm_fifo_t; + +static inline int +svm_fifo_lock (svm_fifo_t * f, u32 pid, u32 tag, int nowait) +{ + if (PREDICT_TRUE (nowait == 0)) + pthread_mutex_lock (&f->mutex); + else + { + if (pthread_mutex_trylock (&f->mutex)) + return -1; + } + f->owner_pid = pid; + f->tag = tag; + return 0; +} + +static inline void +svm_fifo_unlock (svm_fifo_t * f) +{ + f->owner_pid = 0; + f->tag = 0; + CLIB_MEMORY_BARRIER (); + pthread_mutex_unlock (&f->mutex); +} + +static inline u32 +svm_fifo_max_dequeue (svm_fifo_t * f) +{ + return f->cursize; +} + +static inline u32 +svm_fifo_max_enqueue (svm_fifo_t * f) +{ + return f->nitems - f->cursize; +} + +static inline u8 +svm_fifo_has_ooo_data (svm_fifo_t * f) +{ + return f->ooos_list_head != OOO_SEGMENT_INVALID_INDEX; +} + +svm_fifo_t *svm_fifo_create (u32 data_size_in_bytes); + +int svm_fifo_enqueue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, + u8 * copy_from_here); + +int svm_fifo_enqueue_with_offset (svm_fifo_t * f, int pid, + u32 offset, u32 required_bytes, + u8 * copy_from_here); + +int svm_fifo_dequeue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, + u8 * copy_here); + +int svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, + u8 * copy_here); +int svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes); + +always_inline ooo_segment_t * +svm_fifo_newest_ooo_segment (svm_fifo_t * f) +{ + return f->ooo_segments + f->ooos_newest; +} + +always_inline u32 +ooo_segment_offset (svm_fifo_t * f, ooo_segment_t * s) +{ + return ((f->nitems + s->fifo_position - f->tail) % f->nitems); +} + +always_inline u32 +ooo_segment_end_offset (svm_fifo_t * f, ooo_segment_t * s) +{ + return ((f->nitems + s->fifo_position + s->length - f->tail) % f->nitems); +} + +#endif /* __included_ssvm_fifo_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c new file mode 100644 index 00000000..acabb3bd --- /dev/null +++ b/src/svm/svm_fifo_segment.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +svm_fifo_segment_main_t svm_fifo_segment_main; + +/** (master) create an svm fifo segment */ +int +svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) +{ + int rv; + svm_fifo_segment_private_t *s; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + void *oldheap; + + /* Allocate a fresh segment */ + pool_get (sm->segments, s); + memset (s, 0, sizeof (*s)); + + s->ssvm.ssvm_size = a->segment_size; + s->ssvm.i_am_master = 1; + s->ssvm.my_pid = getpid (); + s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.requested_va = sm->next_baseva; + + rv = ssvm_master_init (&s->ssvm, s - sm->segments); + + if (rv) + { + _vec_len (s) = vec_len (s) - 1; + return (rv); + } + + /* Note; requested_va updated due to seg base addr randomization */ + sm->next_baseva = s->ssvm.requested_va + a->segment_size; + + sh = s->ssvm.sh; + oldheap = ssvm_push_heap (sh); + + /* Set up svm_fifo_segment shared header */ + fsh = clib_mem_alloc (sizeof (*fsh)); + memset (fsh, 0, sizeof (*fsh)); + sh->opaque[0] = fsh; + s->h = fsh; + fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + + /* Avoid vec_add1(...) failure when adding a fifo, etc. */ + vec_validate (fsh->fifos, 64); + _vec_len (fsh->fifos) = 0; + + ssvm_pop_heap (oldheap); + + sh->ready = 1; + a->new_segment_index = s - sm->segments; + return (0); +} + +/** (slave) attach to an svm fifo segment */ +int +svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a) +{ + int rv; + svm_fifo_segment_private_t *s; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + + /* Allocate a fresh segment */ + pool_get (sm->segments, s); + + memset (s, 0, sizeof (*s)); + + s->ssvm.ssvm_size = a->segment_size; + s->ssvm.my_pid = getpid (); + s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.requested_va = sm->next_baseva; + + rv = ssvm_slave_init (&s->ssvm, sm->timeout_in_seconds); + + if (rv) + { + _vec_len (s) = vec_len (s) - 1; + return (rv); + } + + /* Fish the segment header */ + sh = s->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + s->h = fsh; + + a->new_segment_index = s - sm->segments; + return (0); +} + +void +svm_fifo_segment_delete (svm_fifo_segment_private_t * s) +{ + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + ssvm_delete (&s->ssvm); + pool_put (sm->segments, s); +} + +svm_fifo_t * +svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, + u32 data_size_in_bytes) +{ + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + svm_fifo_t *f; + void *oldheap; + + sh = s->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + oldheap = ssvm_push_heap (sh); + + /* Note: this can fail, in which case: create another segment */ + f = svm_fifo_create (data_size_in_bytes); + if (f == 0) + { + ssvm_pop_heap (oldheap); + return (0); + } + + vec_add1 (fsh->fifos, f); + + ssvm_pop_heap (oldheap); + return (f); +} + +void +svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f) +{ + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + void *oldheap; + int i; + + sh = s->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + oldheap = ssvm_push_heap (sh); + + for (i = 0; i < vec_len (fsh->fifos); i++) + { + if (fsh->fifos[i] == f) + { + vec_delete (fsh->fifos, 1, i); + goto found; + } + } + clib_warning ("fifo 0x%llx not found in fifo table...", f); + +found: + clib_mem_free (f); + ssvm_pop_heap (oldheap); +} + +void +svm_fifo_segment_init (u64 baseva, u32 timeout_in_seconds) +{ + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + + sm->next_baseva = baseva; + sm->timeout_in_seconds = timeout_in_seconds; +} + +u32 +svm_fifo_segment_index (svm_fifo_segment_private_t * s) +{ + return s - svm_fifo_segment_main.segments; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h new file mode 100644 index 00000000..793fa7c8 --- /dev/null +++ b/src/svm/svm_fifo_segment.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ssvm_fifo_segment_h__ +#define __included_ssvm_fifo_segment_h__ + +#include "svm_fifo.h" +#include "ssvm.h" + +typedef struct +{ + volatile svm_fifo_t **fifos; + u8 *segment_name; +} svm_fifo_segment_header_t; + +typedef struct +{ + ssvm_private_t ssvm; + svm_fifo_segment_header_t *h; +} svm_fifo_segment_private_t; + +typedef struct +{ + /** pool of segments */ + svm_fifo_segment_private_t *segments; + /* Where to put the next one */ + u64 next_baseva; + u32 timeout_in_seconds; +} svm_fifo_segment_main_t; + +extern svm_fifo_segment_main_t svm_fifo_segment_main; + +typedef struct +{ + char *segment_name; + u32 segment_size; + u32 new_segment_index; +} svm_fifo_segment_create_args_t; + +static inline svm_fifo_segment_private_t * +svm_fifo_get_segment (u32 segment_index) +{ + svm_fifo_segment_main_t *ssm = &svm_fifo_segment_main; + return vec_elt_at_index (ssm->segments, segment_index); +} + +#define foreach_ssvm_fifo_segment_api_error \ +_(OUT_OF_SPACE, "Out of space in segment", -200) + +typedef enum +{ +#define _(n,s,c) SSVM_FIFO_SEGMENT_API_ERROR_##n = c, + foreach_ssvm_fifo_segment_api_error +#undef _ +} ssvm_fifo_segment_api_error_enum_t; + +int svm_fifo_segment_create (svm_fifo_segment_create_args_t * a); +int svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a); +void svm_fifo_segment_delete (svm_fifo_segment_private_t * s); + +svm_fifo_t *svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, + u32 data_size_in_bytes); +void svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, + svm_fifo_t * f); + +void svm_fifo_segment_init (u64 baseva, u32 timeout_in_seconds); + +u32 svm_fifo_segment_index (svm_fifo_segment_private_t * s); + +#endif /* __included_ssvm_fifo_segment_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/svm/test_svm_fifo1.c b/src/svm/test_svm_fifo1.c new file mode 100644 index 00000000..355653df --- /dev/null +++ b/src/svm/test_svm_fifo1.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "svm_fifo_segment.h" + +clib_error_t * +hello_world (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_t *f; + int rv; + u8 *test_data; + u8 *retrieved_data = 0; + clib_error_t *error = 0; + int pid = getpid (); + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + a->segment_size = 256 << 10; + + rv = svm_fifo_segment_create (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + + f = svm_fifo_segment_alloc_fifo (sp, 4096); + + if (f == 0) + return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed"); + + test_data = format (0, "Hello world%c", 0); + vec_validate (retrieved_data, vec_len (test_data) - 1); + + while (svm_fifo_max_enqueue (f) >= vec_len (test_data)) + svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + + while (svm_fifo_max_dequeue (f) >= vec_len (test_data)) + svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), + retrieved_data); + + while (svm_fifo_max_enqueue (f) >= vec_len (test_data)) + svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + + while (svm_fifo_max_dequeue (f) >= vec_len (test_data)) + svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), + retrieved_data); + + if (!memcmp (retrieved_data, test_data, vec_len (test_data))) + error = clib_error_return (0, "data test OK, got '%s'", retrieved_data); + else + error = clib_error_return (0, "data test FAIL!"); + + svm_fifo_segment_free_fifo (sp, f); + + return error; +} + +clib_error_t * +master (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_t *f; + int rv; + u8 *test_data; + u8 *retrieved_data = 0; + int i; + int pid = getpid (); + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + a->segment_size = 256 << 10; + + rv = svm_fifo_segment_create (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + + f = svm_fifo_segment_alloc_fifo (sp, 4096); + + if (f == 0) + return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed"); + + test_data = format (0, "Hello world%c", 0); + vec_validate (retrieved_data, vec_len (test_data) - 1); + + for (i = 0; i < 1000; i++) + svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + + return clib_error_return (0, "master (enqueue) done"); +} + +clib_error_t * +mempig (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_t *f; + svm_fifo_t **flist = 0; + int rv; + int i; + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + a->segment_size = 256 << 10; + + rv = svm_fifo_segment_create (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + + for (i = 0; i < 1000; i++) + { + f = svm_fifo_segment_alloc_fifo (sp, 4096); + if (f == 0) + break; + vec_add1 (flist, f); + } + + fformat (stdout, "Try #1: created %d fifos...\n", vec_len (flist)); + for (i = 0; i < vec_len (flist); i++) + { + f = flist[i]; + svm_fifo_segment_free_fifo (sp, f); + } + + _vec_len (flist) = 0; + + for (i = 0; i < 1000; i++) + { + f = svm_fifo_segment_alloc_fifo (sp, 4096); + if (f == 0) + break; + vec_add1 (flist, f); + } + + fformat (stdout, "Try #2: created %d fifos...\n", vec_len (flist)); + for (i = 0; i < vec_len (flist); i++) + { + f = flist[i]; + svm_fifo_segment_free_fifo (sp, f); + } + + return 0; +} + +clib_error_t * +offset (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_t *f; + int rv; + u32 *test_data = 0; + u32 *recovered_data = 0; + int i; + int pid = getpid (); + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + a->segment_size = 256 << 10; + + rv = svm_fifo_segment_create (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + + f = svm_fifo_segment_alloc_fifo (sp, 200 << 10); + + if (f == 0) + return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed"); + + for (i = 0; i < (3 * 1024); i++) + vec_add1 (test_data, i); + + /* Enqueue the first 1024 u32's */ + svm_fifo_enqueue_nowait (f, pid, 4096 /* bytes to enqueue */ , + (u8 *) test_data); + + /* Enqueue the third 1024 u32's 2048 ahead of the current tail */ + svm_fifo_enqueue_with_offset (f, pid, 4096, 4096, (u8 *) & test_data[2048]); + + /* Enqueue the second 1024 u32's at the current tail */ + svm_fifo_enqueue_nowait (f, pid, 4096 /* bytes to enqueue */ , + (u8 *) & test_data[1024]); + + vec_validate (recovered_data, (3 * 1024) - 1); + + svm_fifo_dequeue_nowait (f, pid, 3 * 4096, (u8 *) recovered_data); + + for (i = 0; i < (3 * 1024); i++) + { + if (recovered_data[i] != test_data[i]) + { + clib_warning ("[%d] expected %d recovered %d", i, + test_data[i], recovered_data[i]); + return clib_error_return (0, "offset test FAILED"); + } + } + + return clib_error_return (0, "offset test OK"); +} + +clib_error_t * +slave (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_segment_header_t *fsh; + svm_fifo_t *f; + ssvm_shared_header_t *sh; + int rv; + u8 *test_data; + u8 *retrieved_data = 0; + int pid = getpid (); + int i; + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + + rv = svm_fifo_segment_attach (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_attach returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + sh = sp->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + + /* might wanna wait.. */ + f = (svm_fifo_t *) fsh->fifos[0]; + + /* Lazy bastards united */ + test_data = format (0, "Hello world%c", 0); + vec_validate (retrieved_data, vec_len (test_data) - 1); + + for (i = 0; i < 1000; i++) + { + svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), + retrieved_data); + if (memcmp (retrieved_data, test_data, vec_len (retrieved_data))) + return clib_error_return (0, "retrieved data incorrect, '%s'", + retrieved_data); + } + + return clib_error_return (0, "slave (dequeue) done"); +} + + +int +test_ssvm_fifo1 (unformat_input_t * input) +{ + clib_error_t *error = 0; + int verbose = 0; + int test_id = 0; + + svm_fifo_segment_init (0x200000000ULL, 20); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose %d", &verbose)) + ; + else if (unformat (input, "verbose")) + verbose = 1; + else if (unformat (input, "master")) + test_id = 1; + else if (unformat (input, "slave")) + test_id = 2; + else if (unformat (input, "mempig")) + test_id = 3; + else if (unformat (input, "offset")) + test_id = 4; + else + { + error = clib_error_create ("unknown input `%U'\n", + format_unformat_error, input); + goto out; + } + } + + switch (test_id) + { + case 0: + error = hello_world (verbose); + break; + + case 1: + error = master (verbose); + break; + + case 2: + error = slave (verbose); + break; + + case 3: + error = mempig (verbose); + break; + + case 4: + error = offset (verbose); + break; + + default: + error = clib_error_return (0, "test id %d unknown", test_id); + break; + } + +out: + if (error) + clib_error_report (error); + + return 0; +} + + + +int +main (int argc, char *argv[]) +{ + unformat_input_t i; + int r; + + unformat_init_command_line (&i, argv); + r = test_ssvm_fifo1 (&i); + unformat_free (&i); + return r; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri.am b/src/uri.am new file mode 100644 index 00000000..8cdd77c6 --- /dev/null +++ b/src/uri.am @@ -0,0 +1,22 @@ +# Copyright (c) 2016 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +noinst_PROGRAMS += uri_udp_test2 uri_tcp_test + +uri_udp_test2_SOURCES = uri/uri_udp_test2.c +uri_udp_test2_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ + libvppinfra.la -lpthread -lm -lrt + +uri_tcp_test_SOURCES = uri/uri_tcp_test.c +uri_tcp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ + libvppinfra.la -lpthread -lm -lrt diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c new file mode 100644 index 00000000..ed5a37d8 --- /dev/null +++ b/src/uri/uri_tcp_test.c @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../vnet/session/application_interface.h" + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include +#undef vl_printfun + +/* Satisfy external references when not linking with -lvlib */ +vlib_main_t vlib_global_main; +vlib_main_t **vlib_mains; + +typedef struct +{ + svm_fifo_t * server_rx_fifo; + svm_fifo_t * server_tx_fifo; + + u32 vpp_session_index; + u32 vpp_session_thread; +} session_t; + +typedef enum +{ + STATE_START, + STATE_READY, + STATE_DISCONNECTING, + STATE_FAILED +} connection_state_t; + +typedef struct +{ + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* The URI we're playing with */ + u8 * uri; + + /* Session pool */ + session_t * sessions; + + /* Hash table for disconnect processing */ + uword * session_index_by_vpp_handles; + + /* intermediate rx buffer */ + u8 * rx_buf; + + /* URI for slave's connect */ + u8 * connect_uri; + + u32 connected_session_index; + + int i_am_master; + + /* drop all packets */ + int drop_packets; + + /* Our event queue */ + unix_shared_memory_queue_t * our_event_queue; + + /* $$$ single thread only for the moment */ + unix_shared_memory_queue_t * vpp_event_queue; + + pid_t my_pid; + + /* For deadman timers */ + clib_time_t clib_time; + + /* State of the connection, shared between msg RX thread and main thread */ + volatile connection_state_t state; + + /* Signal variables */ + volatile int time_to_stop; + volatile int time_to_print_stats; + + u32 configured_segment_size; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword * error_string_by_error_number; + + /* convenience */ + svm_fifo_segment_main_t * segment_main; + + u8 *connect_test_data; +} uri_tcp_test_main_t; + +uri_tcp_test_main_t uri_tcp_test_main; + +#if CLIB_DEBUG > 0 +#define NITER 10000 +#else +#define NITER 4000000 +#endif + +int +wait_for_state_change (uri_tcp_test_main_t * utm, connection_state_t state) +{ +#if CLIB_DEBUG > 0 +#define TIMEOUT 600.0 +#else +#define TIMEOUT 600.0 +#endif + + f64 timeout = clib_time_now (&utm->clib_time) + TIMEOUT; + + while (clib_time_now (&utm->clib_time) < timeout) + { + if (utm->state == state) + return 0; + if (utm->state == STATE_FAILED) + return -1; + } + clib_warning ("timeout waiting for STATE_READY"); + return -1; +} + +static void +init_error_string_table (uri_tcp_test_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + +static void +stop_signal (int signum) +{ + uri_tcp_test_main_t *um = &uri_tcp_test_main; + + um->time_to_stop = 1; +} + +static void +stats_signal (int signum) +{ + uri_tcp_test_main_t *um = &uri_tcp_test_main; + + um->time_to_print_stats = 1; +} + +static clib_error_t * +setup_signal_handlers (void) +{ + signal (SIGINT, stats_signal); + signal (SIGQUIT, stop_signal); + signal (SIGTERM, stop_signal); + + return 0; +} + +void +vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) +{ + clib_warning ("BUG"); +} + +int +connect_to_vpp (char *name) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) + return -1; + + utm->vl_input_queue = am->shmem_hdr->vl_input_queue; + utm->my_client_index = am->my_client_index; + + return 0; +} + +static void +vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t *mp) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + clib_warning ("Mapped new segment '%s' size %d", mp->segment_name, + mp->segment_size); +} + +static void +vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + session_t * session; + vl_api_disconnect_session_reply_t * rmp; + uword * p; + int rv = 0; + u64 key; + + key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + + p = hash_get (utm->session_index_by_vpp_handles, key); + + if (p) + { + session = pool_elt_at_index (utm->sessions, p[0]); + hash_unset (utm->session_index_by_vpp_handles, key); + pool_put (utm->sessions, session); + } + else + { + clib_warning ("couldn't find session key %llx", key); + rv = -11; + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + + rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->retval = rv; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); +} + +static void +vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + session_t * session; + vl_api_reset_session_reply_t * rmp; + uword * p; + int rv = 0; + u64 key; + + key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + + p = hash_get(utm->session_index_by_vpp_handles, key); + + if (p) + { + session = pool_elt_at_index(utm->sessions, p[0]); + hash_unset(utm->session_index_by_vpp_handles, key); + pool_put(utm->sessions, session); + } + else + { + clib_warning("couldn't find session key %llx", key); + rv = -11; + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->retval = rv; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); +} + +void +handle_fifo_event_connect_rx (uri_tcp_test_main_t *utm, session_fifo_event_t * e) +{ + svm_fifo_t * rx_fifo; + int n_read, bytes; + + rx_fifo = e->fifo; + + bytes = e->enqueue_length; + do + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len(utm->rx_buf), + utm->rx_buf); + if (n_read > 0) + bytes -= n_read; + } + while (n_read < 0 || bytes > 0); + + // bytes_to_read = svm_fifo_max_dequeue (rx_fifo); + // + // bytes_to_read = vec_len(utm->rx_buf) > bytes_to_read ? + // bytes_to_read : vec_len(utm->rx_buf); + // + // buffer_offset = 0; + // while (bytes_to_read > 0) + // { + // rv = svm_fifo_dequeue_nowait2 (rx_fifo, mypid, + // bytes_to_read, + // utm->rx_buf + buffer_offset); + // if (rv > 0) + // { + // bytes_to_read -= rv; + // buffer_offset += rv; + // bytes_received += rv; + // } + // } + + + // while (bytes_received < bytes_sent) + // { + // rv = svm_fifo_dequeue_nowait2 (rx_fifo, mypid, + // vec_len (utm->rx_buf), + // utm->rx_buf); + // if (rv > 0) + // { + //#if CLIB_DEBUG > 0 + // int j; + // for (j = 0; j < rv; j++) + // { + // if (utm->rx_buf[j] != ((bytes_received + j) & 0xff)) + // { + // clib_warning ("error at byte %lld, 0x%x not 0x%x", + // bytes_received + j, + // utm->rx_buf[j], + // ((bytes_received + j )&0xff)); + // } + // } + //#endif + // bytes_received += (u64) rv; + // } + // } +} + +void +handle_connect_event_queue (uri_tcp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, 0 /* nowait */); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + handle_fifo_event_connect_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning("unknown event type %d", e->event_type); + break; + } +} + +void +uri_tcp_connect_send (uri_tcp_test_main_t *utm) +{ + u8 *test_data = utm->connect_test_data; + u64 bytes_sent = 0; + int rv; + int mypid = getpid(); + session_t * session; + svm_fifo_t *tx_fifo; + int buffer_offset, bytes_to_send = 0; + session_fifo_event_t evt; + static int serial_number = 0; + int i; + u32 max_chunk = 64 << 10, write; + + session = pool_elt_at_index (utm->sessions, utm->connected_session_index); + tx_fifo = session->server_tx_fifo; + + vec_validate (utm->rx_buf, vec_len (test_data) - 1); + + for (i = 0; i < 10; i++) + { + bytes_to_send = vec_len (test_data); + buffer_offset = 0; + while (bytes_to_send > 0) + { + write = bytes_to_send > max_chunk ? max_chunk : bytes_to_send; + rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, write, + test_data + buffer_offset); + + if (rv > 0) + { + bytes_to_send -= rv; + buffer_offset += rv; + bytes_sent += rv; + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = rv; + evt.event_id = serial_number++; + + unix_shared_memory_queue_add (utm->vpp_event_queue, (u8 *) &evt, + 0 /* do wait for mutex */); + } + } + } +} + +static void +uri_tcp_client_test (uri_tcp_test_main_t * utm) +{ + vl_api_connect_uri_t * cmp; + vl_api_disconnect_session_t *dmp; + session_t *connected_session; + int i; + + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl(0xfeedface); + memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&cmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + return; + } + + /* Init test data */ + vec_validate (utm->connect_test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (utm->connect_test_data); i++) + utm->connect_test_data[i] = i & 0xff; + + /* Start reader thread */ + /* handle_connect_event_queue (utm); */ + + /* Start send */ + uri_tcp_connect_send (utm); + + /* Disconnect */ + connected_session = pool_elt_at_index(utm->sessions, + utm->connected_session_index); + dmp = vl_msg_api_alloc (sizeof (*dmp)); + memset (dmp, 0, sizeof (*dmp)); + dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); + dmp->client_index = utm->my_client_index; + dmp->session_index = connected_session->vpp_session_index; + dmp->session_thread_index = connected_session->vpp_session_thread; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&dmp); +} + +void +handle_fifo_event_server_rx (uri_tcp_test_main_t *utm, session_fifo_event_t * e) +{ + svm_fifo_t * rx_fifo, * tx_fifo; + int n_read; + + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + int rv, bytes; + + rx_fifo = e->fifo; + tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + + bytes = e->enqueue_length; + do + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len(utm->rx_buf), + utm->rx_buf); + + /* Reflect if a non-drop session */ + if (!utm->drop_packets && n_read > 0) + { + do + { + rv = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, utm->rx_buf); + } + while (rv == -2); + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = n_read; + evt.event_id = e->event_id; + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) &evt, 0 /* do wait for mutex */); + } + + if (n_read > 0) + bytes -= n_read; + } + while (n_read < 0 || bytes > 0); +} + +void +handle_event_queue (uri_tcp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + + while (1) + { + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *)e, + 0 /* nowait */); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + handle_fifo_event_server_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning ("unknown event type %d", e->event_type); + break; + } + if (PREDICT_FALSE(utm->time_to_stop == 1)) + break; + if (PREDICT_FALSE(utm->time_to_print_stats == 1)) + { + utm->time_to_print_stats = 0; + fformat(stdout, "%d connections\n", pool_elts (utm->sessions)); + } + } +} + +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + if (mp->retval) + { + clib_warning("bind failed: %d", mp->retval); + return; + } + + if (mp->segment_name_length == 0) + { + clib_warning("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT(mp->server_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning("svm_fifo_segment_attach ('%s') failed", mp->segment_name); + return; + } + + utm->our_event_queue = + (unix_shared_memory_queue_t *) mp->server_event_queue_address; + + utm->state = STATE_READY; +} + +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + session_t *session; + u32 session_index; + svm_fifo_t *rx_fifo, *tx_fifo; + int rv; + + if (mp->retval) + { + clib_warning ("connection failed with code: %d", mp->retval); + utm->state = STATE_FAILED; + return; + } + /* + * Attatch to segment + */ + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + utm->state = STATE_FAILED; + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT(mp->client_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + + /* + * Save the queues + */ + + utm->our_event_queue = (unix_shared_memory_queue_t *) + mp->client_event_queue_address; + + utm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + /* + * Setup session + */ + + pool_get (utm->sessions, session); + session_index = session - utm->sessions; + + rx_fifo = (svm_fifo_t *)mp->server_rx_fifo; + rx_fifo->client_session_index = session_index; + tx_fifo = (svm_fifo_t *)mp->server_tx_fifo; + tx_fifo->client_session_index = session_index; + + session->server_rx_fifo = rx_fifo; + session->server_tx_fifo = tx_fifo; + session->vpp_session_index = mp->session_index; + session->vpp_session_thread = mp->session_thread_index; + + /* Save handle */ + utm->connected_session_index = session_index; + + utm->state = STATE_READY; +} + +void +uri_tcp_bind (uri_tcp_test_main_t *utm) +{ + vl_api_bind_uri_t * bmp; + u32 fifo_size = 3 << 20; + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl(0xfeedface); + bmp->initial_segment_size = 256<<20; /* size of initial segment */ + bmp->options[SESSION_OPTIONS_FLAGS] = + SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128<<20; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&bmp); +} + +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t *mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + + if (mp->retval != 0) + clib_warning ("returned %d", ntohl(mp->retval)); + + utm->state = STATE_START; +} + +void +uri_tcp_unbind (uri_tcp_test_main_t *utm) +{ + vl_api_unbind_uri_t * ump; + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&ump); +} + +static void +vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + vl_api_accept_session_reply_t *rmp; + svm_fifo_t * rx_fifo, * tx_fifo; + session_t * session; + static f64 start_time; + u64 key; + u32 session_index; + + if (start_time == 0.0) + start_time = clib_time_now (&utm->clib_time); + + utm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + /* Allocate local session and set it up */ + pool_get (utm->sessions, session); + session_index = session - utm->sessions; + + rx_fifo = (svm_fifo_t *)mp->server_rx_fifo; + rx_fifo->client_session_index = session_index; + tx_fifo = (svm_fifo_t *)mp->server_tx_fifo; + tx_fifo->client_session_index = session_index; + + session->server_rx_fifo = rx_fifo; + session->server_tx_fifo = tx_fifo; + + /* Add it to lookup table */ + key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + hash_set (utm->session_index_by_vpp_handles, key, session_index); + + utm->state = STATE_READY; + + /* Stats printing */ + if (pool_elts (utm->sessions) && (pool_elts(utm->sessions) % 20000) == 0) + { + f64 now = clib_time_now (&utm->clib_time); + fformat (stdout, "%d active sessions in %.2f seconds, %.2f/sec...\n", + pool_elts(utm->sessions), now - start_time, + (f64)pool_elts(utm->sessions) / (now - start_time)); + } + + /* Send accept reply to vpp */ + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); + rmp->session_type = mp->session_type; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); +} + +void +uri_tcp_server_test (uri_tcp_test_main_t * utm) +{ + + /* Bind to uri */ + uri_tcp_bind (utm); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + /* Enter handle event loop */ + handle_event_queue (utm); + + /* Cleanup */ + uri_tcp_unbind (utm); + + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_START"); + return; + } + + fformat (stdout, "Test complete...\n"); +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) \ +_(ACCEPT_SESSION, accept_session) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ +_(DISCONNECT_SESSION, disconnect_session) \ +_(RESET_SESSION, reset_session) \ +_(MAP_ANOTHER_SEGMENT, map_another_segment) + +void +uri_api_hookup (uri_tcp_test_main_t * utm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_uri_msg; +#undef _ +} + +int +main (int argc, char **argv) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + unformat_input_t _argv, *a = &_argv; + u8 *chroot_prefix; + u8 *heap; + u8 * bind_name = (u8 *) "tcp://0.0.0.0/1234"; + u32 tmp; + mheap_t *h; + session_t * session; + int i; + int i_am_master = 1, drop_packets = 0; + + clib_mem_init (0, 256 << 20); + + heap = clib_mem_get_per_cpu_heap (); + h = mheap_header (heap); + + /* make the main heap thread-safe */ + h->flags |= MHEAP_FLAG_THREAD_SAFE; + + vec_validate (utm->rx_buf, 65536); + + utm->session_index_by_vpp_handles = + hash_create (0, sizeof(uword)); + + utm->my_pid = getpid(); + utm->configured_segment_size = 1<<20; + + clib_time_init (&utm->clib_time); + init_error_string_table (utm); + svm_fifo_segment_init(0x200000000ULL, 20); + unformat_init_command_line (a, argv); + + while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) + { + if (unformat (a, "chroot prefix %s", &chroot_prefix)) + { + vl_set_memory_root_path ((char *) chroot_prefix); + } + else if (unformat (a, "uri %s", &bind_name)) + ; + else if (unformat (a, "segment-size %dM", &tmp)) + utm->configured_segment_size = tmp<<20; + else if (unformat (a, "segment-size %dG", &tmp)) + utm->configured_segment_size = tmp<<30; + else if (unformat (a, "master")) + i_am_master = 1; + else if (unformat (a, "slave")) + i_am_master = 0; + else if (unformat (a, "drop")) + drop_packets = 1; + else + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } + } + + utm->uri = format (0, "%s%c", bind_name, 0); + utm->i_am_master = i_am_master; + utm->segment_main = &svm_fifo_segment_main; + utm->drop_packets = drop_packets; + + utm->connect_uri = format (0, "tcp://6.0.1.2/1234%c", 0); + + setup_signal_handlers(); + uri_api_hookup (utm); + + if (connect_to_vpp (i_am_master? "uri_tcp_server":"uri_tcp_client") < 0) + { + svm_region_exit (); + fformat (stderr, "Couldn't connect to vpe, exiting...\n"); + exit (1); + } + + if (i_am_master == 0) + { + uri_tcp_client_test (utm); + exit (0); + } + + /* $$$$ hack preallocation */ + for (i = 0; i < 200000; i++) + { + pool_get (utm->sessions, session); + memset (session, 0, sizeof (*session)); + } + for (i = 0; i < 200000; i++) + pool_put_index (utm->sessions, i); + + uri_tcp_server_test (utm); + + vl_client_disconnect_from_vlib (); + exit (0); +} diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c new file mode 100644 index 00000000..6f5284c9 --- /dev/null +++ b/src/uri/uri_udp_test.c @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include +#undef vl_printfun + +/* Satisfy external references when not linking with -lvlib */ +vlib_main_t vlib_global_main; +vlib_main_t **vlib_mains; + +typedef enum +{ + STATE_START, + STATE_READY, + STATE_DISCONNECTING, +} connection_state_t; + +typedef struct +{ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; +} session_t; + +typedef struct +{ + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* The URI we're playing with */ + u8 *uri; + + /* Session pool */ + session_t *sessions; + + /* Hash table for disconnect processing */ + uword *session_index_by_vpp_handles; + + /* fifo segment */ + svm_fifo_segment_private_t *seg; + + /* intermediate rx buffer */ + u8 *rx_buf; + + /* Our event queue */ + unix_shared_memory_queue_t *our_event_queue; + + /* $$$ single thread only for the moment */ + unix_shared_memory_queue_t *vpp_event_queue; + + /* For deadman timers */ + clib_time_t clib_time; + + /* State of the connection, shared between msg RX thread and main thread */ + volatile connection_state_t state; + + volatile int time_to_stop; + volatile int time_to_print_stats; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword *error_string_by_error_number; +} uri_udp_test_main_t; + +#if CLIB_DEBUG > 0 +#define NITER 1000 +#else +#define NITER 1000000 +#endif + +uri_udp_test_main_t uri_udp_test_main; + +static void +stop_signal (int signum) +{ + uri_udp_test_main_t *um = &uri_udp_test_main; + + um->time_to_stop = 1; +} + +static void +stats_signal (int signum) +{ + uri_udp_test_main_t *um = &uri_udp_test_main; + + um->time_to_print_stats = 1; +} + +static clib_error_t * +setup_signal_handlers (void) +{ + signal (SIGINT, stats_signal); + signal (SIGQUIT, stop_signal); + signal (SIGTERM, stop_signal); + + return 0; +} + +u8 * +format_api_error (u8 * s, va_list * args) +{ + uri_udp_test_main_t *utm = va_arg (*args, uri_udp_test_main_t *); + i32 error = va_arg (*args, u32); + uword *p; + + p = hash_get (utm->error_string_by_error_number, -error); + + if (p) + s = format (s, "%s", p[0]); + else + s = format (s, "%d", error); + return s; +} + +int +wait_for_state_change (uri_udp_test_main_t * utm, connection_state_t state) +{ + f64 timeout = clib_time_now (&utm->clib_time) + 5.0; + + while (clib_time_now (&utm->clib_time) < timeout) + { + if (utm->state == state) + return 0; + } + return -1; +} + +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); + return; + } + + utm->our_event_queue = (unix_shared_memory_queue_t *) + mp->server_event_queue_address; + + utm->state = STATE_READY; +} + +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + + if (mp->retval != 0) + clib_warning ("returned %d", ntohl (mp->retval)); + + utm->state = STATE_START; +} + +static void +vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + vl_api_accept_session_reply_t *rmp; + svm_fifo_t *rx_fifo, *tx_fifo; + session_t *session; + static f64 start_time; + u64 key; + + if (start_time == 0.0) + start_time = clib_time_now (&utm->clib_time); + + utm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + pool_get (utm->sessions, session); + + rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + rx_fifo->client_session_index = session - utm->sessions; + tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + tx_fifo->client_session_index = session - utm->sessions; + + session->server_rx_fifo = rx_fifo; + session->server_tx_fifo = tx_fifo; + + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + + hash_set (utm->session_index_by_vpp_handles, key, session - utm->sessions); + + utm->state = STATE_READY; + + if (pool_elts (utm->sessions) && (pool_elts (utm->sessions) % 20000) == 0) + { + f64 now = clib_time_now (&utm->clib_time); + fformat (stdout, "%d active sessions in %.2f seconds, %.2f/sec...\n", + pool_elts (utm->sessions), now - start_time, + (f64) pool_elts (utm->sessions) / (now - start_time)); + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); + rmp->session_type = mp->session_type; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); +} + +static void +vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + session_t *session; + vl_api_disconnect_session_reply_t *rmp; + uword *p; + int rv = 0; + u64 key; + + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + + p = hash_get (utm->session_index_by_vpp_handles, key); + + if (p) + { + session = pool_elt_at_index (utm->sessions, p[0]); + hash_unset (utm->session_index_by_vpp_handles, key); + pool_put (utm->sessions, session); + } + else + { + clib_warning ("couldn't find session key %llx", key); + rv = -11; + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->retval = rv; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) \ +_(ACCEPT_SESSION, accept_session) \ +_(DISCONNECT_SESSION, disconnect_session) + +void +uri_api_hookup (uri_udp_test_main_t * utm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_uri_msg; +#undef _ + +} + + +int +connect_to_vpp (char *name) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) + return -1; + + utm->vl_input_queue = am->shmem_hdr->vl_input_queue; + utm->my_client_index = am->my_client_index; + + return 0; +} + +void +vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) +{ + clib_warning ("BUG"); +} + +static void +init_error_string_table (uri_udp_test_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + +void +handle_fifo_event_server_rx (uri_udp_test_main_t * utm, + session_fifo_event_t * e) +{ + svm_fifo_t *rx_fifo, *tx_fifo; + int nbytes; + + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + int rv; + + rx_fifo = e->fifo; + tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + + do + { + nbytes = svm_fifo_dequeue_nowait (rx_fifo, 0, + vec_len (utm->rx_buf), utm->rx_buf); + } + while (nbytes <= 0); + do + { + rv = svm_fifo_enqueue_nowait (tx_fifo, 0, nbytes, utm->rx_buf); + } + while (rv == -2); + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = nbytes; + evt.event_id = e->event_id; + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); +} + +void +handle_event_queue (uri_udp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + + while (1) + { + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, + 0 /* nowait */ ); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + handle_fifo_event_server_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning ("unknown event type %d", e->event_type); + break; + } + if (PREDICT_FALSE (utm->time_to_stop == 1)) + break; + if (PREDICT_FALSE (utm->time_to_print_stats == 1)) + { + utm->time_to_print_stats = 0; + fformat (stdout, "%d connections\n", pool_elts (utm->sessions)); + } + } +} + +void +uri_udp_test (uri_udp_test_main_t * utm) +{ + vl_api_bind_uri_t *bmp; + vl_api_unbind_uri_t *ump; + + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->segment_size = 2 << 30; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + handle_event_queue (utm); + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); + + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_START"); + return; + } + + fformat (stdout, "Test complete...\n"); +} + +int +main (int argc, char **argv) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + unformat_input_t _argv, *a = &_argv; + u8 *chroot_prefix; + u8 *heap; + u8 *bind_name = (u8 *) "udp4:1234"; + mheap_t *h; + session_t *session; + int i; + + clib_mem_init (0, 256 << 20); + + heap = clib_mem_get_per_cpu_heap (); + h = mheap_header (heap); + + /* make the main heap thread-safe */ + h->flags |= MHEAP_FLAG_THREAD_SAFE; + + vec_validate (utm->rx_buf, 8192); + + utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); + + clib_time_init (&utm->clib_time); + init_error_string_table (utm); + svm_fifo_segment_init (0x200000000ULL, 20); + unformat_init_command_line (a, argv); + + while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) + { + if (unformat (a, "chroot prefix %s", &chroot_prefix)) + { + vl_set_memory_root_path ((char *) chroot_prefix); + } + else if (unformat (a, "uri %s", &bind_name)) + ; + else + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } + } + + utm->uri = format (0, "%s%c", bind_name, 0); + + setup_signal_handlers (); + + uri_api_hookup (utm); + + if (connect_to_vpp ("uri_udp_test") < 0) + { + svm_region_exit (); + fformat (stderr, "Couldn't connect to vpe, exiting...\n"); + exit (1); + } + + /* $$$$ hack preallocation */ + for (i = 0; i < 200000; i++) + { + pool_get (utm->sessions, session); + memset (session, 0, sizeof (*session)); + } + for (i = 0; i < 200000; i++) + pool_put_index (utm->sessions, i); + + uri_udp_test (utm); + + vl_client_disconnect_from_vlib (); + exit (0); +} + +#undef vl_api_version +#define vl_api_version(n,v) static u32 vpe_api_version = v; +#include +#undef vl_api_version + +void +vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) +{ + /* + * Send the main API signature in slot 0. This bit of code must + * match the checks in ../vpe/api/api.c: vl_msg_api_version_check(). + */ + mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri/uri_udp_test2.c b/src/uri/uri_udp_test2.c new file mode 100644 index 00000000..ddfffaa6 --- /dev/null +++ b/src/uri/uri_udp_test2.c @@ -0,0 +1,954 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../vnet/session/application_interface.h" + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include +#undef vl_printfun + +/* Satisfy external references when not linking with -lvlib */ +vlib_main_t vlib_global_main; +vlib_main_t **vlib_mains; + +typedef enum +{ + STATE_START, + STATE_READY, + STATE_DISCONNECTING, +} connection_state_t; + +typedef struct +{ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; +} session_t; + +typedef struct +{ + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* The URI we're playing with */ + u8 *uri; + + /* Session pool */ + session_t *sessions; + + /* Hash table for disconnect processing */ + uword *session_index_by_vpp_handles; + + /* fifo segment */ + svm_fifo_segment_private_t *seg; + + /* intermediate rx buffer */ + u8 *rx_buf; + + /* URI for connect */ + u8 *connect_uri; + + int i_am_master; + + /* Our event queue */ + unix_shared_memory_queue_t *our_event_queue; + + /* $$$ single thread only for the moment */ + unix_shared_memory_queue_t *vpp_event_queue; + + /* $$$$ hack: cut-through session index */ + volatile u32 cut_through_session_index; + + /* unique segment name counter */ + u32 unique_segment_index; + + pid_t my_pid; + + /* pthread handle */ + pthread_t cut_through_thread_handle; + + /* For deadman timers */ + clib_time_t clib_time; + + /* State of the connection, shared between msg RX thread and main thread */ + volatile connection_state_t state; + + volatile int time_to_stop; + volatile int time_to_print_stats; + + u32 configured_segment_size; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword *error_string_by_error_number; + + /* convenience */ + svm_fifo_segment_main_t *segment_main; + +} uri_udp_test_main_t; + +#if CLIB_DEBUG > 0 +#define NITER 10000 +#else +#define NITER 4000000 +#endif + +uri_udp_test_main_t uri_udp_test_main; + +static void +stop_signal (int signum) +{ + uri_udp_test_main_t *um = &uri_udp_test_main; + + um->time_to_stop = 1; +} + +static void +stats_signal (int signum) +{ + uri_udp_test_main_t *um = &uri_udp_test_main; + + um->time_to_print_stats = 1; +} + +static clib_error_t * +setup_signal_handlers (void) +{ + signal (SIGINT, stats_signal); + signal (SIGQUIT, stop_signal); + signal (SIGTERM, stop_signal); + + return 0; +} + +u8 * +format_api_error (u8 * s, va_list * args) +{ + uri_udp_test_main_t *utm = va_arg (*args, uri_udp_test_main_t *); + i32 error = va_arg (*args, u32); + uword *p; + + p = hash_get (utm->error_string_by_error_number, -error); + + if (p) + s = format (s, "%s", p[0]); + else + s = format (s, "%d", error); + return s; +} + +int +wait_for_state_change (uri_udp_test_main_t * utm, connection_state_t state) +{ +#if CLIB_DEBUG > 0 +#define TIMEOUT 600.0 +#else +#define TIMEOUT 600.0 +#endif + + f64 timeout = clib_time_now (&utm->clib_time) + TIMEOUT; + + while (clib_time_now (&utm->clib_time) < timeout) + { + if (utm->state == state) + return 0; + } + return -1; +} + +u64 server_bytes_received, server_bytes_sent; + +static void * +cut_through_thread_fn (void *arg) +{ + session_t *s; + svm_fifo_t *rx_fifo; + svm_fifo_t *tx_fifo; + u8 *my_copy_buffer = 0; + uri_udp_test_main_t *utm = &uri_udp_test_main; + i32 actual_transfer; + int rv; + u32 buffer_offset; + + while (utm->cut_through_session_index == ~0) + ; + + s = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); + + rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; + + vec_validate (my_copy_buffer, 64 * 1024 - 1); + + while (true) + { + /* We read from the tx fifo and write to the rx fifo */ + do + { + actual_transfer = svm_fifo_dequeue_nowait (tx_fifo, 0, + vec_len (my_copy_buffer), + my_copy_buffer); + } + while (actual_transfer <= 0); + + server_bytes_received += actual_transfer; + + buffer_offset = 0; + while (actual_transfer > 0) + { + rv = svm_fifo_enqueue_nowait (rx_fifo, 0, actual_transfer, + my_copy_buffer + buffer_offset); + if (rv > 0) + { + actual_transfer -= rv; + buffer_offset += rv; + server_bytes_sent += rv; + } + + } + if (PREDICT_FALSE (utm->time_to_stop)) + break; + } + + pthread_exit (0); +} + +static void +uri_udp_slave_test (uri_udp_test_main_t * utm) +{ + vl_api_connect_uri_t *cmp; + int i; + u8 *test_data = 0; + u64 bytes_received = 0, bytes_sent = 0; + i32 bytes_to_read; + int rv; + int mypid = getpid (); + f64 before, after, delta, bytes_per_second; + session_t *session; + svm_fifo_t *rx_fifo, *tx_fifo; + int buffer_offset, bytes_to_send = 0; + + vec_validate (test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i & 0xff; + + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + session = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); + rx_fifo = session->server_rx_fifo; + tx_fifo = session->server_tx_fifo; + + before = clib_time_now (&utm->clib_time); + + vec_validate (utm->rx_buf, vec_len (test_data) - 1); + + for (i = 0; i < NITER; i++) + { + bytes_to_send = vec_len (test_data); + buffer_offset = 0; + while (bytes_to_send > 0) + { + rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, + bytes_to_send, + test_data + buffer_offset); + + if (rv > 0) + { + bytes_to_send -= rv; + buffer_offset += rv; + bytes_sent += rv; + } + } + + bytes_to_read = svm_fifo_max_dequeue (rx_fifo); + + bytes_to_read = vec_len (utm->rx_buf) > bytes_to_read ? + bytes_to_read : vec_len (utm->rx_buf); + + buffer_offset = 0; + while (bytes_to_read > 0) + { + rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, + bytes_to_read, + utm->rx_buf + buffer_offset); + if (rv > 0) + { + bytes_to_read -= rv; + buffer_offset += rv; + bytes_received += rv; + } + } + } + while (bytes_received < bytes_sent) + { + rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, + vec_len (utm->rx_buf), utm->rx_buf); + if (rv > 0) + { +#if CLIB_DEBUG > 0 + int j; + for (j = 0; j < rv; j++) + { + if (utm->rx_buf[j] != ((bytes_received + j) & 0xff)) + { + clib_warning ("error at byte %lld, 0x%x not 0x%x", + bytes_received + j, + utm->rx_buf[j], + ((bytes_received + j) & 0xff)); + } + } +#endif + bytes_received += (u64) rv; + } + } + + after = clib_time_now (&utm->clib_time); + delta = after - before; + bytes_per_second = 0.0; + + if (delta > 0.0) + bytes_per_second = (f64) bytes_received / delta; + + fformat (stdout, + "Done: %lld recv bytes in %.2f seconds, %.2f bytes/sec...\n\n", + bytes_received, delta, bytes_per_second); + fformat (stdout, + "Done: %lld sent bytes in %.2f seconds, %.2f bytes/sec...\n\n", + bytes_sent, delta, bytes_per_second); + fformat (stdout, + "client -> server -> client round trip: %.2f Gbit/sec \n\n", + (bytes_per_second * 8.0) / 1e9); +} + +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT (mp->server_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + + utm->our_event_queue = (unix_shared_memory_queue_t *) + mp->server_event_queue_address; + + utm->state = STATE_READY; +} + +static void +vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t * mp) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + clib_warning ("Mapped new segment '%s' size %d", mp->segment_name, + mp->segment_size); +} + +static void +vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) +{ + u32 segment_index; + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *seg; + unix_shared_memory_queue_t *client_q; + vl_api_connect_uri_reply_t *rmp; + session_t *session; + int rv = 0; + + /* Create the segment */ + a->segment_name = (char *) format (0, "%d:segment%d%c", utm->my_pid, + utm->unique_segment_index++, 0); + a->segment_size = utm->configured_segment_size; + + rv = svm_fifo_segment_create (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", a->segment_name); + rv = VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + goto send_reply; + } + + vec_add2 (utm->seg, seg, 1); + + segment_index = vec_len (sm->segments) - 1; + + memcpy (seg, sm->segments + segment_index, sizeof (utm->seg[0])); + + pool_get (utm->sessions, session); + + /* + * By construction the master's idea of the rx fifo ends up in + * fsh->fifos[0], and the master's idea of the tx fifo ends up in + * fsh->fifos[1]. + */ + session->server_rx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, + 128 * 1024); + ASSERT (session->server_rx_fifo); + + session->server_tx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, + 128 * 1024); + ASSERT (session->server_tx_fifo); + + session->server_rx_fifo->server_session_index = session - utm->sessions; + session->server_tx_fifo->server_session_index = session - utm->sessions; + utm->cut_through_session_index = session - utm->sessions; + + rv = pthread_create (&utm->cut_through_thread_handle, + NULL /*attr */ , cut_through_thread_fn, 0); + if (rv) + { + clib_warning ("pthread_create returned %d", rv); + rv = VNET_API_ERROR_SYSCALL_ERROR_1; + } + +send_reply: + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + + rmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI_REPLY); + rmp->context = mp->context; + rmp->retval = ntohl (rv); + rmp->segment_name_length = vec_len (a->segment_name); + memcpy (rmp->segment_name, a->segment_name, vec_len (a->segment_name)); + + vec_free (a->segment_name); + + client_q = (unix_shared_memory_queue_t *) mp->client_queue_address; + vl_msg_api_send_shmem (client_q, (u8 *) & rmp); +} + +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + + if (mp->retval != 0) + clib_warning ("returned %d", ntohl (mp->retval)); + + utm->state = STATE_START; +} + +static void +vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + vl_api_accept_session_reply_t *rmp; + svm_fifo_t *rx_fifo, *tx_fifo; + session_t *session; + static f64 start_time; + u64 key; + + if (start_time == 0.0) + start_time = clib_time_now (&utm->clib_time); + + utm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + pool_get (utm->sessions, session); + + rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + rx_fifo->client_session_index = session - utm->sessions; + tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + tx_fifo->client_session_index = session - utm->sessions; + + session->server_rx_fifo = rx_fifo; + session->server_tx_fifo = tx_fifo; + + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + + hash_set (utm->session_index_by_vpp_handles, key, session - utm->sessions); + + utm->state = STATE_READY; + + if (pool_elts (utm->sessions) && (pool_elts (utm->sessions) % 20000) == 0) + { + f64 now = clib_time_now (&utm->clib_time); + fformat (stdout, "%d active sessions in %.2f seconds, %.2f/sec...\n", + pool_elts (utm->sessions), now - start_time, + (f64) pool_elts (utm->sessions) / (now - start_time)); + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); + rmp->session_type = mp->session_type; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); +} + +static void +vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + session_t *session; + vl_api_disconnect_session_reply_t *rmp; + uword *p; + int rv = 0; + u64 key; + + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + + p = hash_get (utm->session_index_by_vpp_handles, key); + + if (p) + { + session = pool_elt_at_index (utm->sessions, p[0]); + hash_unset (utm->session_index_by_vpp_handles, key); + pool_put (utm->sessions, session); + } + else + { + clib_warning ("couldn't find session key %llx", key); + rv = -11; + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->retval = rv; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); +} + +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + ssvm_shared_header_t *sh; + svm_fifo_segment_private_t *seg; + svm_fifo_segment_header_t *fsh; + session_t *session; + u32 segment_index; + int rv; + + ASSERT (utm->i_am_master == 0); + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + memset (a, 0, sizeof (*a)); + + a->segment_name = (char *) mp->segment_name; + + sleep (1); + + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%v') failed", mp->segment_name); + return; + } + + segment_index = vec_len (sm->segments) - 1; + + vec_add2 (utm->seg, seg, 1); + + memcpy (seg, sm->segments + segment_index, sizeof (*seg)); + sh = seg->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + + while (vec_len (fsh->fifos) < 2) + sleep (1); + + pool_get (utm->sessions, session); + utm->cut_through_session_index = session - utm->sessions; + + session->server_rx_fifo = (svm_fifo_t *) fsh->fifos[0]; + ASSERT (session->server_rx_fifo); + session->server_tx_fifo = (svm_fifo_t *) fsh->fifos[1]; + ASSERT (session->server_tx_fifo); + + /* security: could unlink /dev/shm/segment_name> here, maybe */ + + utm->state = STATE_READY; +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(CONNECT_URI, connect_uri) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) \ +_(ACCEPT_SESSION, accept_session) \ +_(DISCONNECT_SESSION, disconnect_session) \ +_(MAP_ANOTHER_SEGMENT, map_another_segment) + +void +uri_api_hookup (uri_udp_test_main_t * utm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_uri_msg; +#undef _ + +} + + +int +connect_to_vpp (char *name) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) + return -1; + + utm->vl_input_queue = am->shmem_hdr->vl_input_queue; + utm->my_client_index = am->my_client_index; + + return 0; +} + +void +vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) +{ + clib_warning ("BUG"); +} + +static void +init_error_string_table (uri_udp_test_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + +void +handle_fifo_event_server_rx (uri_udp_test_main_t * utm, + session_fifo_event_t * e) +{ + svm_fifo_t *rx_fifo, *tx_fifo; + int nbytes; + + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + int rv; + + rx_fifo = e->fifo; + tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + + do + { + nbytes = svm_fifo_dequeue_nowait (rx_fifo, 0, + vec_len (utm->rx_buf), utm->rx_buf); + } + while (nbytes <= 0); + do + { + rv = svm_fifo_enqueue_nowait (tx_fifo, 0, nbytes, utm->rx_buf); + } + while (rv == -2); + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = nbytes; + evt.event_id = e->event_id; + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); +} + +void +handle_event_queue (uri_udp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + + while (1) + { + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, + 0 /* nowait */ ); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + handle_fifo_event_server_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning ("unknown event type %d", e->event_type); + break; + } + if (PREDICT_FALSE (utm->time_to_stop == 1)) + break; + if (PREDICT_FALSE (utm->time_to_print_stats == 1)) + { + utm->time_to_print_stats = 0; + fformat (stdout, "%d connections\n", pool_elts (utm->sessions)); + } + } +} + +void +uri_udp_test (uri_udp_test_main_t * utm) +{ + vl_api_bind_uri_t *bmp; + vl_api_unbind_uri_t *ump; + + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->initial_segment_size = 256 << 20; /* size of initial segment */ + bmp->options[SESSION_OPTIONS_FLAGS] = + SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 16 << 10; + bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 16 << 10; + bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + handle_event_queue (utm); + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); + + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_START"); + return; + } + + fformat (stdout, "Test complete...\n"); +} + +int +main (int argc, char **argv) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + unformat_input_t _argv, *a = &_argv; + u8 *chroot_prefix; + u8 *heap; + u8 *bind_name = (u8 *) "udp://0.0.0.0/1234"; + u32 tmp; + mheap_t *h; + session_t *session; + int i; + int i_am_master = 1; + + clib_mem_init (0, 256 << 20); + + heap = clib_mem_get_per_cpu_heap (); + h = mheap_header (heap); + + /* make the main heap thread-safe */ + h->flags |= MHEAP_FLAG_THREAD_SAFE; + + vec_validate (utm->rx_buf, 8192); + + utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); + + utm->my_pid = getpid (); + utm->configured_segment_size = 1 << 20; + + clib_time_init (&utm->clib_time); + init_error_string_table (utm); + svm_fifo_segment_init (0x200000000ULL, 20); + unformat_init_command_line (a, argv); + + while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) + { + if (unformat (a, "chroot prefix %s", &chroot_prefix)) + { + vl_set_memory_root_path ((char *) chroot_prefix); + } + else if (unformat (a, "uri %s", &bind_name)) + ; + else if (unformat (a, "segment-size %dM", &tmp)) + utm->configured_segment_size = tmp << 20; + else if (unformat (a, "segment-size %dG", &tmp)) + utm->configured_segment_size = tmp << 30; + else if (unformat (a, "master")) + i_am_master = 1; + else if (unformat (a, "slave")) + i_am_master = 0; + else + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } + } + + utm->cut_through_session_index = ~0; + utm->uri = format (0, "%s%c", bind_name, 0); + utm->i_am_master = i_am_master; + utm->segment_main = &svm_fifo_segment_main; + + utm->connect_uri = format (0, "udp://10.0.0.1/1234%c", 0); + + setup_signal_handlers (); + + uri_api_hookup (utm); + + if (connect_to_vpp (i_am_master ? "uri_udp_master" : "uri_udp_slave") < 0) + { + svm_region_exit (); + fformat (stderr, "Couldn't connect to vpe, exiting...\n"); + exit (1); + } + + if (i_am_master == 0) + { + uri_udp_slave_test (utm); + exit (0); + } + + /* $$$$ hack preallocation */ + for (i = 0; i < 200000; i++) + { + pool_get (utm->sessions, session); + memset (session, 0, sizeof (*session)); + } + for (i = 0; i < 200000; i++) + pool_put_index (utm->sessions, i); + + uri_udp_test (utm); + + vl_client_disconnect_from_vlib (); + exit (0); +} + +#undef vl_api_version +#define vl_api_version(n,v) static u32 vpe_api_version = v; +#include +#undef vl_api_version + +void +vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) +{ + /* + * Send the main API signature in slot 0. This bit of code must + * match the checks in ../vpe/api/api.c: vl_msg_api_version_check(). + */ + mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); +} + +u32 +vl (void *p) +{ + return vec_len (p); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri/uritest.c b/src/uri/uritest.c new file mode 100644 index 00000000..edcdb3ad --- /dev/null +++ b/src/uri/uritest.c @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include +#undef vl_printfun + +typedef enum +{ + STATE_START, + STATE_READY, + STATE_DISCONNECTING, +} connection_state_t; + +typedef struct +{ + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* role */ + int i_am_master; + + /* The URI we're playing with */ + u8 *uri; + + /* fifo segment */ + svm_fifo_segment_private_t *seg; + + svm_fifo_t *rx_fifo; + svm_fifo_t *tx_fifo; + + /* For deadman timers */ + clib_time_t clib_time; + + /* State of the connection, shared between msg RX thread and main thread */ + volatile connection_state_t state; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword *error_string_by_error_number; +} uritest_main_t; + +#if CLIB_DEBUG > 0 +#define NITER 1000 +#else +#define NITER 1000000 +#endif + +uritest_main_t uritest_main; + +u8 * +format_api_error (u8 * s, va_list * args) +{ + uritest_main_t *utm = va_arg (*args, uritest_main_t *); + i32 error = va_arg (*args, u32); + uword *p; + + p = hash_get (utm->error_string_by_error_number, -error); + + if (p) + s = format (s, "%s", p[0]); + else + s = format (s, "%d", error); + return s; +} + +int +wait_for_state_change (uritest_main_t * utm, connection_state_t state) +{ + f64 timeout = clib_time_now (&utm->clib_time) + 1.0; + + while (clib_time_now (&utm->clib_time) < timeout) + { + if (utm->state == state) + return 0; + } + return -1; +} + +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uritest_main_t *utm = &uritest_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + ASSERT (utm->i_am_master); + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + /* Create the segment */ + rv = svm_fifo_segment_create (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); + return; + } + + vec_validate (utm->seg, 0); + + memcpy (utm->seg, a->rv, sizeof (*utm->seg)); + + /* + * By construction the master's idea of the rx fifo ends up in + * fsh->fifos[0], and the master's idea of the tx fifo ends up in + * fsh->fifos[1]. + */ + utm->rx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, 10240); + ASSERT (utm->rx_fifo); + + utm->tx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, 10240); + ASSERT (utm->tx_fifo); + + utm->state = STATE_READY; +} + +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + uritest_main_t *utm = &uritest_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + int rv; + + ASSERT (utm->i_am_master == 0); + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + memset (a, 0, sizeof (*a)); + + a->segment_name = (char *) mp->segment_name; + + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); + return; + } + + vec_validate (utm->seg, 0); + + memcpy (utm->seg, a->rv, sizeof (*utm->seg)); + sh = utm->seg->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + + while (vec_len (fsh->fifos) < 2) + sleep (1); + + utm->rx_fifo = (svm_fifo_t *) fsh->fifos[1]; + ASSERT (utm->rx_fifo); + utm->tx_fifo = (svm_fifo_t *) fsh->fifos[0]; + ASSERT (utm->tx_fifo); + + /* security: could unlink /dev/shm/segment_name> here, maybe */ + + utm->state = STATE_READY; +} + +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) +{ + uritest_main_t *utm = &uritest_main; + + if (mp->retval != 0) + clib_warning ("returned %d", ntohl (mp->retval)); + + utm->state = STATE_START; +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) + +void +uri_api_hookup (uritest_main_t * utm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_uri_msg; +#undef _ + +} + + +int +connect_to_vpp (char *name) +{ + uritest_main_t *utm = &uritest_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) + return -1; + + utm->vl_input_queue = am->shmem_hdr->vl_input_queue; + utm->my_client_index = am->my_client_index; + + return 0; +} + +void +vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) +{ + clib_warning ("BUG"); +} + +static void +init_error_string_table (uritest_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + +void +uritest_master (uritest_main_t * utm) +{ + vl_api_bind_uri_t *bmp; + vl_api_unbind_uri_t *ump; + int i; + u8 *test_data = 0; + u8 *reply = 0; + u32 reply_len; + int mypid = getpid (); + + for (i = 0; i < 2048; i++) + vec_add1 (test_data, 'a' + (i % 32)); + + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->segment_size = 256 << 10; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + for (i = 0; i < NITER; i++) + svm_fifo_enqueue (utm->tx_fifo, mypid, vec_len (test_data), test_data); + + vec_validate (reply, 0); + + reply_len = svm_fifo_dequeue (utm->rx_fifo, mypid, vec_len (reply), reply); + + if (reply_len != 1) + clib_warning ("reply length %d", reply_len); + + if (reply[0] == 1) + fformat (stdout, "Test OK..."); + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); + + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + fformat (stdout, "Master done...\n"); +} + +void +uritest_slave (uritest_main_t * utm) +{ + vl_api_connect_uri_t *cmp; + int i, j; + u8 *test_data = 0; + u8 *reply = 0; + u32 bytes_received = 0; + u32 actual_bytes; + int mypid = getpid (); + u8 ok; + f64 before, after, delta, bytes_per_second; + + vec_validate (test_data, 4095); + + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + ok = 1; + before = clib_time_now (&utm->clib_time); + for (i = 0; i < NITER; i++) + { + actual_bytes = svm_fifo_dequeue (utm->rx_fifo, mypid, + vec_len (test_data), test_data); + j = 0; + while (j < actual_bytes) + { + if (test_data[j] != ('a' + (bytes_received % 32))) + ok = 0; + bytes_received++; + j++; + } + if (bytes_received == NITER * 2048) + break; + } + + vec_add1 (reply, ok); + + svm_fifo_enqueue (utm->tx_fifo, mypid, vec_len (reply), reply); + after = clib_time_now (&utm->clib_time); + delta = after - before; + bytes_per_second = 0.0; + + if (delta > 0.0) + bytes_per_second = (f64) bytes_received / delta; + + fformat (stdout, + "Slave done, %d bytes in %.2f seconds, %.2f bytes/sec...\n", + bytes_received, delta, bytes_per_second); +} + +int +main (int argc, char **argv) +{ + uritest_main_t *utm = &uritest_main; + unformat_input_t _argv, *a = &_argv; + u8 *chroot_prefix; + u8 *heap; + char *bind_name = "fifo:uritest"; + mheap_t *h; + int i_am_master = 0; + + clib_mem_init (0, 128 << 20); + + heap = clib_mem_get_per_cpu_heap (); + h = mheap_header (heap); + + /* make the main heap thread-safe */ + h->flags |= MHEAP_FLAG_THREAD_SAFE; + + clib_time_init (&utm->clib_time); + init_error_string_table (utm); + svm_fifo_segment_init (0x200000000ULL, 20); + unformat_init_command_line (a, argv); + + utm->uri = format (0, "%s%c", bind_name, 0); + + while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) + { + if (unformat (a, "master")) + i_am_master = 1; + else if (unformat (a, "slave")) + i_am_master = 0; + else if (unformat (a, "chroot prefix %s", &chroot_prefix)) + { + vl_set_memory_root_path ((char *) chroot_prefix); + } + else + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } + } + + uri_api_hookup (utm); + + if (connect_to_vpp (i_am_master ? "uritest_master" : "uritest_slave") < 0) + { + svm_region_exit (); + fformat (stderr, "Couldn't connect to vpe, exiting...\n"); + exit (1); + } + + utm->i_am_master = i_am_master; + + if (i_am_master) + uritest_master (utm); + else + uritest_slave (utm); + + vl_client_disconnect_from_vlib (); + exit (0); +} + +#undef vl_api_version +#define vl_api_version(n,v) static u32 vpe_api_version = v; +#include +#undef vl_api_version + +void +vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) +{ + /* + * Send the main API signature in slot 0. This bit of code must + * match the checks in ../vpe/api/api.c: vl_msg_api_version_check(). + */ + mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 4f5eb09d..9f26bec7 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -360,7 +360,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, memset (f, 0, sizeof (f[0])); f->index = f - bm->buffer_free_list_pool; f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); - f->min_n_buffers_each_physmem_alloc = 16; + f->min_n_buffers_each_physmem_alloc = VLIB_FRAME_SIZE; f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); /* Setup free buffer template. */ diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 1f723f3b..69c8c7cc 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -240,6 +240,74 @@ vlib_get_buffer_opaque2 (vlib_buffer_t * b) return (void *) b->opaque2; } +/** \brief Get pointer to the end of buffer's data + * @param b pointer to the buffer + * @return pointer to tail of packet's data + */ +always_inline u8 * +vlib_buffer_get_tail (vlib_buffer_t * b) +{ + return b->data + b->current_data + b->current_length; +} + +/** \brief Append uninitialized data to buffer + * @param b pointer to the buffer + * @param size number of uninitialized bytes + * @return pointer to beginning of uninitialized data + */ +always_inline void * +vlib_buffer_put_uninit (vlib_buffer_t * b, u8 size) +{ + void *p = vlib_buffer_get_tail (b); + /* XXX make sure there's enough space */ + b->current_length += size; + return p; +} + +/** \brief Prepend uninitialized data to buffer + * @param b pointer to the buffer + * @param size number of uninitialized bytes + * @return pointer to beginning of uninitialized data + */ +always_inline void * +vlib_buffer_push_uninit (vlib_buffer_t * b, u8 size) +{ + ASSERT (b->current_data + VLIB_BUFFER_PRE_DATA_SIZE >= size); + b->current_data -= size; + b->current_length += size; + + return vlib_buffer_get_current (b); +} + +/** \brief Make head room, typically for packet headers + * @param b pointer to the buffer + * @param size number of head room bytes + * @return pointer to start of buffer (current data) + */ +always_inline void * +vlib_buffer_make_headroom (vlib_buffer_t * b, u8 size) +{ + ASSERT (b->current_data + VLIB_BUFFER_PRE_DATA_SIZE >= size); + b->current_data += size; + return vlib_buffer_get_current (b); +} + +/** \brief Retrieve bytes from buffer head + * @param b pointer to the buffer + * @param size number of bytes to pull + * @return pointer to start of buffer (current data) + */ +always_inline void * +vlib_buffer_pull (vlib_buffer_t * b, u8 size) +{ + if (b->current_length + VLIB_BUFFER_PRE_DATA_SIZE < size) + return 0; + + void *data = vlib_buffer_get_current (b); + vlib_buffer_advance (b, size); + return data; +} + /* Forward declaration. */ struct vlib_main_t; diff --git a/src/vlibmemory/unix_shared_memory_queue.c b/src/vlibmemory/unix_shared_memory_queue.c index 25d28910..e86edec3 100644 --- a/src/vlibmemory/unix_shared_memory_queue.c +++ b/src/vlibmemory/unix_shared_memory_queue.c @@ -33,18 +33,13 @@ * nels = number of elements on the queue * elsize = element size, presumably 4 and cacheline-size will * be popular choices. - * coid = consumer coid, from ChannelCreate * pid = consumer pid - * pulse_code = pulse code consumer expects - * pulse_value = pulse value consumer expects - * consumer_prio = consumer's priority, so pulses won't change - * the consumer's priority. * * The idea is to call this function in the queue consumer, * and e-mail the queue pointer to the producer(s). * - * The spp process / main thread allocates one of these - * at startup; its main input queue. The spp main input queue + * The vpp process / main thread allocates one of these + * at startup; its main input queue. The vpp main input queue * has a pointer to it in the shared memory segment header. * * You probably want to be on an svm data heap before calling this @@ -70,7 +65,7 @@ unix_shared_memory_queue_init (int nels, q->signal_when_queue_non_empty = signal_when_queue_non_empty; memset (&attr, 0, sizeof (attr)); - memset (&cattr, 0, sizeof (attr)); + memset (&cattr, 0, sizeof (cattr)); if (pthread_mutexattr_init (&attr)) clib_unix_warning ("mutexattr_init"); @@ -277,6 +272,7 @@ unix_shared_memory_queue_sub (unix_shared_memory_queue_t * q, clib_memcpy (elem, headp, q->elsize); q->head++; + /* $$$$ JFC shouldn't this be == 0? */ if (q->cursize == q->maxsize) need_broadcast = 1; diff --git a/src/vlibmemory/unix_shared_memory_queue.h b/src/vlibmemory/unix_shared_memory_queue.h index f758f17c..13800065 100644 --- a/src/vlibmemory/unix_shared_memory_queue.h +++ b/src/vlibmemory/unix_shared_memory_queue.h @@ -29,7 +29,7 @@ typedef struct _unix_shared_memory_queue pthread_cond_t condvar; /* 8 bytes */ int head; int tail; - int cursize; + volatile int cursize; int maxsize; int elsize; int consumer_pid; diff --git a/src/vnet.am b/src/vnet.am index 64484e18..923f61d8 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -324,11 +324,7 @@ libvnet_la_SOURCES += \ vnet/ip/ip_input_acl.c \ vnet/ip/lookup.c \ vnet/ip/ping.c \ - vnet/ip/punt.c \ - vnet/ip/udp_format.c \ - vnet/ip/udp_init.c \ - vnet/ip/udp_local.c \ - vnet/ip/udp_pg.c + vnet/ip/punt.c nobase_include_HEADERS += \ vnet/ip/format.h \ @@ -354,11 +350,7 @@ nobase_include_HEADERS += \ vnet/ip/ports.def \ vnet/ip/protocols.def \ vnet/ip/punt_error.def \ - vnet/ip/punt.h \ - vnet/ip/tcp_packet.h \ - vnet/ip/udp_error.def \ - vnet/ip/udp.h \ - vnet/ip/udp_packet.h + vnet/ip/punt.h API_FILES += vnet/ip/ip.api @@ -473,6 +465,38 @@ test_map_LDADD = libvnet.la libvppinfra.la libvlib.la \ test_map_LDFLAGS = -static endif +######################################## +# Layer 4 protocol: tcp +######################################## +libvnet_la_SOURCES += \ + vnet/tcp/tcp_format.c \ + vnet/tcp/tcp_pg.c \ + vnet/tcp/tcp_syn_filter4.c \ + vnet/tcp/tcp_output.c \ + vnet/tcp/tcp_input.c \ + vnet/tcp/tcp_newreno.c \ + vnet/tcp/tcp.c + +nobase_include_HEADERS += \ + vnet/tcp/tcp_packet.h \ + vnet/tcp/tcp_timer.h \ + vnet/tcp/tcp.h + +######################################## +# Layer 4 protocol: udp +######################################## +libvnet_la_SOURCES += \ + vnet/udp/udp.c \ + vnet/udp/udp_input.c \ + vnet/udp/builtin_server.c \ + vnet/udp/udp_format.c \ + vnet/udp/udp_local.c \ + vnet/udp/udp_pg.c + +nobase_include_HEADERS += \ + vnet/udp/udp_error.def \ + vnet/udp/udp.h \ + vnet/udp/udp_packet.h ######################################## # Tunnel protocol: gre @@ -833,6 +857,28 @@ libvnet_la_SOURCES += \ nobase_include_HEADERS += \ vnet/devices/ssvm/ssvm_eth.h +######################################## +# session managmeent +######################################## + +libvnet_la_SOURCES += \ + vnet/session/session.c \ + vnet/session/node.c \ + vnet/session/transport.c \ + vnet/session/application.c \ + vnet/session/session_cli.c \ + vnet/session/hashes.c \ + vnet/session/application_interface.c \ + vnet/session/session_api.c + +nobase_include_HEADERS += \ + vnet/session/session.h \ + vnet/session/application.h \ + vnet/session/transport.h \ + vnet/session/application_interface.h + +API_FILES += vnet/session/session.api + ######################################## # Linux packet interface ######################################## diff --git a/src/vnet/api_errno.h b/src/vnet/api_errno.h index 8680ef7c..861a5767 100644 --- a/src/vnet/api_errno.h +++ b/src/vnet/api_errno.h @@ -91,14 +91,19 @@ _(INVALID_ADDRESS_FAMILY, -97, "Invalid address family") \ _(INVALID_SUB_SW_IF_INDEX, -98, "Invalid sub-interface sw_if_index") \ _(TABLE_TOO_BIG, -99, "Table too big") \ _(CANNOT_ENABLE_DISABLE_FEATURE, -100, "Cannot enable/disable feature") \ -_(BFD_EEXIST, -101, "Duplicate BFD object") \ -_(BFD_ENOENT, -102, "No such BFD object") \ -_(BFD_EINUSE, -103, "BFD object in use") \ -_(BFD_NOTSUPP, -104, "BFD feature not supported") \ -_(LISP_RLOC_LOCAL, -105, "RLOC address is local") \ -_(BFD_EAGAIN, -106, "BFD object cannot be manipulated at this time") \ -_(INVALID_GPE_MODE, -107, "Invalid GPE mode") \ -_(LISP_GPE_ENTRIES_PRESENT, -108, "LISP GPE entries are present") +_(BFD_EEXIST, -101, "Duplicate BFD object") \ +_(BFD_ENOENT, -102, "No such BFD object") \ +_(BFD_EINUSE, -103, "BFD object in use") \ +_(BFD_NOTSUPP, -104, "BFD feature not supported") \ +_(ADDRESS_IN_USE, -105, "Address in use") \ +_(ADDRESS_NOT_IN_USE, -106, "Address not in use") \ +_(QUEUE_FULL, -107, "Queue full") \ +_(UNKNOWN_URI_TYPE, -108, "Unknown URI type") \ +_(URI_FIFO_CREATE_FAILED, -109, "URI FIFO segment create failed") \ +_(LISP_RLOC_LOCAL, -110, "RLOC address is local") \ +_(BFD_EAGAIN, -111, "BFD object cannot be manipulated at this time") \ +_(INVALID_GPE_MODE, -112, "Invalid GPE mode") \ +_(LISP_GPE_ENTRIES_PRESENT, -113, "LISP GPE entries are present") typedef enum { diff --git a/src/vnet/bfd/bfd_udp.c b/src/vnet/bfd/bfd_udp.c index 146faad6..cf05089b 100644 --- a/src/vnet/bfd/bfd_udp.c +++ b/src/vnet/bfd/bfd_udp.c @@ -18,12 +18,12 @@ #include #include #include -#include +#include +#include #include #include #include #include -#include #include #include #include diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index f1cc6371..3de01f2a 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -277,6 +277,16 @@ typedef struct u16 buffer_advance; } device_input_feat; + /* TCP */ + struct + { + u32 connection_index; + u32 seq_number; + u32 seq_end; + u32 ack_number; + u8 flags; + } tcp; + u32 unused[6]; }; } vnet_buffer_opaque_t; diff --git a/src/vnet/classify/vnet_classify.c b/src/vnet/classify/vnet_classify.c index 6093e2ac..b651a1f1 100644 --- a/src/vnet/classify/vnet_classify.c +++ b/src/vnet/classify/vnet_classify.c @@ -695,8 +695,8 @@ int vnet_classify_add_del_table (vnet_classify_main_t * cm, } #define foreach_tcp_proto_field \ -_(src_port) \ -_(dst_port) +_(src) \ +_(dst) #define foreach_udp_proto_field \ _(src_port) \ diff --git a/src/vnet/dhcp/dhcp_proxy.h b/src/vnet/dhcp/dhcp_proxy.h index c0d79c41..4586d883 100644 --- a/src/vnet/dhcp/dhcp_proxy.h +++ b/src/vnet/dhcp/dhcp_proxy.h @@ -26,7 +26,7 @@ #include #include #include -#include +#include typedef enum { #define dhcp_proxy_error(n,s) DHCP_PROXY_ERROR_##n, diff --git a/src/vnet/flow/flow_report.h b/src/vnet/flow/flow_report.h index 4e764377..e8ed3818 100644 --- a/src/vnet/flow/flow_report.h +++ b/src/vnet/flow/flow_report.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/vnet/ip/ip.h b/src/vnet/ip/ip.h index 02a1a963..70b4ccd8 100644 --- a/src/vnet/ip/ip.h +++ b/src/vnet/ip/ip.h @@ -50,8 +50,8 @@ #include #include -#include -#include +#include +#include #include #include diff --git a/src/vnet/ip/ip4.h b/src/vnet/ip/ip4.h index b184fbae..4e075d0f 100644 --- a/src/vnet/ip/ip4.h +++ b/src/vnet/ip/ip4.h @@ -309,8 +309,8 @@ ip4_compute_flow_hash (const ip4_header_t * ip, b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2; b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? ip->protocol : 0; - t1 = is_tcp_udp ? tcp->ports.src : 0; - t2 = is_tcp_udp ? tcp->ports.dst : 0; + t1 = is_tcp_udp ? tcp->src : 0; + t2 = is_tcp_udp ? tcp->dst : 0; t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0; t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0; @@ -334,6 +334,44 @@ u8 *format_ip4_forward_next_trace (u8 * s, va_list * args); u32 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0); +#define IP_DF 0x4000 /* don't fragment */ + +/** + * Push IPv4 header to buffer + * + * This does not support fragmentation. + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param src - source IP + * @param dst - destination IP + * @param prot - payload proto + * + * @return - pointer to start of IP header + */ +always_inline void * +vlib_buffer_push_ip4 (vlib_main_t * vm, vlib_buffer_t * b, + ip4_address_t * src, ip4_address_t * dst, int proto) +{ + ip4_header_t *ih; + + /* make some room */ + ih = vlib_buffer_push_uninit (b, sizeof (ip4_header_t)); + + ih->ip_version_and_header_length = 0x45; + ih->tos = 0; + ih->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b)); + + /* No fragments */ + ih->flags_and_fragment_offset = clib_host_to_net_u16 (IP_DF); + ih->ttl = 255; + ih->protocol = proto; + ih->src_address.as_u32 = src->as_u32; + ih->dst_address.as_u32 = dst->as_u32; + + ih->checksum = ip4_header_checksum (ih); + return ih; +} #endif /* included_ip_ip4_h */ /* diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 8081b34b..66d91ab6 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1478,8 +1478,18 @@ ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0) return p0->flags; } -static uword -ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) +/* *INDENT-OFF* */ +VNET_FEATURE_ARC_INIT (ip4_local) = +{ + .arc_name = "ip4-local", + .start_nodes = VNET_FEATURES ("ip4-local"), +}; +/* *INDENT-ON* */ + +static inline uword +ip4_local_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, int head_of_feature_arc) { ip4_main_t *im = &ip4_main; ip_lookup_main_t *lm = &im->lookup_main; @@ -1487,6 +1497,7 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) u32 *from, *to_next, n_left_from, n_left_to_next; vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip4_input_node.index); + u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -1513,7 +1524,7 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) i32 len_diff0, len_diff1; u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0; u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1; - u8 enqueue_code; + u32 sw_if_index0, sw_if_index1; pi0 = to_next[0] = from[0]; pi1 = to_next[1] = from[1]; @@ -1522,6 +1533,8 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) to_next += 2; n_left_to_next -= 2; + next0 = next1 = IP_LOCAL_NEXT_DROP; + p0 = vlib_get_buffer (vm, pi0); p1 = vlib_get_buffer (vm, pi1); @@ -1531,14 +1544,18 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data; - fib_index0 = vec_elt (im->fib_index_by_sw_if_index, - vnet_buffer (p0)->sw_if_index[VLIB_RX]); + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX]; + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0); + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index1); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0); fib_index0 = (vnet_buffer (p0)->sw_if_index[VLIB_TX] == (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX]; - fib_index1 = vec_elt (im->fib_index_by_sw_if_index, - vnet_buffer (p1)->sw_if_index[VLIB_RX]); + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index1); fib_index1 = (vnet_buffer (p1)->sw_if_index[VLIB_TX] == (u32) ~ 0) ? fib_index1 : vnet_buffer (p1)->sw_if_index[VLIB_TX]; @@ -1557,6 +1574,13 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) until support of IP frag reassembly is implemented */ proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol; proto1 = ip4_is_fragment (ip1) ? 0xfe : ip1->protocol; + + if (head_of_feature_arc == 0) + { + error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL; + goto skip_checks; + } + is_udp0 = proto0 == IP_PROTOCOL_UDP; is_udp1 = proto1 == IP_PROTOCOL_UDP; is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP; @@ -1686,6 +1710,7 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) next0 = lm->local_next_by_ip_protocol[proto0]; next1 = lm->local_next_by_ip_protocol[proto1]; + skip_checks: next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0; next1 = @@ -1694,44 +1719,17 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) p0->error = error0 ? error_node->errors[error0] : 0; p1->error = error1 ? error_node->errors[error1] : 0; - enqueue_code = (next0 != next_index) + 2 * (next1 != next_index); - - if (PREDICT_FALSE (enqueue_code != 0)) + if (head_of_feature_arc) { - switch (enqueue_code) - { - case 1: - /* A B A */ - to_next[-2] = pi1; - to_next -= 1; - n_left_to_next += 1; - vlib_set_next_frame_buffer (vm, node, next0, pi0); - break; - - case 2: - /* A A B */ - to_next -= 1; - n_left_to_next += 1; - vlib_set_next_frame_buffer (vm, node, next1, pi1); - break; - - case 3: - /* A B B or A B C */ - to_next -= 2; - n_left_to_next += 2; - vlib_set_next_frame_buffer (vm, node, next0, pi0); - vlib_set_next_frame_buffer (vm, node, next1, pi1); - if (next0 == next1) - { - vlib_put_next_frame (vm, node, next_index, - n_left_to_next); - next_index = next1; - vlib_get_next_frame (vm, node, next_index, to_next, - n_left_to_next); - } - break; - } + if (PREDICT_TRUE (error0 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL)) + vnet_feature_arc_start (arc_index, sw_if_index0, &next0, p0); + if (PREDICT_TRUE (error1 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL)) + vnet_feature_arc_start (arc_index, sw_if_index1, &next1, p1); } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, + n_left_to_next, pi0, pi1, + next0, next1); } while (n_left_from > 0 && n_left_to_next > 0) @@ -1746,6 +1744,7 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0; load_balance_t *lb0; const dpo_id_t *dpo0; + u32 sw_if_index0; pi0 = to_next[0] = from[0]; from += 1; @@ -1753,14 +1752,18 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) to_next += 1; n_left_to_next -= 1; + next0 = IP_LOCAL_NEXT_DROP; + p0 = vlib_get_buffer (vm, pi0); ip0 = vlib_buffer_get_current (p0); vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; - fib_index0 = vec_elt (im->fib_index_by_sw_if_index, - vnet_buffer (p0)->sw_if_index[VLIB_RX]); + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0); + fib_index0 = (vnet_buffer (p0)->sw_if_index[VLIB_TX] == (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX]; @@ -1775,6 +1778,13 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) /* Treat IP frag packets as "experimental" protocol for now until support of IP frag reassembly is implemented */ proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol; + + if (head_of_feature_arc == 0) + { + error0 = IP4_ERROR_UNKNOWN_PROTOCOL; + goto skip_check; + } + is_udp0 = proto0 == IP_PROTOCOL_UDP; is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP; @@ -1847,6 +1857,8 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip0->dst_address.as_u32 != 0xFFFFFFFF) ? IP4_ERROR_SRC_LOOKUP_MISS : error0); + skip_check: + next0 = lm->local_next_by_ip_protocol[proto0]; next0 = @@ -1854,18 +1866,15 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) p0->error = error0 ? error_node->errors[error0] : 0; - if (PREDICT_FALSE (next0 != next_index)) + if (head_of_feature_arc) { - n_left_to_next += 1; - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - - next_index = next0; - vlib_get_next_frame (vm, node, next_index, to_next, - n_left_to_next); - to_next[0] = pi0; - to_next += 1; - n_left_to_next -= 1; + if (PREDICT_TRUE (error0 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL)) + vnet_feature_arc_start (arc_index, sw_if_index0, &next0, p0); } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, pi0, next0); + } vlib_put_next_frame (vm, node, next_index, n_left_to_next); @@ -1874,21 +1883,57 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) return frame->n_vectors; } +static uword +ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ ); +} + +/* *INDENT-OFF* */ VLIB_REGISTER_NODE (ip4_local_node) = { - .function = ip4_local,.name = "ip4-local",.vector_size = - sizeof (u32),.format_trace = - format_ip4_forward_next_trace,.n_next_nodes = - IP_LOCAL_N_NEXT,.next_nodes = + .function = ip4_local, + .name = "ip4-local", + .vector_size = sizeof (u32), + .format_trace = format_ip4_forward_next_trace, + .n_next_nodes = IP_LOCAL_N_NEXT, + .next_nodes = { - [IP_LOCAL_NEXT_DROP] = "error-drop", - [IP_LOCAL_NEXT_PUNT] = "error-punt", - [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup", - [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",} -,}; + [IP_LOCAL_NEXT_DROP] = "error-drop", + [IP_LOCAL_NEXT_PUNT] = "error-punt", + [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup", + [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",}, +}; +/* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local); +static uword +ip4_local_end_of_arc (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ip4_local_end_of_arc_node,static) = { + .function = ip4_local_end_of_arc, + .name = "ip4-local-end-of-arc", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + .sibling_of = "ip4-local", +}; + +VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_end_of_arc_node, ip4_local_end_of_arc) + +VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = { + .arc_name = "ip4-local", + .node_name = "ip4-local-end-of-arc", + .runs_before = 0, /* not before any other features */ +}; +/* *INDENT-ON* */ + void ip4_register_protocol (u32 protocol, u32 node_index) { diff --git a/src/vnet/ip/ip4_packet.h b/src/vnet/ip/ip4_packet.h index 8da788b4..b2c1fcd4 100644 --- a/src/vnet/ip/ip4_packet.h +++ b/src/vnet/ip/ip4_packet.h @@ -41,7 +41,7 @@ #define included_ip4_packet_h #include /* for ip_csum_t */ -#include /* for tcp_header_t */ +#include /* for tcp_header_t */ #include /* for clib_net_to_host_u16 */ /* IP4 address which can be accessed either as 4 bytes @@ -342,10 +342,10 @@ ip4_tcp_reply_x1 (ip4_header_t * ip0, tcp_header_t * tcp0) ip0->src_address.data_u32 = dst0; ip0->dst_address.data_u32 = src0; - src0 = tcp0->ports.src; - dst0 = tcp0->ports.dst; - tcp0->ports.src = dst0; - tcp0->ports.dst = src0; + src0 = tcp0->src; + dst0 = tcp0->dst; + tcp0->src = dst0; + tcp0->dst = src0; } always_inline void @@ -363,14 +363,14 @@ ip4_tcp_reply_x2 (ip4_header_t * ip0, ip4_header_t * ip1, ip0->dst_address.data_u32 = src0; ip1->dst_address.data_u32 = src1; - src0 = tcp0->ports.src; - src1 = tcp1->ports.src; - dst0 = tcp0->ports.dst; - dst1 = tcp1->ports.dst; - tcp0->ports.src = dst0; - tcp1->ports.src = dst1; - tcp0->ports.dst = src0; - tcp1->ports.dst = src1; + src0 = tcp0->src; + src1 = tcp1->src; + dst0 = tcp0->dst; + dst1 = tcp1->dst; + tcp0->src = dst0; + tcp1->src = dst1; + tcp0->dst = src0; + tcp1->dst = src1; } #endif /* included_ip4_packet_h */ diff --git a/src/vnet/ip/ip6.h b/src/vnet/ip/ip6.h index 5456f0f2..2615fbfa 100644 --- a/src/vnet/ip/ip6.h +++ b/src/vnet/ip/ip6.h @@ -461,8 +461,8 @@ ip6_compute_flow_hash (const ip6_header_t * ip, b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2; b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? ip->protocol : 0; - t1 = is_tcp_udp ? tcp->ports.src : 0; - t2 = is_tcp_udp ? tcp->ports.dst : 0; + t1 = is_tcp_udp ? tcp->src : 0; + t2 = is_tcp_udp ? tcp->dst : 0; t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0; t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0; @@ -497,6 +497,46 @@ int ip6_hbh_register_option (u8 option, int ip6_hbh_unregister_option (u8 option); void ip6_hbh_set_next_override (uword next); +/** + * Push IPv6 header to buffer + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param src - source IP + * @param dst - destination IP + * @param prot - payload proto + * + * @return - pointer to start of IP header + */ +always_inline void * +vlib_buffer_push_ip6 (vlib_main_t * vm, vlib_buffer_t * b, + ip6_address_t * src, ip6_address_t * dst, int proto) +{ + ip6_header_t *ip6h; + u16 payload_length; + + /* make some room */ + ip6h = vlib_buffer_push_uninit (b, sizeof (ip6_header_t)); + + ip6h->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32 (0x6 << 28); + + /* calculate ip6 payload length */ + payload_length = vlib_buffer_length_in_chain (vm, b); + payload_length -= sizeof (*ip6h); + + ip6h->payload_length = clib_host_to_net_u16 (payload_length); + + ip6h->hop_limit = 0xff; + ip6h->protocol = proto; + clib_memcpy (ip6h->src_address.as_u8, src->as_u8, + sizeof (ip6h->src_address)); + clib_memcpy (ip6h->dst_address.as_u8, dst->as_u8, + sizeof (ip6h->src_address)); + + return ip6h; +} + #endif /* included_ip_ip6_h */ /* diff --git a/src/vnet/ip/ip6_packet.h b/src/vnet/ip/ip6_packet.h index 1e551c8b..4fd14b96 100644 --- a/src/vnet/ip/ip6_packet.h +++ b/src/vnet/ip/ip6_packet.h @@ -40,7 +40,7 @@ #ifndef included_ip6_packet_h #define included_ip6_packet_h -#include +#include #include typedef union @@ -373,10 +373,10 @@ ip6_tcp_reply_x1 (ip6_header_t * ip0, tcp_header_t * tcp0) { u16 src0, dst0; - src0 = tcp0->ports.src; - dst0 = tcp0->ports.dst; - tcp0->ports.src = dst0; - tcp0->ports.dst = src0; + src0 = tcp0->src; + dst0 = tcp0->dst; + tcp0->src = dst0; + tcp0->dst = src0; } } @@ -400,14 +400,14 @@ ip6_tcp_reply_x2 (ip6_header_t * ip0, ip6_header_t * ip1, { u16 src0, dst0, src1, dst1; - src0 = tcp0->ports.src; - src1 = tcp1->ports.src; - dst0 = tcp0->ports.dst; - dst1 = tcp1->ports.dst; - tcp0->ports.src = dst0; - tcp1->ports.src = dst1; - tcp0->ports.dst = src0; - tcp1->ports.dst = src1; + src0 = tcp0->src; + src1 = tcp1->src; + dst0 = tcp0->dst; + dst1 = tcp1->dst; + tcp0->src = dst0; + tcp1->src = dst1; + tcp0->dst = src0; + tcp1->dst = src1; } } diff --git a/src/vnet/ip/punt.c b/src/vnet/ip/punt.c index 9c735128..48558401 100644 --- a/src/vnet/ip/punt.c +++ b/src/vnet/ip/punt.c @@ -23,7 +23,7 @@ */ #include #include -#include +#include #include #define foreach_punt_next \ diff --git a/src/vnet/ip/tcp_packet.h b/src/vnet/ip/tcp_packet.h deleted file mode 100644 index 93f73e01..00000000 --- a/src/vnet/ip/tcp_packet.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip4/tcp_packet.h: TCP packet format (see RFC 793) - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef included_tcp_packet_h -#define included_tcp_packet_h - -/* TCP flags bit 0 first. */ -#define foreach_tcp_flag \ - _ (FIN) \ - _ (SYN) \ - _ (RST) \ - _ (PSH) \ - _ (ACK) \ - _ (URG) \ - _ (ECE) \ - _ (CWR) - -enum -{ -#define _(f) TCP_FLAG_BIT_##f, - foreach_tcp_flag -#undef _ - TCP_N_FLAG_BITS, - -#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f, - foreach_tcp_flag -#undef _ -}; - -typedef struct -{ - /* Source and destination port. */ - union - { - union - { - struct - { - u16 src, dst; - }; - u32 src_and_dst; - } ports; - struct - { - u16 src_port, dst_port; - }; - }; - - /* Sequence and acknowledgment number. */ - u32 seq_number, ack_number; - - /* Size of TCP header in 32-bit units plus 4 reserved bits. */ - u8 tcp_header_u32s_and_reserved; - - /* see foreach_tcp_flag for enumation of tcp flags. */ - u8 flags; - - /* Current window advertised by sender. - This is the number of bytes sender is willing to receive - right now. */ - u16 window; - - /* Checksum of TCP pseudo header and data. */ - u16 checksum; - - u16 urgent_pointer; -} tcp_header_t; - -always_inline int -tcp_header_bytes (tcp_header_t * t) -{ - return (t->tcp_header_u32s_and_reserved >> 4) * sizeof (u32); -} - -/* TCP options. */ -typedef enum tcp_option_type -{ - TCP_OPTION_END = 0, - TCP_OPTION_NOP = 1, - TCP_OPTION_MSS = 2, - TCP_OPTION_WINDOW_SCALE = 3, - TCP_OPTION_SACK_PERMITTED = 4, - TCP_OPTION_SACK_BLOCK = 5, - TCP_OPTION_TIME_STAMP = 8, -} tcp_option_type_t; - -/* All except NOP and END have 1 byte length field. */ -typedef struct -{ - tcp_option_type_t type:8; - - /* Length of this option in bytes. */ - u8 length; -} tcp_option_with_length_t; - -#endif /* included_tcp_packet_h */ - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp.h b/src/vnet/ip/udp.h deleted file mode 100644 index bad58b5d..00000000 --- a/src/vnet/ip/udp.h +++ /dev/null @@ -1,315 +0,0 @@ -/* - * ip/udp.h: udp protocol - * - * Copyright (c) 2013 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_udp_h -#define included_udp_h - -#include -#include -#include -#include -#include -#include -#include - -typedef enum -{ -#define udp_error(n,s) UDP_ERROR_##n, -#include -#undef udp_error - UDP_N_ERROR, -} udp_error_t; - -#define foreach_udp4_dst_port \ -_ (67, dhcp_to_server) \ -_ (68, dhcp_to_client) \ -_ (500, ikev2) \ -_ (3784, bfd4) \ -_ (3785, bfd_echo4) \ -_ (4341, lisp_gpe) \ -_ (4342, lisp_cp) \ -_ (4739, ipfix) \ -_ (4789, vxlan) \ -_ (4789, vxlan6) \ -_ (4790, vxlan_gpe) \ -_ (6633, vpath_3) - - -#define foreach_udp6_dst_port \ -_ (547, dhcpv6_to_server) \ -_ (546, dhcpv6_to_client) \ -_ (3784, bfd6) \ -_ (3785, bfd_echo6) \ -_ (4341, lisp_gpe6) \ -_ (4342, lisp_cp6) \ -_ (4790, vxlan6_gpe) \ -_ (6633, vpath6_3) - -typedef enum -{ -#define _(n,f) UDP_DST_PORT_##f = n, - foreach_udp4_dst_port foreach_udp6_dst_port -#undef _ -} udp_dst_port_t; - -typedef enum -{ -#define _(n,f) UDP6_DST_PORT_##f = n, - foreach_udp6_dst_port -#undef _ -} udp6_dst_port_t; - -typedef struct -{ - /* Name (a c string). */ - char *name; - - /* GRE protocol type in host byte order. */ - udp_dst_port_t dst_port; - - /* Node which handles this type. */ - u32 node_index; - - /* Next index for this type. */ - u32 next_index; -} udp_dst_port_info_t; - -typedef enum -{ - UDP_IP6 = 0, - UDP_IP4, /* the code is full of is_ip4... */ - N_UDP_AF, -} udp_af_t; - -typedef struct -{ - udp_dst_port_info_t *dst_port_infos[N_UDP_AF]; - - /* Hash tables mapping name/protocol to protocol info index. */ - uword *dst_port_info_by_name[N_UDP_AF]; - uword *dst_port_info_by_dst_port[N_UDP_AF]; - - /* convenience */ - vlib_main_t *vlib_main; -} udp_main_t; - -always_inline udp_dst_port_info_t * -udp_get_dst_port_info (udp_main_t * um, udp_dst_port_t dst_port, u8 is_ip4) -{ - uword *p = hash_get (um->dst_port_info_by_dst_port[is_ip4], dst_port); - return p ? vec_elt_at_index (um->dst_port_infos[is_ip4], p[0]) : 0; -} - -format_function_t format_udp_header; -format_function_t format_udp_rx_trace; - -unformat_function_t unformat_udp_header; - -void udp_register_dst_port (vlib_main_t * vm, - udp_dst_port_t dst_port, - u32 node_index, u8 is_ip4); - -void udp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add); - -always_inline void -ip_udp_fixup_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 is_ip4) -{ - u16 new_l0; - udp_header_t *udp0; - - if (is_ip4) - { - ip4_header_t *ip0; - ip_csum_t sum0; - u16 old_l0 = 0; - - ip0 = vlib_buffer_get_current (b0); - - /* fix the ing outer-IP checksum */ - sum0 = ip0->checksum; - /* old_l0 always 0, see the rewrite setup */ - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); - - sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, - length /* changed member */ ); - ip0->checksum = ip_csum_fold (sum0); - ip0->length = new_l0; - - /* Fix UDP length */ - udp0 = (udp_header_t *) (ip0 + 1); - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - - sizeof (*ip0)); - udp0->length = new_l0; - } - else - { - ip6_header_t *ip0; - int bogus0; - - ip0 = vlib_buffer_get_current (b0); - - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - - sizeof (*ip0)); - ip0->payload_length = new_l0; - - /* Fix UDP length */ - udp0 = (udp_header_t *) (ip0 + 1); - udp0->length = new_l0; - - udp0->checksum = - ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, &bogus0); - ASSERT (bogus0 == 0); - - if (udp0->checksum == 0) - udp0->checksum = 0xffff; - } -} - -always_inline void -ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len, - u8 is_ip4) -{ - vlib_buffer_advance (b0, -ec_len); - - if (is_ip4) - { - ip4_header_t *ip0; - - ip0 = vlib_buffer_get_current (b0); - - /* Apply the encap string. */ - clib_memcpy (ip0, ec0, ec_len); - ip_udp_fixup_one (vm, b0, 1); - } - else - { - ip6_header_t *ip0; - - ip0 = vlib_buffer_get_current (b0); - - /* Apply the encap string. */ - clib_memcpy (ip0, ec0, ec_len); - ip_udp_fixup_one (vm, b0, 0); - } -} - -always_inline void -ip_udp_encap_two (vlib_main_t * vm, vlib_buffer_t * b0, vlib_buffer_t * b1, - u8 * ec0, u8 * ec1, word ec_len, u8 is_v4) -{ - u16 new_l0, new_l1; - udp_header_t *udp0, *udp1; - - ASSERT (_vec_len (ec0) == _vec_len (ec1)); - - vlib_buffer_advance (b0, -ec_len); - vlib_buffer_advance (b1, -ec_len); - - if (is_v4) - { - ip4_header_t *ip0, *ip1; - ip_csum_t sum0, sum1; - u16 old_l0 = 0, old_l1 = 0; - - ip0 = vlib_buffer_get_current (b0); - ip1 = vlib_buffer_get_current (b1); - - /* Apply the encap string */ - clib_memcpy (ip0, ec0, ec_len); - clib_memcpy (ip1, ec1, ec_len); - - /* fix the ing outer-IP checksum */ - sum0 = ip0->checksum; - sum1 = ip1->checksum; - - /* old_l0 always 0, see the rewrite setup */ - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); - new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1)); - - sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, - length /* changed member */ ); - sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t, - length /* changed member */ ); - - ip0->checksum = ip_csum_fold (sum0); - ip1->checksum = ip_csum_fold (sum1); - - ip0->length = new_l0; - ip1->length = new_l1; - - /* Fix UDP length */ - udp0 = (udp_header_t *) (ip0 + 1); - udp1 = (udp_header_t *) (ip1 + 1); - - new_l0 = - clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - - sizeof (*ip0)); - new_l1 = - clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) - - sizeof (*ip1)); - udp0->length = new_l0; - udp1->length = new_l1; - } - else - { - ip6_header_t *ip0, *ip1; - int bogus0, bogus1; - - ip0 = vlib_buffer_get_current (b0); - ip1 = vlib_buffer_get_current (b1); - - /* Apply the encap string. */ - clib_memcpy (ip0, ec0, ec_len); - clib_memcpy (ip1, ec1, ec_len); - - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - - sizeof (*ip0)); - new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) - - sizeof (*ip1)); - ip0->payload_length = new_l0; - ip1->payload_length = new_l1; - - /* Fix UDP length */ - udp0 = (udp_header_t *) (ip0 + 1); - udp1 = (udp_header_t *) (ip1 + 1); - - udp0->length = new_l0; - udp1->length = new_l1; - - udp0->checksum = - ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, &bogus0); - udp1->checksum = - ip6_tcp_udp_icmp_compute_checksum (vm, b1, ip1, &bogus1); - ASSERT (bogus0 == 0); - ASSERT (bogus1 == 0); - - if (udp0->checksum == 0) - udp0->checksum = 0xffff; - if (udp1->checksum == 0) - udp1->checksum = 0xffff; - } -} - -#endif /* included_udp_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_error.def b/src/vnet/ip/udp_error.def deleted file mode 100644 index bfdae0ac..00000000 --- a/src/vnet/ip/udp_error.def +++ /dev/null @@ -1,21 +0,0 @@ -/* - * udp_error.def: udp errors - * - * Copyright (c) 2013-2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -udp_error (NONE, "no error") -udp_error (NO_LISTENER, "no listener for dst port") -udp_error (LENGTH_ERROR, "UDP packets with length errors") -udp_error (PUNT, "no listener punt") diff --git a/src/vnet/ip/udp_format.c b/src/vnet/ip/udp_format.c deleted file mode 100644 index abdf561e..00000000 --- a/src/vnet/ip/udp_format.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip/udp_format.c: udp formatting - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include - -/* Format UDP header. */ -u8 * -format_udp_header (u8 * s, va_list * args) -{ - udp_header_t *udp = va_arg (*args, udp_header_t *); - u32 max_header_bytes = va_arg (*args, u32); - uword indent; - u32 header_bytes = sizeof (udp[0]); - - /* Nothing to do. */ - if (max_header_bytes < sizeof (udp[0])) - return format (s, "UDP header truncated"); - - indent = format_get_indent (s); - indent += 2; - - s = format (s, "UDP: %d -> %d", - clib_net_to_host_u16 (udp->src_port), - clib_net_to_host_u16 (udp->dst_port)); - - s = format (s, "\n%Ulength %d, checksum 0x%04x", - format_white_space, indent, - clib_net_to_host_u16 (udp->length), - clib_net_to_host_u16 (udp->checksum)); - - /* Recurse into next protocol layer. */ - if (max_header_bytes != 0 && header_bytes < max_header_bytes) - { - ip_main_t *im = &ip_main; - tcp_udp_port_info_t *pi; - - pi = ip_get_tcp_udp_port_info (im, udp->dst_port); - - if (pi && pi->format_header) - s = format (s, "\n%U%U", - format_white_space, indent - 2, pi->format_header, - /* next protocol header */ (udp + 1), - max_header_bytes - sizeof (udp[0])); - } - - return s; -} - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_init.c b/src/vnet/ip/udp_init.c deleted file mode 100644 index 1241ca4a..00000000 --- a/src/vnet/ip/udp_init.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip/udp_init.c: udp initialization - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include - -clib_error_t * -udp_init (vlib_main_t * vm) -{ - ip_main_t *im = &ip_main; - ip_protocol_info_t *pi; - clib_error_t *error; - - error = vlib_call_init_function (vm, ip_main_init); - - if (!error) - { - pi = ip_get_protocol_info (im, IP_PROTOCOL_UDP); - if (pi == 0) - return clib_error_return (0, "UDP protocol info AWOL"); - pi->format_header = format_udp_header; - pi->unformat_pg_edit = unformat_pg_udp_header; - } - - return 0; -} - -VLIB_INIT_FUNCTION (udp_init); - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_local.c b/src/vnet/ip/udp_local.c deleted file mode 100644 index 13ab6e4f..00000000 --- a/src/vnet/ip/udp_local.c +++ /dev/null @@ -1,645 +0,0 @@ -/* - * node.c: udp packet processing - * - * Copyright (c) 2013 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -udp_main_t udp_main; - -#define foreach_udp_input_next \ - _ (PUNT, "error-punt") \ - _ (DROP, "error-drop") \ - _ (ICMP4_ERROR, "ip4-icmp-error") \ - _ (ICMP6_ERROR, "ip6-icmp-error") - -typedef enum -{ -#define _(s,n) UDP_INPUT_NEXT_##s, - foreach_udp_input_next -#undef _ - UDP_INPUT_N_NEXT, -} udp_input_next_t; - -typedef struct -{ - u16 src_port; - u16 dst_port; - u8 bound; -} udp_rx_trace_t; - -u8 * -format_udp_rx_trace (u8 * s, va_list * args) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); - CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); - udp_rx_trace_t *t = va_arg (*args, udp_rx_trace_t *); - - s = format (s, "UDP: src-port %d dst-port %d%s", - clib_net_to_host_u16 (t->src_port), - clib_net_to_host_u16 (t->dst_port), - t->bound ? "" : " (no listener)"); - return s; -} - -typedef struct -{ - /* Sparse vector mapping udp dst_port in network byte order - to next index. */ - u16 *next_by_dst_port; - u8 punt_unknown; -} udp_input_runtime_t; - -vlib_node_registration_t udp4_input_node; -vlib_node_registration_t udp6_input_node; - -always_inline uword -udp46_input_inline (vlib_main_t * vm, - vlib_node_runtime_t * node, - vlib_frame_t * from_frame, int is_ip4) -{ - udp_input_runtime_t *rt = is_ip4 ? - (void *) vlib_node_get_runtime_data (vm, udp4_input_node.index) - : (void *) vlib_node_get_runtime_data (vm, udp6_input_node.index); - __attribute__ ((unused)) u32 n_left_from, next_index, *from, *to_next; - word n_no_listener = 0; - u8 punt_unknown = rt->punt_unknown; - - from = vlib_frame_vector_args (from_frame); - n_left_from = from_frame->n_vectors; - - next_index = node->cached_next_index; - - while (n_left_from > 0) - { - u32 n_left_to_next; - - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - - while (n_left_from >= 4 && n_left_to_next >= 2) - { - u32 bi0, bi1; - vlib_buffer_t *b0, *b1; - udp_header_t *h0 = 0, *h1 = 0; - u32 i0, i1, dst_port0, dst_port1; - u32 advance0, advance1; - u32 error0, next0, error1, next1; - - /* Prefetch next iteration. */ - { - vlib_buffer_t *p2, *p3; - - p2 = vlib_get_buffer (vm, from[2]); - p3 = vlib_get_buffer (vm, from[3]); - - vlib_prefetch_buffer_header (p2, LOAD); - vlib_prefetch_buffer_header (p3, LOAD); - - CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD); - CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD); - } - - bi0 = from[0]; - bi1 = from[1]; - to_next[0] = bi0; - to_next[1] = bi1; - from += 2; - to_next += 2; - n_left_to_next -= 2; - n_left_from -= 2; - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - /* ip4/6_local hands us the ip header, not the udp header */ - if (is_ip4) - { - advance0 = sizeof (ip4_header_t); - advance1 = sizeof (ip4_header_t); - } - else - { - advance0 = sizeof (ip6_header_t); - advance1 = sizeof (ip6_header_t); - } - - if (PREDICT_FALSE (b0->current_length < advance0 + sizeof (*h0))) - { - error0 = UDP_ERROR_LENGTH_ERROR; - next0 = UDP_INPUT_NEXT_DROP; - } - else - { - vlib_buffer_advance (b0, advance0); - h0 = vlib_buffer_get_current (b0); - error0 = next0 = 0; - if (PREDICT_FALSE (clib_net_to_host_u16 (h0->length) > - vlib_buffer_length_in_chain (vm, b0))) - { - error0 = UDP_ERROR_LENGTH_ERROR; - next0 = UDP_INPUT_NEXT_DROP; - } - } - - if (PREDICT_FALSE (b1->current_length < advance1 + sizeof (*h1))) - { - error1 = UDP_ERROR_LENGTH_ERROR; - next1 = UDP_INPUT_NEXT_DROP; - } - else - { - vlib_buffer_advance (b1, advance1); - h1 = vlib_buffer_get_current (b1); - error1 = next1 = 0; - if (PREDICT_FALSE (clib_net_to_host_u16 (h1->length) > - vlib_buffer_length_in_chain (vm, b1))) - { - error1 = UDP_ERROR_LENGTH_ERROR; - next1 = UDP_INPUT_NEXT_DROP; - } - } - - /* Index sparse array with network byte order. */ - dst_port0 = (error0 == 0) ? h0->dst_port : 0; - dst_port1 = (error1 == 0) ? h1->dst_port : 0; - sparse_vec_index2 (rt->next_by_dst_port, dst_port0, dst_port1, - &i0, &i1); - next0 = (error0 == 0) ? vec_elt (rt->next_by_dst_port, i0) : next0; - next1 = (error1 == 0) ? vec_elt (rt->next_by_dst_port, i1) : next1; - - if (PREDICT_FALSE (i0 == SPARSE_VEC_INVALID_INDEX)) - { - // move the pointer back so icmp-error can find the - // ip packet header - vlib_buffer_advance (b0, -(word) advance0); - - if (PREDICT_FALSE (punt_unknown)) - { - b0->error = node->errors[UDP_ERROR_PUNT]; - next0 = UDP_INPUT_NEXT_PUNT; - } - else if (is_ip4) - { - icmp4_error_set_vnet_buffer (b0, - ICMP4_destination_unreachable, - ICMP4_destination_unreachable_port_unreachable, - 0); - next0 = UDP_INPUT_NEXT_ICMP4_ERROR; - n_no_listener++; - } - else - { - icmp6_error_set_vnet_buffer (b0, - ICMP6_destination_unreachable, - ICMP6_destination_unreachable_port_unreachable, - 0); - next0 = UDP_INPUT_NEXT_ICMP6_ERROR; - n_no_listener++; - } - } - else - { - b0->error = node->errors[UDP_ERROR_NONE]; - // advance to the payload - vlib_buffer_advance (b0, sizeof (*h0)); - } - - if (PREDICT_FALSE (i1 == SPARSE_VEC_INVALID_INDEX)) - { - // move the pointer back so icmp-error can find the - // ip packet header - vlib_buffer_advance (b1, -(word) advance1); - - if (PREDICT_FALSE (punt_unknown)) - { - b1->error = node->errors[UDP_ERROR_PUNT]; - next1 = UDP_INPUT_NEXT_PUNT; - } - else if (is_ip4) - { - icmp4_error_set_vnet_buffer (b1, - ICMP4_destination_unreachable, - ICMP4_destination_unreachable_port_unreachable, - 0); - next1 = UDP_INPUT_NEXT_ICMP4_ERROR; - n_no_listener++; - } - else - { - icmp6_error_set_vnet_buffer (b1, - ICMP6_destination_unreachable, - ICMP6_destination_unreachable_port_unreachable, - 0); - next1 = UDP_INPUT_NEXT_ICMP6_ERROR; - n_no_listener++; - } - } - else - { - b1->error = node->errors[UDP_ERROR_NONE]; - // advance to the payload - vlib_buffer_advance (b1, sizeof (*h1)); - } - - if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) - { - udp_rx_trace_t *tr = vlib_add_trace (vm, node, - b0, sizeof (*tr)); - if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) - { - tr->src_port = h0 ? h0->src_port : 0; - tr->dst_port = h0 ? h0->dst_port : 0; - tr->bound = (next0 != UDP_INPUT_NEXT_ICMP4_ERROR && - next0 != UDP_INPUT_NEXT_ICMP6_ERROR); - } - } - if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED)) - { - udp_rx_trace_t *tr = vlib_add_trace (vm, node, - b1, sizeof (*tr)); - if (b1->error != node->errors[UDP_ERROR_LENGTH_ERROR]) - { - tr->src_port = h1 ? h1->src_port : 0; - tr->dst_port = h1 ? h1->dst_port : 0; - tr->bound = (next1 != UDP_INPUT_NEXT_ICMP4_ERROR && - next1 != UDP_INPUT_NEXT_ICMP6_ERROR); - } - } - - vlib_validate_buffer_enqueue_x2 (vm, node, next_index, - to_next, n_left_to_next, - bi0, bi1, next0, next1); - } - - while (n_left_from > 0 && n_left_to_next > 0) - { - u32 bi0; - vlib_buffer_t *b0; - udp_header_t *h0 = 0; - u32 i0, next0; - u32 advance0; - - bi0 = from[0]; - to_next[0] = bi0; - from += 1; - to_next += 1; - n_left_from -= 1; - n_left_to_next -= 1; - - b0 = vlib_get_buffer (vm, bi0); - - /* ip4/6_local hands us the ip header, not the udp header */ - if (is_ip4) - advance0 = sizeof (ip4_header_t); - else - advance0 = sizeof (ip6_header_t); - - if (PREDICT_FALSE (b0->current_length < advance0 + sizeof (*h0))) - { - b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; - next0 = UDP_INPUT_NEXT_DROP; - goto trace_x1; - } - - vlib_buffer_advance (b0, advance0); - - h0 = vlib_buffer_get_current (b0); - - if (PREDICT_TRUE (clib_net_to_host_u16 (h0->length) <= - vlib_buffer_length_in_chain (vm, b0))) - { - i0 = sparse_vec_index (rt->next_by_dst_port, h0->dst_port); - next0 = vec_elt (rt->next_by_dst_port, i0); - - if (PREDICT_FALSE (i0 == SPARSE_VEC_INVALID_INDEX)) - { - // move the pointer back so icmp-error can find the - // ip packet header - vlib_buffer_advance (b0, -(word) advance0); - - if (PREDICT_FALSE (punt_unknown)) - { - b0->error = node->errors[UDP_ERROR_PUNT]; - next0 = UDP_INPUT_NEXT_PUNT; - } - else if (is_ip4) - { - icmp4_error_set_vnet_buffer (b0, - ICMP4_destination_unreachable, - ICMP4_destination_unreachable_port_unreachable, - 0); - next0 = UDP_INPUT_NEXT_ICMP4_ERROR; - n_no_listener++; - } - else - { - icmp6_error_set_vnet_buffer (b0, - ICMP6_destination_unreachable, - ICMP6_destination_unreachable_port_unreachable, - 0); - next0 = UDP_INPUT_NEXT_ICMP6_ERROR; - n_no_listener++; - } - } - else - { - b0->error = node->errors[UDP_ERROR_NONE]; - // advance to the payload - vlib_buffer_advance (b0, sizeof (*h0)); - } - } - else - { - b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; - next0 = UDP_INPUT_NEXT_DROP; - } - - trace_x1: - if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) - { - udp_rx_trace_t *tr = vlib_add_trace (vm, node, - b0, sizeof (*tr)); - if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) - { - tr->src_port = h0->src_port; - tr->dst_port = h0->dst_port; - tr->bound = (next0 != UDP_INPUT_NEXT_ICMP4_ERROR && - next0 != UDP_INPUT_NEXT_ICMP6_ERROR); - } - } - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi0, next0); - } - - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - } - vlib_error_count (vm, node->node_index, UDP_ERROR_NO_LISTENER, - n_no_listener); - return from_frame->n_vectors; -} - -static char *udp_error_strings[] = { -#define udp_error(n,s) s, -#include "udp_error.def" -#undef udp_error -}; - -static uword -udp4_input (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * from_frame) -{ - return udp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ); -} - -static uword -udp6_input (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * from_frame) -{ - return udp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ); -} - - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (udp4_input_node) = { - .function = udp4_input, - .name = "ip4-udp-lookup", - /* Takes a vector of packets. */ - .vector_size = sizeof (u32), - - .runtime_data_bytes = sizeof (udp_input_runtime_t), - - .n_errors = UDP_N_ERROR, - .error_strings = udp_error_strings, - - .n_next_nodes = UDP_INPUT_N_NEXT, - .next_nodes = { -#define _(s,n) [UDP_INPUT_NEXT_##s] = n, - foreach_udp_input_next -#undef _ - }, - - .format_buffer = format_udp_header, - .format_trace = format_udp_rx_trace, - .unformat_buffer = unformat_udp_header, -}; -/* *INDENT-ON* */ - -VLIB_NODE_FUNCTION_MULTIARCH (udp4_input_node, udp4_input); - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (udp6_input_node) = { - .function = udp6_input, - .name = "ip6-udp-lookup", - /* Takes a vector of packets. */ - .vector_size = sizeof (u32), - - .runtime_data_bytes = sizeof (udp_input_runtime_t), - - .n_errors = UDP_N_ERROR, - .error_strings = udp_error_strings, - - .n_next_nodes = UDP_INPUT_N_NEXT, - .next_nodes = { -#define _(s,n) [UDP_INPUT_NEXT_##s] = n, - foreach_udp_input_next -#undef _ - }, - - .format_buffer = format_udp_header, - .format_trace = format_udp_rx_trace, - .unformat_buffer = unformat_udp_header, -}; -/* *INDENT-ON* */ - -VLIB_NODE_FUNCTION_MULTIARCH (udp6_input_node, udp6_input); - -static void -add_dst_port (udp_main_t * um, - udp_dst_port_t dst_port, char *dst_port_name, u8 is_ip4) -{ - udp_dst_port_info_t *pi; - u32 i; - - vec_add2 (um->dst_port_infos[is_ip4], pi, 1); - i = pi - um->dst_port_infos[is_ip4]; - - pi->name = dst_port_name; - pi->dst_port = dst_port; - pi->next_index = pi->node_index = ~0; - - hash_set (um->dst_port_info_by_dst_port[is_ip4], dst_port, i); - - if (pi->name) - hash_set_mem (um->dst_port_info_by_name[is_ip4], pi->name, i); -} - -void -udp_register_dst_port (vlib_main_t * vm, - udp_dst_port_t dst_port, u32 node_index, u8 is_ip4) -{ - udp_main_t *um = &udp_main; - udp_dst_port_info_t *pi; - udp_input_runtime_t *rt; - u16 *n; - - { - clib_error_t *error = vlib_call_init_function (vm, udp_local_init); - if (error) - clib_error_report (error); - } - - pi = udp_get_dst_port_info (um, dst_port, is_ip4); - if (!pi) - { - add_dst_port (um, dst_port, 0, is_ip4); - pi = udp_get_dst_port_info (um, dst_port, is_ip4); - ASSERT (pi); - } - - pi->node_index = node_index; - pi->next_index = vlib_node_add_next (vm, - is_ip4 ? udp4_input_node.index - : udp6_input_node.index, node_index); - - /* Setup udp protocol -> next index sparse vector mapping. */ - rt = vlib_node_get_runtime_data - (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); - n = sparse_vec_validate (rt->next_by_dst_port, - clib_host_to_net_u16 (dst_port)); - n[0] = pi->next_index; -} - -void -udp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add) -{ - udp_input_runtime_t *rt; - - { - clib_error_t *error = vlib_call_init_function (vm, udp_local_init); - if (error) - clib_error_report (error); - } - - rt = vlib_node_get_runtime_data - (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); - - rt->punt_unknown = is_add; -} - -/* Parse a UDP header. */ -uword -unformat_udp_header (unformat_input_t * input, va_list * args) -{ - u8 **result = va_arg (*args, u8 **); - udp_header_t *udp; - __attribute__ ((unused)) int old_length; - u16 src_port, dst_port; - - /* Allocate space for IP header. */ - { - void *p; - - old_length = vec_len (*result); - vec_add2 (*result, p, sizeof (ip4_header_t)); - udp = p; - } - - memset (udp, 0, sizeof (udp[0])); - if (unformat (input, "src-port %d dst-port %d", &src_port, &dst_port)) - { - udp->src_port = clib_host_to_net_u16 (src_port); - udp->dst_port = clib_host_to_net_u16 (dst_port); - return 1; - } - return 0; -} - -static void -udp_setup_node (vlib_main_t * vm, u32 node_index) -{ - vlib_node_t *n = vlib_get_node (vm, node_index); - pg_node_t *pn = pg_get_node (node_index); - - n->format_buffer = format_udp_header; - n->unformat_buffer = unformat_udp_header; - pn->unformat_edit = unformat_pg_udp_header; -} - -clib_error_t * -udp_local_init (vlib_main_t * vm) -{ - udp_input_runtime_t *rt; - udp_main_t *um = &udp_main; - int i; - - { - clib_error_t *error; - error = vlib_call_init_function (vm, udp_init); - if (error) - clib_error_report (error); - } - - - for (i = 0; i < 2; i++) - { - um->dst_port_info_by_name[i] = hash_create_string (0, sizeof (uword)); - um->dst_port_info_by_dst_port[i] = hash_create (0, sizeof (uword)); - } - - udp_setup_node (vm, udp4_input_node.index); - udp_setup_node (vm, udp6_input_node.index); - - rt = vlib_node_get_runtime_data (vm, udp4_input_node.index); - - rt->next_by_dst_port = sparse_vec_new - ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), - /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); - - rt->punt_unknown = 0; - -#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 1 /* is_ip4 */); - foreach_udp4_dst_port -#undef _ - rt = vlib_node_get_runtime_data (vm, udp6_input_node.index); - - rt->next_by_dst_port = sparse_vec_new - ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), - /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); - - rt->punt_unknown = 0; - -#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 0 /* is_ip4 */); - foreach_udp6_dst_port -#undef _ - ip4_register_protocol (IP_PROTOCOL_UDP, udp4_input_node.index); - /* Note: ip6 differs from ip4, UDP is hotwired to ip6-udp-lookup */ - return 0; -} - -VLIB_INIT_FUNCTION (udp_local_init); - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_packet.h b/src/vnet/ip/udp_packet.h deleted file mode 100644 index beea3059..00000000 --- a/src/vnet/ip/udp_packet.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip4/udp_packet.h: UDP packet format - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef included_udp_packet_h -#define included_udp_packet_h - -typedef struct -{ - /* Source and destination port. */ - u16 src_port, dst_port; - - /* Length of UDP header plus payload. */ - u16 length; - - /* Checksum of UDP pseudo-header and data or - zero if checksum is disabled. */ - u16 checksum; -} udp_header_t; - -#endif /* included_udp_packet_h */ - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_pg.c b/src/vnet/ip/udp_pg.c deleted file mode 100644 index c9d8d38c..00000000 --- a/src/vnet/ip/udp_pg.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip/udp_pg: UDP packet-generator interface - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include /* for unformat_udp_udp_port */ - -#define UDP_PG_EDIT_LENGTH (1 << 0) -#define UDP_PG_EDIT_CHECKSUM (1 << 1) - -always_inline void -udp_pg_edit_function_inline (pg_main_t * pg, - pg_stream_t * s, - pg_edit_group_t * g, - u32 * packets, u32 n_packets, u32 flags) -{ - vlib_main_t *vm = vlib_get_main (); - u32 ip_offset, udp_offset; - - udp_offset = g->start_byte_offset; - ip_offset = (g - 1)->start_byte_offset; - - while (n_packets >= 1) - { - vlib_buffer_t *p0; - ip4_header_t *ip0; - udp_header_t *udp0; - u32 udp_len0; - - p0 = vlib_get_buffer (vm, packets[0]); - n_packets -= 1; - packets += 1; - - ip0 = (void *) (p0->data + ip_offset); - udp0 = (void *) (p0->data + udp_offset); - udp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); - - if (flags & UDP_PG_EDIT_LENGTH) - udp0->length = - clib_net_to_host_u16 (vlib_buffer_length_in_chain (vm, p0) - - ip_offset); - - /* Initialize checksum with header. */ - if (flags & UDP_PG_EDIT_CHECKSUM) - { - ip_csum_t sum0; - - sum0 = clib_mem_unaligned (&ip0->src_address, u64); - - sum0 = ip_csum_with_carry - (sum0, clib_host_to_net_u32 (udp_len0 + (ip0->protocol << 16))); - - /* Invalidate possibly old checksum. */ - udp0->checksum = 0; - - sum0 = - ip_incremental_checksum_buffer (vm, p0, udp_offset, udp_len0, - sum0); - - sum0 = ~ip_csum_fold (sum0); - - /* Zero checksum means checksumming disabled. */ - sum0 = sum0 != 0 ? sum0 : 0xffff; - - udp0->checksum = sum0; - } - } -} - -static void -udp_pg_edit_function (pg_main_t * pg, - pg_stream_t * s, - pg_edit_group_t * g, u32 * packets, u32 n_packets) -{ - switch (g->edit_function_opaque) - { - case UDP_PG_EDIT_LENGTH: - udp_pg_edit_function_inline (pg, s, g, packets, n_packets, - UDP_PG_EDIT_LENGTH); - break; - - case UDP_PG_EDIT_CHECKSUM: - udp_pg_edit_function_inline (pg, s, g, packets, n_packets, - UDP_PG_EDIT_CHECKSUM); - break; - - case UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH: - udp_pg_edit_function_inline (pg, s, g, packets, n_packets, - UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH); - break; - - default: - ASSERT (0); - break; - } -} - -typedef struct -{ - pg_edit_t src_port, dst_port; - pg_edit_t length; - pg_edit_t checksum; -} pg_udp_header_t; - -static inline void -pg_udp_header_init (pg_udp_header_t * p) -{ - /* Initialize fields that are not bit fields in the IP header. */ -#define _(f) pg_edit_init (&p->f, udp_header_t, f); - _(src_port); - _(dst_port); - _(length); - _(checksum); -#undef _ -} - -uword -unformat_pg_udp_header (unformat_input_t * input, va_list * args) -{ - pg_stream_t *s = va_arg (*args, pg_stream_t *); - pg_udp_header_t *p; - u32 group_index; - - p = pg_create_edit_group (s, sizeof (p[0]), sizeof (udp_header_t), - &group_index); - pg_udp_header_init (p); - - /* Defaults. */ - p->checksum.type = PG_EDIT_UNSPECIFIED; - p->length.type = PG_EDIT_UNSPECIFIED; - - if (!unformat (input, "UDP: %U -> %U", - unformat_pg_edit, - unformat_tcp_udp_port, &p->src_port, - unformat_pg_edit, unformat_tcp_udp_port, &p->dst_port)) - goto error; - - /* Parse options. */ - while (1) - { - if (unformat (input, "length %U", - unformat_pg_edit, unformat_pg_number, &p->length)) - ; - - else if (unformat (input, "checksum %U", - unformat_pg_edit, unformat_pg_number, &p->checksum)) - ; - - /* Can't parse input: try next protocol level. */ - else - break; - } - - { - ip_main_t *im = &ip_main; - u16 dst_port; - tcp_udp_port_info_t *pi; - - pi = 0; - if (p->dst_port.type == PG_EDIT_FIXED) - { - dst_port = pg_edit_get_value (&p->dst_port, PG_EDIT_LO); - pi = ip_get_tcp_udp_port_info (im, dst_port); - } - - if (pi && pi->unformat_pg_edit - && unformat_user (input, pi->unformat_pg_edit, s)) - ; - - else if (!unformat_user (input, unformat_pg_payload, s)) - goto error; - - p = pg_get_edit_group (s, group_index); - if (p->checksum.type == PG_EDIT_UNSPECIFIED - || p->length.type == PG_EDIT_UNSPECIFIED) - { - pg_edit_group_t *g = pg_stream_get_group (s, group_index); - g->edit_function = udp_pg_edit_function; - g->edit_function_opaque = 0; - if (p->checksum.type == PG_EDIT_UNSPECIFIED) - g->edit_function_opaque |= UDP_PG_EDIT_CHECKSUM; - if (p->length.type == PG_EDIT_UNSPECIFIED) - g->edit_function_opaque |= UDP_PG_EDIT_LENGTH; - } - - return 1; - } - -error: - /* Free up any edits we may have added. */ - pg_free_edit_group (s); - return 0; -} - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ipsec/ikev2.c b/src/vnet/ipsec/ikev2.c index 09209334..2c1074d8 100644 --- a/src/vnet/ipsec/ikev2.c +++ b/src/vnet/ipsec/ikev2.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/vnet/ipsec/ikev2_cli.c b/src/vnet/ipsec/ikev2_cli.c index 5c88d8d4..05ed4e60 100644 --- a/src/vnet/ipsec/ikev2_cli.c +++ b/src/vnet/ipsec/ikev2_cli.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/vnet/ipsec/ikev2_crypto.c b/src/vnet/ipsec/ikev2_crypto.c index c201d3eb..ca56158f 100644 --- a/src/vnet/ipsec/ikev2_crypto.c +++ b/src/vnet/ipsec/ikev2_crypto.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/vnet/lisp-cp/packets.c b/src/vnet/lisp-cp/packets.c index 3a4f421b..f24024f1 100644 --- a/src/vnet/lisp-cp/packets.c +++ b/src/vnet/lisp-cp/packets.c @@ -15,7 +15,7 @@ #include #include -#include +#include /* Returns IP ID for the packet */ /* static u16 ip_id = 0; @@ -141,61 +141,6 @@ pkt_push_udp (vlib_main_t * vm, vlib_buffer_t * b, u16 sp, u16 dp) return uh; } -void * -pkt_push_ipv4 (vlib_main_t * vm, vlib_buffer_t * b, ip4_address_t * src, - ip4_address_t * dst, int proto) -{ - ip4_header_t *ih; - - /* make some room */ - ih = vlib_buffer_push_uninit (b, sizeof (ip4_header_t)); - - ih->ip_version_and_header_length = 0x45; - ih->tos = 0; - ih->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b)); - - /* iph->fragment_id = clib_host_to_net_u16(get_IP_ID ()); */ - - /* TODO: decide if we allow fragments in case of control */ - ih->flags_and_fragment_offset = clib_host_to_net_u16 (IP_DF); - ih->ttl = 255; - ih->protocol = proto; - ih->src_address.as_u32 = src->as_u32; - ih->dst_address.as_u32 = dst->as_u32; - - ih->checksum = ip4_header_checksum (ih); - return ih; -} - -void * -pkt_push_ipv6 (vlib_main_t * vm, vlib_buffer_t * b, ip6_address_t * src, - ip6_address_t * dst, int proto) -{ - ip6_header_t *ip6h; - u16 payload_length; - - /* make some room */ - ip6h = vlib_buffer_push_uninit (b, sizeof (ip6_header_t)); - - ip6h->ip_version_traffic_class_and_flow_label = - clib_host_to_net_u32 (0x6 << 28); - - /* calculate ip6 payload length */ - payload_length = vlib_buffer_length_in_chain (vm, b); - payload_length -= sizeof (*ip6h); - - ip6h->payload_length = clib_host_to_net_u16 (payload_length); - - ip6h->hop_limit = 0xff; - ip6h->protocol = proto; - clib_memcpy (ip6h->src_address.as_u8, src->as_u8, - sizeof (ip6h->src_address)); - clib_memcpy (ip6h->dst_address.as_u8, dst->as_u8, - sizeof (ip6h->src_address)); - - return ip6h; -} - void * pkt_push_ip (vlib_main_t * vm, vlib_buffer_t * b, ip_address_t * src, ip_address_t * dst, u32 proto) @@ -210,12 +155,12 @@ pkt_push_ip (vlib_main_t * vm, vlib_buffer_t * b, ip_address_t * src, switch (ip_addr_version (src)) { case IP4: - return pkt_push_ipv4 (vm, b, &ip_addr_v4 (src), &ip_addr_v4 (dst), - proto); + return vlib_buffer_push_ip4 (vm, b, &ip_addr_v4 (src), + &ip_addr_v4 (dst), proto); break; case IP6: - return pkt_push_ipv6 (vm, b, &ip_addr_v6 (src), &ip_addr_v6 (dst), - proto); + return vlib_buffer_push_ip6 (vm, b, &ip_addr_v6 (src), + &ip_addr_v6 (dst), proto); break; } diff --git a/src/vnet/lisp-cp/packets.h b/src/vnet/lisp-cp/packets.h index 212a1d78..f6da3bf4 100644 --- a/src/vnet/lisp-cp/packets.h +++ b/src/vnet/lisp-cp/packets.h @@ -26,51 +26,6 @@ void *pkt_push_udp_and_ip (vlib_main_t * vm, vlib_buffer_t * b, u16 sp, void *pkt_push_ecm_hdr (vlib_buffer_t * b); -always_inline u8 * -vlib_buffer_get_tail (vlib_buffer_t * b) -{ - return b->data + b->current_data + b->current_length; -} - -always_inline void * -vlib_buffer_put_uninit (vlib_buffer_t * b, u8 size) -{ - /* XXX should make sure there's enough space! */ - void *p = vlib_buffer_get_tail (b); - b->current_length += size; - return p; -} - -always_inline void * -vlib_buffer_push_uninit (vlib_buffer_t * b, u8 size) -{ - /* XXX should make sure there's enough space! */ - ASSERT (b->current_data >= size); - b->current_data -= size; - b->current_length += size; - - return vlib_buffer_get_current (b); -} - -always_inline void * -vlib_buffer_make_headroom (vlib_buffer_t * b, u8 size) -{ - /* XXX should make sure there's enough space! */ - b->current_data += size; - return vlib_buffer_get_current (b); -} - -always_inline void * -vlib_buffer_pull (vlib_buffer_t * b, u8 size) -{ - if (b->current_length < size) - return 0; - - void *data = vlib_buffer_get_current (b); - vlib_buffer_advance (b, size); - return data; -} - /* *INDENT-ON* */ /* diff --git a/src/vnet/lisp-gpe/interface.c b/src/vnet/lisp-gpe/interface.c index 13359277..292c7e6a 100644 --- a/src/vnet/lisp-gpe/interface.c +++ b/src/vnet/lisp-gpe/interface.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/vnet/lisp-gpe/lisp_gpe.h b/src/vnet/lisp-gpe/lisp_gpe.h index c898a7da..b5a50ec6 100644 --- a/src/vnet/lisp-gpe/lisp_gpe.h +++ b/src/vnet/lisp-gpe/lisp_gpe.h @@ -27,10 +27,12 @@ #include #include #include -#include +#include #include #include #include +#include +#include /** IP4-UDP-LISP encap header */ /* *INDENT-OFF* */ diff --git a/src/vnet/lisp-gpe/lisp_gpe_adjacency.c b/src/vnet/lisp-gpe/lisp_gpe_adjacency.c index 65006b81..dbcf7134 100644 --- a/src/vnet/lisp-gpe/lisp_gpe_adjacency.c +++ b/src/vnet/lisp-gpe/lisp_gpe_adjacency.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include /** * Memory pool of all adjacencies diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c new file mode 100644 index 00000000..a561e7d1 --- /dev/null +++ b/src/vnet/session/application.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +/* + * Pool from which we allocate all applications + */ +static application_t *app_pool; + +/* + * Hash table of apps by api client index + */ +static uword *app_by_api_client_index; + +int +application_api_queue_is_full (application_t * app) +{ + unix_shared_memory_queue_t *q; + + /* builtin servers are always OK */ + if (app->api_client_index == ~0) + return 0; + + q = vl_api_client_index_to_input_queue (app->api_client_index); + if (!q) + return 1; + + if (q->cursize == q->maxsize) + return 1; + return 0; +} + +static void +application_table_add (application_t * app) +{ + hash_set (app_by_api_client_index, app->api_client_index, app->index); +} + +static void +application_table_del (application_t * app) +{ + hash_unset (app_by_api_client_index, app->api_client_index); +} + +application_t * +application_lookup (u32 api_client_index) +{ + uword *p; + p = hash_get (app_by_api_client_index, api_client_index); + if (p) + return application_get (p[0]); + + return 0; +} + +void +application_del (application_t * app) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + api_main_t *am = &api_main; + void *oldheap; + session_manager_t *sm; + + if (app->mode == APP_SERVER) + { + sm = session_manager_get (app->session_manager_index); + session_manager_del (smm, sm); + } + + /* Free the event fifo in the /vpe-api shared-memory segment */ + oldheap = svm_push_data_heap (am->vlib_rp); + if (app->event_queue) + unix_shared_memory_queue_free (app->event_queue); + svm_pop_heap (oldheap); + + application_table_del (app); + + pool_put (app_pool, app); +} + +application_t * +application_new (application_type_t type, session_type_t sst, + u32 api_client_index, u32 flags, session_cb_vft_t * cb_fns) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + api_main_t *am = &api_main; + application_t *app; + void *oldheap; + session_manager_t *sm; + + pool_get (app_pool, app); + memset (app, 0, sizeof (*app)); + + /* Allocate event fifo in the /vpe-api shared-memory segment */ + oldheap = svm_push_data_heap (am->vlib_rp); + + /* Allocate server event queue */ + app->event_queue = + unix_shared_memory_queue_init (128 /* nels $$$$ config */ , + sizeof (session_fifo_event_t), + 0 /* consumer pid */ , + 0 + /* (do not) signal when queue non-empty */ + ); + + svm_pop_heap (oldheap); + + /* If a server, allocate session manager */ + if (type == APP_SERVER) + { + pool_get (smm->session_managers, sm); + memset (sm, 0, sizeof (*sm)); + + app->session_manager_index = sm - smm->session_managers; + } + else if (type == APP_CLIENT) + { + /* Allocate connect session manager if needed */ + if (smm->connect_manager_index[sst] == INVALID_INDEX) + connects_session_manager_init (smm, sst); + app->session_manager_index = smm->connect_manager_index[sst]; + } + + app->mode = type; + app->index = application_get_index (app); + app->session_type = sst; + app->api_client_index = api_client_index; + app->flags = flags; + app->cb_fns = *cb_fns; + + /* Add app to lookup by api_client_index table */ + application_table_add (app); + + return app; +} + +application_t * +application_get (u32 index) +{ + return pool_elt_at_index (app_pool, index); +} + +u32 +application_get_index (application_t * app) +{ + return app - app_pool; +} + +int +application_server_init (application_t * server, u32 segment_size, + u32 add_segment_size, u32 rx_fifo_size, + u32 tx_fifo_size, u8 ** segment_name) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + session_manager_t *sm; + int rv; + + sm = session_manager_get (server->session_manager_index); + + /* Add first segment */ + if ((rv = session_manager_add_first_segment (smm, sm, segment_size, + segment_name))) + { + return rv; + } + + /* Setup session manager */ + sm->add_segment_size = add_segment_size; + sm->rx_fifo_size = rx_fifo_size; + sm->tx_fifo_size = tx_fifo_size; + sm->add_segment = sm->add_segment_size != 0; + return 0; +} + +u8 * +format_application_server (u8 * s, va_list * args) +{ + application_t *srv = va_arg (*args, application_t *); + int verbose = va_arg (*args, int); + vl_api_registration_t *regp; + stream_session_t *listener; + u8 *server_name, *str, *seg_name; + u32 segment_size; + + if (srv == 0) + { + if (verbose) + s = format (s, "%-40s%-20s%-15s%-15s%-10s", "Connection", "Server", + "Segment", "API Client", "Cookie"); + else + s = format (s, "%-40s%-20s", "Connection", "Server"); + + return s; + } + + regp = vl_api_client_index_to_registration (srv->api_client_index); + if (!regp) + server_name = format (0, "%s%c", regp->name, 0); + else + server_name = regp->name; + + listener = stream_session_listener_get (srv->session_type, + srv->session_index); + str = format (0, "%U", format_stream_session, listener, verbose); + + session_manager_get_segment_info (listener->server_segment_index, &seg_name, + &segment_size); + if (verbose) + { + s = format (s, "%-40s%-20s%-20s%-10d%-10d", str, server_name, + seg_name, srv->api_client_index, srv->accept_cookie); + } + else + s = format (s, "%-40s%-20s", str, server_name); + return s; +} + +u8 * +format_application_client (u8 * s, va_list * args) +{ + application_t *client = va_arg (*args, application_t *); + int verbose = va_arg (*args, int); + stream_session_t *session; + u8 *str, *seg_name; + u32 segment_size; + + if (client == 0) + { + if (verbose) + s = + format (s, "%-40s%-20s%-10s", "Connection", "Segment", + "API Client"); + else + s = format (s, "%-40s", "Connection"); + + return s; + } + + session = stream_session_get (client->session_index, client->thread_index); + str = format (0, "%U", format_stream_session, session, verbose); + + session_manager_get_segment_info (session->server_segment_index, &seg_name, + &segment_size); + if (verbose) + { + s = format (s, "%-40s%-20s%-10d%", str, seg_name, + client->api_client_index); + } + else + s = format (s, "%-40s", str); + return s; +} + +static clib_error_t * +show_app_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + application_t *app; + int do_server = 0; + int do_client = 0; + int verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "server")) + do_server = 1; + else if (unformat (input, "client")) + do_client = 1; + else if (unformat (input, "verbose")) + verbose = 1; + else + break; + } + + if (do_server) + { + if (pool_elts (app_pool)) + { + vlib_cli_output (vm, "%U", format_application_server, + 0 /* header */ , + verbose); + /* *INDENT-OFF* */ + pool_foreach (app, app_pool, + ({ + if (app->mode == APP_SERVER) + vlib_cli_output (vm, "%U", format_application_server, app, + verbose); + })); + /* *INDENT-ON* */ + } + else + vlib_cli_output (vm, "No active server bindings"); + } + + if (do_client) + { + if (pool_elts (app_pool)) + { + vlib_cli_output (vm, "%U", format_application_client, + 0 /* header */ , + verbose); + /* *INDENT-OFF* */ + pool_foreach (app, app_pool, + ({ + if (app->mode == APP_CLIENT) + vlib_cli_output (vm, "%U", format_application_client, app, + verbose); + })); + /* *INDENT-ON* */ + } + else + vlib_cli_output (vm, "No active server bindings"); + } + + return 0; +} + +VLIB_CLI_COMMAND (show_app_command, static) = +{ +.path = "show app",.short_help = + "show app [server|client] [verbose]",.function = show_app_command_fn,}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h new file mode 100644 index 00000000..027d6967 --- /dev/null +++ b/src/vnet/session/application.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_SESSION_APPLICATION_H_ +#define SRC_VNET_SESSION_APPLICATION_H_ + +#include +#include + +typedef enum +{ + APP_SERVER, + APP_CLIENT +} application_type_t; + +typedef struct _stream_session_cb_vft +{ + /** Notify server of new segment */ + int (*add_segment_callback) (u32 api_client_index, const u8 * seg_name, + u32 seg_size); + + /** Notify server of newly accepted session */ + int (*session_accept_callback) (stream_session_t * new_session); + + /* Connection request callback */ + int (*session_connected_callback) (u32 api_client_index, + stream_session_t * s, u8 code); + + /** Notify app that session is closing */ + void (*session_disconnect_callback) (stream_session_t * s); + + /** Notify app that session was reset */ + void (*session_reset_callback) (stream_session_t * s); + + /* Direct RX callback, for built-in servers */ + int (*builtin_server_rx_callback) (stream_session_t * session); + + /* Redirect connection to local server */ + int (*redirect_connect_callback) (u32 api_client_index, void *mp); +} session_cb_vft_t; + +typedef struct _application +{ + /** Index in server pool */ + u32 index; + + /** Flags */ + u32 flags; + + /** Binary API connection index, ~0 if internal */ + u32 api_client_index; + + /* */ + u32 api_context; + + /** Application listens for events on this svm queue */ + unix_shared_memory_queue_t *event_queue; + + /** Stream session type */ + u8 session_type; + + /* Stream server mode: accept or connect */ + u8 mode; + + u32 session_manager_index; + + /* + * Bind/Listen specific + */ + + /** Accept cookie, for multiple session flavors ($$$ maybe) */ + u32 accept_cookie; + + /** Index of the listen session or connect session */ + u32 session_index; + + /** Session thread index for client connect sessions */ + u32 thread_index; + + /* + * Callbacks: shoulder-taps for the server/client + */ + session_cb_vft_t cb_fns; +} application_t; + +application_t *application_new (application_type_t type, session_type_t sst, + u32 api_client_index, u32 flags, + session_cb_vft_t * cb_fns); +void application_del (application_t * app); +application_t *application_get (u32 index); +application_t *application_lookup (u32 api_client_index); +u32 application_get_index (application_t * app); + +int +application_server_init (application_t * server, u32 segment_size, + u32 add_segment_size, u32 rx_fifo_size, + u32 tx_fifo_size, u8 ** segment_name); +int application_api_queue_is_full (application_t * app); + +#endif /* SRC_VNET_SESSION_APPLICATION_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c new file mode 100644 index 00000000..0ea77fd8 --- /dev/null +++ b/src/vnet/session/application_interface.c @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +/** @file + VPP's application/session API bind/unbind/connect/disconnect calls +*/ + +static u8 +ip_is_zero (ip46_address_t * ip46_address, u8 is_ip4) +{ + if (is_ip4) + return (ip46_address->ip4.as_u32 == 0); + else + return (ip46_address->as_u64[0] == 0 && ip46_address->as_u64[1] == 0); +} + +static u8 +ip_is_local (ip46_address_t * ip46_address, u8 is_ip4) +{ + fib_node_index_t fei; + fib_entry_flag_t flags; + fib_prefix_t prefix; + + /* Check if requester is local */ + if (is_ip4) + { + prefix.fp_len = 32; + prefix.fp_proto = FIB_PROTOCOL_IP4; + } + else + { + prefix.fp_len = 128; + prefix.fp_proto = FIB_PROTOCOL_IP6; + } + + clib_memcpy (&prefix.fp_addr, ip46_address, sizeof (ip46_address)); + fei = fib_table_lookup (0, &prefix); + flags = fib_entry_get_flags (fei); + + return (flags & FIB_ENTRY_FLAG_LOCAL); +} + +int +api_parse_session_handle (u64 handle, u32 * session_index, u32 * thread_index) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + stream_session_t *pool; + + *thread_index = handle & 0xFFFFFFFF; + *session_index = handle >> 32; + + if (*thread_index >= vec_len (smm->sessions)) + return VNET_API_ERROR_INVALID_VALUE; + + pool = smm->sessions[*thread_index]; + + if (pool_is_free_index (pool, *session_index)) + return VNET_API_ERROR_INVALID_VALUE_2; + + return 0; +} + +int +vnet_bind_i (u32 api_client_index, ip46_address_t * ip46, u16 port_host_order, + session_type_t sst, u64 * options, session_cb_vft_t * cb_fns, + application_t ** app, u32 * len_seg_name, char *seg_name) +{ + u8 *segment_name = 0; + application_t *server = 0; + stream_session_t *listener; + u8 is_ip4; + + listener = + stream_session_lookup_listener (ip46, + clib_host_to_net_u16 (port_host_order), + sst); + + if (listener) + return VNET_API_ERROR_ADDRESS_IN_USE; + + if (application_lookup (api_client_index)) + { + clib_warning ("Only one bind supported for now"); + return VNET_API_ERROR_ADDRESS_IN_USE; + } + + is_ip4 = SESSION_TYPE_IP4_UDP == sst || SESSION_TYPE_IP4_TCP == sst; + if (!ip_is_zero (ip46, is_ip4) && !ip_is_local (ip46, is_ip4)) + return VNET_API_ERROR_INVALID_VALUE; + + /* Allocate and initialize stream server */ + server = application_new (APP_SERVER, sst, api_client_index, + options[SESSION_OPTIONS_FLAGS], cb_fns); + + application_server_init (server, options[SESSION_OPTIONS_SEGMENT_SIZE], + options[SESSION_OPTIONS_ADD_SEGMENT_SIZE], + options[SESSION_OPTIONS_RX_FIFO_SIZE], + options[SESSION_OPTIONS_TX_FIFO_SIZE], + &segment_name); + + /* Setup listen path down to transport */ + stream_session_start_listen (server->index, ip46, port_host_order); + + /* + * Return values + */ + + ASSERT (vec_len (segment_name) <= 128); + *len_seg_name = vec_len (segment_name); + memcpy (seg_name, segment_name, *len_seg_name); + *app = server; + + return 0; +} + +int +vnet_unbind_i (u32 api_client_index) +{ + application_t *server; + + /* + * Find the stream_server_t corresponding to the api client + */ + server = application_lookup (api_client_index); + if (!server) + return VNET_API_ERROR_INVALID_VALUE_2; + + /* Clear the listener */ + stream_session_stop_listen (server->index); + application_del (server); + + return 0; +} + +int +vnet_connect_i (u32 api_client_index, u32 api_context, session_type_t sst, + ip46_address_t * ip46, u16 port, u64 * options, void *mp, + session_cb_vft_t * cb_fns) +{ + stream_session_t *listener; + application_t *server, *app; + + /* + * Figure out if connecting to a local server + */ + listener = stream_session_lookup_listener (ip46, + clib_host_to_net_u16 (port), + sst); + if (listener) + { + server = application_get (listener->app_index); + + /* + * Server is willing to have a direct fifo connection created + * instead of going through the state machine, etc. + */ + if (server->flags & SESSION_OPTIONS_FLAGS_USE_FIFO) + return server->cb_fns. + redirect_connect_callback (server->api_client_index, mp); + } + + /* Create client app */ + app = application_new (APP_CLIENT, sst, api_client_index, + options[SESSION_OPTIONS_FLAGS], cb_fns); + + app->api_context = api_context; + + /* + * Not connecting to a local server. Create regular session + */ + stream_session_open (sst, ip46, port, app->index); + + return 0; +} + +/** + * unformat a vnet URI + * + * fifo://name + * tcp://ip46-addr:port + * udp://ip46-addr:port + * + * u8 ip46_address[16]; + * u16 port_in_host_byte_order; + * stream_session_type_t sst; + * u8 *fifo_name; + * + * if (unformat (input, "%U", unformat_vnet_uri, &ip46_address, + * &sst, &port, &fifo_name)) + * etc... + * + */ +uword +unformat_vnet_uri (unformat_input_t * input, va_list * args) +{ + ip46_address_t *address = va_arg (*args, ip46_address_t *); + session_type_t *sst = va_arg (*args, session_type_t *); + u16 *port = va_arg (*args, u16 *); + + if (unformat (input, "tcp://%U/%d", unformat_ip4_address, &address->ip4, + port)) + { + *sst = SESSION_TYPE_IP4_TCP; + return 1; + } + if (unformat (input, "udp://%U/%d", unformat_ip4_address, &address->ip4, + port)) + { + *sst = SESSION_TYPE_IP4_UDP; + return 1; + } + if (unformat (input, "udp://%U/%d", unformat_ip6_address, &address->ip6, + port)) + { + *sst = SESSION_TYPE_IP6_UDP; + return 1; + } + if (unformat (input, "tcp://%U/%d", unformat_ip6_address, &address->ip6, + port)) + { + *sst = SESSION_TYPE_IP6_TCP; + return 1; + } + + return 0; +} + +int +parse_uri (char *uri, session_type_t * sst, ip46_address_t * addr, + u16 * port_number_host_byte_order) +{ + unformat_input_t _input, *input = &_input; + + /* Make sure */ + uri = (char *) format (0, "%s%c", uri, 0); + + /* Parse uri */ + unformat_init_string (input, uri, strlen (uri)); + if (!unformat (input, "%U", unformat_vnet_uri, addr, sst, + port_number_host_byte_order)) + { + unformat_free (input); + return VNET_API_ERROR_INVALID_VALUE; + } + unformat_free (input); + + return 0; +} + +int +vnet_bind_uri (vnet_bind_args_t * a) +{ + application_t *server = 0; + u16 port_host_order; + session_type_t sst = SESSION_N_TYPES; + ip46_address_t ip46; + int rv; + + memset (&ip46, 0, sizeof (ip46)); + rv = parse_uri (a->uri, &sst, &ip46, &port_host_order); + if (rv) + return rv; + + if ((rv = vnet_bind_i (a->api_client_index, &ip46, port_host_order, sst, + a->options, a->session_cb_vft, &server, + &a->segment_name_length, a->segment_name))) + return rv; + + a->server_event_queue_address = (u64) server->event_queue; + return 0; +} + +session_type_t +session_type_from_proto_and_ip (session_api_proto_t proto, u8 is_ip4) +{ + if (proto == SESSION_PROTO_TCP) + { + if (is_ip4) + return SESSION_TYPE_IP4_TCP; + else + return SESSION_TYPE_IP6_TCP; + } + else + { + if (is_ip4) + return SESSION_TYPE_IP4_UDP; + else + return SESSION_TYPE_IP6_UDP; + } + + return SESSION_N_TYPES; +} + +int +vnet_unbind_uri (char *uri, u32 api_client_index) +{ + u16 port_number_host_byte_order; + session_type_t sst = SESSION_N_TYPES; + ip46_address_t ip46_address; + stream_session_t *listener; + int rv; + + rv = parse_uri (uri, &sst, &ip46_address, &port_number_host_byte_order); + if (rv) + return rv; + + listener = + stream_session_lookup_listener (&ip46_address, + clib_host_to_net_u16 + (port_number_host_byte_order), sst); + + if (!listener) + return VNET_API_ERROR_ADDRESS_NOT_IN_USE; + + /* External client? */ + if (api_client_index != ~0) + { + ASSERT (vl_api_client_index_to_registration (api_client_index)); + } + + return vnet_unbind_i (api_client_index); +} + +int +vnet_connect_uri (vnet_connect_args_t * a) +{ + ip46_address_t ip46_address; + u16 port; + session_type_t sst; + application_t *app; + int rv; + + app = application_lookup (a->api_client_index); + if (app) + { + clib_warning ("Already have a connect from this app"); + return VNET_API_ERROR_INVALID_VALUE_2; + } + + /* Parse uri */ + rv = parse_uri (a->uri, &sst, &ip46_address, &port); + if (rv) + return rv; + + return vnet_connect_i (a->api_client_index, a->api_context, sst, + &ip46_address, port, a->options, a->mp, + a->session_cb_vft); +} + +int +vnet_disconnect_session (u32 client_index, u32 session_index, + u32 thread_index) +{ + stream_session_t *session; + + session = stream_session_get (session_index, thread_index); + stream_session_disconnect (session); + + return 0; +} + + +int +vnet_bind (vnet_bind_args_t * a) +{ + application_t *server = 0; + session_type_t sst = SESSION_N_TYPES; + int rv; + + sst = session_type_from_proto_and_ip (a->proto, a->tep.is_ip4); + if ((rv = vnet_bind_i (a->api_client_index, &a->tep.ip, a->tep.port, sst, + a->options, a->session_cb_vft, &server, + &a->segment_name_length, a->segment_name))) + return rv; + + a->server_event_queue_address = (u64) server->event_queue; + a->handle = (u64) a->tep.vrf << 32 | (u64) server->session_index; + return 0; +} + +int +vnet_unbind (vnet_unbind_args_t * a) +{ + application_t *server; + + if (a->api_client_index != ~0) + { + ASSERT (vl_api_client_index_to_registration (a->api_client_index)); + } + + /* Make sure this is the right one */ + server = application_lookup (a->api_client_index); + ASSERT (server->session_index == (0xFFFFFFFF & a->handle)); + + /* TODO use handle to disambiguate namespaces/vrfs */ + return vnet_unbind_i (a->api_client_index); +} + +int +vnet_connect (vnet_connect_args_t * a) +{ + session_type_t sst; + application_t *app; + + app = application_lookup (a->api_client_index); + if (app) + { + clib_warning ("Already have a connect from this app"); + return VNET_API_ERROR_INVALID_VALUE_2; + } + + sst = session_type_from_proto_and_ip (a->proto, a->tep.is_ip4); + return vnet_connect_i (a->api_client_index, a->api_context, sst, &a->tep.ip, + a->tep.port, a->options, a->mp, a->session_cb_vft); +} + +int +vnet_disconnect (vnet_disconnect_args_t * a) +{ + stream_session_t *session; + u32 session_index, thread_index; + + if (api_parse_session_handle (a->handle, &session_index, &thread_index)) + { + clib_warning ("Invalid handle"); + return -1; + } + + session = stream_session_get (session_index, thread_index); + stream_session_disconnect (session); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h new file mode 100644 index 00000000..8d87c067 --- /dev/null +++ b/src/vnet/session/application_interface.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_uri_h__ +#define __included_uri_h__ + +#include +#include +#include +#include +#include +#include + +typedef enum _session_api_proto +{ + SESSION_PROTO_TCP, + SESSION_PROTO_UDP +} session_api_proto_t; + +typedef struct _vnet_bind_args_t +{ + union + { + char *uri; + struct + { + transport_endpoint_t tep; + session_api_proto_t proto; + }; + }; + + u32 api_client_index; + u64 *options; + session_cb_vft_t *session_cb_vft; + + /* + * Results + */ + char *segment_name; + u32 segment_name_length; + u64 server_event_queue_address; + u64 handle; +} vnet_bind_args_t; + +typedef struct _vnet_unbind_args_t +{ + union + { + char *uri; + u64 handle; + }; + u32 api_client_index; +} vnet_unbind_args_t; + +typedef struct _vnet_connect_args +{ + union + { + char *uri; + struct + { + transport_endpoint_t tep; + session_api_proto_t proto; + }; + }; + u32 api_client_index; + u32 api_context; + u64 *options; + session_cb_vft_t *session_cb_vft; + + /* Used for redirects */ + void *mp; +} vnet_connect_args_t; + +typedef struct _vnet_disconnect_args_t +{ + u64 handle; + u32 api_client_index; +} vnet_disconnect_args_t; + +/* Bind / connect options */ +typedef enum +{ + SESSION_OPTIONS_FLAGS, + SESSION_OPTIONS_SEGMENT_SIZE, + SESSION_OPTIONS_ADD_SEGMENT_SIZE, + SESSION_OPTIONS_RX_FIFO_SIZE, + SESSION_OPTIONS_TX_FIFO_SIZE, + SESSION_OPTIONS_ACCEPT_COOKIE, + SESSION_OPTIONS_N_OPTIONS +} session_options_index_t; + +/** Server can handle delegated connect requests from local clients */ +#define SESSION_OPTIONS_FLAGS_USE_FIFO (1<<0) + +/** Server wants vpp to add segments when out of memory for fifos */ +#define SESSION_OPTIONS_FLAGS_ADD_SEGMENT (1<<1) + +#define VNET_CONNECT_REDIRECTED 123 + +int vnet_bind_uri (vnet_bind_args_t *); +int vnet_unbind_uri (char *uri, u32 api_client_index); +int vnet_connect_uri (vnet_connect_args_t * a); +int +vnet_disconnect_session (u32 client_index, u32 session_index, + u32 thread_index); + +int vnet_bind (vnet_bind_args_t * a); +int vnet_connect (vnet_connect_args_t * a); +int vnet_unbind (vnet_unbind_args_t * a); +int vnet_disconnect (vnet_disconnect_args_t * a); + +int +api_parse_session_handle (u64 handle, u32 * session_index, + u32 * thread_index); + +#endif /* __included_uri_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/hashes.c b/src/vnet/session/hashes.c new file mode 100644 index 00000000..1808dd73 --- /dev/null +++ b/src/vnet/session/hashes.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Generate typed init functions for multiple hash table styles... */ + +#include +#include + +#include + +#undef __included_bihash_template_h__ + +#include +#include + +#include diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c new file mode 100644 index 00000000..e467f4e9 --- /dev/null +++ b/src/vnet/session/node.c @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include + +vlib_node_registration_t session_queue_node; + +typedef struct +{ + u32 session_index; + u32 server_thread_index; +} session_queue_trace_t; + +/* packet trace format function */ +static u8 * +format_session_queue_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + session_queue_trace_t *t = va_arg (*args, session_queue_trace_t *); + + s = format (s, "SESSION_QUEUE: session index %d, server thread index %d", + t->session_index, t->server_thread_index); + return s; +} + +vlib_node_registration_t session_queue_node; + +#define foreach_session_queue_error \ +_(TX, "Packets transmitted") \ +_(TIMER, "Timer events") + +typedef enum +{ +#define _(sym,str) SESSION_QUEUE_ERROR_##sym, + foreach_session_queue_error +#undef _ + SESSION_QUEUE_N_ERROR, +} session_queue_error_t; + +static char *session_queue_error_strings[] = { +#define _(sym,string) string, + foreach_session_queue_error +#undef _ +}; + +static u32 session_type_to_next[] = { + SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT, + SESSION_QUEUE_NEXT_IP4_LOOKUP, + SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT, + SESSION_QUEUE_NEXT_IP6_LOOKUP, +}; + +always_inline int +session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, int *n_tx_packets, + u8 peek_data) +{ + u32 n_trace = vlib_get_trace_count (vm, node); + u32 left_to_snd0, max_len_to_snd0, len_to_deq0, n_bufs, snd_space0; + u32 n_frame_bytes, n_frames_per_evt; + transport_connection_t *tc0; + transport_proto_vft_t *transport_vft; + u32 next_index, next0, *to_next, n_left_to_next, bi0; + vlib_buffer_t *b0; + u32 rx_offset; + u16 snd_mss0; + u8 *data0; + int i; + + next_index = next0 = session_type_to_next[s0->session_type]; + + transport_vft = session_get_transport_vft (s0->session_type); + tc0 = transport_vft->get_connection (s0->connection_index, thread_index); + + /* Make sure we have space to send and there's something to dequeue */ + snd_space0 = transport_vft->send_space (tc0); + snd_mss0 = transport_vft->send_mss (tc0); + + if (snd_space0 == 0 || svm_fifo_max_dequeue (s0->server_tx_fifo) == 0 + || snd_mss0 == 0) + return 0; + + ASSERT (e0->enqueue_length > 0); + + /* Ensure we're not writing more than transport window allows */ + max_len_to_snd0 = clib_min (e0->enqueue_length, snd_space0); + + if (peek_data) + { + /* Offset in rx fifo from where to peek data */ + rx_offset = transport_vft->rx_fifo_offset (tc0); + } + + /* TODO check if transport is willing to send len_to_snd0 + * bytes (Nagle) */ + + n_frame_bytes = snd_mss0 * VLIB_FRAME_SIZE; + n_frames_per_evt = ceil ((double) max_len_to_snd0 / n_frame_bytes); + + n_bufs = vec_len (smm->tx_buffers[thread_index]); + left_to_snd0 = max_len_to_snd0; + for (i = 0; i < n_frames_per_evt; i++) + { + /* Make sure we have at least one full frame of buffers ready */ + if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) + { + vec_validate (smm->tx_buffers[thread_index], + n_bufs + VLIB_FRAME_SIZE - 1); + n_bufs += + vlib_buffer_alloc (vm, &smm->tx_buffers[thread_index][n_bufs], + VLIB_FRAME_SIZE); + + /* buffer shortage + * XXX 0.9 because when debugging we might not get a full frame */ + if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE)) + { + /* Keep track of how much we've dequeued and exit */ + e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; + return -1; + } + + _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + } + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (left_to_snd0 && n_left_to_next) + { + /* Get free buffer */ + n_bufs--; + bi0 = smm->tx_buffers[thread_index][n_bufs]; + _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + + b0 = vlib_get_buffer (vm, bi0); + b0->error = 0; + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID + | VNET_BUFFER_LOCALLY_ORIGINATED; + b0->current_data = 0; + + /* RX on the local interface. tx in default fib */ + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + /* usual speculation, or the enqueue_x1 macro will barf */ + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + if (PREDICT_FALSE (n_trace > 0)) + { + session_queue_trace_t *t0; + vlib_trace_buffer (vm, node, next_index, b0, + 1 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + t0->session_index = s0->session_index; + t0->server_thread_index = s0->thread_index; + } + + if (1) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "evt-dequeue: id %d length %d",.format_args = + "i4i4",}; + struct + { + u32 data[2]; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->data[0] = e0->event_id; + ed->data[1] = e0->enqueue_length; + } + + len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; + + /* Make room for headers */ + data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); + + /* Dequeue the data + * TODO 1) peek instead of dequeue + * 2) buffer chains */ + if (peek_data) + { + int n_bytes_read; + n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, s0->pid, + rx_offset, len_to_deq0, data0); + if (n_bytes_read < 0) + goto dequeue_fail; + + /* Keep track of progress locally, transport is also supposed to + * increment it independently when pushing header */ + rx_offset += n_bytes_read; + } + else + { + if (svm_fifo_dequeue_nowait (s0->server_tx_fifo, s0->pid, + len_to_deq0, data0) < 0) + goto dequeue_fail; + } + + b0->current_length = len_to_deq0; + + /* Ask transport to push header */ + transport_vft->push_header (tc0, b0); + + left_to_snd0 -= len_to_deq0; + *n_tx_packets = *n_tx_packets + 1; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* If we couldn't dequeue all bytes store progress */ + if (max_len_to_snd0 < e0->enqueue_length) + { + e0->enqueue_length -= max_len_to_snd0; + vec_add1 (smm->evts_partially_read[thread_index], *e0); + } + return 0; + +dequeue_fail: + /* Can't read from fifo. Store event rx progress, save as partially read, + * return buff to free list and return */ + e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; + vec_add1 (smm->evts_partially_read[thread_index], *e0); + + to_next -= 1; + n_left_to_next += 1; + _vec_len (smm->tx_buffers[thread_index]) += 1; + + clib_warning ("dequeue fail"); + return 0; +} + +int +session_fifo_rx_peek (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, int *n_tx_pkts) +{ + return session_fifo_rx_i (vm, node, smm, e0, s0, thread_index, n_tx_pkts, + 1); +} + +int +session_fifo_rx_dequeue (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, stream_session_t * s0, + u32 thread_index, int *n_tx_pkts) +{ + return session_fifo_rx_i (vm, node, smm, e0, s0, thread_index, n_tx_pkts, + 0); +} + +static uword +session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + session_fifo_event_t *my_fifo_events, *e; + u32 n_to_dequeue; + unix_shared_memory_queue_t *q; + int n_tx_packets = 0; + u32 my_thread_index = vm->cpu_index; + int i, rv; + + /* + * Update TCP time + */ + tcp_update_time (vlib_time_now (vm), my_thread_index); + + /* + * Get vpp queue events + */ + q = smm->vpp_event_queues[my_thread_index]; + if (PREDICT_FALSE (q == 0)) + return 0; + + /* min number of events we can dequeue without blocking */ + n_to_dequeue = q->cursize; + if (n_to_dequeue == 0) + return 0; + + my_fifo_events = smm->fifo_events[my_thread_index]; + + /* If we didn't manage to process previous events try going + * over them again without dequeuing new ones. + * XXX: Block senders to sessions that can't keep up */ + if (vec_len (my_fifo_events) >= 100) + goto skip_dequeue; + + /* See you in the next life, don't be late */ + if (pthread_mutex_trylock (&q->mutex)) + return 0; + + for (i = 0; i < n_to_dequeue; i++) + { + vec_add2 (my_fifo_events, e, 1); + unix_shared_memory_queue_sub_raw (q, (u8 *) e); + } + + /* The other side of the connection is not polling */ + if (q->cursize < (q->maxsize / 8)) + (void) pthread_cond_broadcast (&q->condvar); + pthread_mutex_unlock (&q->mutex); + + smm->fifo_events[my_thread_index] = my_fifo_events; + +skip_dequeue: + + for (i = 0; i < n_to_dequeue; i++) + { + svm_fifo_t *f0; /* $$$ prefetch 1 ahead maybe */ + stream_session_t *s0; + u32 server_session_index0, server_thread_index0; + session_fifo_event_t *e0; + + e0 = &my_fifo_events[i]; + f0 = e0->fifo; + server_session_index0 = f0->server_session_index; + server_thread_index0 = f0->server_thread_index; + + /* $$$ add multiple event queues, per vpp worker thread */ + ASSERT (server_thread_index0 == my_thread_index); + + s0 = pool_elt_at_index (smm->sessions[my_thread_index], + server_session_index0); + + ASSERT (s0->thread_index == my_thread_index); + + switch (e0->event_type) + { + case FIFO_EVENT_SERVER_TX: + /* Spray packets in per session type frames, since they go to + * different nodes */ + rv = (smm->session_rx_fns[s0->session_type]) (vm, node, smm, e0, s0, + my_thread_index, + &n_tx_packets); + if (rv < 0) + goto done; + + break; + + default: + clib_warning ("unhandled event type %d", e0->event_type); + } + } + +done: + + /* Couldn't process all events. Probably out of buffers */ + if (PREDICT_FALSE (i < n_to_dequeue)) + { + session_fifo_event_t *partially_read = + smm->evts_partially_read[my_thread_index]; + vec_add (partially_read, &my_fifo_events[i], n_to_dequeue - i); + vec_free (my_fifo_events); + smm->fifo_events[my_thread_index] = partially_read; + smm->evts_partially_read[my_thread_index] = 0; + } + else + { + vec_free (smm->fifo_events[my_thread_index]); + smm->fifo_events[my_thread_index] = + smm->evts_partially_read[my_thread_index]; + smm->evts_partially_read[my_thread_index] = 0; + } + + vlib_node_increment_counter (vm, session_queue_node.index, + SESSION_QUEUE_ERROR_TX, n_tx_packets); + + return n_tx_packets; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (session_queue_node) = +{ + .function = session_queue_node_fn, + .name = "session-queue", + .format_trace = format_session_queue_trace, + .type = VLIB_NODE_TYPE_INPUT, + .n_errors = ARRAY_LEN (session_queue_error_strings), + .error_strings = session_queue_error_strings, + .n_next_nodes = SESSION_QUEUE_N_NEXT, + /* .state = VLIB_NODE_STATE_DISABLED, enable on-demand? */ + /* edit / add dispositions here */ + .next_nodes = + { + [SESSION_QUEUE_NEXT_DROP] = "error-drop", + [SESSION_QUEUE_NEXT_IP4_LOOKUP] = "ip4-lookup", + [SESSION_QUEUE_NEXT_IP6_LOOKUP] = "ip6-lookup", + [SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT] = "tcp4-output", + [SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT] = "tcp6-output", + }, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session.api b/src/vnet/session/session.api new file mode 100644 index 00000000..a7b28c1d --- /dev/null +++ b/src/vnet/session/session.api @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2015-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /** \brief Bind to a given URI + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param accept_cookie - sender accept cookie, to identify this bind flavor + @param uri - a URI, e.g. "tcp://0.0.0.0/0/80" [ipv4] + "tcp://::/0/80" [ipv6] etc. + @param options - socket options, fifo sizes, etc. +*/ +define bind_uri { + u32 client_index; + u32 context; + u32 accept_cookie; + u32 initial_segment_size; + u8 uri[128]; + u64 options[16]; +}; + +/** \brief Unbind a given URI + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param uri - a URI, e.g. "tcp://0.0.0.0/0/80" [ipv4] + "tcp://::/0/80" [ipv6], etc. + @param options - socket options, fifo sizes, etc. +*/ +define unbind_uri { + u32 client_index; + u32 context; + u8 uri[128]; +}; + +/** \brief Connect to a given URI + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param accept_cookie - sender accept cookie, to identify this bind flavor + @param uri - a URI, e.g. "tcp4://0.0.0.0/0/80" + "tcp6://::/0/80" [ipv6], etc. + @param options - socket options, fifo sizes, etc. +*/ +define connect_uri { + u32 client_index; + u32 context; + u8 uri[128]; + u64 client_queue_address; + u64 options[16]; +}; + +/** \brief Bind reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param event_queue_address - vpp event queue address or 0 if this + connection shouldn't send events + @param segment_name_length - length of segment name + @param segment_name - name of segment client needs to attach to +*/ +define bind_uri_reply { + u32 context; + i32 retval; + u64 server_event_queue_address; + u8 segment_name_length; + u32 segment_size; + u8 segment_name[128]; +}; + +/** \brief unbind reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request +*/ +define unbind_uri_reply { + u32 context; + i32 retval; +}; + +/** \brief vpp->client, connect reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param server_rx_fifo - rx (vpp -> vpp-client) fifo address + @param server_tx_fifo - tx (vpp-client -> vpp) fifo address + @param session_index - session index; + @param session_thread_index - session thread index + @param session_type - session thread type + @param vpp_event_queue_address - vpp's event queue address + @param client_event_queue_address - client's event queue address + @param segment_name_length - non-zero if the client needs to attach to + the fifo segment + @param segment_name - set if the client needs to attach to the segment +*/ +define connect_uri_reply { + u32 context; + i32 retval; + u64 server_rx_fifo; + u64 server_tx_fifo; + u32 session_index; + u32 session_thread_index; + u8 session_type; + u64 client_event_queue_address; + u64 vpp_event_queue_address; + u32 segment_size; + u8 segment_name_length; + u8 segment_name[128]; +}; + +/** \brief vpp->client, please map an additional shared memory segment + @param context - sender context, to match reply w/ request + @param segment_name - +*/ +define map_another_segment { + u32 client_index; + u32 context; + u32 segment_size; + u8 segment_name[128]; +}; + +/** \brief client->vpp + @param context - sender context, to match reply w/ request + @param retval - return code for the request +*/ +define map_another_segment_reply { + u32 context; + i32 retval; +}; + +/** \brief vpp->client, accept this session + @param context - sender context, to match reply w/ request + @param accept_cookie - tells client which bind flavor just occurred + @param rx_fifo_address - rx (vpp -> vpp-client) fifo address + @param tx_fifo_address - tx (vpp-client -> vpp) fifo address + @param session_index - index of new session + @param session_thread_index - thread index of new session + @param vpp_event_queue_address - vpp's event queue address + @param session_type - type of session + +*/ +define accept_session { + u32 client_index; + u32 context; + u32 accept_cookie; + u64 server_rx_fifo; + u64 server_tx_fifo; + u32 session_index; + u32 session_thread_index; + u64 vpp_event_queue_address; + u8 session_type; +}; + +/** \brief client->vpp, reply to an accept message + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param session_index - session index from accept_session / connect_reply + @param session_thread_index - thread index from accept_session / + connect_reply +*/ +define accept_session_reply { + u32 context; + i32 retval; + u8 session_type; + u8 session_thread_index; + u32 session_index; +}; + +/** \brief bidirectional disconnect API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param session_index - cookie #1 from accept_session / connect_reply + @param session_thread_index - cookie #2 +*/ +define disconnect_session { + u32 client_index; + u32 context; + u32 session_index; + u32 session_thread_index; +}; + +/** \brief bidirectional disconnect reply API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param session_index - session index from accept_session / connect_reply + @param session_thread_index - thread index from accept_session / + connect_reply +*/ +define disconnect_session_reply { + u32 client_index; + u32 context; + i32 retval; + u32 session_index; + u32 session_thread_index; +}; + +/** \brief vpp->client reset session API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param session_index - session index from accept_session / connect_reply + @param session_thread_index - thread index from accept_session / + connect_reply +*/ +define reset_session { + u32 client_index; + u32 context; + u32 session_index; + u32 session_thread_index; +}; + +/** \brief client->vpp reset session reply + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param session_index - session index from accept_session / connect_reply + @param session_thread_index - thread index from accept_session / + connect_reply +*/ +define reset_session_reply { + u32 client_index; + u32 context; + i32 retval; + u32 session_index; + u32 session_thread_index; +}; + +/** \brief Bind to an ip:port pair for a given transport protocol + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param vrf - bind namespace + @param is_ip4 - flag that is 1 if ip address family is IPv4 + @param ip - ip address + @param port - port + @param proto - protocol 0 - TCP 1 - UDP + @param options - socket options, fifo sizes, etc. +*/ +define bind_sock { + u32 client_index; + u32 context; + u32 vrf; + u8 is_ip4; + u8 ip[16]; + u16 port; + u8 proto; + u64 options[16]; +}; + +/** \brief Unbind + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param handle - bind handle obtained from bind reply +*/ +define unbind_sock { + u32 client_index; + u32 context; + u64 handle; +}; + +/** \brief Connect to a remote peer + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param vrf - connection namespace + @param is_ip4 - flag that is 1 if ip address family is IPv4 + @param ip - ip address + @param port - port + @param proto - protocol 0 - TCP 1 - UDP + @param client_queue_address - client's API queue address. Non-zero when + used to perform redirects + @param options - socket options, fifo sizes, etc. +*/ +define connect_sock { + u32 client_index; + u32 context; + u32 vrf; + u8 is_ip4; + u8 ip[16]; + u16 port; + u8 proto; + u64 client_queue_address; + u64 options[16]; +}; + +/** \brief Bind reply + @param context - sender context, to match reply w/ request + @param handle - bind handle + @param retval - return code for the request + @param event_queue_address - vpp event queue address or 0 if this + connection shouldn't send events + @param segment_name_length - length of segment name + @param segment_name - name of segment client needs to attach to +*/ +define bind_sock_reply { + u32 context; + u64 handle; + i32 retval; + u64 server_event_queue_address; + u32 segment_size; + u8 segment_name_length; + u8 segment_name[128]; +}; + +/** \brief unbind reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request +*/ +define unbind_sock_reply { + u32 context; + i32 retval; +}; + +/** \brief vpp/server->client, connect reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param handle - connection handle + @param server_rx_fifo - rx (vpp -> vpp-client) fifo address + @param server_tx_fifo - tx (vpp-client -> vpp) fifo address + @param vpp_event_queue_address - vpp's event queue address + @param client_event_queue_address - client's event queue address + @param segment_name_length - non-zero if the client needs to attach to + the fifo segment + @param segment_name - set if the client needs to attach to the segment +*/ +define connect_sock_reply { + u32 context; + i32 retval; + u64 handle; + u64 server_rx_fifo; + u64 server_tx_fifo; + u64 client_event_queue_address; + u64 vpp_event_queue_address; + u32 segment_size; + u8 segment_name_length; + u8 segment_name[128]; +}; + +/** \brief bidirectional disconnect API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param handle - session handle obtained through accept/connect +*/ +define disconnect_sock { + u32 client_index; + u32 context; + u64 handle; +}; + +/** \brief bidirectional disconnect reply API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param client_context - sender context, to match reply w/ request + @param handle - session handle obtained through accept/connect +*/ +define disconnect_sock_reply { + u32 client_index; + u32 context; + i32 retval; + u64 handle; +}; + +/** \brief vpp->client, accept this session + @param context - sender context, to match reply w/ request + @param accept_cookie - tells client which bind flavor just occurred + @param handle - session handle obtained through accept/connect + @param rx_fifo_address - rx (vpp -> vpp-client) fifo address + @param tx_fifo_address - tx (vpp-client -> vpp) fifo address + @param vpp_event_queue_address - vpp's event queue address +*/ +define accept_sock { + u32 client_index; + u32 context; + u32 accept_cookie; + u64 handle; + u64 server_rx_fifo; + u64 server_tx_fifo; + u64 vpp_event_queue_address; +}; + +/** \brief client->vpp, reply to an accept message + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param handle - session handle obtained through accept/connect +*/ +define accept_sock_reply { + u32 context; + i32 retval; + u64 handle; +}; + +/** \brief vpp->client reset session API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param handle - session handle obtained through accept/connect +*/ +define reset_sock { + u32 client_index; + u32 context; + u64 handle; +}; + +/** \brief client->vpp reset session reply + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param handle - session handle obtained through accept/connect +*/ +define reset_sock_reply { + u32 client_index; + u32 context; + i32 retval; + u64 handle; +}; +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ \ No newline at end of file diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c new file mode 100644 index 00000000..539da613 --- /dev/null +++ b/src/vnet/session/session.c @@ -0,0 +1,1286 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief Session and session manager + */ + +#include +#include +#include +#include +#include + +/** + * Per-type vector of transport protocol virtual function tables + */ +static transport_proto_vft_t *tp_vfts; + +session_manager_main_t session_manager_main; + +/* + * Session lookup key; (src-ip, dst-ip, src-port, dst-port, session-type) + * Value: (owner thread index << 32 | session_index); + */ +static void +stream_session_table_add_for_tc (u8 sst, transport_connection_t * tc, + u64 value) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + kv4.value = value; + clib_bihash_add_del_16_8 (&smm->v4_session_hash, &kv4, 1 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + kv6.value = value; + clib_bihash_add_del_48_8 (&smm->v6_session_hash, &kv6, 1 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +void +stream_session_table_add (session_manager_main_t * smm, stream_session_t * s, + u64 value) +{ + transport_connection_t *tc; + + tc = tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + stream_session_table_add_for_tc (s->session_type, tc, value); +} + +static void +stream_session_half_open_table_add (u8 sst, transport_connection_t * tc, + u64 value) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + kv4.value = value; + clib_bihash_add_del_16_8 (&smm->v4_half_open_hash, &kv4, + 1 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + kv6.value = value; + clib_bihash_add_del_48_8 (&smm->v6_half_open_hash, &kv6, + 1 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +static int +stream_session_table_del_for_tc (session_manager_main_t * smm, u8 sst, + transport_connection_t * tc) +{ + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + return clib_bihash_add_del_16_8 (&smm->v4_session_hash, &kv4, + 0 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + return clib_bihash_add_del_48_8 (&smm->v6_session_hash, &kv6, + 0 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } + + return 0; +} + +static int +stream_session_table_del (session_manager_main_t * smm, stream_session_t * s) +{ + transport_connection_t *ts; + + ts = tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + return stream_session_table_del_for_tc (smm, s->session_type, ts); +} + +static void +stream_session_half_open_table_del (session_manager_main_t * smm, u8 sst, + transport_connection_t * tc) +{ + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + clib_bihash_add_del_16_8 (&smm->v4_half_open_hash, &kv4, + 0 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + clib_bihash_add_del_48_8 (&smm->v6_half_open_hash, &kv6, + 0 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +stream_session_t * +stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + int rv; + + make_v4_listener_kv (&kv4, lcl, lcl_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + return pool_elt_at_index (smm->listen_sessions[proto], (u32) kv4.value); + + /* Zero out the lcl ip */ + kv4.key[0] = 0; + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + return pool_elt_at_index (smm->listen_sessions[proto], kv4.value); + + return 0; +} + +/** Looks up a session based on the 5-tuple passed as argument. + * + * First it tries to find an established session, if this fails, it tries + * finding a listener session if this fails, it tries a lookup with a + * wildcarded local source (listener bound to all interfaces) + */ +stream_session_t * +stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + int rv; + + /* Lookup session amongst established ones */ + make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + return stream_session_get_tsi (kv4.value, my_thread_index); + + /* If nothing is found, check if any listener is available */ + return stream_session_lookup_listener4 (lcl, lcl_port, proto); +} + +stream_session_t * +stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv6_t kv6; + int rv; + + make_v6_listener_kv (&kv6, lcl, lcl_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + return pool_elt_at_index (smm->listen_sessions[proto], kv6.value); + + /* Zero out the lcl ip */ + kv6.key[0] = kv6.key[1] = 0; + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + return pool_elt_at_index (smm->listen_sessions[proto], kv6.value); + + return 0; +} + +/* Looks up a session based on the 5-tuple passed as argument. + * First it tries to find an established session, if this fails, it tries + * finding a listener session if this fails, it tries a lookup with a + * wildcarded local source (listener bound to all interfaces) */ +stream_session_t * +stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + session_kv6_t kv6; + int rv; + + make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + return stream_session_get_tsi (kv6.value, my_thread_index); + + /* If nothing is found, check if any listener is available */ + return stream_session_lookup_listener6 (lcl, lcl_port, proto); +} + +stream_session_t * +stream_session_lookup_listener (ip46_address_t * lcl, u16 lcl_port, u8 proto) +{ + switch (proto) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + return stream_session_lookup_listener4 (&lcl->ip4, lcl_port, proto); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + return stream_session_lookup_listener6 (&lcl->ip6, lcl_port, proto); + break; + } + return 0; +} + +static u64 +stream_session_half_open_lookup (session_manager_main_t * smm, + ip46_address_t * lcl, ip46_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + session_kv4_t kv4; + session_kv6_t kv6; + int rv; + + switch (proto) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv (&kv4, &lcl->ip4, &rmt->ip4, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + + if (rv == 0) + return kv4.value; + + return (u64) ~ 0; + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv (&kv6, &lcl->ip6, &rmt->ip6, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); + + if (rv == 0) + return kv6.value; + + return (u64) ~ 0; + break; + } + return 0; +} + +transport_connection_t * +stream_session_lookup_transport4 (session_manager_main_t * smm, + ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + session_kv4_t kv4; + stream_session_t *s; + int rv; + + /* Lookup session amongst established ones */ + make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + { + s = stream_session_get_tsi (kv4.value, my_thread_index); + + return tp_vfts[s->session_type].get_connection (s->connection_index, + my_thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener4 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + if (rv == 0) + return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); + + return 0; +} + +transport_connection_t * +stream_session_lookup_transport6 (session_manager_main_t * smm, + ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + stream_session_t *s; + session_kv6_t kv6; + int rv; + + make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + { + s = stream_session_get_tsi (kv6.value, my_thread_index); + + return tp_vfts[s->session_type].get_connection (s->connection_index, + my_thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener6 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); + if (rv == 0) + return tp_vfts[s->session_type].get_half_open (kv6.value & 0xFFFFFFFF); + + return 0; +} + +/** + * Allocate vpp event queue (once) per worker thread + */ +void +vpp_session_event_queue_allocate (session_manager_main_t * smm, + u32 thread_index) +{ + api_main_t *am = &api_main; + void *oldheap; + + if (smm->vpp_event_queues[thread_index] == 0) + { + /* Allocate event fifo in the /vpe-api shared-memory segment */ + oldheap = svm_push_data_heap (am->vlib_rp); + + smm->vpp_event_queues[thread_index] = + unix_shared_memory_queue_init (2048 /* nels $$$$ config */ , + sizeof (session_fifo_event_t), + 0 /* consumer pid */ , + 0 + /* (do not) send signal when queue non-empty */ + ); + + svm_pop_heap (oldheap); + } +} + +void +session_manager_get_segment_info (u32 index, u8 ** name, u32 * size) +{ + svm_fifo_segment_private_t *s; + s = svm_fifo_get_segment (index); + *name = s->h->segment_name; + *size = s->ssvm.ssvm_size; +} + +always_inline int +session_manager_add_segment_i (session_manager_main_t * smm, + session_manager_t * sm, + u32 segment_size, u8 * segment_name) +{ + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + int rv; + + memset (ca, 0, sizeof (*ca)); + + ca->segment_name = (char *) segment_name; + ca->segment_size = segment_size; + + rv = svm_fifo_segment_create (ca); + if (rv) + { + clib_warning ("svm_fifo_segment_create ('%s', %d) failed", + ca->segment_name, ca->segment_size); + vec_free (segment_name); + return -1; + } + + vec_add1 (sm->segment_indices, ca->new_segment_index); + + return 0; +} + +static int +session_manager_add_segment (session_manager_main_t * smm, + session_manager_t * sm) +{ + u8 *segment_name; + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + u32 add_segment_size; + u32 default_segment_size = 128 << 10; + + memset (ca, 0, sizeof (*ca)); + segment_name = format (0, "%d-%d%c", getpid (), + smm->unique_segment_name_counter++, 0); + add_segment_size = + sm->add_segment_size ? sm->add_segment_size : default_segment_size; + + return session_manager_add_segment_i (smm, sm, add_segment_size, + segment_name); +} + +int +session_manager_add_first_segment (session_manager_main_t * smm, + session_manager_t * sm, u32 segment_size, + u8 ** segment_name) +{ + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + memset (ca, 0, sizeof (*ca)); + *segment_name = format (0, "%d-%d%c", getpid (), + smm->unique_segment_name_counter++, 0); + return session_manager_add_segment_i (smm, sm, segment_size, *segment_name); +} + +void +session_manager_del (session_manager_main_t * smm, session_manager_t * sm) +{ + u32 *deleted_sessions = 0; + u32 *deleted_thread_indices = 0; + int i, j; + + /* Across all fifo segments used by the server */ + for (j = 0; j < vec_len (sm->segment_indices); j++) + { + svm_fifo_segment_private_t *fifo_segment; + svm_fifo_t **fifos; + /* Vector of fifos allocated in the segment */ + fifo_segment = svm_fifo_get_segment (sm->segment_indices[j]); + fifos = (svm_fifo_t **) fifo_segment->h->fifos; + + /* + * Remove any residual sessions from the session lookup table + * Don't bother deleting the individual fifos, we're going to + * throw away the fifo segment in a minute. + */ + for (i = 0; i < vec_len (fifos); i++) + { + svm_fifo_t *fifo; + u32 session_index, thread_index; + stream_session_t *session; + + fifo = fifos[i]; + session_index = fifo->server_session_index; + thread_index = fifo->server_thread_index; + + session = pool_elt_at_index (smm->sessions[thread_index], + session_index); + + /* Add to the deleted_sessions vector (once!) */ + if (!session->is_deleted) + { + session->is_deleted = 1; + vec_add1 (deleted_sessions, + session - smm->sessions[thread_index]); + vec_add1 (deleted_thread_indices, thread_index); + } + } + + for (i = 0; i < vec_len (deleted_sessions); i++) + { + stream_session_t *session; + + session = + pool_elt_at_index (smm->sessions[deleted_thread_indices[i]], + deleted_sessions[i]); + + /* Instead of directly removing the session call disconnect */ + stream_session_disconnect (session); + + /* + stream_session_table_del (smm, session); + pool_put(smm->sessions[deleted_thread_indices[i]], session); + */ + } + + vec_reset_length (deleted_sessions); + vec_reset_length (deleted_thread_indices); + + /* Instead of removing the segment, test when removing the session if + * the segment can be removed + */ + /* svm_fifo_segment_delete (fifo_segment); */ + } + + vec_free (deleted_sessions); + vec_free (deleted_thread_indices); +} + +int +session_manager_allocate_session_fifos (session_manager_main_t * smm, + session_manager_t * sm, + svm_fifo_t ** server_rx_fifo, + svm_fifo_t ** server_tx_fifo, + u32 * fifo_segment_index, + u8 * added_a_segment) +{ + svm_fifo_segment_private_t *fifo_segment; + u32 fifo_size, default_fifo_size = 8192 /* TODO config */ ; + int i; + + *added_a_segment = 0; + + /* Allocate svm fifos */ + ASSERT (vec_len (sm->segment_indices)); + +again: + for (i = 0; i < vec_len (sm->segment_indices); i++) + { + *fifo_segment_index = sm->segment_indices[i]; + fifo_segment = svm_fifo_get_segment (*fifo_segment_index); + + fifo_size = sm->rx_fifo_size; + fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size; + *server_rx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size); + + fifo_size = sm->tx_fifo_size; + fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size; + *server_tx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size); + + if (*server_rx_fifo == 0) + { + /* This would be very odd, but handle it... */ + if (*server_tx_fifo != 0) + { + svm_fifo_segment_free_fifo (fifo_segment, *server_tx_fifo); + *server_tx_fifo = 0; + } + continue; + } + if (*server_tx_fifo == 0) + { + if (*server_rx_fifo != 0) + { + svm_fifo_segment_free_fifo (fifo_segment, *server_rx_fifo); + *server_rx_fifo = 0; + } + continue; + } + break; + } + + /* See if we're supposed to create another segment */ + if (*server_rx_fifo == 0) + { + if (sm->add_segment) + { + if (*added_a_segment) + { + clib_warning ("added a segment, still cant allocate a fifo"); + return SESSION_ERROR_NEW_SEG_NO_SPACE; + } + + if (session_manager_add_segment (smm, sm)) + return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + + *added_a_segment = 1; + goto again; + } + else + return SESSION_ERROR_NO_SPACE; + } + return 0; +} + +int +stream_session_create_i (session_manager_main_t * smm, application_t * app, + transport_connection_t * tc, + stream_session_t ** ret_s) +{ + int rv; + svm_fifo_t *server_rx_fifo = 0, *server_tx_fifo = 0; + u32 fifo_segment_index; + u32 pool_index, seg_size; + stream_session_t *s; + u64 value; + u32 thread_index = tc->thread_index; + session_manager_t *sm; + u8 segment_added; + u8 *seg_name; + + sm = session_manager_get (app->session_manager_index); + + /* Check the API queue */ + if (app->mode == APP_SERVER && application_api_queue_is_full (app)) + return SESSION_ERROR_API_QUEUE_FULL; + + if ((rv = session_manager_allocate_session_fifos (smm, sm, &server_rx_fifo, + &server_tx_fifo, + &fifo_segment_index, + &segment_added))) + return rv; + + if (segment_added && app->mode == APP_SERVER) + { + /* Send an API message to the external server, to map new segment */ + ASSERT (app->cb_fns.add_segment_callback); + + session_manager_get_segment_info (fifo_segment_index, &seg_name, + &seg_size); + if (app->cb_fns.add_segment_callback (app->api_client_index, seg_name, + seg_size)) + return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + } + + /* Create the session */ + pool_get (smm->sessions[thread_index], s); + memset (s, 0, sizeof (*s)); + + /* Initialize backpointers */ + pool_index = s - smm->sessions[thread_index]; + server_rx_fifo->server_session_index = pool_index; + server_rx_fifo->server_thread_index = thread_index; + + server_tx_fifo->server_session_index = pool_index; + server_tx_fifo->server_thread_index = thread_index; + + s->server_rx_fifo = server_rx_fifo; + s->server_tx_fifo = server_tx_fifo; + + /* Initialize state machine, such as it is... */ + s->session_type = app->session_type; + s->session_state = SESSION_STATE_CONNECTING; + s->app_index = application_get_index (app); + s->server_segment_index = fifo_segment_index; + s->thread_index = thread_index; + s->session_index = pool_index; + + /* Attach transport to session */ + s->connection_index = tc->c_index; + + /* Attach session to transport */ + tc->s_index = s->session_index; + + /* Add to the main lookup table */ + value = (((u64) thread_index) << 32) | (u64) s->session_index; + stream_session_table_add_for_tc (app->session_type, tc, value); + + *ret_s = s; + + return 0; +} + +/* + * Enqueue data for delivery to session peer. Does not notify peer of enqueue + * event but on request can queue notification events for later delivery by + * calling stream_server_flush_enqueue_events(). + * + * @param tc Transport connection which is to be enqueued data + * @param data Data to be enqueued + * @param len Length of data to be enqueued + * @param queue_event Flag to indicate if peer is to be notified or if event + * is to be queued. The former is useful when more data is + * enqueued and only one event is to be generated. + * @return Number of bytes enqueued or a negative value if enqueueing failed. + */ +int +stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, + u8 queue_event) +{ + stream_session_t *s; + int enqueued; + + s = stream_session_get (tc->s_index, tc->thread_index); + + /* Make sure there's enough space left. We might've filled the pipes */ + if (PREDICT_FALSE (len > svm_fifo_max_enqueue (s->server_rx_fifo))) + return -1; + + enqueued = svm_fifo_enqueue_nowait (s->server_rx_fifo, s->pid, len, data); + + if (queue_event) + { + /* Queue RX event on this fifo. Eventually these will need to be flushed + * by calling stream_server_flush_enqueue_events () */ + session_manager_main_t *smm = vnet_get_session_manager_main (); + u32 thread_index = s->thread_index; + u32 my_enqueue_epoch = smm->current_enqueue_epoch[thread_index]; + + if (s->enqueue_epoch != my_enqueue_epoch) + { + s->enqueue_epoch = my_enqueue_epoch; + vec_add1 (smm->session_indices_to_enqueue_by_thread[thread_index], + s - smm->sessions[thread_index]); + } + } + + return enqueued; +} + +/** Check if we have space in rx fifo to push more bytes */ +u8 +stream_session_no_space (transport_connection_t * tc, u32 thread_index, + u16 data_len) +{ + stream_session_t *s = stream_session_get (tc->c_index, thread_index); + + if (PREDICT_FALSE (s->session_state != SESSION_STATE_READY)) + return 1; + + if (data_len > svm_fifo_max_enqueue (s->server_rx_fifo)) + return 1; + + return 0; +} + +u32 +stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, + u32 offset, u32 max_bytes) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + return svm_fifo_peek (s->server_tx_fifo, s->pid, offset, max_bytes, buffer); +} + +u32 +stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + return svm_fifo_dequeue_drop (s->server_tx_fifo, s->pid, max_bytes); +} + +/** + * Notify session peer that new data has been enqueued. + * + * @param s Stream session for which the event is to be generated. + * @param block Flag to indicate if call should block if event queue is full. + * + * @return 0 on succes or negative number if failed to send notification. + */ +static int +stream_session_enqueue_notify (stream_session_t * s, u8 block) +{ + application_t *app; + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + static u32 serial_number; + + if (PREDICT_FALSE (s->session_state == SESSION_STATE_CLOSED)) + return 0; + + /* Get session's server */ + app = application_get (s->app_index); + + /* Fabricate event */ + evt.fifo = s->server_rx_fifo; + evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_id = serial_number++; + evt.enqueue_length = svm_fifo_max_dequeue (s->server_rx_fifo); + + /* Add event to server's event queue */ + q = app->event_queue; + + /* Based on request block (or not) for lack of space */ + if (block || PREDICT_TRUE (q->cursize < q->maxsize)) + unix_shared_memory_queue_add (app->event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); + else + return -1; + + if (1) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "evt-enqueue: id %d length %d",.format_args = "i4i4",}; + struct + { + u32 data[2]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = evt.event_id; + ed->data[1] = evt.enqueue_length; + } + + return 0; +} + +/** + * Flushes queue of sessions that are to be notified of new data + * enqueued events. + * + * @param thread_index Thread index for which the flush is to be performed. + * @return 0 on success or a positive number indicating the number of + * failures due to API queue being full. + */ +int +session_manager_flush_enqueue_events (u32 thread_index) +{ + session_manager_main_t *smm = &session_manager_main; + u32 *session_indices_to_enqueue; + int i, errors = 0; + + session_indices_to_enqueue = + smm->session_indices_to_enqueue_by_thread[thread_index]; + + for (i = 0; i < vec_len (session_indices_to_enqueue); i++) + { + stream_session_t *s0; + + /* Get session */ + s0 = stream_session_get (session_indices_to_enqueue[i], thread_index); + if (stream_session_enqueue_notify (s0, 0 /* don't block */ )) + { + errors++; + } + } + + vec_reset_length (session_indices_to_enqueue); + + smm->session_indices_to_enqueue_by_thread[thread_index] = + session_indices_to_enqueue; + + /* Increment enqueue epoch for next round */ + smm->current_enqueue_epoch[thread_index]++; + + return errors; +} + +/* + * Start listening on server's ip/port pair for requested transport. + * + * Creates a 'dummy' stream session with state LISTENING to be used in session + * lookups, prior to establishing connection. Requests transport to build + * it's own specific listening connection. + */ +int +stream_session_start_listen (u32 server_index, ip46_address_t * ip, u16 port) +{ + session_manager_main_t *smm = &session_manager_main; + stream_session_t *s; + transport_connection_t *tc; + application_t *srv; + u32 tci; + + srv = application_get (server_index); + + pool_get (smm->listen_sessions[srv->session_type], s); + memset (s, 0, sizeof (*s)); + + s->session_type = srv->session_type; + s->session_state = SESSION_STATE_LISTENING; + s->session_index = s - smm->listen_sessions[srv->session_type]; + s->app_index = srv->index; + + /* Transport bind/listen */ + tci = tp_vfts[srv->session_type].bind (smm->vlib_main, s->session_index, ip, + port); + + /* Attach transport to session */ + s->connection_index = tci; + tc = tp_vfts[srv->session_type].get_listener (tci); + + srv->session_index = s->session_index; + + /* Add to the main lookup table */ + stream_session_table_add_for_tc (s->session_type, tc, s->session_index); + + return 0; +} + +void +stream_session_stop_listen (u32 server_index) +{ + session_manager_main_t *smm = &session_manager_main; + stream_session_t *listener; + transport_connection_t *tc; + application_t *srv; + + srv = application_get (server_index); + listener = pool_elt_at_index (smm->listen_sessions[srv->session_type], + srv->session_index); + + tc = tp_vfts[srv->session_type].get_listener (listener->connection_index); + stream_session_table_del_for_tc (smm, listener->session_type, tc); + + tp_vfts[srv->session_type].unbind (smm->vlib_main, + listener->connection_index); + pool_put (smm->listen_sessions[srv->session_type], listener); +} + +int +connect_server_add_segment_cb (application_t * ss, char *segment_name, + u32 segment_size) +{ + /* Does exactly nothing, but die */ + ASSERT (0); + return 0; +} + +void +connects_session_manager_init (session_manager_main_t * smm, u8 session_type) +{ + session_manager_t *sm; + u32 connect_fifo_size = 8 << 10; /* Config? */ + u32 default_segment_size = 1 << 20; + + pool_get (smm->session_managers, sm); + memset (sm, 0, sizeof (*sm)); + + sm->add_segment_size = default_segment_size; + sm->rx_fifo_size = connect_fifo_size; + sm->tx_fifo_size = connect_fifo_size; + sm->add_segment = 1; + + session_manager_add_segment (smm, sm); + smm->connect_manager_index[session_type] = sm - smm->session_managers; +} + +void +stream_session_connect_notify (transport_connection_t * tc, u8 sst, + u8 is_fail) +{ + session_manager_main_t *smm = &session_manager_main; + application_t *app; + stream_session_t *new_s = 0; + u64 value; + + value = stream_session_half_open_lookup (smm, &tc->lcl_ip, &tc->rmt_ip, + tc->lcl_port, tc->rmt_port, + tc->proto); + if (value == HALF_OPEN_LOOKUP_INVALID_VALUE) + { + clib_warning ("This can't be good!"); + return; + } + + app = application_get (value >> 32); + + if (!is_fail) + { + /* Create new session (server segments are allocated if needed) */ + if (stream_session_create_i (smm, app, tc, &new_s)) + return; + + app->session_index = stream_session_get_index (new_s); + app->thread_index = new_s->thread_index; + + /* Allocate vpp event queue for this thread if needed */ + vpp_session_event_queue_allocate (smm, tc->thread_index); + } + + /* Notify client */ + app->cb_fns.session_connected_callback (app->api_client_index, new_s, + is_fail); + + /* Cleanup session lookup */ + stream_session_half_open_table_del (smm, sst, tc); +} + +void +stream_session_accept_notify (transport_connection_t * tc) +{ + application_t *server; + stream_session_t *s; + + s = stream_session_get (tc->s_index, tc->thread_index); + server = application_get (s->app_index); + server->cb_fns.session_accept_callback (s); +} + +/** + * Notification from transport that connection is being closed. + * + * A disconnect is sent to application but state is not removed. Once + * disconnect is acknowledged by application, session disconnect is called. + * Ultimately this leads to close being called on transport (passive close). + */ +void +stream_session_disconnect_notify (transport_connection_t * tc) +{ + application_t *server; + stream_session_t *s; + + s = stream_session_get (tc->s_index, tc->thread_index); + server = application_get (s->app_index); + server->cb_fns.session_disconnect_callback (s); +} + +/** + * Cleans up session and associated app if needed. + */ +void +stream_session_delete (stream_session_t * s) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + svm_fifo_segment_private_t *fifo_segment; + application_t *app; + int rv; + + /* delete from the main lookup table */ + rv = stream_session_table_del (smm, s); + + if (rv) + clib_warning ("hash delete error, rv %d", rv); + + /* Cleanup fifo segments */ + fifo_segment = svm_fifo_get_segment (s->server_segment_index); + svm_fifo_segment_free_fifo (fifo_segment, s->server_rx_fifo); + svm_fifo_segment_free_fifo (fifo_segment, s->server_tx_fifo); + + /* Cleanup app if client */ + app = application_get (s->app_index); + if (app->mode == APP_CLIENT) + { + application_del (app); + } + else if (app->mode == APP_SERVER) + { + session_manager_t *sm; + svm_fifo_segment_private_t *fifo_segment; + svm_fifo_t **fifos; + u32 fifo_index; + + sm = session_manager_get (app->session_manager_index); + + /* Delete fifo */ + fifo_segment = svm_fifo_get_segment (s->server_segment_index); + fifos = (svm_fifo_t **) fifo_segment->h->fifos; + + fifo_index = svm_fifo_segment_index (fifo_segment); + + /* Remove segment only if it holds no fifos and not the first */ + if (sm->segment_indices[0] != fifo_index && vec_len (fifos) == 0) + svm_fifo_segment_delete (fifo_segment); + } + + pool_put (smm->sessions[s->thread_index], s); +} + +/** + * Notification from transport that connection is being deleted + * + * This should be called only on previously fully established sessions. For + * instance failed connects should call stream_session_connect_notify and + * indicate that the connect has failed. + */ +void +stream_session_delete_notify (transport_connection_t * tc) +{ + stream_session_t *s; + + s = stream_session_get_if_valid (tc->s_index, tc->thread_index); + if (!s) + { + clib_warning ("Surprised!"); + return; + } + stream_session_delete (s); +} + +/** + * Notify application that connection has been reset. + */ +void +stream_session_reset_notify (transport_connection_t * tc) +{ + stream_session_t *s; + application_t *app; + s = stream_session_get (tc->s_index, tc->thread_index); + + app = application_get (s->app_index); + app->cb_fns.session_reset_callback (s); +} + +/** + * Accept a stream session. Optionally ping the server by callback. + */ +int +stream_session_accept (transport_connection_t * tc, u32 listener_index, + u8 sst, u8 notify) +{ + session_manager_main_t *smm = &session_manager_main; + application_t *server; + stream_session_t *s, *listener; + + int rv; + + /* Find the server */ + listener = pool_elt_at_index (smm->listen_sessions[sst], listener_index); + server = application_get (listener->app_index); + + if ((rv = stream_session_create_i (smm, server, tc, &s))) + return rv; + + /* Allocate vpp event queue for this thread if needed */ + vpp_session_event_queue_allocate (smm, tc->thread_index); + + /* Shoulder-tap the server */ + if (notify) + { + server->cb_fns.session_accept_callback (s); + } + + return 0; +} + +void +stream_session_open (u8 sst, ip46_address_t * addr, u16 port_host_byte_order, + u32 app_index) +{ + transport_connection_t *tc; + u32 tci; + u64 value; + + /* Ask transport to open connection */ + tci = tp_vfts[sst].open (addr, port_host_byte_order); + + /* Get transport connection */ + tc = tp_vfts[sst].get_half_open (tci); + + /* Store api_client_index and transport connection index */ + value = (((u64) app_index) << 32) | (u64) tc->c_index; + + /* Add to the half-open lookup table */ + stream_session_half_open_table_add (sst, tc, value); +} + +/** + * Disconnect session and propagate to transport. This should eventually + * result in a delete notification that allows us to cleanup session state. + * Called for both active/passive disconnects. + */ +void +stream_session_disconnect (stream_session_t * s) +{ + tp_vfts[s->session_type].close (s->connection_index, s->thread_index); + s->session_state = SESSION_STATE_CLOSED; +} + +/** + * Cleanup transport and session state. + */ +void +stream_session_cleanup (stream_session_t * s) +{ + tp_vfts[s->session_type].cleanup (s->connection_index, s->thread_index); + stream_session_delete (s); +} + +void +session_register_transport (u8 type, const transport_proto_vft_t * vft) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + + vec_validate (tp_vfts, type); + tp_vfts[type] = *vft; + + /* If an offset function is provided, then peek instead of dequeue */ + smm->session_rx_fns[type] = + (vft->rx_fifo_offset) ? session_fifo_rx_peek : session_fifo_rx_dequeue; +} + +transport_proto_vft_t * +session_get_transport_vft (u8 type) +{ + if (type >= vec_len (tp_vfts)) + return 0; + return &tp_vfts[type]; +} + +static clib_error_t * +session_manager_main_init (vlib_main_t * vm) +{ + u32 num_threads; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + session_manager_main_t *smm = &session_manager_main; + int i; + + smm->vlib_main = vm; + smm->vnet_main = vnet_get_main (); + + num_threads = 1 /* main thread */ + vtm->n_threads; + + if (num_threads < 1) + return clib_error_return (0, "n_thread_stacks not set"); + + /* $$$ config parameters */ + svm_fifo_segment_init (0x200000000ULL /* first segment base VA */ , + 20 /* timeout in seconds */ ); + + /* configure per-thread ** vectors */ + vec_validate (smm->sessions, num_threads - 1); + vec_validate (smm->session_indices_to_enqueue_by_thread, num_threads - 1); + vec_validate (smm->tx_buffers, num_threads - 1); + vec_validate (smm->fifo_events, num_threads - 1); + vec_validate (smm->evts_partially_read, num_threads - 1); + vec_validate (smm->current_enqueue_epoch, num_threads - 1); + vec_validate (smm->vpp_event_queues, num_threads - 1); + + /* $$$$ preallocate hack config parameter */ + for (i = 0; i < 200000; i++) + { + stream_session_t *ss; + pool_get (smm->sessions[0], ss); + memset (ss, 0, sizeof (*ss)); + } + + for (i = 0; i < 200000; i++) + pool_put_index (smm->sessions[0], i); + + clib_bihash_init_16_8 (&smm->v4_session_hash, "v4 session table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + clib_bihash_init_48_8 (&smm->v6_session_hash, "v6 session table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + + clib_bihash_init_16_8 (&smm->v4_half_open_hash, "v4 half-open table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + clib_bihash_init_48_8 (&smm->v6_half_open_hash, "v6 half-open table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + + for (i = 0; i < SESSION_N_TYPES; i++) + smm->connect_manager_index[i] = INVALID_INDEX; + + return 0; +} + +VLIB_INIT_FUNCTION (session_manager_main_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h new file mode 100644 index 00000000..cf14cca9 --- /dev/null +++ b/src/vnet/session/session.h @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_session_h__ +#define __included_session_h__ + +#include +#include +#include +#include +#include + +#define HALF_OPEN_LOOKUP_INVALID_VALUE ((u64)~0) +#define INVALID_INDEX ((u32)~0) + +/* TODO decide how much since we have pre-data as well */ +#define MAX_HDRS_LEN 100 /* Max number of bytes for headers */ + +typedef enum +{ + FIFO_EVENT_SERVER_RX, + FIFO_EVENT_SERVER_TX, + FIFO_EVENT_TIMEOUT, + FIFO_EVENT_SERVER_EXIT, +} fifo_event_type_t; + +#define foreach_session_input_error \ +_(NO_SESSION, "No session drops") \ +_(NO_LISTENER, "No listener for dst port drops") \ +_(ENQUEUED, "Packets pushed into rx fifo") \ +_(NOT_READY, "Session not ready packets") \ +_(FIFO_FULL, "Packets dropped for lack of rx fifo space") \ +_(EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") \ +_(API_QUEUE_FULL, "Sessions not created for lack of API queue space") \ +_(NEW_SEG_NO_SPACE, "Created segment, couldn't allocate a fifo pair") \ +_(NO_SPACE, "Couldn't allocate a fifo pair") + +typedef enum +{ +#define _(sym,str) SESSION_ERROR_##sym, + foreach_session_input_error +#undef _ + SESSION_N_ERROR, +} session_error_t; + +/* Event queue input node static next indices */ +typedef enum +{ + SESSION_QUEUE_NEXT_DROP, + SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT, + SESSION_QUEUE_NEXT_IP4_LOOKUP, + SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT, + SESSION_QUEUE_NEXT_IP6_LOOKUP, + SESSION_QUEUE_N_NEXT, +} session_queue_next_t; + +#define foreach_session_type \ + _(IP4_TCP, ip4_tcp) \ + _(IP4_UDP, ip4_udp) \ + _(IP6_TCP, ip6_tcp) \ + _(IP6_UDP, ip6_udp) + +typedef enum +{ +#define _(A, a) SESSION_TYPE_##A, + foreach_session_type +#undef _ + SESSION_N_TYPES, +} session_type_t; + +/* + * Application session state + */ +typedef enum +{ + SESSION_STATE_LISTENING, + SESSION_STATE_CONNECTING, + SESSION_STATE_READY, + SESSION_STATE_CLOSED, + SESSION_STATE_N_STATES, +} stream_session_state_t; + +typedef CLIB_PACKED (struct + { + svm_fifo_t * fifo; + u8 event_type; + /* $$$$ for event logging */ + u16 event_id; + u32 enqueue_length; + }) session_fifo_event_t; + +typedef struct _stream_session_t +{ + /** Type */ + u8 session_type; + + /** State */ + u8 session_state; + + /** Session index in per_thread pool */ + u32 session_index; + + /** Transport specific */ + u32 connection_index; + + u8 thread_index; + + /** Application specific */ + u32 pid; + + /** fifo pointers. Once allocated, these do not move */ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; + + /** To avoid n**2 "one event per frame" check */ + u8 enqueue_epoch; + + /** used during unbind processing */ + u8 is_deleted; + + /** stream server pool index */ + u32 app_index; + + /** svm segment index */ + u32 server_segment_index; +} stream_session_t; + +typedef struct _session_manager +{ + /** segments mapped by this server */ + u32 *segment_indices; + + /** Session fifo sizes. They are provided for binds and take default + * values for connects */ + u32 rx_fifo_size; + u32 tx_fifo_size; + + /** Configured additional segment size */ + u32 add_segment_size; + + /** Flag that indicates if additional segments should be created */ + u8 add_segment; +} session_manager_t; + +/* Forward definition */ +typedef struct _session_manager_main session_manager_main_t; + +typedef int + (session_fifo_rx_fn) (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, stream_session_t * s0, + u32 thread_index, int *n_tx_pkts); + +extern session_fifo_rx_fn session_fifo_rx_peek; +extern session_fifo_rx_fn session_fifo_rx_dequeue; + +struct _session_manager_main +{ + /** Lookup tables for established sessions and listeners */ + clib_bihash_16_8_t v4_session_hash; + clib_bihash_48_8_t v6_session_hash; + + /** Lookup tables for half-open sessions */ + clib_bihash_16_8_t v4_half_open_hash; + clib_bihash_48_8_t v6_half_open_hash; + + /** Per worker thread session pools */ + stream_session_t **sessions; + + /** Pool of listen sessions. Same type as stream sessions to ease lookups */ + stream_session_t *listen_sessions[SESSION_N_TYPES]; + + /** Sparse vector to map dst port to stream server */ + u16 *stream_server_by_dst_port[SESSION_N_TYPES]; + + /** per-worker enqueue epoch counters */ + u8 *current_enqueue_epoch; + + /** Per-worker thread vector of sessions to enqueue */ + u32 **session_indices_to_enqueue_by_thread; + + /** per-worker tx buffer free lists */ + u32 **tx_buffers; + + /** Per worker-thread vector of partially read events */ + session_fifo_event_t **evts_partially_read; + + /** per-worker active event vectors */ + session_fifo_event_t **fifo_events; + + /** vpp fifo event queue */ + unix_shared_memory_queue_t **vpp_event_queues; + + /** Unique segment name counter */ + u32 unique_segment_name_counter; + + /* Connection manager used by incoming connects */ + u32 connect_manager_index[SESSION_N_TYPES]; + + session_manager_t *session_managers; + + /** Per transport rx function that can either dequeue or peek */ + session_fifo_rx_fn *session_rx_fns[SESSION_N_TYPES]; + + /* Convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +}; + +extern session_manager_main_t session_manager_main; + +/* + * Session manager function + */ +always_inline session_manager_main_t * +vnet_get_session_manager_main () +{ + return &session_manager_main; +} + +always_inline session_manager_t * +session_manager_get (u32 index) +{ + return pool_elt_at_index (session_manager_main.session_managers, index); +} + +always_inline unix_shared_memory_queue_t * +session_manager_get_vpp_event_queue (u32 thread_index) +{ + return session_manager_main.vpp_event_queues[thread_index]; +} + +always_inline session_manager_t * +connects_session_manager_get (session_manager_main_t * smm, + session_type_t session_type) +{ + return pool_elt_at_index (smm->session_managers, + smm->connect_manager_index[session_type]); +} + +void session_manager_get_segment_info (u32 index, u8 ** name, u32 * size); +int session_manager_flush_enqueue_events (u32 thread_index); +int +session_manager_add_first_segment (session_manager_main_t * smm, + session_manager_t * sm, u32 segment_size, + u8 ** segment_name); +void +session_manager_del (session_manager_main_t * smm, session_manager_t * sm); +void +connects_session_manager_init (session_manager_main_t * smm, u8 session_type); + +/* + * Stream session functions + */ + +stream_session_t *stream_session_lookup_listener4 (ip4_address_t * lcl, + u16 lcl_port, u8 proto); +stream_session_t *stream_session_lookup4 (ip4_address_t * lcl, + ip4_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto, + u32 thread_index); +stream_session_t *stream_session_lookup_listener6 (ip6_address_t * lcl, + u16 lcl_port, u8 proto); +stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, + ip6_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8, u32 thread_index); +transport_connection_t + * stream_session_lookup_transport4 (session_manager_main_t * smm, + ip4_address_t * lcl, + ip4_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto, + u32 thread_index); +transport_connection_t + * stream_session_lookup_transport6 (session_manager_main_t * smm, + ip6_address_t * lcl, + ip6_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto, + u32 thread_index); +stream_session_t *stream_session_lookup_listener (ip46_address_t * lcl, + u16 lcl_port, u8 proto); + +always_inline stream_session_t * +stream_session_get_tsi (u64 ti_and_si, u32 thread_index) +{ + ASSERT ((u32) (ti_and_si >> 32) == thread_index); + return pool_elt_at_index (session_manager_main.sessions[thread_index], + ti_and_si & 0xFFFFFFFFULL); +} + +always_inline stream_session_t * +stream_session_get (u64 si, u32 thread_index) +{ + return pool_elt_at_index (session_manager_main.sessions[thread_index], si); +} + +always_inline stream_session_t * +stream_session_get_if_valid (u64 si, u32 thread_index) +{ + if (thread_index >= vec_len (session_manager_main.sessions)) + return 0; + + if (pool_is_free_index (session_manager_main.sessions[thread_index], si)) + return 0; + + return pool_elt_at_index (session_manager_main.sessions[thread_index], si); +} + +always_inline stream_session_t * +stream_session_listener_get (u8 sst, u64 si) +{ + return pool_elt_at_index (session_manager_main.listen_sessions[sst], si); +} + +always_inline u32 +stream_session_get_index (stream_session_t * s) +{ + if (s->session_state == SESSION_STATE_LISTENING) + return s - session_manager_main.listen_sessions[s->session_type]; + + return s - session_manager_main.sessions[s->thread_index]; +} + +always_inline u32 +stream_session_max_enqueue (transport_connection_t * tc) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + return svm_fifo_max_enqueue (s->server_rx_fifo); +} + +int +stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, + u8 queue_event); +u32 +stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, + u32 offset, u32 max_bytes); +u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); + +void +stream_session_connect_notify (transport_connection_t * tc, u8 sst, + u8 is_fail); +void stream_session_accept_notify (transport_connection_t * tc); +void stream_session_disconnect_notify (transport_connection_t * tc); +void stream_session_delete_notify (transport_connection_t * tc); +void stream_session_reset_notify (transport_connection_t * tc); +int +stream_session_accept (transport_connection_t * tc, u32 listener_index, + u8 sst, u8 notify); +void stream_session_open (u8 sst, ip46_address_t * addr, + u16 port_host_byte_order, u32 api_client_index); +void stream_session_disconnect (stream_session_t * s); +void stream_session_cleanup (stream_session_t * s); +int +stream_session_start_listen (u32 server_index, ip46_address_t * ip, u16 port); +void stream_session_stop_listen (u32 server_index); + +u8 *format_stream_session (u8 * s, va_list * args); + +void session_register_transport (u8 type, const transport_proto_vft_t * vft); +transport_proto_vft_t *session_get_transport_vft (u8 type); + +#endif /* __included_session_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c new file mode 100644 index 00000000..9d068684 --- /dev/null +++ b/src/vnet/session/session_api.c @@ -0,0 +1,821 @@ +/* + * Copyright (c) 2015-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include "application_interface.h" + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include +#undef vl_printfun + +#include + +#define foreach_session_api_msg \ +_(MAP_ANOTHER_SEGMENT_REPLY, map_another_segment_reply) \ +_(BIND_URI, bind_uri) \ +_(UNBIND_URI, unbind_uri) \ +_(CONNECT_URI, connect_uri) \ +_(DISCONNECT_SESSION, disconnect_session) \ +_(DISCONNECT_SESSION_REPLY, disconnect_session_reply) \ +_(ACCEPT_SESSION_REPLY, accept_session_reply) \ +_(RESET_SESSION_REPLY, reset_session_reply) \ +_(BIND_SOCK, bind_sock) \ +_(UNBIND_SOCK, unbind_sock) \ +_(CONNECT_SOCK, connect_sock) \ +_(DISCONNECT_SOCK, disconnect_sock) \ +_(DISCONNECT_SOCK_REPLY, disconnect_sock_reply) \ +_(ACCEPT_SOCK_REPLY, accept_sock_reply) \ +_(RESET_SOCK_REPLY, reset_sock_reply) \ + +static int +send_add_segment_callback (u32 api_client_index, const u8 * segment_name, + u32 segment_size) +{ + vl_api_map_another_segment_t *mp; + unix_shared_memory_queue_t *q; + + q = vl_api_client_index_to_input_queue (api_client_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_MAP_ANOTHER_SEGMENT); + mp->segment_size = segment_size; + strncpy ((char *) mp->segment_name, (char *) segment_name, + sizeof (mp->segment_name) - 1); + + vl_msg_api_send_shmem (q, (u8 *) & mp); + + return 0; +} + +static int +send_session_accept_uri_callback (stream_session_t * s) +{ + vl_api_accept_session_t *mp; + unix_shared_memory_queue_t *q, *vpp_queue; + application_t *server = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (server->api_client_index); + vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_ACCEPT_SESSION); + + /* Note: session_type is the first octet in all types of sessions */ + + mp->accept_cookie = server->accept_cookie; + mp->server_rx_fifo = (u64) s->server_rx_fifo; + mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->session_thread_index = s->thread_index; + mp->session_index = s->session_index; + mp->session_type = s->session_type; + mp->vpp_event_queue_address = (u64) vpp_queue; + vl_msg_api_send_shmem (q, (u8 *) & mp); + + return 0; +} + +static void +send_session_disconnect_uri_callback (stream_session_t * s) +{ + vl_api_disconnect_session_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_DISCONNECT_SESSION); + + mp->session_thread_index = s->thread_index; + mp->session_index = s->session_index; + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + +static int +send_session_connected_uri_callback (u32 api_client_index, + stream_session_t * s, u8 is_fail) +{ + vl_api_connect_uri_reply_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_lookup (api_client_index); + u8 *seg_name; + unix_shared_memory_queue_t *vpp_queue; + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_URI_REPLY); + mp->context = app->api_context; + mp->retval = is_fail; + if (!is_fail) + { + vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); + mp->server_rx_fifo = (u64) s->server_rx_fifo; + mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->session_thread_index = s->thread_index; + mp->session_index = s->session_index; + mp->session_type = s->session_type; + mp->vpp_event_queue_address = (u64) vpp_queue; + mp->client_event_queue_address = (u64) app->event_queue; + + session_manager_get_segment_info (s->server_segment_index, &seg_name, + &mp->segment_size); + mp->segment_name_length = vec_len (seg_name); + if (mp->segment_name_length) + clib_memcpy (mp->segment_name, seg_name, mp->segment_name_length); + } + + vl_msg_api_send_shmem (q, (u8 *) & mp); + + /* Remove client if connect failed */ + if (is_fail) + application_del (app); + + return 0; +} + +/** + * Redirect a connect_uri message to the indicated server. + * Only sent if the server has bound the related port with + * URI_OPTIONS_FLAGS_USE_FIFO + */ +static int +redirect_connect_uri_callback (u32 server_api_client_index, void *mp_arg) +{ + vl_api_connect_uri_t *mp = mp_arg; + unix_shared_memory_queue_t *server_q, *client_q; + vlib_main_t *vm = vlib_get_main (); + f64 timeout = vlib_time_now (vm) + 0.5; + int rv = 0; + + server_q = vl_api_client_index_to_input_queue (server_api_client_index); + + if (!server_q) + { + rv = VNET_API_ERROR_INVALID_VALUE; + goto out; + } + + client_q = vl_api_client_index_to_input_queue (mp->client_index); + if (!client_q) + { + rv = VNET_API_ERROR_INVALID_VALUE_2; + goto out; + } + + /* Tell the server the client's API queue address, so it can reply */ + mp->client_queue_address = (u64) client_q; + + /* + * Bounce message handlers MUST NOT block the data-plane. + * Spin waiting for the queue lock, but + */ + + while (vlib_time_now (vm) < timeout) + { + rv = + unix_shared_memory_queue_add (server_q, (u8 *) & mp, 1 /*nowait */ ); + switch (rv) + { + /* correctly enqueued */ + case 0: + return VNET_CONNECT_REDIRECTED; + + /* continue spinning, wait for pthread_mutex_trylock to work */ + case -1: + continue; + + /* queue stuffed, drop the msg */ + case -2: + rv = VNET_API_ERROR_QUEUE_FULL; + goto out; + } + } +out: + /* Dispose of the message */ + vl_msg_api_free (mp); + return rv; +} + +static u64 +make_session_handle (stream_session_t * s) +{ + return (u64) s->session_index << 32 | (u64) s->thread_index; +} + +static int +send_session_accept_callback (stream_session_t * s) +{ + vl_api_accept_sock_t *mp; + unix_shared_memory_queue_t *q, *vpp_queue; + application_t *server = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (server->api_client_index); + vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_ACCEPT_SOCK); + + /* Note: session_type is the first octet in all types of sessions */ + + mp->accept_cookie = server->accept_cookie; + mp->server_rx_fifo = (u64) s->server_rx_fifo; + mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->handle = make_session_handle (s); + mp->vpp_event_queue_address = (u64) vpp_queue; + vl_msg_api_send_shmem (q, (u8 *) & mp); + + return 0; +} + +static int +send_session_connected_callback (u32 api_client_index, stream_session_t * s, + u8 is_fail) +{ + vl_api_connect_sock_reply_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_lookup (api_client_index); + u8 *seg_name; + unix_shared_memory_queue_t *vpp_queue; + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_SOCK_REPLY); + mp->context = app->api_context; + mp->retval = is_fail; + if (!is_fail) + { + vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); + mp->server_rx_fifo = (u64) s->server_rx_fifo; + mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->handle = make_session_handle (s); + mp->vpp_event_queue_address = (u64) vpp_queue; + mp->client_event_queue_address = (u64) app->event_queue; + + session_manager_get_segment_info (s->server_segment_index, &seg_name, + &mp->segment_size); + mp->segment_name_length = vec_len (seg_name); + if (mp->segment_name_length) + clib_memcpy (mp->segment_name, seg_name, mp->segment_name_length); + } + + vl_msg_api_send_shmem (q, (u8 *) & mp); + + /* Remove client if connect failed */ + if (is_fail) + application_del (app); + + return 0; +} + +static void +send_session_disconnect_callback (stream_session_t * s) +{ + vl_api_disconnect_sock_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_DISCONNECT_SOCK); + + mp->handle = make_session_handle (s); + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + +/** + * Redirect a connect_uri message to the indicated server. + * Only sent if the server has bound the related port with + * URI_OPTIONS_FLAGS_USE_FIFO + */ +static int +redirect_connect_callback (u32 server_api_client_index, void *mp_arg) +{ + vl_api_connect_sock_t *mp = mp_arg; + unix_shared_memory_queue_t *server_q, *client_q; + vlib_main_t *vm = vlib_get_main (); + f64 timeout = vlib_time_now (vm) + 0.5; + int rv = 0; + + server_q = vl_api_client_index_to_input_queue (server_api_client_index); + + if (!server_q) + { + rv = VNET_API_ERROR_INVALID_VALUE; + goto out; + } + + client_q = vl_api_client_index_to_input_queue (mp->client_index); + if (!client_q) + { + rv = VNET_API_ERROR_INVALID_VALUE_2; + goto out; + } + + /* Tell the server the client's API queue address, so it can reply */ + mp->client_queue_address = (u64) client_q; + + /* + * Bounce message handlers MUST NOT block the data-plane. + * Spin waiting for the queue lock, but + */ + + while (vlib_time_now (vm) < timeout) + { + rv = + unix_shared_memory_queue_add (server_q, (u8 *) & mp, 1 /*nowait */ ); + switch (rv) + { + /* correctly enqueued */ + case 0: + return VNET_CONNECT_REDIRECTED; + + /* continue spinning, wait for pthread_mutex_trylock to work */ + case -1: + continue; + + /* queue stuffed, drop the msg */ + case -2: + rv = VNET_API_ERROR_QUEUE_FULL; + goto out; + } + } +out: + /* Dispose of the message */ + vl_msg_api_free (mp); + return rv; +} + +static session_cb_vft_t uri_session_cb_vft = { + .session_accept_callback = send_session_accept_uri_callback, + .session_disconnect_callback = send_session_disconnect_uri_callback, + .session_connected_callback = send_session_connected_uri_callback, + .add_segment_callback = send_add_segment_callback, + .redirect_connect_callback = redirect_connect_uri_callback +}; + +static session_cb_vft_t session_cb_vft = { + .session_accept_callback = send_session_accept_callback, + .session_disconnect_callback = send_session_disconnect_callback, + .session_connected_callback = send_session_connected_callback, + .add_segment_callback = send_add_segment_callback, + .redirect_connect_callback = redirect_connect_callback +}; + +static int +api_session_not_valid (u32 session_index, u32 thread_index) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + stream_session_t *pool; + + if (thread_index >= vec_len (smm->sessions)) + return VNET_API_ERROR_INVALID_VALUE; + + pool = smm->sessions[thread_index]; + + if (pool_is_free_index (pool, session_index)) + return VNET_API_ERROR_INVALID_VALUE_2; + + return 0; +} + +static void +vl_api_bind_uri_t_handler (vl_api_bind_uri_t * mp) +{ + vl_api_bind_uri_reply_t *rmp; + vnet_bind_args_t _a, *a = &_a; + char segment_name[128]; + u32 segment_name_length; + int rv; + + _Static_assert (sizeof (u64) * SESSION_OPTIONS_N_OPTIONS <= + sizeof (mp->options), + "Out of options, fix api message definition"); + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + + a->uri = (char *) mp->uri; + a->api_client_index = mp->client_index; + a->options = mp->options; + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &uri_session_cb_vft; + + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = mp->initial_segment_size; + a->options[SESSION_OPTIONS_ACCEPT_COOKIE] = mp->accept_cookie; + rv = vnet_bind_uri (a); + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_BIND_URI_REPLY, ({ + rmp->retval = rv; + if (!rv) + { + rmp->segment_name_length = 0; + /* $$$$ policy? */ + rmp->segment_size = mp->initial_segment_size; + if (segment_name_length) + { + memcpy (rmp->segment_name, segment_name, segment_name_length); + rmp->segment_name_length = segment_name_length; + } + rmp->server_event_queue_address = a->server_event_queue_address; + } + })); + /* *INDENT-ON* */ + +} + +static void +vl_api_unbind_uri_t_handler (vl_api_unbind_uri_t * mp) +{ + vl_api_unbind_uri_reply_t *rmp; + int rv; + + rv = vnet_unbind_uri ((char *) mp->uri, mp->client_index); + + REPLY_MACRO (VL_API_UNBIND_URI_REPLY); +} + +static void +vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) +{ + vnet_connect_args_t _a, *a = &_a; + + a->uri = (char *) mp->uri; + a->api_client_index = mp->client_index; + a->api_context = mp->context; + a->options = mp->options; + a->session_cb_vft = &uri_session_cb_vft; + a->mp = mp; + vnet_connect_uri (a); +} + +static void +vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) +{ + vl_api_disconnect_session_reply_t *rmp; + int rv; + + rv = api_session_not_valid (mp->session_index, mp->session_thread_index); + if (!rv) + rv = vnet_disconnect_session (mp->client_index, mp->session_index, + mp->session_thread_index); + + REPLY_MACRO (VL_API_DISCONNECT_SESSION_REPLY); +} + +static void +vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * + mp) +{ + if (api_session_not_valid (mp->session_index, mp->session_thread_index)) + { + clib_warning ("Invalid session!"); + return; + } + + /* Client objected to disconnecting the session, log and continue */ + if (mp->retval) + { + clib_warning ("client retval %d", mp->retval); + return; + } + + /* Disconnect has been confirmed. Confirm close to transport */ + vnet_disconnect_session (mp->client_index, mp->session_index, + mp->session_thread_index); +} + +static void +vl_api_reset_session_reply_t_handler (vl_api_reset_session_reply_t * mp) +{ + stream_session_t *s; + + if (api_session_not_valid (mp->session_index, mp->session_thread_index)) + { + clib_warning ("Invalid session!"); + return; + } + + /* Client objected to resetting the session, log and continue */ + if (mp->retval) + { + clib_warning ("client retval %d", mp->retval); + return; + } + + s = stream_session_get (mp->session_index, mp->session_thread_index); + + /* This comes as a response to a reset, transport only waiting for + * confirmation to remove connection state, no need to disconnect */ + stream_session_cleanup (s); +} + +static void +vl_api_accept_session_reply_t_handler (vl_api_accept_session_reply_t * mp) +{ + stream_session_t *s; + int rv; + + if (api_session_not_valid (mp->session_index, mp->session_thread_index)) + return; + + s = stream_session_get (mp->session_index, mp->session_thread_index); + rv = mp->retval; + + if (rv) + { + /* Server isn't interested, kill the session */ + stream_session_disconnect (s); + return; + } + + s->session_state = SESSION_STATE_READY; +} + +static void +vl_api_map_another_segment_reply_t_handler (vl_api_map_another_segment_reply_t + * mp) +{ + clib_warning ("not implemented"); +} + +static void +vl_api_bind_sock_t_handler (vl_api_bind_sock_t * mp) +{ + vl_api_bind_sock_reply_t *rmp; + vnet_bind_args_t _a, *a = &_a; + char segment_name[128]; + u32 segment_name_length; + int rv; + + STATIC_ASSERT (sizeof (u64) * SESSION_OPTIONS_N_OPTIONS <= + sizeof (mp->options), + "Out of options, fix api message definition"); + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + + clib_memcpy (&a->tep.ip, mp->ip, + (mp->is_ip4 ? sizeof (ip4_address_t) : + sizeof (ip6_address_t))); + a->tep.is_ip4 = mp->is_ip4; + a->tep.port = mp->port; + a->tep.vrf = mp->vrf; + + a->api_client_index = mp->client_index; + a->options = mp->options; + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &session_cb_vft; + + rv = vnet_bind_uri (a); + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_BIND_SOCK_REPLY, ({ + rmp->retval = rv; + if (!rv) + { + rmp->segment_name_length = 0; + rmp->segment_size = mp->options[SESSION_OPTIONS_SEGMENT_SIZE]; + if (segment_name_length) + { + memcpy(rmp->segment_name, segment_name, segment_name_length); + rmp->segment_name_length = segment_name_length; + } + rmp->server_event_queue_address = a->server_event_queue_address; + } + })); + /* *INDENT-ON* */ +} + +static void +vl_api_unbind_sock_t_handler (vl_api_unbind_sock_t * mp) +{ + vl_api_unbind_sock_reply_t *rmp; + vnet_unbind_args_t _a, *a = &_a; + int rv; + + a->api_client_index = mp->client_index; + a->handle = mp->handle; + + rv = vnet_unbind (a); + + REPLY_MACRO (VL_API_UNBIND_SOCK_REPLY); +} + +static void +vl_api_connect_sock_t_handler (vl_api_connect_sock_t * mp) +{ + vnet_connect_args_t _a, *a = &_a; + + clib_memcpy (&a->tep.ip, mp->ip, + (mp->is_ip4 ? sizeof (ip4_address_t) : + sizeof (ip6_address_t))); + a->tep.is_ip4 = mp->is_ip4; + a->tep.port = mp->port; + a->tep.vrf = mp->vrf; + a->options = mp->options; + a->session_cb_vft = &session_cb_vft; + a->api_context = mp->context; + a->mp = mp; + + vnet_connect (a); +} + +static void +vl_api_disconnect_sock_t_handler (vl_api_disconnect_sock_t * mp) +{ + vnet_disconnect_args_t _a, *a = &_a; + vl_api_disconnect_sock_reply_t *rmp; + int rv; + + a->api_client_index = mp->client_index; + a->handle = mp->handle; + rv = vnet_disconnect (a); + + REPLY_MACRO (VL_API_DISCONNECT_SOCK_REPLY); +} + +static void +vl_api_disconnect_sock_reply_t_handler (vl_api_disconnect_sock_reply_t * mp) +{ + vnet_disconnect_args_t _a, *a = &_a; + + /* Client objected to disconnecting the session, log and continue */ + if (mp->retval) + { + clib_warning ("client retval %d", mp->retval); + return; + } + + a->api_client_index = mp->client_index; + a->handle = mp->handle; + + vnet_disconnect (a); +} + +static void +vl_api_reset_sock_reply_t_handler (vl_api_reset_sock_reply_t * mp) +{ + stream_session_t *s; + u32 session_index, thread_index; + + /* Client objected to resetting the session, log and continue */ + if (mp->retval) + { + clib_warning ("client retval %d", mp->retval); + return; + } + + if (api_parse_session_handle (mp->handle, &session_index, &thread_index)) + { + clib_warning ("Invalid handle"); + return; + } + + s = stream_session_get (session_index, thread_index); + + /* This comes as a response to a reset, transport only waiting for + * confirmation to remove connection state, no need to disconnect */ + stream_session_cleanup (s); +} + +static void +vl_api_accept_sock_reply_t_handler (vl_api_accept_sock_reply_t * mp) +{ + stream_session_t *s; + u32 session_index, thread_index; + + if (api_parse_session_handle (mp->handle, &session_index, &thread_index)) + { + clib_warning ("Invalid handle"); + return; + } + s = stream_session_get (session_index, thread_index); + + if (mp->retval) + { + /* Server isn't interested, kill the session */ + stream_session_disconnect (s); + return; + } + + s->session_state = SESSION_STATE_READY; +} + +#define vl_msg_name_crc_list +#include +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (api_main_t * am) +{ +#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id); + foreach_vl_msg_name_crc_session; +#undef _ +} + +/* + * session_api_hookup + * Add uri's API message handlers to the table. + * vlib has alread mapped shared memory and + * added the client registration handlers. + * See .../open-repo/vlib/memclnt_vlib.c:memclnt_process() + */ +static clib_error_t * +session_api_hookup (vlib_main_t * vm) +{ + api_main_t *am = &api_main; + +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_session_api_msg; +#undef _ + + /* + * Messages which bounce off the data-plane to + * an API client. Simply tells the message handling infra not + * to free the message. + * + * Bounced message handlers MUST NOT block the data plane + */ + am->message_bounce[VL_API_CONNECT_URI] = 1; + am->message_bounce[VL_API_CONNECT_SOCK] = 1; + + /* + * Set up the (msg_name, crc, message-id) table + */ + setup_message_id_table (am); + + return 0; +} + +VLIB_API_INIT_FUNCTION (session_api_hookup); +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c new file mode 100644 index 00000000..b2943a1c --- /dev/null +++ b/src/vnet/session/session_cli.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +/** + * Format stream session as per the following format + * + * verbose: + * "Connection", "Rx fifo", "Tx fifo", "Session Index" + * non-verbose: + * "Connection" + */ +u8 * +format_stream_session (u8 * s, va_list * args) +{ + stream_session_t *ss = va_arg (*args, stream_session_t *); + int verbose = va_arg (*args, int); + transport_proto_vft_t *tp_vft; + u8 *str = 0; + + tp_vft = session_get_transport_vft (ss->session_type); + + if (verbose) + str = format (0, "%-20llp%-20llp%-15lld", ss->server_rx_fifo, + ss->server_tx_fifo, stream_session_get_index (ss)); + + if (ss->session_state == SESSION_STATE_READY) + { + s = format (s, "%-40U%v", tp_vft->format_connection, + ss->connection_index, ss->thread_index, str); + } + else if (ss->session_state == SESSION_STATE_LISTENING) + { + s = format (s, "%-40U%v", tp_vft->format_listener, ss->connection_index, + str); + } + else if (ss->session_state == SESSION_STATE_READY) + { + s = + format (s, "%-40U%v", tp_vft->format_half_open, ss->connection_index, + str); + } + else if (ss->session_state == SESSION_STATE_CLOSED) + { + s = format (s, "[CL] %-40U%v", tp_vft->format_connection, + ss->connection_index, ss->thread_index, str); + } + else + { + clib_warning ("Session in unknown state!"); + } + + vec_free (str); + + return s; +} + +static clib_error_t * +show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + session_manager_main_t *smm = &session_manager_main; + int verbose = 0, i; + stream_session_t *pool; + stream_session_t *s; + u8 *str = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + break; + } + + for (i = 0; i < vec_len (smm->sessions); i++) + { + u32 once_per_pool; + pool = smm->sessions[i]; + + once_per_pool = 1; + + if (pool_elts (pool)) + { + + vlib_cli_output (vm, "Thread %d: %d active sessions", + i, pool_elts (pool)); + if (verbose) + { + if (once_per_pool) + { + str = format (str, "%-40s%-20s%-20s%-15s", + "Connection", "Rx fifo", "Tx fifo", + "Session Index"); + vlib_cli_output (vm, "%v", str); + vec_reset_length (str); + once_per_pool = 0; + } + + /* *INDENT-OFF* */ + pool_foreach (s, pool, + ({ + vlib_cli_output (vm, "%U", format_stream_session, s, verbose); + })); + /* *INDENT-ON* */ + } + } + else + vlib_cli_output (vm, "Thread %d: no active sessions", i); + } + vec_free (str); + + return 0; +} + +VLIB_CLI_COMMAND (show_uri_command, static) = +{ +.path = "show session",.short_help = "show session [verbose]",.function = + show_session_command_fn,}; + + +static clib_error_t * +clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + session_manager_main_t *smm = &session_manager_main; + u32 thread_index = 0; + u32 session_index = ~0; + stream_session_t *pool, *session; + application_t *server; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "thread %d", &thread_index)) + ; + else if (unformat (input, "session %d", &session_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (session_index == ~0) + return clib_error_return (0, "session required, but not set."); + + if (thread_index > vec_len (smm->sessions)) + return clib_error_return (0, "thread %d out of range [0-%d]", + thread_index, vec_len (smm->sessions)); + + pool = smm->sessions[thread_index]; + + if (pool_is_free_index (pool, session_index)) + return clib_error_return (0, "session %d not active", session_index); + + session = pool_elt_at_index (pool, session_index); + server = application_get (session->app_index); + + /* Disconnect both app and transport */ + server->cb_fns.session_disconnect_callback (session); + + return 0; +} + +VLIB_CLI_COMMAND (clear_uri_session_command, static) = +{ +.path = "clear session",.short_help = + "clear session thread session ",.function = + clear_session_command_fn,}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/transport.c b/src/vnet/session/transport.c new file mode 100644 index 00000000..abd94ba4 --- /dev/null +++ b/src/vnet/session/transport.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +u32 +transport_endpoint_lookup (transport_endpoint_table_t *ht, ip46_address_t *ip, + u16 port) +{ + clib_bihash_kv_24_8_t kv; + int rv; + + kv.key[0] = ip->as_u64[0]; + kv.key[1] = ip->as_u64[1]; + kv.key[2] = port; + + rv = clib_bihash_search_inline_24_8 (ht, &kv); + if (rv == 0) + return kv.value; + + return TRANSPORT_ENDPOINT_INVALID_INDEX; +} + +void +transport_endpoint_table_add (transport_endpoint_table_t *ht, + transport_endpoint_t *te, u32 value) +{ + clib_bihash_kv_24_8_t kv; + + kv.key[0] = te->ip.as_u64[0]; + kv.key[1] = te->ip.as_u64[1]; + kv.key[2] = te->port; + kv.value = value; + + clib_bihash_add_del_24_8 (ht, &kv, 1); +} + +void +transport_endpoint_table_del (transport_endpoint_table_t *ht, + transport_endpoint_t *te) +{ + clib_bihash_kv_24_8_t kv; + + kv.key[0] = te->ip.as_u64[0]; + kv.key[1] = te->ip.as_u64[1]; + kv.key[2] = te->port; + + clib_bihash_add_del_24_8 (ht, &kv, 0); +} + + + diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h new file mode 100644 index 00000000..2d4415ba --- /dev/null +++ b/src/vnet/session/transport.h @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef VNET_VNET_URI_TRANSPORT_H_ +#define VNET_VNET_URI_TRANSPORT_H_ + +#include +#include +#include +#include + +/* + * Protocol independent transport properties associated to a session + */ +typedef struct _transport_connection +{ + ip46_address_t rmt_ip; /**< Remote IP */ + ip46_address_t lcl_ip; /**< Local IP */ + u16 lcl_port; /**< Local port */ + u16 rmt_port; /**< Remote port */ + u8 proto; /**< Transport protocol id */ + + u32 s_index; /**< Parent session index */ + u32 c_index; /**< Connection index in transport pool */ + u8 is_ip4; /**< Flag if IP4 connection */ + u32 thread_index; /**< Worker-thread index */ + + /** Macros for 'derived classes' where base is named "connection" */ +#define c_lcl_ip connection.lcl_ip +#define c_rmt_ip connection.rmt_ip +#define c_lcl_ip4 connection.lcl_ip.ip4 +#define c_rmt_ip4 connection.rmt_ip.ip4 +#define c_lcl_ip6 connection.lcl_ip.ip6 +#define c_rmt_ip6 connection.rmt_ip.ip6 +#define c_lcl_port connection.lcl_port +#define c_rmt_port connection.rmt_port +#define c_proto connection.proto +#define c_state connection.state +#define c_s_index connection.s_index +#define c_c_index connection.c_index +#define c_is_ip4 connection.is_ip4 +#define c_thread_index connection.thread_index +} transport_connection_t; + +/* + * Transport protocol virtual function table + */ +typedef struct _transport_proto_vft +{ + /* + * Setup + */ + u32 (*bind) (vlib_main_t *, u32, ip46_address_t *, u16); + u32 (*unbind) (vlib_main_t *, u32); + int (*open) (ip46_address_t * addr, u16 port_host_byte_order); + void (*close) (u32 conn_index, u32 thread_index); + void (*cleanup) (u32 conn_index, u32 thread_index); + + /* + * Transmission + */ + u32 (*push_header) (transport_connection_t * tconn, vlib_buffer_t * b); + u16 (*send_mss) (transport_connection_t * tc); + u32 (*send_space) (transport_connection_t * tc); + u32 (*rx_fifo_offset) (transport_connection_t * tc); + + /* + * Connection retrieval + */ + transport_connection_t *(*get_connection) (u32 conn_idx, u32 thread_idx); + transport_connection_t *(*get_listener) (u32 conn_index); + transport_connection_t *(*get_half_open) (u32 conn_index); + + /* + * Format + */ + u8 *(*format_connection) (u8 * s, va_list * args); + u8 *(*format_listener) (u8 * s, va_list * args); + u8 *(*format_half_open) (u8 * s, va_list * args); + +} transport_proto_vft_t; + +/* 16 octets */ +typedef CLIB_PACKED (struct + { + union + { + struct + { + ip4_address_t src; ip4_address_t dst; + u16 src_port; + u16 dst_port; + /* align by making this 4 octets even though its a 1-bit field + * NOTE: avoid key overlap with other transports that use 5 tuples for + * session identification. + */ + u32 proto; + }; + u64 as_u64[2]; + }; + }) v4_connection_key_t; + +typedef CLIB_PACKED (struct + { + union + { + struct + { + /* 48 octets */ + ip6_address_t src; ip6_address_t dst; + u16 src_port; + u16 dst_port; u32 proto; u8 unused_for_now[8]; + }; u64 as_u64[6]; + }; + }) v6_connection_key_t; + +typedef clib_bihash_kv_16_8_t session_kv4_t; +typedef clib_bihash_kv_48_8_t session_kv6_t; + +always_inline void +make_v4_ss_kv (session_kv4_t * kv, ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + v4_connection_key_t key; + memset (&key, 0, sizeof (v4_connection_key_t)); + + key.src.as_u32 = lcl->as_u32; + key.dst.as_u32 = rmt->as_u32; + key.src_port = lcl_port; + key.dst_port = rmt_port; + key.proto = proto; + + kv->key[0] = key.as_u64[0]; + kv->key[1] = key.as_u64[1]; + kv->value = ~0ULL; +} + +always_inline void +make_v4_listener_kv (session_kv4_t * kv, ip4_address_t * lcl, u16 lcl_port, + u8 proto) +{ + v4_connection_key_t key; + memset (&key, 0, sizeof (v4_connection_key_t)); + + key.src.as_u32 = lcl->as_u32; + key.dst.as_u32 = 0; + key.src_port = lcl_port; + key.dst_port = 0; + key.proto = proto; + + kv->key[0] = key.as_u64[0]; + kv->key[1] = key.as_u64[1]; + kv->value = ~0ULL; +} + +always_inline void +make_v4_ss_kv_from_tc (session_kv4_t * kv, transport_connection_t * t) +{ + return make_v4_ss_kv (kv, &t->lcl_ip.ip4, &t->rmt_ip.ip4, t->lcl_port, + t->rmt_port, t->proto); +} + +always_inline void +make_v6_ss_kv (session_kv6_t * kv, ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + v6_connection_key_t key; + memset (&key, 0, sizeof (v6_connection_key_t)); + + key.src.as_u64[0] = lcl->as_u64[0]; + key.src.as_u64[1] = lcl->as_u64[1]; + key.dst.as_u64[0] = rmt->as_u64[0]; + key.dst.as_u64[1] = rmt->as_u64[1]; + key.src_port = lcl_port; + key.dst_port = rmt_port; + key.proto = proto; + + kv->key[0] = key.as_u64[0]; + kv->key[1] = key.as_u64[1]; + kv->value = ~0ULL; +} + +always_inline void +make_v6_listener_kv (session_kv6_t * kv, ip6_address_t * lcl, u16 lcl_port, + u8 proto) +{ + v6_connection_key_t key; + memset (&key, 0, sizeof (v6_connection_key_t)); + + key.src.as_u64[0] = lcl->as_u64[0]; + key.src.as_u64[1] = lcl->as_u64[1]; + key.dst.as_u64[0] = 0; + key.dst.as_u64[1] = 0; + key.src_port = lcl_port; + key.dst_port = 0; + key.proto = proto; + + kv->key[0] = key.as_u64[0]; + kv->key[1] = key.as_u64[1]; + kv->value = ~0ULL; +} + +always_inline void +make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) +{ + make_v6_ss_kv (kv, &t->lcl_ip.ip6, &t->rmt_ip.ip6, t->lcl_port, + t->rmt_port, t->proto); +} + +typedef struct _transport_endpoint +{ + ip46_address_t ip; + u16 port; + u8 is_ip4; + u32 vrf; +} transport_endpoint_t; + +typedef clib_bihash_24_8_t transport_endpoint_table_t; + +#define TRANSPORT_ENDPOINT_INVALID_INDEX ((u32)~0) + +u32 +transport_endpoint_lookup (transport_endpoint_table_t * ht, + ip46_address_t * ip, u16 port); +void transport_endpoint_table_add (transport_endpoint_table_t * ht, + transport_endpoint_t * te, u32 value); +void transport_endpoint_table_del (transport_endpoint_table_t * ht, + transport_endpoint_t * te); + +#endif /* VNET_VNET_URI_TRANSPORT_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c new file mode 100644 index 00000000..0f9b7097 --- /dev/null +++ b/src/vnet/tcp/tcp.c @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +tcp_main_t tcp_main; + +static u32 +tcp_connection_bind (vlib_main_t * vm, u32 session_index, ip46_address_t * ip, + u16 port_host_byte_order, u8 is_ip4) +{ + tcp_main_t *tm = &tcp_main; + tcp_connection_t *listener; + + pool_get (tm->listener_pool, listener); + memset (listener, 0, sizeof (*listener)); + + listener->c_c_index = listener - tm->listener_pool; + listener->c_lcl_port = clib_host_to_net_u16 (port_host_byte_order); + + if (is_ip4) + listener->c_lcl_ip4.as_u32 = ip->ip4.as_u32; + else + clib_memcpy (&listener->c_lcl_ip6, &ip->ip6, sizeof (ip6_address_t)); + + listener->c_s_index = session_index; + listener->c_proto = SESSION_TYPE_IP4_TCP; + listener->state = TCP_STATE_LISTEN; + listener->c_is_ip4 = 1; + + return listener->c_c_index; +} + +u32 +tcp_session_bind_ip4 (vlib_main_t * vm, u32 session_index, + ip46_address_t * ip, u16 port_host_byte_order) +{ + return tcp_connection_bind (vm, session_index, ip, port_host_byte_order, 1); +} + +u32 +tcp_session_bind_ip6 (vlib_main_t * vm, u32 session_index, + ip46_address_t * ip, u16 port_host_byte_order) +{ + return tcp_connection_bind (vm, session_index, ip, port_host_byte_order, 0); + +} + +static void +tcp_session_unbind (u32 listener_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + pool_put_index (tm->listener_pool, listener_index); +} + +u32 +tcp_session_unbind_ip4 (vlib_main_t * vm, u32 listener_index) +{ + tcp_session_unbind (listener_index); + return 0; +} + +u32 +tcp_session_unbind_ip6 (vlib_main_t * vm, u32 listener_index) +{ + tcp_session_unbind (listener_index); + return 0; +} + +transport_connection_t * +tcp_session_get_listener (u32 listener_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + tc = pool_elt_at_index (tm->listener_pool, listener_index); + return &tc->connection; +} + +/** + * Cleans up connection state. + * + * No notifications. + */ +void +tcp_connection_cleanup (tcp_connection_t * tc) +{ + tcp_main_t *tm = &tcp_main; + u32 tepi; + transport_endpoint_t *tep; + + /* Cleanup local endpoint if this was an active connect */ + tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip, + tc->c_lcl_port); + + /*XXX lock */ + if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX) + { + tep = pool_elt_at_index (tm->local_endpoints, tepi); + transport_endpoint_table_del (&tm->local_endpoints_table, tep); + pool_put (tm->local_endpoints, tep); + } + + /* Make sure all timers are cleared */ + tcp_connection_timers_reset (tc); + + /* Check if half-open */ + if (tc->state == TCP_STATE_SYN_SENT) + pool_put (tm->half_open_connections, tc); + else + pool_put (tm->connections[tc->c_thread_index], tc); +} + +/** + * Connection removal. + * + * This should be called only once connection enters CLOSED state. Note + * that it notifies the session of the removal event, so if the goal is to + * just remove the connection, call tcp_connection_cleanup instead. + */ +void +tcp_connection_del (tcp_connection_t * tc) +{ + stream_session_delete_notify (&tc->connection); + tcp_connection_cleanup (tc); +} + +/** + * Begin connection closing procedure. + * + * If at the end the connection is not in CLOSED state, it is not removed. + * Instead, we rely on on TCP to advance through state machine to either + * 1) LAST_ACK (passive close) whereby when the last ACK is received + * tcp_connection_del is called. This notifies session of the delete and + * calls cleanup. + * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers + * and cleanup is called. + */ +void +tcp_connection_close (tcp_connection_t * tc) +{ + /* Send FIN if needed */ + if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD + || tc->state == TCP_STATE_CLOSE_WAIT) + tcp_send_fin (tc); + + /* Switch state */ + if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD) + tc->state = TCP_STATE_FIN_WAIT_1; + else if (tc->state == TCP_STATE_SYN_SENT) + tc->state = TCP_STATE_CLOSED; + else if (tc->state == TCP_STATE_CLOSE_WAIT) + tc->state = TCP_STATE_LAST_ACK; + + /* Half-close connections are not supported XXX */ + + if (tc->state == TCP_STATE_CLOSED) + tcp_connection_del (tc); +} + +void +tcp_session_close (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc; + tc = tcp_connection_get (conn_index, thread_index); + tcp_connection_close (tc); +} + +void +tcp_session_cleanup (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc; + tc = tcp_connection_get (conn_index, thread_index); + tcp_connection_cleanup (tc); +} + +void * +ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) +{ + ip_lookup_main_t *lm4 = &ip4_main.lookup_main; + ip_lookup_main_t *lm6 = &ip6_main.lookup_main; + ip_interface_address_t *ia = 0; + + if (is_ip4) + { + /* *INDENT-OFF* */ + foreach_ip_interface_address (lm4, ia, sw_if_index, 1 /* unnumbered */ , + ({ + return ip_interface_address_get_address (lm4, ia); + })); + /* *INDENT-ON* */ + } + else + { + /* *INDENT-OFF* */ + foreach_ip_interface_address (lm6, ia, sw_if_index, 1 /* unnumbered */ , + ({ + return ip_interface_address_get_address (lm6, ia); + })); + /* *INDENT-ON* */ + } + + return 0; +} + +/** + * Allocate local port and add if successful add entry to local endpoint + * table to mark the pair as used. + */ +u16 +tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) +{ + u8 unique = 0; + transport_endpoint_t *tep; + u32 time_now, tei; + u16 min = 1024, max = 65535, tries; /* XXX configurable ? */ + + tries = max - min; + time_now = tcp_time_now (); + + /* Start at random point or max */ + pool_get (tm->local_endpoints, tep); + clib_memcpy (&tep->ip, ip, sizeof (*ip)); + tep->port = random_u32 (&time_now) << 16; + tep->port = tep->port < min ? max : tep->port; + + /* Search for first free slot */ + while (tries) + { + tei = transport_endpoint_lookup (&tm->local_endpoints_table, &tep->ip, + tep->port); + if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX) + { + unique = 1; + break; + } + + tep->port--; + + if (tep->port < min) + tep->port = max; + + tries--; + } + + if (unique) + { + transport_endpoint_table_add (&tm->local_endpoints_table, tep, + tep - tm->local_endpoints); + + return tep->port; + } + + /* Failed */ + pool_put (tm->local_endpoints, tep); + return -1; +} + +/** + * Initialize all connection timers as invalid + */ +void +tcp_connection_timers_init (tcp_connection_t * tc) +{ + int i; + + /* Set all to invalid */ + for (i = 0; i < TCP_N_TIMERS; i++) + { + tc->timers[i] = TCP_TIMER_HANDLE_INVALID; + } + + tc->rto = TCP_RTO_INIT; +} + +/** + * Stop all connection timers + */ +void +tcp_connection_timers_reset (tcp_connection_t * tc) +{ + int i; + for (i = 0; i < TCP_N_TIMERS; i++) + { + tcp_timer_reset (tc, i); + } +} + +/** Initialize tcp connection variables + * + * Should be called after having received a msg from the peer, i.e., a SYN or + * a SYNACK, such that connection options have already been exchanged. */ +void +tcp_connection_init_vars (tcp_connection_t * tc) +{ + tcp_connection_timers_init (tc); + tcp_set_snd_mss (tc); + tc->sack_sb.head = TCP_INVALID_SACK_HOLE_INDEX; + tcp_cc_init (tc); +} + +int +tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + fib_prefix_t prefix; + u32 fei, sw_if_index; + ip46_address_t lcl_addr; + u16 lcl_port; + + /* + * Find the local address and allocate port + */ + memset (&lcl_addr, 0, sizeof (lcl_addr)); + + /* Find a FIB path to the destination */ + clib_memcpy (&prefix.fp_addr, rmt_addr, sizeof (*rmt_addr)); + prefix.fp_proto = is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; + prefix.fp_len = is_ip4 ? 32 : 128; + + fei = fib_table_lookup (0, &prefix); + + /* Couldn't find route to destination. Bail out. */ + if (fei == FIB_NODE_INDEX_INVALID) + return -1; + + sw_if_index = fib_entry_get_resolving_interface (fei); + + if (sw_if_index == (u32) ~ 0) + return -1; + + if (is_ip4) + { + ip4_address_t *ip4; + ip4 = ip_interface_get_first_ip (sw_if_index, 1); + lcl_addr.ip4.as_u32 = ip4->as_u32; + } + else + { + ip6_address_t *ip6; + ip6 = ip_interface_get_first_ip (sw_if_index, 0); + clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + } + + /* Allocate source port */ + lcl_port = tcp_allocate_local_port (tm, &lcl_addr); + if (lcl_port < 1) + return -1; + + /* + * Create connection and send SYN + */ + + pool_get (tm->half_open_connections, tc); + memset (tc, 0, sizeof (*tc)); + + clib_memcpy (&tc->c_rmt_ip, rmt_addr, sizeof (ip46_address_t)); + clib_memcpy (&tc->c_lcl_ip, &lcl_addr, sizeof (ip46_address_t)); + tc->c_rmt_port = clib_host_to_net_u16 (rmt_port); + tc->c_lcl_port = clib_host_to_net_u16 (lcl_port); + tc->c_c_index = tc - tm->half_open_connections; + tc->c_is_ip4 = is_ip4; + + /* The other connection vars will be initialized after SYN ACK */ + tcp_connection_timers_init (tc); + + tcp_send_syn (tc); + + tc->state = TCP_STATE_SYN_SENT; + + return tc->c_c_index; +} + +int +tcp_session_open_ip4 (ip46_address_t * addr, u16 port) +{ + return tcp_connection_open (addr, port, 1); +} + +int +tcp_session_open_ip6 (ip46_address_t * addr, u16 port) +{ + return tcp_connection_open (addr, port, 0); +} + +u8 * +format_tcp_session_ip4 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + tcp_connection_t *tc; + + tc = tcp_connection_get (tci, thread_index); + + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, + &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip4_address, &tc->c_rmt_ip4, + clib_net_to_host_u16 (tc->c_rmt_port)); + + return s; +} + +u8 * +format_tcp_session_ip6 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + tcp_connection_t *tc = tcp_connection_get (tci, thread_index); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_tcp_listener_session_ip4 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_listener_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, + &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip4_address, &tc->c_rmt_ip4, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_tcp_listener_session_ip6 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_listener_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_tcp_half_open_session_ip4 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_half_open_connection_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, + &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip4_address, &tc->c_rmt_ip4, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_tcp_half_open_session_ip6 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_half_open_connection_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +transport_connection_t * +tcp_session_get_transport (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index); + return &tc->connection; +} + +transport_connection_t * +tcp_half_open_session_get_transport (u32 conn_index) +{ + tcp_connection_t *tc = tcp_half_open_connection_get (conn_index); + return &tc->connection; +} + +u16 +tcp_session_send_mss (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return tc->snd_mss; +} + +u32 +tcp_session_send_space (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return tcp_available_snd_space (tc); +} + +u32 +tcp_session_rx_fifo_offset (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return (tc->snd_una_max - tc->snd_una); +} + +/* *INDENT-OFF* */ +const static transport_proto_vft_t tcp4_proto = { + .bind = tcp_session_bind_ip4, + .unbind = tcp_session_unbind_ip4, + .push_header = tcp_push_header, + .get_connection = tcp_session_get_transport, + .get_listener = tcp_session_get_listener, + .get_half_open = tcp_half_open_session_get_transport, + .open = tcp_session_open_ip4, + .close = tcp_session_close, + .cleanup = tcp_session_cleanup, + .send_mss = tcp_session_send_mss, + .send_space = tcp_session_send_space, + .rx_fifo_offset = tcp_session_rx_fifo_offset, + .format_connection = format_tcp_session_ip4, + .format_listener = format_tcp_listener_session_ip4, + .format_half_open = format_tcp_half_open_session_ip4 +}; + +const static transport_proto_vft_t tcp6_proto = { + .bind = tcp_session_bind_ip6, + .unbind = tcp_session_unbind_ip6, + .push_header = tcp_push_header, + .get_connection = tcp_session_get_transport, + .get_listener = tcp_session_get_listener, + .get_half_open = tcp_half_open_session_get_transport, + .open = tcp_session_open_ip6, + .close = tcp_session_close, + .cleanup = tcp_session_cleanup, + .send_mss = tcp_session_send_mss, + .send_space = tcp_session_send_space, + .rx_fifo_offset = tcp_session_rx_fifo_offset, + .format_connection = format_tcp_session_ip6, + .format_listener = format_tcp_listener_session_ip6, + .format_half_open = format_tcp_half_open_session_ip6 +}; +/* *INDENT-ON* */ + +void +tcp_timer_keep_handler (u32 conn_index) +{ + u32 cpu_index = os_get_cpu_number (); + tcp_connection_t *tc; + + tc = tcp_connection_get (conn_index, cpu_index); + tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID; + + tcp_connection_close (tc); +} + +void +tcp_timer_establish_handler (u32 conn_index) +{ + tcp_connection_t *tc; + u8 sst; + + tc = tcp_half_open_connection_get (conn_index); + tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; + + ASSERT (tc->state == TCP_STATE_SYN_SENT); + + sst = tc->c_is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + stream_session_connect_notify (&tc->connection, sst, 1 /* fail */ ); + + tcp_connection_cleanup (tc); +} + +void +tcp_timer_2msl_handler (u32 conn_index) +{ + u32 cpu_index = os_get_cpu_number (); + tcp_connection_t *tc; + + tc = tcp_connection_get (conn_index, cpu_index); + tc->timers[TCP_TIMER_2MSL] = TCP_TIMER_HANDLE_INVALID; + + tcp_connection_del (tc); +} + +/* *INDENT-OFF* */ +static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] = +{ + tcp_timer_retransmit_handler, + tcp_timer_delack_handler, + 0, + tcp_timer_keep_handler, + tcp_timer_2msl_handler, + tcp_timer_retransmit_syn_handler, + tcp_timer_establish_handler +}; +/* *INDENT-ON* */ + +static void +tcp_expired_timers_dispatch (u32 * expired_timers) +{ + int i; + u32 connection_index, timer_id; + + for (i = 0; i < vec_len (expired_timers); i++) + { + /* Get session index and timer id */ + connection_index = expired_timers[i] & 0x0FFFFFFF; + timer_id = expired_timers[i] >> 28; + + /* Handle expiration */ + (*timer_expiration_handlers[timer_id]) (connection_index); + } +} + +void +tcp_initialize_timer_wheels (tcp_main_t * tm) +{ + tw_timer_wheel_16t_2w_512sl_t *tw; + vec_foreach (tw, tm->timer_wheels) + { + tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch, + 100e-3 /* timer period 100ms */ , ~0); + tw->last_run_time = vlib_time_now (tm->vlib_main); + } +} + +clib_error_t * +tcp_init (vlib_main_t * vm) +{ + ip_main_t *im = &ip_main; + ip_protocol_info_t *pi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_thread_main_t *vtm = vlib_get_thread_main (); + clib_error_t *error = 0; + u32 num_threads; + + tm->vlib_main = vm; + tm->vnet_main = vnet_get_main (); + + if ((error = vlib_call_init_function (vm, ip_main_init))) + return error; + if ((error = vlib_call_init_function (vm, ip4_lookup_init))) + return error; + if ((error = vlib_call_init_function (vm, ip6_lookup_init))) + return error; + + /* + * Registrations + */ + + /* Register with IP */ + pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP); + if (pi == 0) + return clib_error_return (0, "TCP protocol info AWOL"); + pi->format_header = format_tcp_header; + pi->unformat_pg_edit = unformat_pg_tcp_header; + + ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index); + + /* Register as transport with URI */ + session_register_transport (SESSION_TYPE_IP4_TCP, &tcp4_proto); + session_register_transport (SESSION_TYPE_IP6_TCP, &tcp6_proto); + + /* + * Initialize data structures + */ + + num_threads = 1 /* main thread */ + vtm->n_threads; + vec_validate (tm->connections, num_threads - 1); + + /* Initialize per worker thread tx buffers (used for control messages) */ + vec_validate (tm->tx_buffers, num_threads - 1); + + /* Initialize timer wheels */ + vec_validate (tm->timer_wheels, num_threads - 1); + tcp_initialize_timer_wheels (tm); + + vec_validate (tm->delack_connections, num_threads - 1); + + /* Initialize clocks per tick for TCP timestamp. Used to compute + * monotonically increasing timestamps. */ + tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock + / TCP_TSTAMP_RESOLUTION; + + clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + + return error; +} + +VLIB_INIT_FUNCTION (tcp_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h new file mode 100644 index 00000000..22f00a63 --- /dev/null +++ b/src/vnet/tcp/tcp.h @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _vnet_tcp_h_ +#define _vnet_tcp_h_ + +#include +#include +#include +#include +#include +#include + +#define TCP_TICK 10e-3 /**< TCP tick period (s) */ +#define THZ 1/TCP_TICK /**< TCP tick frequency */ +#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ +#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ +#define TCP_MAX_OPTION_SPACE 40 + +#define TCP_DUPACK_THRESHOLD 3 +#define TCP_DEFAULT_RX_FIFO_SIZE 64 << 10 + +/** TCP FSM state definitions as per RFC793. */ +#define foreach_tcp_fsm_state \ + _(CLOSED, "CLOSED") \ + _(LISTEN, "LISTEN") \ + _(SYN_SENT, "SYN_SENT") \ + _(SYN_RCVD, "SYN_RCVD") \ + _(ESTABLISHED, "ESTABLISHED") \ + _(CLOSE_WAIT, "CLOSE_WAIT") \ + _(FIN_WAIT_1, "FIN_WAIT_1") \ + _(LAST_ACK, "LAST_ACK") \ + _(CLOSING, "CLOSING") \ + _(FIN_WAIT_2, "FIN_WAIT_2") \ + _(TIME_WAIT, "TIME_WAIT") + +typedef enum _tcp_state +{ +#define _(sym, str) TCP_STATE_##sym, + foreach_tcp_fsm_state +#undef _ + TCP_N_STATES +} tcp_state_t; + +format_function_t format_tcp_state; + +/** TCP timers */ +#define foreach_tcp_timer \ + _(RETRANSMIT, "RETRANSMIT") \ + _(DELACK, "DELAYED ACK") \ + _(PERSIST, "PERSIST") \ + _(KEEP, "KEEP") \ + _(2MSL, "2MSL") \ + _(RETRANSMIT_SYN, "RETRANSMIT_SYN") \ + _(ESTABLISH, "ESTABLISH") + +typedef enum _tcp_timers +{ +#define _(sym, str) TCP_TIMER_##sym, + foreach_tcp_timer +#undef _ + TCP_N_TIMERS +} tcp_timers_e; + +typedef void (timer_expiration_handler) (u32 index); + +extern timer_expiration_handler tcp_timer_delack_handler; +extern timer_expiration_handler tcp_timer_retransmit_handler; +extern timer_expiration_handler tcp_timer_retransmit_syn_handler; + +#define TCP_TIMER_HANDLE_INVALID ((u32) ~0) + +/* Timer delays as multiples of 100ms */ +#define TCP_TO_TIMER_TICK TCP_TICK*10 /* Period for converting from TCP + * ticks to timer units */ +#define TCP_DELACK_TIME 1 /* 0.1s */ +#define TCP_ESTABLISH_TIME 750 /* 75s */ +#define TCP_2MSL_TIME 300 /* 30s */ + +#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ +#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */ +#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */ +#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */ + +void tcp_update_time (f64 now, u32 thread_index); + +/** TCP connection flags */ +#define foreach_tcp_connection_flag \ + _(DELACK, "Delay ACK") \ + _(SNDACK, "Send ACK") \ + _(BURSTACK, "Burst ACK set") \ + _(SENT_RCV_WND0, "Sent 0 receive window") \ + _(RECOVERY, "Recovery on") \ + _(FAST_RECOVERY, "Fast Recovery on") + +typedef enum _tcp_connection_flag_bits +{ +#define _(sym, str) TCP_CONN_##sym##_BIT, + foreach_tcp_connection_flag +#undef _ + TCP_CONN_N_FLAG_BITS +} tcp_connection_flag_bits_e; + +typedef enum _tcp_connection_flag +{ +#define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT, + foreach_tcp_connection_flag +#undef _ + TCP_CONN_N_FLAGS +} tcp_connection_flags_e; + +/** TCP buffer flags */ +#define foreach_tcp_buf_flag \ + _ (ACK) /**< Sending ACK. */ \ + _ (DUPACK) /**< Sending DUPACK. */ \ + +enum +{ +#define _(f) TCP_BUF_BIT_##f, + foreach_tcp_buf_flag +#undef _ + TCP_N_BUF_BITS, +}; + +enum +{ +#define _(f) TCP_BUF_FLAG_##f = 1 << TCP_BUF_BIT_##f, + foreach_tcp_buf_flag +#undef _ +}; + +#define TCP_MAX_SACK_BLOCKS 5 /**< Max number of SACK blocks stored */ +#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) + +typedef struct _sack_scoreboard_hole +{ + u32 next; /**< Index for next entry in linked list */ + u32 prev; /**< Index for previous entry in linked list */ + u32 start; /**< Start sequence number */ + u32 end; /**< End sequence number */ +} sack_scoreboard_hole_t; + +typedef struct _sack_scoreboard +{ + sack_scoreboard_hole_t *holes; /**< Pool of holes */ + u32 head; /**< Index to first entry */ + u32 sacked_bytes; /**< Number of bytes sacked in sb */ +} sack_scoreboard_t; + +typedef enum _tcp_cc_algorithm_type +{ + TCP_CC_NEWRENO, +} tcp_cc_algorithm_type_e; + +typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t; + +typedef enum _tcp_cc_ack_t +{ + TCP_CC_ACK, + TCP_CC_DUPACK, + TCP_CC_PARTIALACK +} tcp_cc_ack_t; + +typedef struct _tcp_connection +{ + transport_connection_t connection; /**< Common transport data. First! */ + + u8 state; /**< TCP state as per tcp_state_t */ + u16 flags; /**< Connection flags (see tcp_conn_flags_e) */ + u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */ + + /* TODO RFC4898 */ + + /** Send sequence variables RFC793 */ + u32 snd_una; /**< oldest unacknowledged sequence number */ + u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/ + u32 snd_wnd; /**< send window */ + u32 snd_wl1; /**< seq number used for last snd.wnd update */ + u32 snd_wl2; /**< ack number used for last snd.wnd update */ + u32 snd_nxt; /**< next seq number to be sent */ + + /** Receive sequence variables RFC793 */ + u32 rcv_nxt; /**< next sequence number expected */ + u32 rcv_wnd; /**< receive window we expect */ + + u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */ + u32 iss; /**< initial sent sequence */ + u32 irs; /**< initial remote sequence */ + + /* Options */ + tcp_options_t opt; /**< TCP connection options parsed */ + u8 rcv_wscale; /**< Window scale to advertise to peer */ + u8 snd_wscale; /**< Window scale to use when sending */ + u32 tsval_recent; /**< Last timestamp received */ + u32 tsval_recent_age; /**< When last updated tstamp_recent*/ + + sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */ + sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */ + + u8 rcv_dupacks; /**< Number of DUPACKs received */ + u8 snt_dupacks; /**< Number of DUPACKs sent in a burst */ + + /* Congestion control */ + u32 cwnd; /**< Congestion window */ + u32 ssthresh; /**< Slow-start threshold */ + u32 prev_ssthresh; /**< ssthresh before congestion */ + u32 bytes_acked; /**< Bytes acknowledged by current segment */ + u32 rtx_bytes; /**< Retransmitted bytes */ + u32 tsecr_last_ack; /**< Timestamp echoed to us in last health ACK */ + tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ + + /* RTT and RTO */ + u32 rto; /**< Retransmission timeout */ + u32 rto_boff; /**< Index for RTO backoff */ + u32 srtt; /**< Smoothed RTT */ + u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */ + u32 rtt_ts; /**< Timestamp for tracked ACK */ + u32 rtt_seq; /**< Sequence number for tracked ACK */ + + u16 snd_mss; /**< Send MSS */ +} tcp_connection_t; + +struct _tcp_cc_algorithm +{ + void (*rcv_ack) (tcp_connection_t * tc); + void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack); + void (*congestion) (tcp_connection_t * tc); + void (*recovered) (tcp_connection_t * tc); + void (*init) (tcp_connection_t * tc); +}; + +#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY +#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY +#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) +#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) +#define tcp_recovery_off(tc) ((tc)->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) +#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) + +typedef enum +{ + TCP_IP4, + TCP_IP6, + TCP_N_AF, +} tcp_af_t; + +typedef enum _tcp_error +{ +#define tcp_error(n,s) TCP_ERROR_##n, +#include +#undef tcp_error + TCP_N_ERROR, +} tcp_error_t; + +typedef struct _tcp_lookup_dispatch +{ + u8 next, error; +} tcp_lookup_dispatch_t; + +typedef struct _tcp_main +{ + /* Per-worker thread tcp connection pools */ + tcp_connection_t **connections; + + /* Pool of listeners. */ + tcp_connection_t *listener_pool; + + /** Dispatch table by state and flags */ + tcp_lookup_dispatch_t dispatch_table[TCP_N_STATES][64]; + + u8 log2_tstamp_clocks_per_tick; + f64 tstamp_ticks_per_clock; + + /** per-worker tx buffer free lists */ + u32 **tx_buffers; + + /* Per worker-thread timer wheel for connections timers */ + tw_timer_wheel_16t_2w_512sl_t *timer_wheels; + + /* Convenience per worker-thread vector of connections to DELACK */ + u32 **delack_connections; + + /* Pool of half-open connections on which we've sent a SYN */ + tcp_connection_t *half_open_connections; + + /* Pool of local TCP endpoints */ + transport_endpoint_t *local_endpoints; + + /* Local endpoints lookup table */ + transport_endpoint_table_t local_endpoints_table; + + /* Congestion control algorithms registered */ + tcp_cc_algorithm_t *cc_algos; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ip4_main_t *ip4_main; + ip6_main_t *ip6_main; +} tcp_main_t; + +extern tcp_main_t tcp_main; +extern vlib_node_registration_t tcp4_input_node; +extern vlib_node_registration_t tcp6_input_node; +extern vlib_node_registration_t tcp4_output_node; +extern vlib_node_registration_t tcp6_output_node; + +always_inline tcp_main_t * +vnet_get_tcp_main () +{ + return &tcp_main; +} + +always_inline tcp_connection_t * +tcp_connection_get (u32 conn_index, u32 thread_index) +{ + return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); +} + +always_inline tcp_connection_t * +tcp_connection_get_if_valid (u32 conn_index, u32 thread_index) +{ + if (tcp_main.connections[thread_index] == 0) + return 0; + if (pool_is_free_index (tcp_main.connections[thread_index], conn_index)) + return 0; + return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); +} + +void tcp_connection_close (tcp_connection_t * tc); +void tcp_connection_cleanup (tcp_connection_t * tc); +void tcp_connection_del (tcp_connection_t * tc); + +always_inline tcp_connection_t * +tcp_listener_get (u32 tli) +{ + return pool_elt_at_index (tcp_main.listener_pool, tli); +} + +always_inline tcp_connection_t * +tcp_half_open_connection_get (u32 conn_index) +{ + return pool_elt_at_index (tcp_main.half_open_connections, conn_index); +} + +void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b); +void tcp_make_finack (tcp_connection_t * tc, vlib_buffer_t * b); +void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); +void tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4); +void tcp_send_syn (tcp_connection_t * tc); +void tcp_send_fin (tcp_connection_t * tc); +void tcp_set_snd_mss (tcp_connection_t * tc); + +always_inline u32 +tcp_end_seq (tcp_header_t * th, u32 len) +{ + return th->seq_number + tcp_is_syn (th) + tcp_is_fin (th) + len; +} + +/* Modulo arithmetic for TCP sequence numbers */ +#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0) +#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) +#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) +#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) + +/* Modulo arithmetic for timestamps */ +#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) +#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) + +always_inline u32 +tcp_flight_size (const tcp_connection_t * tc) +{ + return tc->snd_una_max - tc->snd_una - tc->sack_sb.sacked_bytes + + tc->rtx_bytes; +} + +/** + * Initial cwnd as per RFC5681 + */ +always_inline u32 +tcp_initial_cwnd (const tcp_connection_t * tc) +{ + if (tc->snd_mss > 2190) + return 2 * tc->snd_mss; + else if (tc->snd_mss > 1095) + return 3 * tc->snd_mss; + else + return 4 * tc->snd_mss; +} + +always_inline u32 +tcp_loss_wnd (const tcp_connection_t * tc) +{ + return tc->snd_mss; +} + +always_inline u32 +tcp_available_wnd (const tcp_connection_t * tc) +{ + return clib_min (tc->cwnd, tc->snd_wnd); +} + +always_inline u32 +tcp_available_snd_space (const tcp_connection_t * tc) +{ + u32 available_wnd = tcp_available_wnd (tc); + u32 flight_size = tcp_flight_size (tc); + + if (available_wnd <= flight_size) + return 0; + + return available_wnd - flight_size; +} + +void tcp_retransmit_first_unacked (tcp_connection_t * tc); + +void tcp_fast_retransmit (tcp_connection_t * tc); + +always_inline u32 +tcp_time_now (void) +{ + return clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock; +} + +u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); + +u32 +tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, + u32 max_bytes); + +void tcp_connection_timers_init (tcp_connection_t * tc); +void tcp_connection_timers_reset (tcp_connection_t * tc); + +void tcp_connection_init_vars (tcp_connection_t * tc); + +always_inline void +tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + /* Reset flags, make sure ack is sent */ + tc->flags = TCP_CONN_SNDACK; + vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; +} + +always_inline void +tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) +{ + tc->timers[timer_id] + = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->c_c_index, timer_id, interval); +} + +always_inline void +tcp_retransmit_timer_set (tcp_main_t * tm, tcp_connection_t * tc) +{ + /* XXX Switch to faster TW */ + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) +{ + if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID) + return; + + tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->timers[timer_id]); + tc->timers[timer_id] = TCP_TIMER_HANDLE_INVALID; +} + +always_inline void +tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) +{ + if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID) + tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->timers[timer_id]); + tc->timers[timer_id] = + tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->c_c_index, timer_id, interval); +} + +always_inline u8 +tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) +{ + return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID; +} + +void +scoreboard_remove_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * hole); + +always_inline sack_scoreboard_hole_t * +scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->next); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_first_hole (sack_scoreboard_t * sb) +{ + if (sb->head != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->head); + return 0; +} + +always_inline void +scoreboard_clear (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole = scoreboard_first_hole (sb); + while ((hole = scoreboard_first_hole (sb))) + { + scoreboard_remove_hole (sb, hole); + } +} + +always_inline u32 +scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) +{ + return hole->end - hole->start; +} + +always_inline void +tcp_cc_algo_register (tcp_cc_algorithm_type_e type, + const tcp_cc_algorithm_t * vft) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vec_validate (tm->cc_algos, type); + + tm->cc_algos[type] = *vft; +} + +always_inline tcp_cc_algorithm_t * +tcp_cc_algo_get (tcp_cc_algorithm_type_e type) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + return &tm->cc_algos[type]; +} + +void tcp_cc_init (tcp_connection_t * tc); + +/** + * Push TCP header to buffer + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param sp_net - source port net order + * @param dp_net - destination port net order + * @param seq - sequence number net order + * @param ack - ack number net order + * @param tcp_hdr_opts_len - header and options length in bytes + * @param flags - header flags + * @param wnd - window size + * + * @return - pointer to start of TCP header + */ +always_inline void * +vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq, + u32 ack, u8 tcp_hdr_opts_len, u8 flags, + u16 wnd) +{ + tcp_header_t *th; + + th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len); + + th->src_port = sp; + th->dst_port = dp; + th->seq_number = seq; + th->ack_number = ack; + th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4; + th->flags = flags; + th->window = wnd; + th->checksum = 0; + th->urgent_pointer = 0; + return th; +} + +/** + * Push TCP header to buffer + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param sp_net - source port net order + * @param dp_net - destination port net order + * @param seq - sequence number host order + * @param ack - ack number host order + * @param tcp_hdr_opts_len - header and options length in bytes + * @param flags - header flags + * @param wnd - window size + * + * @return - pointer to start of TCP header + */ +always_inline void * +vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq, + u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd) +{ + return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net, + clib_host_to_net_u32 (seq), + clib_host_to_net_u32 (ack), + tcp_hdr_opts_len, flags, + clib_host_to_net_u16 (wnd)); +} + +#endif /* _vnet_tcp_h_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def new file mode 100644 index 00000000..cff5ec13 --- /dev/null +++ b/src/vnet/tcp/tcp_error.def @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +tcp_error (NONE, "no error") +tcp_error (NO_LISTENER, "no listener for dst port") +tcp_error (LOOKUP_DROPS, "lookup drops") +tcp_error (DISPATCH, "Dispatch error") +tcp_error (ENQUEUED, "Packets pushed into rx fifo") +tcp_error (PURE_ACK, "Pure acks") +tcp_error (SYNS_RCVD, "SYNs received") +tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received") +tcp_error (NOT_READY, "Session not ready for packets") +tcp_error (FIFO_FULL, "Packets dropped for lack of rx fifo space") +tcp_error (EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") +tcp_error (API_QUEUE_FULL, "Sessions not created for lack of API queue space") +tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated") +tcp_error (SEGMENT_INVALID, "Invalid segment") +tcp_error (ACK_INVALID, "Invalid ACK") +tcp_error (ACK_DUP, "Duplicate ACK") +tcp_error (ACK_OLD, "Old ACK") +tcp_error (PKTS_SENT, "Packets sent") +tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") +tcp_error (RST_SENT, "Resets sent") \ No newline at end of file diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c new file mode 100644 index 00000000..7136741d --- /dev/null +++ b/src/vnet/tcp/tcp_format.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * tcp/tcp_format.c: tcp formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +static u8 * +format_tcp_flags (u8 * s, va_list * args) +{ + int flags = va_arg (*args, int); + +#define _(f) if (flags & TCP_FLAG_##f) s = format (s, "%s, ", #f); + foreach_tcp_flag +#undef _ + return s; +} + +/* Format TCP header. */ +u8 * +format_tcp_header (u8 * s, va_list * args) +{ + tcp_header_t *tcp = va_arg (*args, tcp_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + u32 header_bytes; + uword indent; + + /* Nothing to do. */ + if (max_header_bytes < sizeof (tcp[0])) + return format (s, "TCP header truncated"); + + indent = format_get_indent (s); + indent += 2; + header_bytes = tcp_header_bytes (tcp); + + s = format (s, "TCP: %d -> %d", clib_net_to_host_u16 (tcp->src), + clib_net_to_host_u16 (tcp->dst)); + + s = format (s, "\n%Useq. 0x%08x ack 0x%08x", format_white_space, indent, + clib_net_to_host_u32 (tcp->seq_number), + clib_net_to_host_u32 (tcp->ack_number)); + + s = format (s, "\n%Uflags %U, tcp header: %d bytes", format_white_space, + indent, format_tcp_flags, tcp->flags, header_bytes); + + s = format (s, "\n%Uwindow %d, checksum 0x%04x", format_white_space, indent, + clib_net_to_host_u16 (tcp->window), + clib_net_to_host_u16 (tcp->checksum)); + + +#if 0 + /* Format TCP options. */ + { + u8 *o; + u8 *option_start = (void *) (tcp + 1); + u8 *option_end = (void *) tcp + header_bytes; + + for (o = option_start; o < option_end;) + { + u32 length = o[1]; + switch (o[0]) + { + case TCP_OPTION_END: + length = 1; + o = option_end; + break; + + case TCP_OPTION_NOOP: + length = 1; + break; + + } + } + } +#endif + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && header_bytes < max_header_bytes) + { + ip_main_t *im = &ip_main; + tcp_udp_port_info_t *pi; + + pi = ip_get_tcp_udp_port_info (im, tcp->dst); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", format_white_space, indent - 2, + pi->format_header, + /* next protocol header */ (void *) tcp + header_bytes, + max_header_bytes - header_bytes); + } + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c new file mode 100644 index 00000000..daa0683b --- /dev/null +++ b/src/vnet/tcp/tcp_input.c @@ -0,0 +1,2316 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +static char *tcp_error_strings[] = { +#define tcp_error(n,s) s, +#include +#undef tcp_error +}; + +/* All TCP nodes have the same outgoing arcs */ +#define foreach_tcp_state_next \ + _ (DROP, "error-drop") \ + _ (TCP4_OUTPUT, "tcp4-output") \ + _ (TCP6_OUTPUT, "tcp6-output") + +typedef enum _tcp_established_next +{ +#define _(s,n) TCP_ESTABLISHED_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_ESTABLISHED_N_NEXT, +} tcp_established_next_t; + +typedef enum _tcp_rcv_process_next +{ +#define _(s,n) TCP_RCV_PROCESS_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_RCV_PROCESS_N_NEXT, +} tcp_rcv_process_next_t; + +typedef enum _tcp_syn_sent_next +{ +#define _(s,n) TCP_SYN_SENT_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_SYN_SENT_N_NEXT, +} tcp_syn_sent_next_t; + +typedef enum _tcp_listen_next +{ +#define _(s,n) TCP_LISTEN_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_LISTEN_N_NEXT, +} tcp_listen_next_t; + +/* Generic, state independent indices */ +typedef enum _tcp_state_next +{ +#define _(s,n) TCP_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_STATE_N_NEXT, +} tcp_state_next_t; + +#define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \ + : TCP_NEXT_TCP6_OUTPUT) + +vlib_node_registration_t tcp4_established_node; +vlib_node_registration_t tcp6_established_node; + +/** + * Validate segment sequence number. As per RFC793: + * + * Segment Receive Test + * Length Window + * ------- ------- ------------------------------------------- + * 0 0 SEG.SEQ = RCV.NXT + * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + * >0 0 not acceptable + * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + * + * This ultimately consists in checking if segment falls within the window. + * The one important difference compared to RFC793 is that we use rcv_las, + * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the + * peer's reference when computing our receive window. + * + * This accepts only segments within the window. + */ +always_inline u8 +tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) +{ + return seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) + && seq_geq (seq, tc->rcv_nxt); +} + +void +tcp_options_parse (tcp_header_t * th, tcp_options_t * to) +{ + const u8 *data; + u8 opt_len, opts_len, kind; + int j; + sack_block_t b; + + opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t); + data = (const u8 *) (th + 1); + + /* Zero out all flags but those set in SYN */ + to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE); + + for (; opts_len > 0; opts_len -= opt_len, data += opt_len) + { + kind = data[0]; + + /* Get options length */ + if (kind == TCP_OPTION_EOL) + break; + else if (kind == TCP_OPTION_NOOP) + opt_len = 1; + else + { + /* broken options */ + if (opts_len < 2) + break; + opt_len = data[1]; + + /* weird option length */ + if (opt_len < 2 || opt_len > opts_len) + break; + } + + /* Parse options */ + switch (kind) + { + case TCP_OPTION_MSS: + if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_MSS; + to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2)); + } + break; + case TCP_OPTION_WINDOW_SCALE: + if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_WSCALE; + to->wscale = data[2]; + if (to->wscale > TCP_MAX_WND_SCALE) + { + clib_warning ("Illegal window scaling value: %d", + to->wscale); + to->wscale = TCP_MAX_WND_SCALE; + } + } + break; + case TCP_OPTION_TIMESTAMP: + if (opt_len == TCP_OPTION_LEN_TIMESTAMP) + { + to->flags |= TCP_OPTS_FLAG_TSTAMP; + to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2)); + to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6)); + } + break; + case TCP_OPTION_SACK_PERMITTED: + if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th)) + to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + break; + case TCP_OPTION_SACK_BLOCK: + /* If SACK permitted was not advertised or a SYN, break */ + if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th)) + break; + + /* If too short or not correctly formatted, break */ + if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK)) + break; + + to->flags |= TCP_OPTS_FLAG_SACK; + to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK; + vec_reset_length (to->sacks); + for (j = 0; j < to->n_sack_blocks; j++) + { + b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 4 * j)); + b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 4 * j)); + vec_add1 (to->sacks, b); + } + break; + default: + /* Nothing to see here */ + continue; + } + } +} + +always_inline int +tcp_segment_check_paws (tcp_connection_t * tc) +{ + /* XXX normally test for timestamp should be lt instead of leq, but for + * local testing this is not enough */ + return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent + && timestamp_lt (tc->opt.tsval, tc->tsval_recent); +} + +/** + * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19 + * + * It first verifies if segment has a wrapped sequence number (PAWS) and then + * does the processing associated to the first four steps (ignoring security + * and precedence): sequence number, rst bit and syn bit checks. + * + * @return 0 if segments passes validation. + */ +static int +tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, + vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0) +{ + u8 paws_failed; + + if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) + return -1; + + tcp_options_parse (th0, &tc0->opt); + + /* RFC1323: Check against wrapped sequence numbers (PAWS). If we have + * timestamp to echo and it's less than tsval_recent, drop segment + * but still send an ACK in order to retain TCP's mechanism for detecting + * and recovering from half-open connections */ + paws_failed = tcp_segment_check_paws (tc0); + if (paws_failed) + { + clib_warning ("paws failed"); + + /* If it just so happens that a segment updates tsval_recent for a + * segment over 24 days old, invalidate tsval_recent. */ + if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE, + tcp_time_now ())) + { + /* Age isn't reset until we get a valid tsval (bsd inspired) */ + tc0->tsval_recent = 0; + } + else + { + /* Drop after ack if not rst */ + if (!tcp_rst (th0)) + { + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + return -1; + } + } + } + + /* 1st: check sequence number */ + if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end)) + { + if (!tcp_rst (th0)) + { + /* Send dup ack */ + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + } + return -1; + } + + /* 2nd: check the RST bit */ + if (tcp_rst (th0)) + { + /* Notify session that connection has been reset. Switch + * state to closed and await for session to do the cleanup. */ + stream_session_reset_notify (&tc0->connection); + tc0->state = TCP_STATE_CLOSED; + return -1; + } + + /* 3rd: check security and precedence (skip) */ + + /* 4th: check the SYN bit */ + if (tcp_syn (th0)) + { + tcp_send_reset (b0, tc0->c_is_ip4); + return -1; + } + + /* If PAWS passed and segment in window, save timestamp */ + if (!paws_failed) + { + tc0->tsval_recent = tc0->opt.tsval; + tc0->tsval_recent_age = tcp_time_now (); + } + + return 0; +} + +always_inline int +tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0) +{ + /* SND.UNA =< SEG.ACK =< SND.NXT */ + return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number) + && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_nxt)); +} + +/** + * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298 + * + * Note that although the original article, srtt and rttvar are scaled + * to minimize round-off errors, here we don't. Instead, we rely on + * better precision time measurements. + * + * TODO support us rtt resolution + */ +static void +tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) +{ + int err; + + if (tc->srtt != 0) + { + err = mrtt - tc->srtt; + tc->srtt += err >> 3; + + /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. + * The increase should be bound */ + tc->rttvar += (clib_abs (err) - tc->rttvar) >> 2; + } + else + { + /* First measurement. */ + tc->srtt = mrtt; + tc->rttvar = mrtt << 1; + } +} + +/** Update RTT estimate and RTO timer + * + * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK + * timing. Middle boxes are known to fiddle with TCP options so we + * should give higher priority to ACK timing. + * + * return 1 if valid rtt 0 otherwise + */ +static int +tcp_update_rtt (tcp_connection_t * tc, u32 ack) +{ + u32 mrtt = 0; + + /* Karn's rule, part 1. Don't use retransmitted segments to estimate + * RTT because they're ambiguous. */ + if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) + { + mrtt = tcp_time_now () - tc->rtt_ts; + tc->rtt_seq = 0; + } + + /* As per RFC7323 TSecr can be used for RTTM only if the segment advances + * snd_una, i.e., the left side of the send window: + * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't + * try to update rtt for dupacks */ + else if (tcp_opts_tstamp (&tc->opt) && tc->opt.tsecr && tc->bytes_acked) + { + mrtt = tcp_time_now () - tc->opt.tsecr; + } + + /* Ignore dubious measurements */ + if (mrtt == 0 || mrtt > TCP_RTT_MAX) + return 0; + + tcp_estimate_rtt (tc, mrtt); + + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + + return 1; +} + +/** + * Dequeue bytes that have been acked and while at it update RTT estimates. + */ +static void +tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) +{ + /* Dequeue the newly ACKed bytes */ + stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); + + /* Update rtt and rto */ + if (tcp_update_rtt (tc, ack)) + { + /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */ + tc->rto_boff = 0; + } +} + +/** Check if dupack as per RFC5681 Sec. 2 */ +always_inline u8 +tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd) +{ + return ((vnet_buffer (b)->tcp.ack_number == tc->snd_una) + && seq_gt (tc->snd_una_max, tc->snd_una) + && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) + && (new_snd_wnd == tc->snd_wnd)); +} + +void +scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + sack_scoreboard_hole_t *next, *prev; + + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + { + next = pool_elt_at_index (sb->holes, hole->next); + next->prev = hole->prev; + } + + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + { + prev = pool_elt_at_index (sb->holes, hole->prev); + prev->next = hole->next; + } + else + { + sb->head = hole->next; + } + + pool_put (sb->holes, hole); +} + +sack_scoreboard_hole_t * +scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, + u32 start, u32 end) +{ + sack_scoreboard_hole_t *hole, *next; + u32 hole_index; + + pool_get (sb->holes, hole); + memset (hole, 0, sizeof (*hole)); + + hole->start = start; + hole->end = end; + hole_index = hole - sb->holes; + + if (prev) + { + hole->prev = prev - sb->holes; + hole->next = prev->next; + + if ((next = scoreboard_next_hole (sb, hole))) + next->prev = hole_index; + + prev->next = hole_index; + } + else + { + sb->head = hole_index; + hole->prev = TCP_INVALID_SACK_HOLE_INDEX; + hole->next = TCP_INVALID_SACK_HOLE_INDEX; + } + + return hole; +} + +static void +tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) +{ + sack_scoreboard_t *sb = &tc->sack_sb; + sack_block_t *blk, tmp; + sack_scoreboard_hole_t *hole, *next_hole; + u32 blk_index = 0; + int i, j; + + if (!tcp_opts_sack (tc) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + return; + + /* Remove invalid blocks */ + vec_foreach (blk, tc->opt.sacks) + { + if (seq_lt (blk->start, blk->end) + && seq_gt (blk->start, tc->snd_una) + && seq_gt (blk->start, ack) && seq_lt (blk->end, tc->snd_nxt)) + continue; + + vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); + } + + /* Add block for cumulative ack */ + if (seq_gt (ack, tc->snd_una)) + { + tmp.start = tc->snd_una; + tmp.end = ack; + vec_add1 (tc->opt.sacks, tmp); + } + + if (vec_len (tc->opt.sacks) == 0) + return; + + /* Make sure blocks are ordered */ + for (i = 0; i < vec_len (tc->opt.sacks); i++) + for (j = i; j < vec_len (tc->opt.sacks); j++) + if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start)) + { + tmp = tc->opt.sacks[i]; + tc->opt.sacks[i] = tc->opt.sacks[j]; + tc->opt.sacks[j] = tmp; + } + + /* If no holes, insert the first that covers all outstanding bytes */ + if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) + { + scoreboard_insert_hole (sb, 0, tc->snd_una, tc->snd_una_max); + } + + /* Walk the holes with the SACK blocks */ + hole = pool_elt_at_index (sb->holes, sb->head); + while (hole && blk_index < vec_len (tc->opt.sacks)) + { + blk = &tc->opt.sacks[blk_index]; + + if (seq_leq (blk->start, hole->start)) + { + /* Block covers hole. Remove hole */ + if (seq_geq (blk->end, hole->end)) + { + next_hole = scoreboard_next_hole (sb, hole); + + /* Byte accounting */ + if (seq_lt (hole->end, ack)) + { + /* Bytes lost because snd wnd left edge advances */ + if (seq_lt (next_hole->start, ack)) + sb->sacked_bytes -= next_hole->start - hole->end; + else + sb->sacked_bytes -= ack - hole->end; + } + else + { + sb->sacked_bytes += scoreboard_hole_bytes (hole); + } + + scoreboard_remove_hole (sb, hole); + hole = next_hole; + } + /* Partial overlap */ + else + { + sb->sacked_bytes += blk->end - hole->start; + hole->start = blk->end; + blk_index++; + } + } + else + { + /* Hole must be split */ + if (seq_leq (blk->end, hole->end)) + { + sb->sacked_bytes += blk->end - blk->start; + scoreboard_insert_hole (sb, hole, blk->end, hole->end); + hole->end = blk->start - 1; + blk_index++; + } + else + { + sb->sacked_bytes += hole->end - blk->start + 1; + hole->end = blk->start - 1; + hole = scoreboard_next_hole (sb, hole); + } + } + } +} + +/** Update snd_wnd + * + * If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set + * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ +static void +tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) +{ + if (tc->snd_wl1 < seq || (tc->snd_wl1 == seq && tc->snd_wl2 <= ack)) + { + tc->snd_wnd = snd_wnd; + tc->snd_wl1 = seq; + tc->snd_wl2 = ack; + } +} + +static void +tcp_cc_congestion (tcp_connection_t * tc) +{ + tc->cc_algo->congestion (tc); +} + +static void +tcp_cc_recover (tcp_connection_t * tc) +{ + if (tcp_in_fastrecovery (tc)) + { + tc->cc_algo->recovered (tc); + tcp_recovery_off (tc); + } + else if (tcp_in_recovery (tc)) + { + tcp_recovery_off (tc); + tc->cwnd = tcp_loss_wnd (tc); + } +} + +static void +tcp_cc_rcv_ack (tcp_connection_t * tc) +{ + u8 partial_ack; + + if (tcp_in_recovery (tc)) + { + partial_ack = seq_lt (tc->snd_una, tc->snd_una_max); + if (!partial_ack) + { + /* Clear retransmitted bytes. */ + tc->rtx_bytes = 0; + tcp_cc_recover (tc); + } + else + { + /* Clear retransmitted bytes. XXX should we clear all? */ + tc->rtx_bytes = 0; + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + + /* Retransmit first unacked segment */ + tcp_retransmit_first_unacked (tc); + } + } + else + { + tc->cc_algo->rcv_ack (tc); + } + + tc->rcv_dupacks = 0; + tc->tsecr_last_ack = tc->opt.tsecr; +} + +static void +tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack) +{ + ASSERT (tc->snd_una == ack); + + tc->rcv_dupacks++; + if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + { + /* RFC6582 NewReno heuristic to avoid multiple fast retransmits */ + if (tc->opt.tsecr != tc->tsecr_last_ack) + { + tc->rcv_dupacks = 0; + return; + } + + tcp_fastrecovery_on (tc); + + /* Handle congestion and dupack */ + tcp_cc_congestion (tc); + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + + tcp_fast_retransmit (tc); + + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver */ + tc->cwnd = tc->ssthresh + TCP_DUPACK_THRESHOLD * tc->snd_mss; + } + else if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD) + { + ASSERT (tcp_in_fastrecovery (tc)); + + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + } +} + +void +tcp_cc_init (tcp_connection_t * tc) +{ + tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO); + tc->cc_algo->init (tc); +} + +static int +tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, + tcp_header_t * th, u32 * next, u32 * error) +{ + u32 new_snd_wnd; + + /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) then send an + * ACK, drop the segment, and return */ + if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)) + { + tcp_make_ack (tc, b); + *next = tcp_next_output (tc->c_is_ip4); + *error = TCP_ERROR_ACK_INVALID; + return -1; + } + + /* If old ACK, discard */ + if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) + { + *error = TCP_ERROR_ACK_OLD; + return -1; + } + + if (tcp_opts_sack_permitted (&tc->opt)) + tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); + + new_snd_wnd = clib_net_to_host_u32 (th->window) << tc->snd_wscale; + + if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) + { + tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); + *error = TCP_ERROR_ACK_DUP; + return -1; + } + + /* Valid ACK */ + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; + tc->snd_una = vnet_buffer (b)->tcp.ack_number; + + /* Dequeue ACKed packet and update RTT */ + tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + + tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, + vnet_buffer (b)->tcp.ack_number, new_snd_wnd); + + /* Updates congestion control (slow start/congestion avoidance) */ + tcp_cc_rcv_ack (tc); + + /* If everything has been acked, stop retransmit timer + * otherwise update */ + if (tc->snd_una == tc->snd_una_max) + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); + else + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, tc->rto); + + return 0; +} + +/** + * Build SACK list as per RFC2018. + * + * Makes sure the first block contains the segment that generated the current + * ACK and the following ones are the ones most recently reported in SACK + * blocks. + * + * @param tc TCP connection for which the SACK list is updated + * @param start Start sequence number of the newest SACK block + * @param end End sequence of the newest SACK block + */ +static void +tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) +{ + sack_block_t *new_list = 0, block; + u32 n_elts; + int i; + u8 new_head = 0; + + /* If the first segment is ooo add it to the list. Last write might've moved + * rcv_nxt over the first segment. */ + if (seq_lt (tc->rcv_nxt, start)) + { + block.start = start; + block.end = end; + vec_add1 (new_list, block); + new_head = 1; + } + + /* Find the blocks still worth keeping. */ + for (i = 0; i < vec_len (tc->snd_sacks); i++) + { + /* Discard if: + * 1) rcv_nxt advanced beyond current block OR + * 2) Segment overlapped by the first segment, i.e., it has been merged + * into it.*/ + if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt) + || seq_leq (tc->snd_sacks[i].start, end)) + continue; + + /* Save subsequent segments to new SACK list. */ + n_elts = clib_min (vec_len (tc->snd_sacks) - i, + TCP_MAX_SACK_BLOCKS - new_head); + vec_insert_elts (new_list, &tc->snd_sacks[i], n_elts, new_head); + break; + } + + /* Replace old vector with new one */ + vec_free (tc->snd_sacks); + tc->snd_sacks = new_list; +} + +/** Enqueue data for delivery to application */ +always_inline u32 +tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, + u16 data_len) +{ + int written; + + /* Pure ACK. Update rcv_nxt and be done. */ + if (PREDICT_FALSE (data_len == 0)) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; + return TCP_ERROR_PURE_ACK; + } + + written = stream_session_enqueue_data (&tc->connection, + vlib_buffer_get_current (b), + data_len, 1 /* queue event */ ); + + /* Update rcv_nxt */ + if (PREDICT_TRUE (written == data_len)) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; + } + /* If more data written than expected, account for out-of-order bytes. */ + else if (written > data_len) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len; + + /* Send ACK confirming the update */ + tc->flags |= TCP_CONN_SNDACK; + + /* Update SACK list if need be */ + if (tcp_opts_sack_permitted (&tc->opt)) + { + /* Remove SACK blocks that have been delivered */ + tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); + } + } + else + { + ASSERT (0); + return TCP_ERROR_FIFO_FULL; + } + + return TCP_ERROR_ENQUEUED; +} + +/** Enqueue out-of-order data */ +always_inline u32 +tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, + u16 data_len) +{ + stream_session_t *s0; + u32 offset, seq; + + s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); + seq = vnet_buffer (b)->tcp.seq_number; + offset = seq - tc->rcv_nxt; + + if (svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, + data_len, vlib_buffer_get_current (b))) + return TCP_ERROR_FIFO_FULL; + + /* Update SACK list if in use */ + if (tcp_opts_sack_permitted (&tc->opt)) + { + ooo_segment_t *newest; + u32 start, end; + + /* Get the newest segment from the fifo */ + newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); + start = tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest); + end = tc->rcv_nxt + ooo_segment_end_offset (s0->server_rx_fifo, newest); + + tcp_update_sack_list (tc, start, end); + } + + return TCP_ERROR_ENQUEUED; +} + +/** + * Check if ACK could be delayed. DELACK timer is set only after frame is + * processed so this can return true for a full bursts of packets. + */ +always_inline int +tcp_can_delack (tcp_connection_t * tc) +{ + /* If there's no DELACK timer set and the last window sent wasn't 0 we + * can safely delay. */ + if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK) + && (tc->flags & TCP_CONN_SENT_RCV_WND0) == 0 + && (tc->flags & TCP_CONN_SNDACK) == 0) + return 1; + + return 0; +} + +static int +tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, + u16 n_data_bytes, u32 * next0) +{ + u32 error = 0; + + /* Handle out-of-order data */ + if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) + { + error = tcp_session_enqueue_ooo (tc, b, n_data_bytes); + + /* Don't send more than 3 dupacks per burst + * XXX decide if this is good */ + if (tc->snt_dupacks < 3) + { + /* RFC2581: Send DUPACK for fast retransmit */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); + + /* Mark as DUPACK. We may filter these in output if + * the burst fills the holes. */ + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; + + tc->snt_dupacks++; + } + + goto done; + } + + /* In order data, enqueue. Fifo figures out by itself if any out-of-order + * segments can be enqueued after fifo tail offset changes. */ + error = tcp_session_enqueue_data (tc, b, n_data_bytes); + + /* Check if ACK can be delayed */ + if (tcp_can_delack (tc)) + { + /* Nothing to do for pure ACKs */ + if (n_data_bytes == 0) + goto done; + + /* If connection has not been previously marked for delay ack + * add it to the list and flag it */ + if (!tc->flags & TCP_CONN_DELACK) + { + vec_add1 (tm->delack_connections[tc->c_thread_index], + tc->c_c_index); + tc->flags |= TCP_CONN_DELACK; + } + } + else + { + /* Check if a packet has already been enqueued to output for burst. + * If yes, then drop this one, otherwise, let it pass through to + * output */ + if ((tc->flags & TCP_CONN_BURSTACK) == 0) + { + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); + error = TCP_ERROR_ENQUEUED; + + /* TODO: maybe add counter to ensure N acks will be sent/burst */ + tc->flags |= TCP_CONN_BURSTACK; + } + } + +done: + return error; +} + +void +delack_timers_init (tcp_main_t * tm, u32 thread_index) +{ + tcp_connection_t *tc; + u32 i, *conns; + tw_timer_wheel_16t_2w_512sl_t *tw; + + tw = &tm->timer_wheels[thread_index]; + conns = tm->delack_connections[thread_index]; + for (i = 0; i < vec_len (conns); i++) + { + tc = pool_elt_at_index (tm->connections[thread_index], conns[i]); + ASSERT (0 != tc); + + tc->timers[TCP_TIMER_DELACK] + = tw_timer_start_16t_2w_512sl (tw, conns[i], + TCP_TIMER_DELACK, TCP_DELACK_TIME); + } + vec_reset_length (tm->delack_connections[thread_index]); +} + +always_inline uword +tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + tcp_main_t *tm = vnet_get_tcp_main (); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *th0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (th0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (th0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number + + tcp_is_syn (th0) + tcp_is_fin (th0) + n_data_bytes0; + + /* TODO header prediction fast path */ + + /* 1-4: check SEQ, RST, SYN */ + if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0))) + { + error0 = TCP_ERROR_SEGMENT_INVALID; + goto drop; + } + + /* 5: check the ACK field */ + if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)) + { + goto drop; + } + + /* 6: check the URG bit TODO */ + + /* 7: process the segment text */ + vlib_buffer_advance (b0, n_advance_bytes0); + error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + + /* 8: check the FIN bit */ + if (tcp_fin (th0)) + { + /* Send ACK and enter CLOSE-WAIT */ + tcp_make_ack (tc0, b0); + tcp_connection_force_ack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + tc0->state = TCP_STATE_CLOSE_WAIT; + stream_session_disconnect_notify (&tc0->connection); + } + + drop: + b0->error = node->errors[error0]; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + delack_timers_init (tm, my_thread_index); + + return from_frame->n_vectors; +} + +static uword +tcp4_established (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_established (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_established_node) = +{ + .function = tcp4_established, + .name = "tcp4-established", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR,.error_strings = tcp_error_strings, + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_established_node, tcp4_established); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_established_node) = +{ + .function = tcp6_established, + .name = "tcp6-established", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_established_node, tcp6_established); + +vlib_node_registration_t tcp4_syn_sent_node; +vlib_node_registration_t tcp6_syn_sent_node; + +always_inline uword +tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, ack0, seq0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + tcp_connection_t *new_tc0; + u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = + tcp_half_open_connection_get (vnet_buffer (b0)-> + tcp.connection_index); + + ack0 = vnet_buffer (b0)->tcp.ack_number; + seq0 = vnet_buffer (b0)->tcp.seq_number; + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + if (PREDICT_FALSE + (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) + goto drop; + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0) + + tcp_is_fin (tcp0) + n_data_bytes0; + + /* + * 1. check the ACK bit + */ + + /* + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless + * the RST bit is set, if so drop the segment and return) + * + * and discard the segment. Return. + * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. + */ + if (tcp_ack (tcp0)) + { + if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) + { + if (!tcp_rst (tcp0)) + tcp_send_reset (b0, is_ip4); + + goto drop; + } + + /* Make sure ACK is valid */ + if (tc0->snd_una > ack0) + goto drop; + } + + /* + * 2. check the RST bit + */ + + if (tcp_rst (tcp0)) + { + /* If ACK is acceptable, signal client that peer is not + * willing to accept connection and drop connection*/ + if (tcp_ack (tcp0)) + { + stream_session_connect_notify (&tc0->connection, sst, + 1 /* fail */ ); + tcp_connection_cleanup (tc0); + } + goto drop; + } + + /* + * 3. check the security and precedence (skipped) + */ + + /* + * 4. check the SYN bit + */ + + /* No SYN flag. Drop. */ + if (!tcp_syn (tcp0)) + goto drop; + + /* Stop connection establishment and retransmit timers */ + tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); + tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); + + /* Valid SYN or SYN-ACK. Move connection from half-open pool to + * current thread pool. */ + pool_get (tm->connections[my_thread_index], new_tc0); + clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); + + new_tc0->c_thread_index = my_thread_index; + + /* Cleanup half-open connection XXX lock */ + pool_put (tm->half_open_connections, tc0); + + new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; + new_tc0->irs = seq0; + + /* Parse options */ + tcp_options_parse (tcp0, &new_tc0->opt); + tcp_connection_init_vars (new_tc0); + + if (tcp_opts_tstamp (&new_tc0->opt)) + { + new_tc0->tsval_recent = new_tc0->opt.tsval; + new_tc0->tsval_recent_age = tcp_time_now (); + } + + if (tcp_opts_wscale (&new_tc0->opt)) + new_tc0->snd_wscale = new_tc0->opt.wscale; + + new_tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) + << new_tc0->snd_wscale; + new_tc0->snd_wl1 = seq0; + new_tc0->snd_wl2 = ack0; + + /* SYN-ACK: See if we can switch to ESTABLISHED state */ + if (tcp_ack (tcp0)) + { + /* Our SYN is ACKed: we have iss < ack = snd_una */ + + /* TODO Dequeue acknowledged segments if we support Fast Open */ + new_tc0->snd_una = ack0; + new_tc0->state = TCP_STATE_ESTABLISHED; + + /* Notify app that we have connection */ + stream_session_connect_notify (&new_tc0->connection, sst, 0); + + /* Make sure after data segment processing ACK is sent */ + new_tc0->flags |= TCP_CONN_SNDACK; + } + /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ + else + { + new_tc0->state = TCP_STATE_SYN_RCVD; + + /* Notify app that we have connection XXX */ + stream_session_connect_notify (&new_tc0->connection, sst, 0); + + tcp_make_synack (new_tc0, b0); + next0 = tcp_next_output (is_ip4); + + goto drop; + } + + /* Read data, if any */ + if (n_data_bytes0) + { + error0 = + tcp_segment_rcv (tm, new_tc0, b0, n_data_bytes0, &next0); + if (error0 == TCP_ERROR_PURE_ACK) + error0 = TCP_ERROR_SYN_ACKS_RCVD; + } + else + { + tcp_make_ack (new_tc0, b0); + next0 = tcp_next_output (new_tc0->c_is_ip4); + } + + drop: + + b0->error = error0 ? node->errors[error0] : 0; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_syn_sent (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_syn_sent_rcv (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_syn_sent_node) = +{ + .function = tcp4_syn_sent, + .name = "tcp4-syn-sent", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_SYN_SENT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_syn_sent_node, tcp4_syn_sent); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_syn_sent_node) = +{ + .function = tcp6_syn_sent_rcv, + .name = "tcp6-syn-sent", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_SYN_SENT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + } +,}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); +/** + * Handles reception for all states except LISTEN, SYN-SEND and ESTABLISHED + * as per RFC793 p. 64 + */ +always_inline uword +tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number + + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) + n_data_bytes0; + + /* + * Special treatment for CLOSED + */ + switch (tc0->state) + { + case TCP_STATE_CLOSED: + goto drop; + break; + } + + /* + * For all other states (except LISTEN) + */ + + /* 1-4: check SEQ, RST, SYN */ + if (PREDICT_FALSE + (tcp_segment_validate (vm, tc0, b0, tcp0, &next0))) + { + error0 = TCP_ERROR_SEGMENT_INVALID; + goto drop; + } + + /* 5: check the ACK field */ + switch (tc0->state) + { + case TCP_STATE_SYN_RCVD: + /* + * If the segment acknowledgment is not acceptable, form a + * reset segment, + * + * and send it. + */ + if (!tcp_rcv_ack_is_acceptable (tc0, b0)) + { + tcp_send_reset (b0, is_ip4); + goto drop; + } + /* Switch state to ESTABLISHED */ + tc0->state = TCP_STATE_ESTABLISHED; + + /* Initialize session variables */ + tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; + tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) + << tc0->opt.wscale; + tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; + tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; + + /* Shoulder tap the server */ + stream_session_accept_notify (&tc0->connection); + + tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); + break; + case TCP_STATE_ESTABLISHED: + /* We can get packets in established state here because they + * were enqueued before state change */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + break; + case TCP_STATE_FIN_WAIT_1: + /* In addition to the processing for the ESTABLISHED state, if + * our FIN is now acknowledged then enter FIN-WAIT-2 and + * continue processing in that state. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + tc0->state = TCP_STATE_FIN_WAIT_2; + /* Stop all timers, 2MSL will be set lower */ + tcp_connection_timers_reset (tc0); + break; + case TCP_STATE_FIN_WAIT_2: + /* In addition to the processing for the ESTABLISHED state, if + * the retransmission queue is empty, the user's CLOSE can be + * acknowledged ("ok") but do not delete the TCB. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + /* check if rtx queue is empty and ack CLOSE TODO */ + break; + case TCP_STATE_CLOSE_WAIT: + /* Do the same processing as for the ESTABLISHED state. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + break; + case TCP_STATE_CLOSING: + /* In addition to the processing for the ESTABLISHED state, if + * the ACK acknowledges our FIN then enter the TIME-WAIT state, + * otherwise ignore the segment. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + /* XXX test that send queue empty */ + tc0->state = TCP_STATE_TIME_WAIT; + goto drop; + + break; + case TCP_STATE_LAST_ACK: + /* The only thing that can arrive in this state is an + * acknowledgment of our FIN. If our FIN is now acknowledged, + * delete the TCB, enter the CLOSED state, and return. */ + + if (!tcp_rcv_ack_is_acceptable (tc0, b0)) + goto drop; + + tcp_connection_del (tc0); + goto drop; + + break; + case TCP_STATE_TIME_WAIT: + /* The only thing that can arrive in this state is a + * retransmission of the remote FIN. Acknowledge it, and restart + * the 2 MSL timeout. */ + + /* TODO */ + goto drop; + break; + default: + ASSERT (0); + } + + /* 6: check the URG bit TODO */ + + /* 7: process the segment text */ + switch (tc0->state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_FIN_WAIT_1: + case TCP_STATE_FIN_WAIT_2: + error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + break; + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_CLOSING: + case TCP_STATE_LAST_ACK: + case TCP_STATE_TIME_WAIT: + /* This should not occur, since a FIN has been received from the + * remote side. Ignore the segment text. */ + break; + } + + /* 8: check the FIN bit */ + if (!tcp_fin (tcp0)) + goto drop; + + switch (tc0->state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_SYN_RCVD: + /* Send FIN-ACK notify app and enter CLOSE-WAIT */ + tcp_connection_timers_reset (tc0); + tcp_make_finack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + stream_session_disconnect_notify (&tc0->connection); + tc0->state = TCP_STATE_CLOSE_WAIT; + break; + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_CLOSING: + case TCP_STATE_LAST_ACK: + /* move along .. */ + break; + case TCP_STATE_FIN_WAIT_1: + tc0->state = TCP_STATE_TIME_WAIT; + tcp_connection_timers_reset (tc0); + tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + break; + case TCP_STATE_FIN_WAIT_2: + /* Got FIN, send ACK! */ + tc0->state = TCP_STATE_TIME_WAIT; + tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (is_ip4); + break; + case TCP_STATE_TIME_WAIT: + /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait + * timeout. + */ + tcp_timer_update (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + break; + } + + b0->error = error0 ? node->errors[error0] : 0; + + drop: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_rcv_process_node) = +{ + .function = tcp4_rcv_process, + .name = "tcp4-rcv-process", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RCV_PROCESS_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_rcv_process_node, tcp4_rcv_process); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_rcv_process_node) = +{ + .function = tcp6_rcv_process, + .name = "tcp6-rcv-process", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RCV_PROCESS_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_rcv_process_node, tcp6_rcv_process); + +vlib_node_registration_t tcp4_listen_node; +vlib_node_registration_t tcp6_listen_node; + +/** + * LISTEN state processing as per RFC 793 p. 65 + */ +always_inline uword +tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + tcp_main_t *tm = vnet_get_tcp_main (); + u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *th0 = 0; + tcp_connection_t *lc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + tcp_connection_t *child0; + u32 error0 = TCP_ERROR_SYNS_RCVD, next0 = TCP_LISTEN_NEXT_DROP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index); + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ip40); + } + else + { + ip60 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ip60); + } + + /* Create child session. For syn-flood protection use filter */ + + /* 1. first check for an RST */ + if (tcp_rst (th0)) + goto drop; + + /* 2. second check for an ACK */ + if (tcp_ack (th0)) + { + tcp_send_reset (b0, is_ip4); + goto drop; + } + + /* 3. check for a SYN (did that already) */ + + /* Create child session and send SYN-ACK */ + pool_get (tm->connections[my_thread_index], child0); + memset (child0, 0, sizeof (*child0)); + + child0->c_c_index = child0 - tm->connections[my_thread_index]; + child0->c_lcl_port = lc0->c_lcl_port; + child0->c_rmt_port = th0->src_port; + child0->c_is_ip4 = is_ip4; + child0->c_thread_index = my_thread_index; + + if (is_ip4) + { + child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32; + child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32; + } + else + { + clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address, + sizeof (ip6_address_t)); + clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address, + sizeof (ip6_address_t)); + } + + if (stream_session_accept (&child0->connection, lc0->c_s_index, sst, + 0 /* notify */ )) + { + error0 = TCP_ERROR_CREATE_SESSION_FAIL; + goto drop; + } + + tcp_options_parse (th0, &child0->opt); + tcp_connection_init_vars (child0); + + child0->irs = vnet_buffer (b0)->tcp.seq_number; + child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; + child0->state = TCP_STATE_SYN_RCVD; + + /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} + * segments are used to initialize PAWS. */ + if (tcp_opts_tstamp (&child0->opt)) + { + child0->tsval_recent = child0->opt.tsval; + child0->tsval_recent_age = tcp_time_now (); + } + + /* Reuse buffer to make syn-ack and send */ + tcp_make_synack (child0, b0); + next0 = tcp_next_output (is_ip4); + + drop: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + b0->error = error0 ? node->errors[error0] : 0; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static uword +tcp4_listen (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_listen (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_listen_node) = +{ + .function = tcp4_listen, + .name = "tcp4-listen", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_LISTEN_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_listen_node, tcp4_listen); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_listen_node) = +{ + .function = tcp6_listen, + .name = "tcp6-listen", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_LISTEN_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_listen_node, tcp6_listen); + +vlib_node_registration_t tcp4_input_node; +vlib_node_registration_t tcp6_input_node; + +typedef enum _tcp_input_next +{ + TCP_INPUT_NEXT_DROP, + TCP_INPUT_NEXT_LISTEN, + TCP_INPUT_NEXT_RCV_PROCESS, + TCP_INPUT_NEXT_SYN_SENT, + TCP_INPUT_NEXT_ESTABLISHED, + TCP_INPUT_NEXT_RESET, + TCP_INPUT_N_NEXT +} tcp_input_next_t; + +#define foreach_tcp4_input_next \ + _ (DROP, "error-drop") \ + _ (LISTEN, "tcp4-listen") \ + _ (RCV_PROCESS, "tcp4-rcv-process") \ + _ (SYN_SENT, "tcp4-syn-sent") \ + _ (ESTABLISHED, "tcp4-established") \ + _ (RESET, "tcp4-reset") + +#define foreach_tcp6_input_next \ + _ (DROP, "error-drop") \ + _ (LISTEN, "tcp6-listen") \ + _ (RCV_PROCESS, "tcp6-rcv-process") \ + _ (SYN_SENT, "tcp6-syn-sent") \ + _ (ESTABLISHED, "tcp6-established") \ + _ (RESET, "tcp6-reset") + +typedef struct +{ + u16 src_port; + u16 dst_port; + u8 state; +} tcp_rx_trace_t; + +const char *tcp_fsm_states[] = { +#define _(sym, str) str, + foreach_tcp_fsm_state +#undef _ +}; + +u8 * +format_tcp_state (u8 * s, va_list * args) +{ + tcp_state_t *state = va_arg (*args, tcp_state_t *); + + if (state[0] < TCP_N_STATES) + s = format (s, "%s", tcp_fsm_states[state[0]]); + else + s = format (s, "UNKNOWN"); + + return s; +} + +u8 * +format_tcp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + + s = format (s, "TCP: src-port %d dst-port %U%s\n", + clib_net_to_host_u16 (t->src_port), + clib_net_to_host_u16 (t->dst_port), format_tcp_state, t->state); + + return s; +} + +#define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) + +always_inline uword +tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + tcp_main_t *tm = vnet_get_tcp_main (); + session_manager_main_t *ssm = vnet_get_session_manager_main (); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP; + u8 flags0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + + /* lookup session */ + tc0 = + (tcp_connection_t *) stream_session_lookup_transport4 (ssm, + &ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + tc0 = + (tcp_connection_t *) stream_session_lookup_transport6 (ssm, + &ip60->src_address, + &ip60->dst_address, + tcp0->src_port, + tcp0->dst_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); + } + + /* Session exists */ + if (PREDICT_TRUE (0 != tc0)) + { + /* Save connection index */ + vnet_buffer (b0)->tcp.connection_index = tc0->c_c_index; + vnet_buffer (b0)->tcp.seq_number = + clib_net_to_host_u32 (tcp0->seq_number); + vnet_buffer (b0)->tcp.ack_number = + clib_net_to_host_u32 (tcp0->ack_number); + + flags0 = tcp0->flags & filter_flags; + next0 = tm->dispatch_table[tc0->state][flags0].next; + error0 = tm->dispatch_table[tc0->state][flags0].error; + + if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH)) + { + /* Overload tcp flags to store state */ + vnet_buffer (b0)->tcp.flags = tc0->state; + } + } + else + { + /* Send reset */ + next0 = TCP_INPUT_NEXT_RESET; + error0 = TCP_ERROR_NO_LISTENER; + vnet_buffer (b0)->tcp.flags = 0; + } + + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_input (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_input (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_input_node) = +{ + .function = tcp4_input, + .name = "tcp4-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_INPUT_NEXT_##s] = n, + foreach_tcp4_input_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_rx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_input_node, tcp4_input); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_input_node) = +{ + .function = tcp6_input, + .name = "tcp6-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_INPUT_NEXT_##s] = n, + foreach_tcp6_input_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_rx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input); +void +tcp_update_time (f64 now, u32 thread_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tw_timer_expire_timers_16t_2w_512sl (&tm->timer_wheels[thread_index], now); +} + +static void +tcp_dispatch_table_init (tcp_main_t * tm) +{ + int i, j; + for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++) + for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++) + { + tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP; + tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH; + } + +#define _(t,f,n,e) \ +do { \ + tm->dispatch_table[TCP_STATE_##t][f].next = (n); \ + tm->dispatch_table[TCP_STATE_##t][f].error = (e); \ +} while (0) + + /* SYNs for new connections -> tcp-listen. */ + _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); + /* ACK for for a SYN-ACK -> tcp-rcv-process. */ + _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + /* SYN-ACK for a SYN */ + _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); + /* ACK for for established connection -> tcp-established. */ + _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + /* FIN for for established connection -> tcp-established. */ + _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, + TCP_ERROR_NONE); + /* ACK or FIN-ACK to our FIN */ + _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + /* FIN in reply to our FIN from the other side */ + _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + /* FIN confirming that the peer (app) has closed */ + _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); +#undef _ +} + +clib_error_t * +tcp_input_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + tcp_main_t *tm = vnet_get_tcp_main (); + + if ((error = vlib_call_init_function (vm, tcp_init))) + return error; + + /* Initialize dispatch table. */ + tcp_dispatch_table_init (tm); + + return error; +} + +VLIB_INIT_FUNCTION (tcp_input_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c new file mode 100644 index 00000000..856dffe4 --- /dev/null +++ b/src/vnet/tcp/tcp_newreno.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +void +newreno_congestion (tcp_connection_t * tc) +{ + tc->prev_ssthresh = tc->ssthresh; + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); +} + +void +newreno_recovered (tcp_connection_t * tc) +{ + tc->cwnd = tc->ssthresh; +} + +void +newreno_rcv_ack (tcp_connection_t * tc) +{ + if (tcp_in_slowstart (tc)) + { + tc->cwnd += clib_min (tc->snd_mss, tc->bytes_acked); + } + else + { + /* Round up to 1 if needed */ + tc->cwnd += clib_max (tc->snd_mss * tc->snd_mss / tc->cwnd, 1); + } +} + +void +newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) +{ + if (ack_type == TCP_CC_DUPACK) + { + tc->cwnd += tc->snd_mss; + } + else if (ack_type == TCP_CC_PARTIALACK) + { + tc->cwnd -= tc->bytes_acked; + if (tc->bytes_acked > tc->snd_mss) + tc->bytes_acked += tc->snd_mss; + } +} + +void +newreno_conn_init (tcp_connection_t * tc) +{ + tc->ssthresh = tc->snd_wnd; + tc->cwnd = tcp_initial_cwnd (tc); +} + +const static tcp_cc_algorithm_t tcp_newreno = { + .congestion = newreno_congestion, + .recovered = newreno_recovered, + .rcv_ack = newreno_rcv_ack, + .rcv_cong_ack = newreno_rcv_cong_ack, + .init = newreno_conn_init +}; + +clib_error_t * +newreno_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + tcp_cc_algo_register (TCP_CC_NEWRENO, &tcp_newreno); + + return error; +} + +VLIB_INIT_FUNCTION (newreno_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c new file mode 100644 index 00000000..dbcf1f74 --- /dev/null +++ b/src/vnet/tcp/tcp_output.c @@ -0,0 +1,1412 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +vlib_node_registration_t tcp4_output_node; +vlib_node_registration_t tcp6_output_node; + +typedef enum _tcp_output_nect +{ + TCP_OUTPUT_NEXT_DROP, + TCP_OUTPUT_NEXT_IP_LOOKUP, + TCP_OUTPUT_N_NEXT +} tcp_output_next_t; + +#define foreach_tcp4_output_next \ + _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip4-lookup") + +#define foreach_tcp6_output_next \ + _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip6-lookup") + +static char *tcp_error_strings[] = { +#define tcp_error(n,s) s, +#include +#undef tcp_error +}; + +typedef struct +{ + u16 src_port; + u16 dst_port; + u8 state; +} tcp_tx_trace_t; + +u16 dummy_mtu = 400; + +u8 * +format_tcp_tx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + + s = format (s, "TBD\n"); + + return s; +} + +void +tcp_set_snd_mss (tcp_connection_t * tc) +{ + u16 snd_mss; + + /* TODO find our iface MTU */ + snd_mss = dummy_mtu; + + /* TODO cache mss and consider PMTU discovery */ + snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss; + + tc->snd_mss = snd_mss; + + if (tc->snd_mss == 0) + { + clib_warning ("snd mss is 0"); + tc->snd_mss = dummy_mtu; + } +} + +static u8 +tcp_window_compute_scale (u32 available_space) +{ + u8 wnd_scale = 0; + while (wnd_scale < TCP_MAX_WND_SCALE + && (available_space >> wnd_scale) > TCP_WND_MAX) + wnd_scale++; + return wnd_scale; +} + +/** + * Compute initial window and scale factor. As per RFC1323, window field in + * SYN and SYN-ACK segments is never scaled. + */ +u32 +tcp_initial_window_to_advertise (tcp_connection_t * tc) +{ + u32 available_space; + + /* Initial wnd for SYN. Fifos are not allocated yet. + * Use some predefined value */ + if (tc->state != TCP_STATE_SYN_RCVD) + { + return TCP_DEFAULT_RX_FIFO_SIZE; + } + + available_space = stream_session_max_enqueue (&tc->connection); + tc->rcv_wscale = tcp_window_compute_scale (available_space); + tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); + + return clib_min (tc->rcv_wnd, TCP_WND_MAX); +} + +/** + * Compute and return window to advertise, scaled as per RFC1323 + */ +u32 +tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) +{ + u32 available_space, wnd, scaled_space; + + if (state != TCP_STATE_ESTABLISHED) + return tcp_initial_window_to_advertise (tc); + + available_space = stream_session_max_enqueue (&tc->connection); + scaled_space = available_space >> tc->rcv_wscale; + + /* Need to update scale */ + if (PREDICT_FALSE ((scaled_space == 0 && available_space != 0)) + || (scaled_space >= TCP_WND_MAX)) + tc->rcv_wscale = tcp_window_compute_scale (available_space); + + wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); + tc->rcv_wnd = wnd; + + return wnd >> tc->rcv_wscale; +} + +/** + * Write TCP options to segment. + */ +u32 +tcp_options_write (u8 * data, tcp_options_t * opts) +{ + u32 opts_len = 0; + u32 buf, seq_len = 4; + + if (tcp_opts_mss (opts)) + { + *data++ = TCP_OPTION_MSS; + *data++ = TCP_OPTION_LEN_MSS; + buf = clib_host_to_net_u16 (opts->mss); + clib_memcpy (data, &buf, sizeof (opts->mss)); + data += sizeof (opts->mss); + opts_len += TCP_OPTION_LEN_MSS; + } + + if (tcp_opts_wscale (opts)) + { + *data++ = TCP_OPTION_WINDOW_SCALE; + *data++ = TCP_OPTION_LEN_WINDOW_SCALE; + *data++ = opts->wscale; + opts_len += TCP_OPTION_LEN_WINDOW_SCALE; + } + + if (tcp_opts_sack_permitted (opts)) + { + *data++ = TCP_OPTION_SACK_PERMITTED; + *data++ = TCP_OPTION_LEN_SACK_PERMITTED; + opts_len += TCP_OPTION_LEN_SACK_PERMITTED; + } + + if (tcp_opts_tstamp (opts)) + { + *data++ = TCP_OPTION_TIMESTAMP; + *data++ = TCP_OPTION_LEN_TIMESTAMP; + buf = clib_host_to_net_u32 (opts->tsval); + clib_memcpy (data, &buf, sizeof (opts->tsval)); + data += sizeof (opts->tsval); + buf = clib_host_to_net_u32 (opts->tsecr); + clib_memcpy (data, &buf, sizeof (opts->tsecr)); + data += sizeof (opts->tsecr); + opts_len += TCP_OPTION_LEN_TIMESTAMP; + } + + if (tcp_opts_sack (opts)) + { + int i; + u32 n_sack_blocks = clib_min (vec_len (opts->sacks), + TCP_OPTS_MAX_SACK_BLOCKS); + + if (n_sack_blocks != 0) + { + *data++ = TCP_OPTION_SACK_BLOCK; + *data++ = 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; + for (i = 0; i < n_sack_blocks; i++) + { + buf = clib_host_to_net_u32 (opts->sacks[i].start); + clib_memcpy (data, &buf, seq_len); + data += seq_len; + buf = clib_host_to_net_u32 (opts->sacks[i].end); + clib_memcpy (data, &buf, seq_len); + data += seq_len; + } + opts_len += 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; + } + } + + /* Terminate TCP options */ + if (opts_len % 4) + { + *data++ = TCP_OPTION_EOL; + opts_len += TCP_OPTION_LEN_EOL; + } + + /* Pad with zeroes to a u32 boundary */ + while (opts_len % 4) + { + *data++ = TCP_OPTION_NOOP; + opts_len += TCP_OPTION_LEN_NOOP; + } + return opts_len; +} + +always_inline int +tcp_make_syn_options (tcp_options_t * opts, u32 initial_wnd) +{ + u8 len = 0; + + opts->flags |= TCP_OPTS_FLAG_MSS; + opts->mss = dummy_mtu; /*XXX discover that */ + len += TCP_OPTION_LEN_MSS; + + opts->flags |= TCP_OPTS_FLAG_WSCALE; + opts->wscale = tcp_window_compute_scale (initial_wnd); + len += TCP_OPTION_LEN_WINDOW_SCALE; + + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = 0; + len += TCP_OPTION_LEN_TIMESTAMP; + + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) +{ + u8 len = 0; + + opts->flags |= TCP_OPTS_FLAG_MSS; + opts->mss = dummy_mtu; /*XXX discover that */ + len += TCP_OPTION_LEN_MSS; + + if (tcp_opts_wscale (&tc->opt)) + { + opts->flags |= TCP_OPTS_FLAG_WSCALE; + opts->wscale = tc->rcv_wscale; + len += TCP_OPTION_LEN_WINDOW_SCALE; + } + + if (tcp_opts_tstamp (&tc->opt)) + { + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = tc->tsval_recent; + len += TCP_OPTION_LEN_TIMESTAMP; + } + + if (tcp_opts_sack_permitted (&tc->opt)) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) +{ + u8 len = 0; + + opts->flags = 0; + + if (tcp_opts_tstamp (&tc->opt)) + { + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = tc->tsval_recent; + len += TCP_OPTION_LEN_TIMESTAMP; + } + if (tcp_opts_sack_permitted (&tc->opt)) + { + if (vec_len (tc->snd_sacks)) + { + opts->flags |= TCP_OPTS_FLAG_SACK; + opts->sacks = tc->snd_sacks; + opts->n_sack_blocks = vec_len (tc->snd_sacks); + len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks; + } + } + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, + tcp_state_t state) +{ + switch (state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_FIN_WAIT_1: + return tcp_make_established_options (tc, opts); + case TCP_STATE_SYN_RCVD: + return tcp_make_synack_options (tc, opts); + case TCP_STATE_SYN_SENT: + return tcp_make_syn_options (opts, + tcp_initial_window_to_advertise (tc)); + default: + clib_warning ("Not handled!"); + return 0; + } +} + +#define tcp_get_free_buffer_index(tm, bidx) \ +do { \ + u32 *my_tx_buffers, n_free_buffers; \ + u32 cpu_index = tm->vlib_main->cpu_index; \ + my_tx_buffers = tm->tx_buffers[cpu_index]; \ + if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ + { \ + n_free_buffers = 32; /* TODO config or macro */ \ + vec_validate (my_tx_buffers, n_free_buffers - 1); \ + _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ + tm->vlib_main, my_tx_buffers, n_free_buffers, \ + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ + tm->tx_buffers[cpu_index] = my_tx_buffers; \ + } \ + /* buffer shortage */ \ + if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ + return; \ + *bidx = my_tx_buffers[_vec_len (my_tx_buffers)-1]; \ + _vec_len (my_tx_buffers) -= 1; \ +} while (0) + +always_inline void +tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_buffer_t *it = b; + do + { + it->current_data = 0; + it->current_length = 0; + it->total_length_not_including_first_buffer = 0; + } + while ((it->flags & VLIB_BUFFER_NEXT_PRESENT) + && (it = vlib_get_buffer (vm, it->next_buffer))); + + /* Leave enough space for headers */ + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); +} + +/** + * Prepare ACK + */ +void +tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, + u8 flags) +{ + tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_opts_len, tcp_hdr_opts_len; + tcp_header_t *th; + u16 wnd; + + wnd = tcp_window_to_advertise (tc, state); + + /* Make and write options */ + tcp_opts_len = tcp_make_established_options (tc, snd_opts); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd); + + tcp_options_write ((u8 *) (th + 1), snd_opts); + + /* Mark as ACK */ + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; +} + +/** + * Convert buffer to ACK + */ +void +tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + + tcp_reuse_buffer (vm, b); + tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; +} + +/** + * Convert buffer to FIN-ACK + */ +void +tcp_make_finack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + + tcp_reuse_buffer (vm, b); + tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK | TCP_FLAG_FIN); + + /* Reset flags, make sure ack is sent */ + tc->flags = TCP_CONN_SNDACK; + vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; + + tc->snd_nxt += 1; +} + +/** + * Convert buffer to SYN-ACK + */ +void +tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_opts_len, tcp_hdr_opts_len; + tcp_header_t *th; + u16 initial_wnd; + u32 time_now; + + memset (snd_opts, 0, sizeof (*snd_opts)); + + tcp_reuse_buffer (vm, b); + + /* Set random initial sequence */ + time_now = tcp_time_now (); + + tc->iss = random_u32 (&time_now); + tc->snd_una = tc->iss; + tc->snd_nxt = tc->iss + 1; + tc->snd_una_max = tc->snd_nxt; + + initial_wnd = tcp_initial_window_to_advertise (tc); + + /* Make and write options */ + tcp_opts_len = tcp_make_synack_options (tc, snd_opts); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, + tc->rcv_nxt, tcp_hdr_opts_len, + TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd); + + tcp_options_write ((u8 *) (th + 1), snd_opts); + + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + + /* Init retransmit timer */ + tcp_retransmit_timer_set (tm, tc); +} + +always_inline void +tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + u32 *to_next, next_index; + vlib_frame_t *f; + + b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->error = 0; + + /* Default FIB for now */ + vnet_buffer (b)->sw_if_index[VLIB_TX] = 0; + + /* Send to IP lookup */ + next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index; + f = vlib_get_frame_to_node (vm, next_index); + + /* Enqueue the packet */ + to_next = vlib_frame_vector_args (f); + to_next[0] = bi; + f->n_vectors = 1; + vlib_put_frame_to_node (vm, next_index, f); +} + +int +tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, + tcp_state_t state, u32 my_thread_index, u8 is_ip4) +{ + u8 tcp_hdr_len = sizeof (tcp_header_t); + ip4_header_t *ih4; + ip6_header_t *ih6; + tcp_header_t *th0; + ip4_address_t src_ip40; + ip6_address_t src_ip60; + u16 src_port0; + u32 tmp; + + /* Find IP and TCP headers */ + if (is_ip4) + { + ih4 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ih4); + } + else + { + ih6 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ih6); + } + + /* Swap src and dst ip */ + if (is_ip4) + { + ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40); + src_ip40.as_u32 = ih4->src_address.as_u32; + ih4->src_address.as_u32 = ih4->dst_address.as_u32; + ih4->dst_address.as_u32 = src_ip40.as_u32; + + /* Chop the end of the pkt */ + b0->current_length += ip4_header_bytes (ih4) + tcp_hdr_len; + } + else + { + ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60); + clib_memcpy (&src_ip60, &ih6->src_address, sizeof (ip6_address_t)); + clib_memcpy (&ih6->src_address, &ih6->dst_address, + sizeof (ip6_address_t)); + clib_memcpy (&ih6->dst_address, &src_ip60, sizeof (ip6_address_t)); + + /* Chop the end of the pkt */ + b0->current_length += sizeof (ip6_header_t) + tcp_hdr_len; + } + + /* Try to determine what/why we're actually resetting and swap + * src and dst ports */ + if (state == TCP_STATE_CLOSED) + { + if (!tcp_syn (th0)) + return -1; + + tmp = clib_net_to_host_u32 (th0->seq_number); + + /* Got a SYN for no listener. */ + th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; + th0->ack_number = clib_host_to_net_u32 (tmp + 1); + th0->seq_number = 0; + + } + else if (state >= TCP_STATE_SYN_SENT) + { + th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; + th0->seq_number = th0->ack_number; + th0->ack_number = 0; + } + + src_port0 = th0->src_port; + th0->src_port = th0->dst_port; + th0->dst_port = src_port0; + th0->window = 0; + th0->data_offset_and_reserved = (tcp_hdr_len >> 2) << 4; + th0->urgent_pointer = 0; + + /* Compute checksum */ + if (is_ip4) + { + th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4); + } + else + { + int bogus = ~0; + th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih6, &bogus); + ASSERT (!bogus); + } + + return 0; +} + +/** + * Send reset without reusing existing buffer + */ +void +tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) +{ + vlib_buffer_t *b; + u32 bi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u8 tcp_hdr_len, flags = 0; + tcp_header_t *th, *pkt_th; + u32 seq, ack; + ip4_header_t *ih4, *pkt_ih4; + ip6_header_t *ih6, *pkt_ih6; + + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + /* Leave enough space for headers */ + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + + /* Make and write options */ + tcp_hdr_len = sizeof (tcp_header_t); + + if (is_ip4) + { + pkt_ih4 = vlib_buffer_get_current (pkt); + pkt_th = ip4_next_header (pkt_ih4); + } + else + { + pkt_ih6 = vlib_buffer_get_current (pkt); + pkt_th = ip6_next_header (pkt_ih6); + } + + if (tcp_ack (pkt_th)) + { + flags = TCP_FLAG_RST; + seq = pkt_th->ack_number; + ack = 0; + } + else + { + flags = TCP_FLAG_RST | TCP_FLAG_ACK; + seq = 0; + ack = clib_host_to_net_u32 (vnet_buffer (pkt)->tcp.seq_end); + } + + th = vlib_buffer_push_tcp_net_order (b, pkt_th->dst_port, pkt_th->src_port, + seq, ack, tcp_hdr_len, flags, 0); + + /* Swap src and dst ip */ + if (is_ip4) + { + ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40); + ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address, + &pkt_ih4->src_address, IP_PROTOCOL_TCP); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4); + } + else + { + int bogus = ~0; + pkt_ih6 = (ip6_header_t *) (pkt_th - 1); + ASSERT ((pkt_ih6->ip_version_traffic_class_and_flow_label & 0xF0) == + 0x60); + ih6 = + vlib_buffer_push_ip6 (vm, b, &pkt_ih6->dst_address, + &pkt_ih6->src_address, IP_PROTOCOL_TCP); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus); + ASSERT (!bogus); + } + + tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4); +} + +void +tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_header_t *th = vlib_buffer_get_current (b); + + if (tc->c_is_ip4) + { + ip4_header_t *ih; + ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4, + &tc->c_rmt_ip4, IP_PROTOCOL_TCP); + th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih); + } + else + { + ip6_header_t *ih; + int bogus = ~0; + + ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6, + &tc->c_rmt_ip6, IP_PROTOCOL_TCP); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih, + &bogus); + ASSERT (!bogus); + } +} + +/** + * Send SYN + * + * Builds a SYN packet for a half-open connection and sends it to ipx_lookup. + * The packet is not forwarded through tcpx_output to avoid doing lookups + * in the half_open pool. + */ +void +tcp_send_syn (tcp_connection_t * tc) +{ + vlib_buffer_t *b; + u32 bi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u8 tcp_hdr_opts_len, tcp_opts_len; + tcp_header_t *th; + u32 time_now; + u16 initial_wnd; + tcp_options_t snd_opts; + + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + /* Leave enough space for headers */ + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + + /* Set random initial sequence */ + time_now = tcp_time_now (); + + tc->iss = random_u32 (&time_now); + tc->snd_una = tc->iss; + tc->snd_una_max = tc->snd_nxt = tc->iss + 1; + + initial_wnd = tcp_initial_window_to_advertise (tc); + + /* Make and write options */ + memset (&snd_opts, 0, sizeof (snd_opts)); + tcp_opts_len = tcp_make_syn_options (&snd_opts, initial_wnd); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, + tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN, + initial_wnd); + + tcp_options_write ((u8 *) (th + 1), &snd_opts); + + /* Measure RTT with this */ + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + + /* Start retransmit trimer */ + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, tc->rto * TCP_TO_TIMER_TICK); + tc->rto_boff = 0; + + /* Set the connection establishment timer */ + tcp_timer_set (tc, TCP_TIMER_ESTABLISH, TCP_ESTABLISH_TIME); + + tcp_push_ip_hdr (tm, tc, b); + tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); +} + +always_inline void +tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +{ + u32 *to_next, next_index; + vlib_frame_t *f; + + b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->error = 0; + + /* Decide where to send the packet */ + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + f = vlib_get_frame_to_node (vm, next_index); + + /* Enqueue the packet */ + to_next = vlib_frame_vector_args (f); + to_next[0] = bi; + f->n_vectors = 1; + vlib_put_frame_to_node (vm, next_index, f); +} + +/** + * Send FIN + */ +void +tcp_send_fin (tcp_connection_t * tc) +{ + vlib_buffer_t *b; + u32 bi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + /* Leave enough space for headers */ + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + + tcp_make_finack (tc, b); + + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); +} + +always_inline u8 +tcp_make_state_flags (tcp_state_t next_state) +{ + switch (next_state) + { + case TCP_STATE_ESTABLISHED: + return TCP_FLAG_ACK; + case TCP_STATE_SYN_RCVD: + return TCP_FLAG_SYN | TCP_FLAG_ACK; + case TCP_STATE_SYN_SENT: + return TCP_FLAG_SYN; + case TCP_STATE_LAST_ACK: + case TCP_STATE_FIN_WAIT_1: + return TCP_FLAG_FIN; + default: + clib_warning ("Shouldn't be here!"); + } + return 0; +} + +/** + * Push TCP header and update connection variables + */ +static void +tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, + tcp_state_t next_state) +{ + u32 advertise_wnd, data_len; + u8 tcp_opts_len, tcp_hdr_opts_len, opts_write_len, flags; + tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + tcp_header_t *th; + + data_len = b->current_length; + vnet_buffer (b)->tcp.flags = 0; + + /* Make and write options */ + memset (snd_opts, 0, sizeof (*snd_opts)); + tcp_opts_len = tcp_make_options (tc, snd_opts, next_state); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + /* Get rcv window to advertise */ + advertise_wnd = tcp_window_to_advertise (tc, next_state); + flags = tcp_make_state_flags (next_state); + + /* Push header and options */ + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, + advertise_wnd); + + opts_write_len = tcp_options_write ((u8 *) (th + 1), snd_opts); + + ASSERT (opts_write_len == tcp_opts_len); + + /* Tag the buffer with the connection index */ + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + + tc->snd_nxt += data_len; +} + +/* Send delayed ACK when timer expires */ +void +tcp_timer_delack_handler (u32 index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u32 thread_index = os_get_cpu_number (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi; + + tc = tcp_connection_get (index, thread_index); + + /* Get buffer */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + /* Fill in the ACK */ + tcp_make_ack (tc, b); + + tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; + tc->flags &= ~TCP_CONN_DELACK; + + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); +} + +/** Build a retransmit segment + * + * @return the number of bytes in the segment or 0 if there's nothing to + * retransmit + * */ +u32 +tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, + u32 max_bytes) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u32 n_bytes, offset = 0; + sack_scoreboard_hole_t *hole; + u32 hole_size; + + tcp_reuse_buffer (vm, b); + + ASSERT (tc->state == TCP_STATE_ESTABLISHED); + ASSERT (max_bytes != 0); + + if (tcp_opts_sack_permitted (&tc->opt)) + { + /* XXX get first hole not retransmitted yet */ + hole = scoreboard_first_hole (&tc->sack_sb); + if (!hole) + return 0; + + offset = hole->start - tc->snd_una; + hole_size = hole->end - hole->start; + + ASSERT (hole_size); + + if (hole_size < max_bytes) + max_bytes = hole_size; + } + else + { + if (seq_geq (tc->snd_nxt, tc->snd_una_max)) + return 0; + } + + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (b), offset, + max_bytes); + ASSERT (n_bytes != 0); + + tc->snd_nxt += n_bytes; + tcp_push_hdr_i (tc, b, tc->state); + + return n_bytes; +} + +static void +tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u32 thread_index = os_get_cpu_number (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi, max_bytes, snd_space; + + if (is_syn) + { + tc = tcp_half_open_connection_get (index); + } + else + { + tc = tcp_connection_get (index, thread_index); + } + + /* Make sure timer handle is set to invalid */ + tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + + /* Increment RTO backoff (also equal to number of retries) */ + tc->rto_boff += 1; + + /* Go back to first un-acked byte */ + tc->snd_nxt = tc->snd_una; + + /* Get buffer */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + if (tc->state == TCP_STATE_ESTABLISHED) + { + tcp_fastrecovery_off (tc); + + /* Exponential backoff */ + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + /* Figure out what and how many bytes we can send */ + snd_space = tcp_available_snd_space (tc); + max_bytes = clib_min (tc->snd_mss, snd_space); + tcp_prepare_retransmit_segment (tc, b, max_bytes); + + tc->rtx_bytes += max_bytes; + + /* No fancy recovery for now! */ + scoreboard_clear (&tc->sack_sb); + } + else + { + /* Retransmit for SYN/SYNACK */ + ASSERT (tc->state == TCP_STATE_SYN_RCVD + || tc->state == TCP_STATE_SYN_SENT); + + /* Try without increasing RTO a number of times. If this fails, + * start growing RTO exponentially */ + if (tc->rto_boff > TCP_RTO_SYN_RETRIES) + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_push_hdr_i (tc, b, tc->state); + } + + if (!is_syn) + { + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + /* Re-enable retransmit timer */ + tcp_retransmit_timer_set (tm, tc); + } + else + { + ASSERT (tc->state == TCP_STATE_SYN_SENT); + + /* This goes straight to ipx_lookup */ + tcp_push_ip_hdr (tm, tc, b); + tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); + + /* Re-enable retransmit timer */ + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, + tc->rto * TCP_TO_TIMER_TICK); + } +} + +void +tcp_timer_retransmit_handler (u32 index) +{ + tcp_timer_retransmit_handler_i (index, 0); +} + +void +tcp_timer_retransmit_syn_handler (u32 index) +{ + tcp_timer_retransmit_handler_i (index, 1); +} + +/** + * Retansmit first unacked segment */ +void +tcp_retransmit_first_unacked (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 snd_nxt = tc->snd_nxt; + vlib_buffer_t *b; + u32 bi; + + tc->snd_nxt = tc->snd_una; + + /* Get buffer */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (tm->vlib_main, bi); + + tcp_prepare_retransmit_segment (tc, b, tc->snd_mss); + tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + + tc->snd_nxt = snd_nxt; + tc->rtx_bytes += tc->snd_mss; +} + +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 snd_space, max_bytes, n_bytes, bi; + vlib_buffer_t *b; + + ASSERT (tcp_in_fastrecovery (tc)); + + clib_warning ("fast retransmit!"); + + /* Start resending from first un-acked segment */ + tc->snd_nxt = tc->snd_una; + + snd_space = tcp_available_snd_space (tc); + + while (snd_space) + { + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (tm->vlib_main, bi); + + max_bytes = clib_min (tc->snd_mss, snd_space); + n_bytes = tcp_prepare_retransmit_segment (tc, b, max_bytes); + + /* Nothing left to retransmit */ + if (n_bytes == 0) + return; + + tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + + snd_space -= n_bytes; + } + + /* If window allows, send new data */ + tc->snd_nxt = tc->snd_una_max; +} + +always_inline u32 +tcp_session_has_ooo_data (tcp_connection_t * tc) +{ + stream_session_t *s = + stream_session_get (tc->c_s_index, tc->c_thread_index); + return svm_fifo_has_ooo_data (s->server_rx_fifo); +} + +always_inline uword +tcp46_output_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_connection_t *tc0; + tcp_header_t *th0; + u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + th0 = vlib_buffer_get_current (b0); + + if (is_ip4) + { + ip4_header_t *ih0; + ih0 = vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, + &tc0->c_rmt_ip4, IP_PROTOCOL_TCP); + th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih0); + } + else + { + ip6_header_t *ih0; + int bogus = ~0; + + ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6, + &tc0->c_rmt_ip6, IP_PROTOCOL_TCP); + th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih0, + &bogus); + ASSERT (!bogus); + } + + /* Filter out DUPACKs if there are no OOO segments left */ + if (PREDICT_FALSE + (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) + { + tc0->snt_dupacks--; + ASSERT (tc0->snt_dupacks >= 0); + if (!tcp_session_has_ooo_data (tc0)) + { + error0 = TCP_ERROR_FILTERED_DUPACKS; + next0 = TCP_OUTPUT_NEXT_DROP; + goto done; + } + } + + /* Retransmitted SYNs do reach this but it should be harmless */ + tc0->rcv_las = tc0->rcv_nxt; + + /* Stop DELACK timer and fix flags */ + tc0->flags &= + ~(TCP_CONN_SNDACK | TCP_CONN_DELACK | TCP_CONN_BURSTACK); + if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) + { + tcp_timer_reset (tc0, TCP_TIMER_DELACK); + } + + /* If not retransmitting + * 1) update snd_una_max (SYN, SYNACK, new data, FIN) + * 2) If we're not tracking an ACK, start tracking */ + if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) + { + tc0->snd_una_max = tc0->snd_nxt; + if (tc0->rtt_ts == 0) + { + tc0->rtt_ts = tcp_time_now (); + tc0->rtt_seq = tc0->snd_nxt; + } + } + + /* Set the retransmit timer if not set already and not + * doing a pure ACK */ + if (!tcp_timer_is_active (tc0, TCP_TIMER_RETRANSMIT) + && tc0->snd_nxt != tc0->snd_una) + { + tcp_retransmit_timer_set (tm, tc0); + tc0->rto_boff = 0; + } + + /* set fib index to default and lookup node */ + /* XXX network virtualization (vrf/vni) */ + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + + done: + b0->error = error0 != 0 ? node->errors[error0] : 0; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_output (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_output_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_output (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +VLIB_REGISTER_NODE (tcp4_output_node) = +{ + .function = tcp4_output,.name = "tcp4-output", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32),.n_errors = TCP_N_ERROR,.error_strings = + tcp_error_strings,.n_next_nodes = TCP_OUTPUT_N_NEXT,.next_nodes = + { +#define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, + foreach_tcp4_output_next +#undef _ + } +,.format_buffer = format_tcp_header,.format_trace = format_tcp_tx_trace,}; + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_output_node, tcp4_output) +VLIB_REGISTER_NODE (tcp6_output_node) = +{ + .function = tcp6_output,.name = "tcp6-output", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32),.n_errors = TCP_N_ERROR,.error_strings = + tcp_error_strings,.n_next_nodes = TCP_OUTPUT_N_NEXT,.next_nodes = + { +#define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, + foreach_tcp6_output_next +#undef _ + } +,.format_buffer = format_tcp_header,.format_trace = format_tcp_tx_trace,}; + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output) u32 +tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) +{ + tcp_connection_t *tc; + + tc = (tcp_connection_t *) tconn; + tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED); + return 0; +} + +typedef enum _tcp_reset_next +{ + TCP_RESET_NEXT_DROP, + TCP_RESET_NEXT_IP_LOOKUP, + TCP_RESET_N_NEXT +} tcp_reset_next_t; + +#define foreach_tcp4_reset_next \ + _(DROP, "error-drop") \ + _(IP_LOOKUP, "ip4-lookup") + +#define foreach_tcp6_reset_next \ + _(DROP, "error-drop") \ + _(IP_LOOKUP, "ip6-lookup") + +static uword +tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, u8 is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + if (tcp_make_reset_in_place (vm, b0, vnet_buffer (b0)->tcp.flags, + my_thread_index, is_ip4)) + { + error0 = TCP_ERROR_LOOKUP_DROPS; + next0 = TCP_RESET_NEXT_DROP; + goto done; + } + + /* Prepare to send to IP lookup */ + vnet_buffer (b0)->sw_if_index[VLIB_TX] = 0; + next0 = TCP_RESET_NEXT_IP_LOOKUP; + + done: + b0->error = error0 != 0 ? node->errors[error0] : 0; + b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static uword +tcp4_send_reset (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_send_reset_inline (vm, node, from_frame, 1); +} + +static uword +tcp6_send_reset (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_send_reset_inline (vm, node, from_frame, 0); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_reset_node) = { + .function = tcp4_send_reset, + .name = "tcp4-reset", + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RESET_N_NEXT, + .next_nodes = { +#define _(s,n) [TCP_RESET_NEXT_##s] = n, + foreach_tcp4_reset_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_reset_node) = { + .function = tcp6_send_reset, + .name = "tcp6-reset", + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RESET_N_NEXT, + .next_nodes = { +#define _(s,n) [TCP_RESET_NEXT_##s] = n, + foreach_tcp6_reset_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h new file mode 100644 index 00000000..866c5fd6 --- /dev/null +++ b/src/vnet/tcp/tcp_packet.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_tcp_packet_h +#define included_tcp_packet_h + +#include + +/* TCP flags bit 0 first. */ +#define foreach_tcp_flag \ + _ (FIN) /**< No more data from sender. */ \ + _ (SYN) /**< Synchronize sequence numbers. */ \ + _ (RST) /**< Reset the connection. */ \ + _ (PSH) /**< Push function. */ \ + _ (ACK) /**< Ack field significant. */ \ + _ (URG) /**< Urgent pointer field significant. */ \ + _ (ECE) /**< ECN-echo. Receiver got CE packet */ \ + _ (CWR) /**< Sender reduced congestion window */ + +enum +{ +#define _(f) TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ + TCP_N_FLAG_BITS, +}; + +enum +{ +#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ +}; + +typedef struct _tcp_header +{ + union + { + struct + { + u16 src_port; /**< Source port. */ + u16 dst_port; /**< Destination port. */ + }; + struct + { + u16 src, dst; + }; + }; + + u32 seq_number; /**< Sequence number of the first data octet in this + * segment, except when SYN is present. If SYN + * is present the seq number is is the ISN and the + * first data octet is ISN+1 */ + u32 ack_number; /**< Acknowledgement number if ACK is set. It contains + * the value of the next sequence number the sender + * of the segment is expecting to receive. */ + u8 data_offset_and_reserved; + u8 flags; /**< Flags: see the macro above */ + u16 window; /**< Number of bytes sender is willing to receive. */ + + u16 checksum; /**< Checksum of TCP pseudo header and data. */ + u16 urgent_pointer; /**< Seq number of the byte after the urgent data. */ +} __attribute__ ((packed)) tcp_header_t; + +/* Flag tests that return 0 or !0 */ +#define tcp_doff(_th) ((_th)->data_offset_and_reserved >> 4) +#define tcp_fin(_th) ((_th)->flags & TCP_FLAG_FIN) +#define tcp_syn(_th) ((_th)->flags & TCP_FLAG_SYN) +#define tcp_rst(_th) ((_th)->flags & TCP_FLAG_RST) +#define tcp_psh(_th) ((_th)->flags & TCP_FLAG_PSH) +#define tcp_ack(_th) ((_th)->flags & TCP_FLAG_ACK) +#define tcp_urg(_th) ((_th)->flags & TCP_FLAG_URG) +#define tcp_ece(_th) ((_th)->flags & TCP_FLAG_ECE) +#define tcp_cwr(_th) ((_th)->flags & TCP_FLAG_CWR) + +/* Flag tests that return 0 or 1 */ +#define tcp_is_syn(_th) !!((_th)->flags & TCP_FLAG_SYN) +#define tcp_is_fin(_th) !!((_th)->flags & TCP_FLAG_FIN) + +always_inline int +tcp_header_bytes (tcp_header_t * t) +{ + return tcp_doff (t) * sizeof (u32); +} + +/* + * TCP options. + */ + +typedef enum tcp_option_type +{ + TCP_OPTION_EOL = 0, /**< End of options. */ + TCP_OPTION_NOOP = 1, /**< No operation. */ + TCP_OPTION_MSS = 2, /**< Limit MSS. */ + TCP_OPTION_WINDOW_SCALE = 3, /**< Window scale. */ + TCP_OPTION_SACK_PERMITTED = 4, /**< Selective Ack permitted. */ + TCP_OPTION_SACK_BLOCK = 5, /**< Selective Ack block. */ + TCP_OPTION_TIMESTAMP = 8, /**< Timestamps. */ + TCP_OPTION_UTO = 28, /**< User timeout. */ + TCP_OPTION_AO = 29, /**< Authentication Option. */ +} tcp_option_type_t; + +#define foreach_tcp_options_flag \ + _ (MSS) /**< MSS advertised in SYN */ \ + _ (TSTAMP) /**< Timestamp capability advertised in SYN */ \ + _ (WSCALE) /**< Wnd scale capability advertised in SYN */ \ + _ (SACK_PERMITTED) /**< SACK capability advertised in SYN */ \ + _ (SACK) /**< SACK present */ + +enum +{ +#define _(f) TCP_OPTS_FLAG_BIT_##f, + foreach_tcp_options_flag +#undef _ + TCP_OPTIONS_N_FLAG_BITS, +}; + +enum +{ +#define _(f) TCP_OPTS_FLAG_##f = 1 << TCP_OPTS_FLAG_BIT_##f, + foreach_tcp_options_flag +#undef _ +}; + +typedef struct _sack_block +{ + u32 start; /**< Start sequence number */ + u32 end; /**< End sequence number */ +} sack_block_t; + +typedef struct +{ + u8 flags; /** Option flags, see above */ + + /* Received options */ + u16 mss; /**< Maximum segment size advertised by peer */ + u8 wscale; /**< Window scale advertised by peer */ + u32 tsval; /**< Peer's timestamp value */ + u32 tsecr; /**< Echoed/reflected time stamp */ + sack_block_t *sacks; /**< SACK blocks received */ + u8 n_sack_blocks; /**< Number of SACKs blocks */ +} tcp_options_t; + +/* Flag tests that return 0 or !0 */ +#define tcp_opts_mss(_to) ((_to)->flags & TCP_OPTS_FLAG_MSS) +#define tcp_opts_tstamp(_to) ((_to)->flags & TCP_OPTS_FLAG_TSTAMP) +#define tcp_opts_wscale(_to) ((_to)->flags & TCP_OPTS_FLAG_WSCALE) +#define tcp_opts_sack(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK) +#define tcp_opts_sack_permitted(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK_PERMITTED) + +/* TCP option lengths */ +#define TCP_OPTION_LEN_EOL 1 +#define TCP_OPTION_LEN_NOOP 1 +#define TCP_OPTION_LEN_MSS 4 +#define TCP_OPTION_LEN_WINDOW_SCALE 3 +#define TCP_OPTION_LEN_SACK_PERMITTED 2 +#define TCP_OPTION_LEN_TIMESTAMP 10 +#define TCP_OPTION_LEN_SACK_BLOCK 8 + +#define TCP_WND_MAX 65535U +#define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ +#define TCP_OPTS_ALIGN 4 +#define TCP_OPTS_MAX_SACK_BLOCKS 3 +#endif /* included_tcp_packet_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_pg.c b/src/vnet/tcp/tcp_pg.c new file mode 100644 index 00000000..dc324049 --- /dev/null +++ b/src/vnet/tcp/tcp_pg.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/tcp_pg: TCP packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +/* TCP flags bit 0 first. */ +#define foreach_tcp_flag \ + _ (FIN) \ + _ (SYN) \ + _ (RST) \ + _ (PSH) \ + _ (ACK) \ + _ (URG) \ + _ (ECE) \ + _ (CWR) + +static void +tcp_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets) +{ + vlib_main_t * vm = vlib_get_main(); + u32 ip_offset, tcp_offset; + + tcp_offset = g->start_byte_offset; + ip_offset = (g-1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + tcp_header_t * tcp0; + ip_csum_t sum0; + u32 tcp_len0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ASSERT (p0->current_data == 0); + ip0 = (void *) (p0->data + ip_offset); + tcp0 = (void *) (p0->data + tcp_offset); + tcp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); + + /* Initialize checksum with header. */ + if (BITS (sum0) == 32) + { + sum0 = clib_mem_unaligned (&ip0->src_address, u32); + sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32)); + } + else + sum0 = clib_mem_unaligned (&ip0->src_address, u64); + + sum0 = ip_csum_with_carry + (sum0, clib_host_to_net_u32 (tcp_len0 + (ip0->protocol << 16))); + + /* Invalidate possibly old checksum. */ + tcp0->checksum = 0; + + sum0 = ip_incremental_checksum_buffer (vm, p0, tcp_offset, tcp_len0, sum0); + + tcp0->checksum = ~ ip_csum_fold (sum0); + } +} + +typedef struct { + pg_edit_t src, dst; + pg_edit_t seq_number, ack_number; + pg_edit_t data_offset_and_reserved; +#define _(f) pg_edit_t f##_flag; + foreach_tcp_flag +#undef _ + pg_edit_t window; + pg_edit_t checksum; + pg_edit_t urgent_pointer; +} pg_tcp_header_t; + +static inline void +pg_tcp_header_init (pg_tcp_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, tcp_header_t, f); + _ (src); + _ (dst); + _ (seq_number); + _ (ack_number); + _ (window); + _ (checksum); + _ (urgent_pointer); +#undef _ + + /* Initialize bit fields. */ +#define _(f) \ + pg_edit_init_bitfield (&p->f##_flag, tcp_header_t, \ + flags, \ + TCP_FLAG_BIT_##f, 1); + + foreach_tcp_flag +#undef _ + + pg_edit_init_bitfield (&p->data_offset_and_reserved, tcp_header_t, + data_offset_and_reserved, + 4, 4); +} + +uword +unformat_pg_tcp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t * s = va_arg (*args, pg_stream_t *); + pg_tcp_header_t * p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (tcp_header_t), + &group_index); + pg_tcp_header_init (p); + + /* Defaults. */ + pg_edit_set_fixed (&p->seq_number, 0); + pg_edit_set_fixed (&p->ack_number, 0); + + pg_edit_set_fixed (&p->data_offset_and_reserved, + sizeof (tcp_header_t) / sizeof (u32)); + + pg_edit_set_fixed (&p->window, 4096); + pg_edit_set_fixed (&p->urgent_pointer, 0); + +#define _(f) pg_edit_set_fixed (&p->f##_flag, 0); + foreach_tcp_flag +#undef _ + + p->checksum.type = PG_EDIT_UNSPECIFIED; + + if (! unformat (input, "TCP: %U -> %U", + unformat_pg_edit, + unformat_tcp_udp_port, &p->src, + unformat_pg_edit, + unformat_tcp_udp_port, &p->dst)) + goto error; + + /* Parse options. */ + while (1) + { + if (unformat (input, "window %U", + unformat_pg_edit, + unformat_pg_number, &p->window)) + ; + + else if (unformat (input, "checksum %U", + unformat_pg_edit, + unformat_pg_number, &p->checksum)) + ; + + /* Flags. */ +#define _(f) else if (unformat (input, #f)) pg_edit_set_fixed (&p->f##_flag, 1); + foreach_tcp_flag +#undef _ + + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t * im = &ip_main; + u16 dst_port; + tcp_udp_port_info_t * pi; + + pi = 0; + if (p->dst.type == PG_EDIT_FIXED) + { + dst_port = pg_edit_get_value (&p->dst, PG_EDIT_LO); + pi = ip_get_tcp_udp_port_info (im, dst_port); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (! unformat_user (input, unformat_pg_payload, s)) + goto error; + + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t * g = pg_stream_get_group (s, group_index); + g->edit_function = tcp_pg_edit_function; + g->edit_function_opaque = 0; + } + + return 1; + } + + error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + diff --git a/src/vnet/tcp/tcp_syn_filter4.c b/src/vnet/tcp/tcp_syn_filter4.c new file mode 100644 index 00000000..c7605a30 --- /dev/null +++ b/src/vnet/tcp/tcp_syn_filter4.c @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +typedef struct +{ + f64 next_reset; + f64 reset_interval; + u8 *syn_counts; +} syn_filter4_runtime_t; + +typedef struct +{ + u32 next_index; + int not_a_syn; + u8 filter_value; +} syn_filter4_trace_t; + +/* packet trace format function */ +static u8 * +format_syn_filter4_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + syn_filter4_trace_t *t = va_arg (*args, syn_filter4_trace_t *); + + s = format (s, "SYN_FILTER4: next index %d, %s", + t->next_index, t->not_a_syn ? "not a syn" : "syn"); + if (t->not_a_syn == 0) + s = format (s, ", filter value %d\n", t->filter_value); + else + s = format (s, "\n"); + return s; +} + +static vlib_node_registration_t syn_filter4_node; + +#define foreach_syn_filter_error \ +_(THROTTLED, "TCP SYN packet throttle drops") \ +_(OK, "TCP SYN packets passed") + +typedef enum +{ +#define _(sym,str) SYN_FILTER_ERROR_##sym, + foreach_syn_filter_error +#undef _ + SYN_FILTER_N_ERROR, +} syn_filter_error_t; + +static char *syn_filter4_error_strings[] = { +#define _(sym,string) string, + foreach_syn_filter_error +#undef _ +}; + +typedef enum +{ + SYN_FILTER_NEXT_DROP, + SYN_FILTER_N_NEXT, +} syn_filter_next_t; + +extern vnet_feature_arc_registration_t vnet_feat_arc_ip4_local; + +static uword +syn_filter4_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + syn_filter_next_t next_index; + u32 ok_syn_packets = 0; + vnet_feature_main_t *fm = &feature_main; + u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index; + vnet_feature_config_main_t *cm = &fm->feature_config_mains[arc_index]; + syn_filter4_runtime_t *rt = (syn_filter4_runtime_t *) node->runtime_data; + f64 now = vlib_time_now (vm); + /* Shut up spurious gcc warnings. */ + u8 *c0 = 0, *c1 = 0, *c2 = 0, *c3 = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (now > rt->next_reset) + { + memset (rt->syn_counts, 0, vec_len (rt->syn_counts)); + rt->next_reset = now + rt->reset_interval; + } + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 8 && n_left_to_next >= 4) + { + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 next0, next1, next2, next3; + ip4_header_t *ip0, *ip1, *ip2, *ip3; + tcp_header_t *tcp0, *tcp1, *tcp2, *tcp3; + u32 not_a_syn0 = 1, not_a_syn1 = 1, not_a_syn2 = 1, not_a_syn3 = 1; + u64 hash0, hash1, hash2, hash3; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p4, *p5, *p6, *p7; + + p4 = vlib_get_buffer (vm, from[4]); + p5 = vlib_get_buffer (vm, from[5]); + p6 = vlib_get_buffer (vm, from[6]); + p7 = vlib_get_buffer (vm, from[7]); + + vlib_prefetch_buffer_header (p4, LOAD); + vlib_prefetch_buffer_header (p5, LOAD); + vlib_prefetch_buffer_header (p6, LOAD); + vlib_prefetch_buffer_header (p7, LOAD); + + CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p6->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p7->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + to_next[2] = bi2 = from[2]; + to_next[3] = bi3 = from[3]; + from += 4; + to_next += 4; + n_left_from -= 4; + n_left_to_next -= 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + vnet_get_config_data + (&cm->config_main, &b0->current_config_index, + &next0, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b1->current_config_index, + &next1, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b2->current_config_index, + &next2, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b3->current_config_index, + &next3, 0 /* sizeof (c0[0]) */ ); + + /* Not TCP? */ + ip0 = vlib_buffer_get_current (b0); + if (ip0->protocol != IP_PROTOCOL_TCP) + goto trace00; + + tcp0 = ip4_next_header (ip0); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp0->flags & 0x2))) + goto trace00; + + not_a_syn0 = 0; + hash0 = clib_xxhash ((u64) ip0->src_address.as_u32); + c0 = &rt->syn_counts[hash0 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c0 >= 0x80)) + { + next0 = SYN_FILTER_NEXT_DROP; + b0->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace00; + } + *c0 += 1; + ok_syn_packets++; + + trace00: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->not_a_syn = not_a_syn0; + t->next_index = next0; + t->filter_value = not_a_syn0 ? 0 : *c0; + } + + /* Not TCP? */ + ip1 = vlib_buffer_get_current (b1); + if (ip1->protocol != IP_PROTOCOL_TCP) + goto trace01; + + tcp1 = ip4_next_header (ip1); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp1->flags & 0x2))) + goto trace01; + + not_a_syn1 = 0; + hash1 = clib_xxhash ((u64) ip1->src_address.as_u32); + c1 = &rt->syn_counts[hash1 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c1 >= 0x80)) + { + next1 = SYN_FILTER_NEXT_DROP; + b1->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace01; + } + *c1 += 1; + ok_syn_packets++; + + trace01: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b1->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->not_a_syn = not_a_syn1; + t->next_index = next1; + t->filter_value = not_a_syn1 ? 0 : *c1; + } + + /* Not TCP? */ + ip2 = vlib_buffer_get_current (b2); + if (ip2->protocol != IP_PROTOCOL_TCP) + goto trace02; + + tcp2 = ip4_next_header (ip2); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp2->flags & 0x2))) + goto trace02; + + not_a_syn2 = 0; + hash2 = clib_xxhash ((u64) ip2->src_address.as_u32); + c2 = &rt->syn_counts[hash2 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c2 >= 0x80)) + { + next2 = SYN_FILTER_NEXT_DROP; + b2->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace02; + } + *c2 += 1; + ok_syn_packets++; + + trace02: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b2->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b2, sizeof (*t)); + t->not_a_syn = not_a_syn2; + t->next_index = next2; + t->filter_value = not_a_syn2 ? 0 : *c2; + } + + /* Not TCP? */ + ip3 = vlib_buffer_get_current (b3); + if (ip3->protocol != IP_PROTOCOL_TCP) + goto trace03; + + tcp3 = ip4_next_header (ip3); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp3->flags & 0x2))) + goto trace03; + + not_a_syn3 = 0; + hash3 = clib_xxhash ((u64) ip3->src_address.as_u32); + c3 = &rt->syn_counts[hash3 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c3 >= 0x80)) + { + next3 = SYN_FILTER_NEXT_DROP; + b3->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace03; + } + *c3 += 1; + ok_syn_packets++; + + trace03: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b3->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b3, sizeof (*t)); + t->not_a_syn = not_a_syn3; + t->next_index = next3; + t->filter_value = not_a_syn3 ? 0 : *c3; + } + vlib_validate_buffer_enqueue_x4 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, bi2, bi3, + next0, next1, next2, next3); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ip4_header_t *ip0; + tcp_header_t *tcp0; + u32 not_a_syn0 = 1; + u32 hash0; + u8 *c0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + vnet_get_config_data + (&cm->config_main, &b0->current_config_index, + &next0, 0 /* sizeof (c0[0]) */ ); + + /* Not TCP? */ + ip0 = vlib_buffer_get_current (b0); + if (ip0->protocol != IP_PROTOCOL_TCP) + goto trace0; + + tcp0 = ip4_next_header (ip0); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp0->flags & 0x2))) + goto trace0; + + not_a_syn0 = 0; + hash0 = clib_xxhash ((u64) ip0->src_address.as_u32); + c0 = &rt->syn_counts[hash0 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c0 >= 0x80)) + { + next0 = SYN_FILTER_NEXT_DROP; + b0->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace0; + } + *c0 += 1; + ok_syn_packets++; + + trace0: + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->not_a_syn = not_a_syn0; + t->next_index = next0; + t->filter_value = not_a_syn0 ? 0 : *c0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, syn_filter4_node.index, + SYN_FILTER_ERROR_OK, ok_syn_packets); + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (syn_filter4_node, static) = +{ + .function = syn_filter4_node_fn, + .name = "syn-filter-4", + .vector_size = sizeof (u32), + .format_trace = format_syn_filter4_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .runtime_data_bytes = sizeof (syn_filter4_runtime_t), + .n_errors = ARRAY_LEN(syn_filter4_error_strings), + .error_strings = syn_filter4_error_strings, + + .n_next_nodes = SYN_FILTER_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SYN_FILTER_NEXT_DROP] = "error-drop", + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (syn_filter4_node, syn_filter4_node_fn); + +/* *INDENT-OFF* */ +VNET_FEATURE_INIT (syn_filter_4, static) = +{ + .arc_name = "ip4-local", + .node_name = "syn-filter-4", + .runs_before = VNET_FEATURES("ip4-local-end-of-arc"), +}; +/* *INDENT-ON* */ + +int +syn_filter_enable_disable (u32 sw_if_index, int enable_disable) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_sw_interface_t *sw; + int rv = 0; + + /* Utterly wrong? */ + if (pool_is_free_index (vnm->interface_main.sw_interfaces, sw_if_index)) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + /* Not a physical port? */ + sw = vnet_get_sw_interface (vnm, sw_if_index); + if (sw->type != VNET_SW_INTERFACE_TYPE_HARDWARE) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + if (enable_disable) + { + vlib_main_t *vm = vlib_get_main (); + syn_filter4_runtime_t *rt; + + rt = vlib_node_get_runtime_data (vm, syn_filter4_node.index); + vec_validate (rt->syn_counts, 1023); + /* + * Given perfect disperson / optimal hashing results: + * Allow 128k (successful) syns/sec. 1024, buckets each of which + * absorb 128 syns before filtering. Reset table once a second. + * Reality bites, lets try resetting once every 100ms. + */ + rt->reset_interval = 0.1; /* reset interval in seconds */ + } + + rv = vnet_feature_enable_disable ("ip4-local", "syn-filter-4", + sw_if_index, enable_disable, 0, 0); + + return rv; +} + +static clib_error_t * +syn_filter_enable_disable_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t *vnm = vnet_get_main (); + u32 sw_if_index = ~0; + int enable_disable = 1; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "disable")) + enable_disable = 0; + else if (unformat (input, "%U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + ; + else + break; + } + + if (sw_if_index == ~0) + return clib_error_return (0, "Please specify an interface..."); + + rv = syn_filter_enable_disable (sw_if_index, enable_disable); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_INVALID_SW_IF_INDEX: + return clib_error_return + (0, "Invalid interface, only works on physical ports"); + break; + + case VNET_API_ERROR_UNIMPLEMENTED: + return clib_error_return (0, + "Device driver doesn't support redirection"); + break; + + case VNET_API_ERROR_INVALID_VALUE: + return clib_error_return (0, "feature arc not found"); + + case VNET_API_ERROR_INVALID_VALUE_2: + return clib_error_return (0, "feature node not found"); + + default: + return clib_error_return (0, "syn_filter_enable_disable returned %d", + rv); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (sr_content_command, static) = +{ + .path = "ip syn filter", + .short_help = "ip syn filter [disable]", + .function = syn_filter_enable_disable_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_timer.h b/src/vnet/tcp/tcp_timer.h new file mode 100644 index 00000000..fa25268c --- /dev/null +++ b/src/vnet/tcp/tcp_timer.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_tcp_timer_h__ +#define __included_tcp_timer_h__ + +#include +#include + +#endif /* __included_tcp_timer_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/builtin_server.c b/src/vnet/udp/builtin_server.c new file mode 100644 index 00000000..afa66ba4 --- /dev/null +++ b/src/vnet/udp/builtin_server.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** @file + udp builtin server +*/ + +#include +#include +#include + +/** per-worker built-in server copy buffers */ +u8 **copy_buffers; + +static int +builtin_session_create_callback (stream_session_t * s) +{ + /* Simple version: declare session ready-to-go... */ + s->session_state = SESSION_STATE_READY; + return 0; +} + +static void +builtin_session_disconnect_callback (stream_session_t * s) +{ + stream_session_disconnect (s); +} + +static int +builtin_server_rx_callback (stream_session_t * s) +{ + svm_fifo_t *rx_fifo, *tx_fifo; + u32 this_transfer; + int actual_transfer; + u8 *my_copy_buffer; + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + + my_copy_buffer = copy_buffers[s->thread_index]; + rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; + + this_transfer = svm_fifo_max_enqueue (tx_fifo) + < svm_fifo_max_dequeue (rx_fifo) ? + svm_fifo_max_enqueue (tx_fifo) : svm_fifo_max_dequeue (rx_fifo); + + vec_validate (my_copy_buffer, this_transfer - 1); + _vec_len (my_copy_buffer) = this_transfer; + + actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, 0, this_transfer, + my_copy_buffer); + ASSERT (actual_transfer == this_transfer); + actual_transfer = svm_fifo_enqueue_nowait (tx_fifo, 0, this_transfer, + my_copy_buffer); + + copy_buffers[s->thread_index] = my_copy_buffer; + + /* Fabricate TX event, send to ourselves */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = actual_transfer; + evt.event_id = 0; + q = session_manager_get_vpp_event_queue (s->thread_index); + unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); + + return 0; +} + +/* *INDENT-OFF* */ +static session_cb_vft_t builtin_server = { + .session_accept_callback = builtin_session_create_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback +}; +/* *INDENT-ON* */ + +static int +bind_builtin_uri_server (u8 * uri) +{ + vnet_bind_args_t _a, *a = &_a; + char segment_name[128]; + u32 segment_name_length; + int rv; + u64 options[16]; + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->uri = (char *) uri; + a->api_client_index = ~0; /* built-in server */ + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &builtin_server; + + options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; + options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */ + a->options = options; + + rv = vnet_bind_uri (a); + + return rv; +} + +static int +unbind_builtin_uri_server (u8 * uri) +{ + int rv; + + rv = vnet_unbind_uri ((char *) uri, ~0 /* client_index */ ); + + return rv; +} + +static clib_error_t * +builtin_server_init (vlib_main_t * vm) +{ + vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; + + num_threads = 1 /* main thread */ + vtm->n_threads; + + vec_validate (copy_buffers, num_threads - 1); + return 0; +} + +VLIB_INIT_FUNCTION (builtin_server_init); + +static clib_error_t * +builtin_uri_bind_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *uri = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "uri %s", &uri)) + ; + else + break; + } + + if (uri == 0) + return clib_error_return (0, "uri to bind not specified..."); + + rv = bind_builtin_uri_server (uri); + + vec_free (uri); + + switch (rv) + { + case 0: + break; + + default: + return clib_error_return (0, "bind_uri_server returned %d", rv); + break; + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (builtin_uri_bind_command, static) = +{ + .path = "builtin uri bind", + .short_help = "builtin uri bind", + .function = builtin_uri_bind_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +builtin_uri_unbind_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *uri = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "uri %s", &uri)) + ; + else + break; + } + + if (uri == 0) + return clib_error_return (0, "uri to unbind not specified..."); + + rv = unbind_builtin_uri_server (uri); + + vec_free (uri); + + switch (rv) + { + case 0: + break; + + default: + return clib_error_return (0, "unbind_uri_server returned %d", rv); + break; + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (builtin_uri_unbind_command, static) = +{ + .path = "builtin uri unbind", + .short_help = "builtin uri unbind", + .function = builtin_uri_unbind_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp.c b/src/vnet/udp/udp.c new file mode 100644 index 00000000..9e740466 --- /dev/null +++ b/src/vnet/udp/udp.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** @file + udp state machine, etc. +*/ + +#include +#include +#include +#include + +udp_uri_main_t udp_uri_main; + +u32 +udp_session_bind_ip4 (vlib_main_t * vm, u32 session_index, + ip46_address_t * ip, u16 port_number_host_byte_order) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + udp_connection_t *listener; + + pool_get (um->udp_listeners, listener); + memset (listener, 0, sizeof (udp_connection_t)); + listener->c_lcl_port = clib_host_to_net_u16 (port_number_host_byte_order); + listener->c_lcl_ip4.as_u32 = ip->ip4.as_u32; + listener->c_proto = SESSION_TYPE_IP4_UDP; + udp_register_dst_port (um->vlib_main, port_number_host_byte_order, + udp4_uri_input_node.index, 1 /* is_ipv4 */ ); + return 0; +} + +u32 +udp_session_bind_ip6 (vlib_main_t * vm, u32 session_index, + ip46_address_t * ip, u16 port_number_host_byte_order) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + udp_connection_t *listener; + + pool_get (um->udp_listeners, listener); + listener->c_lcl_port = clib_host_to_net_u16 (port_number_host_byte_order); + clib_memcpy (&listener->c_lcl_ip6, &ip->ip6, sizeof (ip6_address_t)); + listener->c_proto = SESSION_TYPE_IP6_UDP; + udp_register_dst_port (um->vlib_main, port_number_host_byte_order, + udp4_uri_input_node.index, 0 /* is_ipv4 */ ); + return 0; +} + +u32 +udp_session_unbind_ip4 (vlib_main_t * vm, u32 listener_index) +{ + udp_connection_t *listener; + listener = udp_listener_get (listener_index); + + /* deregister the udp_local mapping */ + udp_unregister_dst_port (vm, listener->c_lcl_port, 1 /* is_ipv4 */ ); + return 0; +} + +u32 +udp_session_unbind_ip6 (vlib_main_t * vm, u32 listener_index) +{ + udp_connection_t *listener; + + listener = udp_listener_get (listener_index); + + /* deregister the udp_local mapping */ + udp_unregister_dst_port (vm, listener->c_lcl_port, 0 /* is_ipv4 */ ); + return 0; +} + +transport_connection_t * +udp_session_get_listener (u32 listener_index) +{ + udp_connection_t *us; + + us = udp_listener_get (listener_index); + return &us->connection; +} + +u32 +udp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) +{ + udp_connection_t *us; + u8 *data; + udp_header_t *udp; + + us = (udp_connection_t *) tconn; + + if (tconn->is_ip4) + { + ip4_header_t *ip; + + data = vlib_buffer_get_current (b); + udp = (udp_header_t *) (data - sizeof (udp_header_t)); + ip = (ip4_header_t *) ((u8 *) udp - sizeof (ip4_header_t)); + + /* Build packet header, swap rx key src + dst fields */ + ip->src_address.as_u32 = us->c_lcl_ip4.as_u32; + ip->dst_address.as_u32 = us->c_rmt_ip4.as_u32; + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->length = clib_host_to_net_u16 (b->current_length + sizeof (*udp)); + ip->checksum = ip4_header_checksum (ip); + + udp->src_port = us->c_lcl_port; + udp->dst_port = us->c_rmt_port; + udp->length = clib_host_to_net_u16 (b->current_length); + udp->checksum = 0; + + b->current_length = sizeof (*ip) + sizeof (*udp); + return SESSION_QUEUE_NEXT_IP4_LOOKUP; + } + else + { + vlib_main_t *vm = vlib_get_main (); + ip6_header_t *ip; + u16 payload_length; + int bogus = ~0; + + data = vlib_buffer_get_current (b); + udp = (udp_header_t *) (data - sizeof (udp_header_t)); + ip = (ip6_header_t *) ((u8 *) udp - sizeof (ip6_header_t)); + + /* Build packet header, swap rx key src + dst fields */ + clib_memcpy (&ip->src_address, &us->c_lcl_ip6, sizeof (ip6_address_t)); + clib_memcpy (&ip->dst_address, &us->c_rmt_ip6, sizeof (ip6_address_t)); + + ip->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32 (0x6 << 28); + + ip->hop_limit = 0xff; + ip->protocol = IP_PROTOCOL_UDP; + + payload_length = vlib_buffer_length_in_chain (vm, b); + payload_length -= sizeof (*ip); + + ip->payload_length = clib_host_to_net_u16 (payload_length); + + udp->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip, &bogus); + ASSERT (!bogus); + + udp->src_port = us->c_lcl_port; + udp->dst_port = us->c_rmt_port; + udp->length = clib_host_to_net_u16 (b->current_length); + udp->checksum = 0; + + b->current_length = sizeof (*ip) + sizeof (*udp); + + return SESSION_QUEUE_NEXT_IP6_LOOKUP; + } +} + +transport_connection_t * +udp_session_get (u32 connection_index, u32 my_thread_index) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + + udp_connection_t *us; + us = + pool_elt_at_index (um->udp_sessions[my_thread_index], connection_index); + return &us->connection; +} + +void +udp_session_close (u32 connection_index, u32 my_thread_index) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + pool_put_index (um->udp_sessions[my_thread_index], connection_index); +} + +u8 * +format_udp_session_ip4 (u8 * s, va_list * args) +{ + u32 uci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + udp_connection_t *u4; + + u4 = udp_connection_get (uci, thread_index); + + s = format (s, "[%s] %U:%d->%U:%d", "udp", format_ip4_address, + &u4->c_lcl_ip4, clib_net_to_host_u16 (u4->c_lcl_port), + format_ip4_address, &u4->c_rmt_ip4, + clib_net_to_host_u16 (u4->c_rmt_port)); + return s; +} + +u8 * +format_udp_session_ip6 (u8 * s, va_list * args) +{ + u32 uci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + udp_connection_t *tc = udp_connection_get (uci, thread_index); + s = format (s, "[%s] %U:%d->%U:%d", "udp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_udp_listener_session_ip4 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + udp_connection_t *tc = udp_listener_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "udp", format_ip4_address, + &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip4_address, &tc->c_rmt_ip4, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_udp_listener_session_ip6 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + udp_connection_t *tc = udp_listener_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "udp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u16 +udp_send_mss_uri (transport_connection_t * t) +{ + /* TODO figure out MTU of output interface */ + return 400; +} + +u32 +udp_send_space_uri (transport_connection_t * t) +{ + /* No constraint on TX window */ + return ~0; +} + +int +udp_open_connection (ip46_address_t * addr, u16 port) +{ + clib_warning ("Not implemented"); + return 0; +} + +/* *INDENT-OFF* */ +const static transport_proto_vft_t udp4_proto = { + .bind = udp_session_bind_ip4, + .open = udp_open_connection, + .unbind = udp_session_unbind_ip4, + .push_header = udp_push_header, + .get_connection = udp_session_get, + .get_listener = udp_session_get_listener, + .close = udp_session_close, + .send_mss = udp_send_mss_uri, + .send_space = udp_send_space_uri, + .format_connection = format_udp_session_ip4, + .format_listener = format_udp_listener_session_ip4 +}; + +const static transport_proto_vft_t udp6_proto = { + .bind = udp_session_bind_ip6, + .open = udp_open_connection, + .unbind = udp_session_unbind_ip6, + .push_header = udp_push_header, + .get_connection = udp_session_get, + .get_listener = udp_session_get_listener, + .close = udp_session_close, + .send_mss = udp_send_mss_uri, + .send_space = udp_send_space_uri, + .format_connection = format_udp_session_ip6, + .format_listener = format_udp_listener_session_ip6 +}; +/* *INDENT-ON* */ + +static clib_error_t * +udp_init (vlib_main_t * vm) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + ip_main_t *im = &ip_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u32 num_threads; + clib_error_t *error = 0; + ip_protocol_info_t *pi; + + um->vlib_main = vm; + um->vnet_main = vnet_get_main (); + + if ((error = vlib_call_init_function (vm, ip_main_init))) + return error; + if ((error = vlib_call_init_function (vm, ip4_lookup_init))) + return error; + if ((error = vlib_call_init_function (vm, ip6_lookup_init))) + return error; + + /* + * Registrations + */ + + /* IP registration */ + pi = ip_get_protocol_info (im, IP_PROTOCOL_UDP); + if (pi == 0) + return clib_error_return (0, "UDP protocol info AWOL"); + pi->format_header = format_udp_header; + pi->unformat_pg_edit = unformat_pg_udp_header; + + + /* Register as transport with URI */ + session_register_transport (SESSION_TYPE_IP4_UDP, &udp4_proto); + session_register_transport (SESSION_TYPE_IP6_UDP, &udp6_proto); + + /* + * Initialize data structures + */ + + num_threads = 1 /* main thread */ + tm->n_threads; + vec_validate (um->udp_sessions, num_threads - 1); + + return error; +} + +VLIB_INIT_FUNCTION (udp_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp.h b/src/vnet/udp/udp.h new file mode 100644 index 00000000..7ab26ce9 --- /dev/null +++ b/src/vnet/udp/udp.h @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_udp_h__ +#define __included_udp_h__ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +typedef struct +{ + transport_connection_t connection; /** must be first */ + + /** ersatz MTU to limit fifo pushes to test data size */ + u32 mtu; +} udp_connection_t; + +typedef struct _udp_uri_main +{ + /* Per-worker thread udp connection pools */ + udp_connection_t **udp_sessions; + udp_connection_t *udp_listeners; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ip4_main_t *ip4_main; + ip6_main_t *ip6_main; +} udp_uri_main_t; + +extern udp_uri_main_t udp_uri_main; +extern vlib_node_registration_t udp4_uri_input_node; + +always_inline udp_uri_main_t * +vnet_get_udp_main () +{ + return &udp_uri_main; +} + +always_inline udp_connection_t * +udp_connection_get (u32 conn_index, u32 thread_index) +{ + return pool_elt_at_index (udp_uri_main.udp_sessions[thread_index], + conn_index); +} + +always_inline udp_connection_t * +udp_listener_get (u32 conn_index) +{ + return pool_elt_at_index (udp_uri_main.udp_listeners, conn_index); +} + +typedef enum +{ +#define udp_error(n,s) UDP_ERROR_##n, +#include +#undef udp_error + UDP_N_ERROR, +} udp_error_t; + +#define foreach_udp4_dst_port \ +_ (67, dhcp_to_server) \ +_ (68, dhcp_to_client) \ +_ (500, ikev2) \ +_ (3784, bfd4) \ +_ (3785, bfd_echo4) \ +_ (4341, lisp_gpe) \ +_ (4342, lisp_cp) \ +_ (4739, ipfix) \ +_ (4789, vxlan) \ +_ (4789, vxlan6) \ +_ (4790, vxlan_gpe) \ +_ (6633, vpath_3) + + +#define foreach_udp6_dst_port \ +_ (547, dhcpv6_to_server) \ +_ (546, dhcpv6_to_client) \ +_ (3784, bfd6) \ +_ (3785, bfd_echo6) \ +_ (4341, lisp_gpe6) \ +_ (4342, lisp_cp6) \ +_ (4790, vxlan6_gpe) \ +_ (6633, vpath6_3) + +typedef enum +{ +#define _(n,f) UDP_DST_PORT_##f = n, + foreach_udp4_dst_port foreach_udp6_dst_port +#undef _ +} udp_dst_port_t; + +typedef enum +{ +#define _(n,f) UDP6_DST_PORT_##f = n, + foreach_udp6_dst_port +#undef _ +} udp6_dst_port_t; + +typedef struct +{ + /* Name (a c string). */ + char *name; + + /* GRE protocol type in host byte order. */ + udp_dst_port_t dst_port; + + /* Node which handles this type. */ + u32 node_index; + + /* Next index for this type. */ + u32 next_index; +} udp_dst_port_info_t; + +typedef enum +{ + UDP_IP6 = 0, + UDP_IP4, /* the code is full of is_ip4... */ + N_UDP_AF, +} udp_af_t; + +typedef struct +{ + udp_dst_port_info_t *dst_port_infos[N_UDP_AF]; + + /* Hash tables mapping name/protocol to protocol info index. */ + uword *dst_port_info_by_name[N_UDP_AF]; + uword *dst_port_info_by_dst_port[N_UDP_AF]; + + /* convenience */ + vlib_main_t *vlib_main; +} udp_main_t; + +always_inline udp_dst_port_info_t * +udp_get_dst_port_info (udp_main_t * um, udp_dst_port_t dst_port, u8 is_ip4) +{ + uword *p = hash_get (um->dst_port_info_by_dst_port[is_ip4], dst_port); + return p ? vec_elt_at_index (um->dst_port_infos[is_ip4], p[0]) : 0; +} + +format_function_t format_udp_header; +format_function_t format_udp_rx_trace; + +unformat_function_t unformat_udp_header; + +void udp_register_dst_port (vlib_main_t * vm, + udp_dst_port_t dst_port, + u32 node_index, u8 is_ip4); + +void +udp_unregister_dst_port (vlib_main_t * vm, + udp_dst_port_t dst_port, u8 is_ip4); + +void udp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add); + +always_inline void +ip_udp_fixup_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 is_ip4) +{ + u16 new_l0; + udp_header_t *udp0; + + if (is_ip4) + { + ip4_header_t *ip0; + ip_csum_t sum0; + u16 old_l0 = 0; + + ip0 = vlib_buffer_get_current (b0); + + /* fix the ing outer-IP checksum */ + sum0 = ip0->checksum; + /* old_l0 always 0, see the rewrite setup */ + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); + + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */ ); + ip0->checksum = ip_csum_fold (sum0); + ip0->length = new_l0; + + /* Fix UDP length */ + udp0 = (udp_header_t *) (ip0 + 1); + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof (*ip0)); + udp0->length = new_l0; + } + else + { + ip6_header_t *ip0; + int bogus0; + + ip0 = vlib_buffer_get_current (b0); + + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof (*ip0)); + ip0->payload_length = new_l0; + + /* Fix UDP length */ + udp0 = (udp_header_t *) (ip0 + 1); + udp0->length = new_l0; + + udp0->checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, &bogus0); + ASSERT (bogus0 == 0); + + if (udp0->checksum == 0) + udp0->checksum = 0xffff; + } +} + +always_inline void +ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len, + u8 is_ip4) +{ + vlib_buffer_advance (b0, -ec_len); + + if (is_ip4) + { + ip4_header_t *ip0; + + ip0 = vlib_buffer_get_current (b0); + + /* Apply the encap string. */ + clib_memcpy (ip0, ec0, ec_len); + ip_udp_fixup_one (vm, b0, 1); + } + else + { + ip6_header_t *ip0; + + ip0 = vlib_buffer_get_current (b0); + + /* Apply the encap string. */ + clib_memcpy (ip0, ec0, ec_len); + ip_udp_fixup_one (vm, b0, 0); + } +} + +always_inline void +ip_udp_encap_two (vlib_main_t * vm, vlib_buffer_t * b0, vlib_buffer_t * b1, + u8 * ec0, u8 * ec1, word ec_len, u8 is_v4) +{ + u16 new_l0, new_l1; + udp_header_t *udp0, *udp1; + + ASSERT (_vec_len (ec0) == _vec_len (ec1)); + + vlib_buffer_advance (b0, -ec_len); + vlib_buffer_advance (b1, -ec_len); + + if (is_v4) + { + ip4_header_t *ip0, *ip1; + ip_csum_t sum0, sum1; + u16 old_l0 = 0, old_l1 = 0; + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b1); + + /* Apply the encap string */ + clib_memcpy (ip0, ec0, ec_len); + clib_memcpy (ip1, ec1, ec_len); + + /* fix the ing outer-IP checksum */ + sum0 = ip0->checksum; + sum1 = ip1->checksum; + + /* old_l0 always 0, see the rewrite setup */ + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); + new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1)); + + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */ ); + sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t, + length /* changed member */ ); + + ip0->checksum = ip_csum_fold (sum0); + ip1->checksum = ip_csum_fold (sum1); + + ip0->length = new_l0; + ip1->length = new_l1; + + /* Fix UDP length */ + udp0 = (udp_header_t *) (ip0 + 1); + udp1 = (udp_header_t *) (ip1 + 1); + + new_l0 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - + sizeof (*ip0)); + new_l1 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) - + sizeof (*ip1)); + udp0->length = new_l0; + udp1->length = new_l1; + } + else + { + ip6_header_t *ip0, *ip1; + int bogus0, bogus1; + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b1); + + /* Apply the encap string. */ + clib_memcpy (ip0, ec0, ec_len); + clib_memcpy (ip1, ec1, ec_len); + + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof (*ip0)); + new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) + - sizeof (*ip1)); + ip0->payload_length = new_l0; + ip1->payload_length = new_l1; + + /* Fix UDP length */ + udp0 = (udp_header_t *) (ip0 + 1); + udp1 = (udp_header_t *) (ip1 + 1); + + udp0->length = new_l0; + udp1->length = new_l1; + + udp0->checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, &bogus0); + udp1->checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, b1, ip1, &bogus1); + ASSERT (bogus0 == 0); + ASSERT (bogus1 == 0); + + if (udp0->checksum == 0) + udp0->checksum = 0xffff; + if (udp1->checksum == 0) + udp1->checksum = 0xffff; + } +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ + +#endif /* __included_udp_h__ */ diff --git a/src/vnet/udp/udp_error.def b/src/vnet/udp/udp_error.def new file mode 100644 index 00000000..bfdae0ac --- /dev/null +++ b/src/vnet/udp/udp_error.def @@ -0,0 +1,21 @@ +/* + * udp_error.def: udp errors + * + * Copyright (c) 2013-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +udp_error (NONE, "no error") +udp_error (NO_LISTENER, "no listener for dst port") +udp_error (LENGTH_ERROR, "UDP packets with length errors") +udp_error (PUNT, "no listener punt") diff --git a/src/vnet/udp/udp_format.c b/src/vnet/udp/udp_format.c new file mode 100644 index 00000000..abdf561e --- /dev/null +++ b/src/vnet/udp/udp_format.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/udp_format.c: udp formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +/* Format UDP header. */ +u8 * +format_udp_header (u8 * s, va_list * args) +{ + udp_header_t *udp = va_arg (*args, udp_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + uword indent; + u32 header_bytes = sizeof (udp[0]); + + /* Nothing to do. */ + if (max_header_bytes < sizeof (udp[0])) + return format (s, "UDP header truncated"); + + indent = format_get_indent (s); + indent += 2; + + s = format (s, "UDP: %d -> %d", + clib_net_to_host_u16 (udp->src_port), + clib_net_to_host_u16 (udp->dst_port)); + + s = format (s, "\n%Ulength %d, checksum 0x%04x", + format_white_space, indent, + clib_net_to_host_u16 (udp->length), + clib_net_to_host_u16 (udp->checksum)); + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && header_bytes < max_header_bytes) + { + ip_main_t *im = &ip_main; + tcp_udp_port_info_t *pi; + + pi = ip_get_tcp_udp_port_info (im, udp->dst_port); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", + format_white_space, indent - 2, pi->format_header, + /* next protocol header */ (udp + 1), + max_header_bytes - sizeof (udp[0])); + } + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c new file mode 100644 index 00000000..4d509335 --- /dev/null +++ b/src/vnet/udp/udp_input.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include "../session/application_interface.h" + +vlib_node_registration_t udp4_uri_input_node; + +typedef struct +{ + u32 session; + u32 disposition; + u32 thread_index; +} udp4_uri_input_trace_t; + +/* packet trace format function */ +static u8 * +format_udp4_uri_input_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + udp4_uri_input_trace_t *t = va_arg (*args, udp4_uri_input_trace_t *); + + s = format (s, "UDP4_URI_INPUT: session %d, disposition %d, thread %d", + t->session, t->disposition, t->thread_index); + return s; +} + +typedef enum +{ + UDP4_URI_INPUT_NEXT_DROP, + UDP4_URI_INPUT_N_NEXT, +} udp4_uri_input_next_t; + +static char *udp4_uri_input_error_strings[] = { +#define _(sym,string) string, + foreach_session_input_error +#undef _ +}; + +static uword +udp4_uri_input_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + udp4_uri_input_next_t next_index; + udp_uri_main_t *um = vnet_get_udp_main (); + session_manager_main_t *smm = vnet_get_session_manager_main (); + u32 my_thread_index = vm->cpu_index; + u8 my_enqueue_epoch; + u32 *session_indices_to_enqueue; + static u32 serial_number; + int i; + + my_enqueue_epoch = ++smm->current_enqueue_epoch[my_thread_index]; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = UDP4_URI_INPUT_NEXT_DROP; + u32 error0 = SESSION_ERROR_ENQUEUED; + udp_header_t *udp0; + ip4_header_t *ip0; + stream_session_t *s0; + svm_fifo_t *f0; + u16 udp_len0; + u8 *data0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* udp_local hands us a pointer to the udp data */ + + data0 = vlib_buffer_get_current (b0); + udp0 = (udp_header_t *) (data0 - sizeof (*udp0)); + + /* $$$$ fixme: udp_local doesn't do ip options correctly anyhow */ + ip0 = (ip4_header_t *) (((u8 *) udp0) - sizeof (*ip0)); + s0 = 0; + + /* lookup session */ + s0 = stream_session_lookup4 (&ip0->dst_address, &ip0->src_address, + udp0->dst_port, udp0->src_port, + SESSION_TYPE_IP4_UDP, my_thread_index); + + /* no listener */ + if (PREDICT_FALSE (s0 == 0)) + { + error0 = SESSION_ERROR_NO_LISTENER; + goto trace0; + } + + f0 = s0->server_rx_fifo; + + /* established hit */ + if (PREDICT_TRUE (s0->session_state == SESSION_STATE_READY)) + { + udp_len0 = clib_net_to_host_u16 (udp0->length); + + if (PREDICT_FALSE (udp_len0 > svm_fifo_max_enqueue (f0))) + { + error0 = SESSION_ERROR_FIFO_FULL; + goto trace0; + } + + svm_fifo_enqueue_nowait (f0, 0 /* pid */ , + udp_len0 - sizeof (*udp0), + (u8 *) (udp0 + 1)); + + b0->error = node->errors[SESSION_ERROR_ENQUEUED]; + + /* We need to send an RX event on this fifo */ + if (s0->enqueue_epoch != my_enqueue_epoch) + { + s0->enqueue_epoch = my_enqueue_epoch; + + vec_add1 (smm->session_indices_to_enqueue_by_thread + [my_thread_index], + s0 - smm->sessions[my_thread_index]); + } + } + /* listener hit */ + else if (s0->session_state == SESSION_STATE_LISTENING) + { + udp_connection_t *us; + int rv; + + error0 = SESSION_ERROR_NOT_READY; + + /* + * create udp transport session + */ + pool_get (um->udp_sessions[my_thread_index], us); + + us->mtu = 1024; /* $$$$ policy */ + + us->c_lcl_ip4.as_u32 = ip0->dst_address.as_u32; + us->c_rmt_ip4.as_u32 = ip0->src_address.as_u32; + us->c_lcl_port = udp0->dst_port; + us->c_rmt_port = udp0->src_port; + us->c_proto = SESSION_TYPE_IP4_UDP; + us->c_c_index = us - um->udp_sessions[my_thread_index]; + + /* + * create stream session and attach the udp session to it + */ + rv = stream_session_accept (&us->connection, s0->session_index, + SESSION_TYPE_IP4_UDP, + 1 /*notify */ ); + if (rv) + error0 = rv; + + } + else + { + + error0 = SESSION_ERROR_NOT_READY; + goto trace0; + } + + trace0: + b0->error = node->errors[error0]; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + udp4_uri_input_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + + t->session = ~0; + if (s0) + t->session = s0 - smm->sessions[my_thread_index]; + t->disposition = error0; + t->thread_index = my_thread_index; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Send enqueue events */ + + session_indices_to_enqueue = + smm->session_indices_to_enqueue_by_thread[my_thread_index]; + + for (i = 0; i < vec_len (session_indices_to_enqueue); i++) + { + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + stream_session_t *s0; + application_t *server0; + + /* Get session */ + s0 = pool_elt_at_index (smm->sessions[my_thread_index], + session_indices_to_enqueue[i]); + + /* Get session's server */ + server0 = application_get (s0->app_index); + + /* Built-in server? Deliver the goods... */ + if (server0->cb_fns.builtin_server_rx_callback) + { + server0->cb_fns.builtin_server_rx_callback (s0); + continue; + } + + /* Fabricate event */ + evt.fifo = s0->server_rx_fifo; + evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_id = serial_number++; + evt.enqueue_length = svm_fifo_max_dequeue (s0->server_rx_fifo); + + /* Add event to server's event queue */ + q = server0->event_queue; + + /* Don't block for lack of space */ + if (PREDICT_TRUE (q->cursize < q->maxsize)) + unix_shared_memory_queue_add (server0->event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); + else + { + vlib_node_increment_counter (vm, udp4_uri_input_node.index, + SESSION_ERROR_FIFO_FULL, 1); + } + if (1) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "evt-enqueue: id %d length %d",.format_args = "i4i4",}; + struct + { + u32 data[2]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = evt.event_id; + ed->data[1] = evt.enqueue_length; + } + } + + vec_reset_length (session_indices_to_enqueue); + + smm->session_indices_to_enqueue_by_thread[my_thread_index] = + session_indices_to_enqueue; + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (udp4_uri_input_node) = +{ + .function = udp4_uri_input_node_fn,.name = "udp4-uri-input",.vector_size = + sizeof (u32),.format_trace = format_udp4_uri_input_trace,.type = + VLIB_NODE_TYPE_INTERNAL,.n_errors = + ARRAY_LEN (udp4_uri_input_error_strings),.error_strings = + udp4_uri_input_error_strings,.n_next_nodes = UDP4_URI_INPUT_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = + { + [UDP4_URI_INPUT_NEXT_DROP] = "error-drop",} +,}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp_local.c b/src/vnet/udp/udp_local.c new file mode 100644 index 00000000..6b239f73 --- /dev/null +++ b/src/vnet/udp/udp_local.c @@ -0,0 +1,666 @@ +/* + * node.c: udp packet processing + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +udp_main_t udp_main; + +#define foreach_udp_input_next \ + _ (PUNT, "error-punt") \ + _ (DROP, "error-drop") \ + _ (ICMP4_ERROR, "ip4-icmp-error") \ + _ (ICMP6_ERROR, "ip6-icmp-error") + +typedef enum +{ +#define _(s,n) UDP_INPUT_NEXT_##s, + foreach_udp_input_next +#undef _ + UDP_INPUT_N_NEXT, +} udp_input_next_t; + +typedef struct +{ + u16 src_port; + u16 dst_port; + u8 bound; +} udp_rx_trace_t; + +u8 * +format_udp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + udp_rx_trace_t *t = va_arg (*args, udp_rx_trace_t *); + + s = format (s, "UDP: src-port %d dst-port %d%s", + clib_net_to_host_u16 (t->src_port), + clib_net_to_host_u16 (t->dst_port), + t->bound ? "" : " (no listener)"); + return s; +} + +typedef struct +{ + /* Sparse vector mapping udp dst_port in network byte order + to next index. */ + u16 *next_by_dst_port; + u8 punt_unknown; +} udp_input_runtime_t; + +vlib_node_registration_t udp4_input_node; +vlib_node_registration_t udp6_input_node; + +always_inline uword +udp46_input_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + udp_input_runtime_t *rt = is_ip4 ? + (void *) vlib_node_get_runtime_data (vm, udp4_input_node.index) + : (void *) vlib_node_get_runtime_data (vm, udp6_input_node.index); + __attribute__ ((unused)) u32 n_left_from, next_index, *from, *to_next; + word n_no_listener = 0; + u8 punt_unknown = rt->punt_unknown; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + udp_header_t *h0 = 0, *h1 = 0; + u32 i0, i1, dst_port0, dst_port1; + u32 advance0, advance1; + u32 error0, next0, error1, next1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* ip4/6_local hands us the ip header, not the udp header */ + if (is_ip4) + { + advance0 = sizeof (ip4_header_t); + advance1 = sizeof (ip4_header_t); + } + else + { + advance0 = sizeof (ip6_header_t); + advance1 = sizeof (ip6_header_t); + } + + if (PREDICT_FALSE (b0->current_length < advance0 + sizeof (*h0))) + { + error0 = UDP_ERROR_LENGTH_ERROR; + next0 = UDP_INPUT_NEXT_DROP; + } + else + { + vlib_buffer_advance (b0, advance0); + h0 = vlib_buffer_get_current (b0); + error0 = next0 = 0; + if (PREDICT_FALSE (clib_net_to_host_u16 (h0->length) > + vlib_buffer_length_in_chain (vm, b0))) + { + error0 = UDP_ERROR_LENGTH_ERROR; + next0 = UDP_INPUT_NEXT_DROP; + } + } + + if (PREDICT_FALSE (b1->current_length < advance1 + sizeof (*h1))) + { + error1 = UDP_ERROR_LENGTH_ERROR; + next1 = UDP_INPUT_NEXT_DROP; + } + else + { + vlib_buffer_advance (b1, advance1); + h1 = vlib_buffer_get_current (b1); + error1 = next1 = 0; + if (PREDICT_FALSE (clib_net_to_host_u16 (h1->length) > + vlib_buffer_length_in_chain (vm, b1))) + { + error1 = UDP_ERROR_LENGTH_ERROR; + next1 = UDP_INPUT_NEXT_DROP; + } + } + + /* Index sparse array with network byte order. */ + dst_port0 = (error0 == 0) ? h0->dst_port : 0; + dst_port1 = (error1 == 0) ? h1->dst_port : 0; + sparse_vec_index2 (rt->next_by_dst_port, dst_port0, dst_port1, + &i0, &i1); + next0 = (error0 == 0) ? vec_elt (rt->next_by_dst_port, i0) : next0; + next1 = (error1 == 0) ? vec_elt (rt->next_by_dst_port, i1) : next1; + + if (PREDICT_FALSE (i0 == SPARSE_VEC_INVALID_INDEX)) + { + // move the pointer back so icmp-error can find the + // ip packet header + vlib_buffer_advance (b0, -(word) advance0); + + if (PREDICT_FALSE (punt_unknown)) + { + b0->error = node->errors[UDP_ERROR_PUNT]; + next0 = UDP_INPUT_NEXT_PUNT; + } + else if (is_ip4) + { + icmp4_error_set_vnet_buffer (b0, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_port_unreachable, + 0); + next0 = UDP_INPUT_NEXT_ICMP4_ERROR; + n_no_listener++; + } + else + { + icmp6_error_set_vnet_buffer (b0, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_port_unreachable, + 0); + next0 = UDP_INPUT_NEXT_ICMP6_ERROR; + n_no_listener++; + } + } + else + { + b0->error = node->errors[UDP_ERROR_NONE]; + // advance to the payload + vlib_buffer_advance (b0, sizeof (*h0)); + } + + if (PREDICT_FALSE (i1 == SPARSE_VEC_INVALID_INDEX)) + { + // move the pointer back so icmp-error can find the + // ip packet header + vlib_buffer_advance (b1, -(word) advance1); + + if (PREDICT_FALSE (punt_unknown)) + { + b1->error = node->errors[UDP_ERROR_PUNT]; + next1 = UDP_INPUT_NEXT_PUNT; + } + else if (is_ip4) + { + icmp4_error_set_vnet_buffer (b1, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_port_unreachable, + 0); + next1 = UDP_INPUT_NEXT_ICMP4_ERROR; + n_no_listener++; + } + else + { + icmp6_error_set_vnet_buffer (b1, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_port_unreachable, + 0); + next1 = UDP_INPUT_NEXT_ICMP6_ERROR; + n_no_listener++; + } + } + else + { + b1->error = node->errors[UDP_ERROR_NONE]; + // advance to the payload + vlib_buffer_advance (b1, sizeof (*h1)); + } + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h0 ? h0->src_port : 0; + tr->dst_port = h0 ? h0->dst_port : 0; + tr->bound = (next0 != UDP_INPUT_NEXT_ICMP4_ERROR && + next0 != UDP_INPUT_NEXT_ICMP6_ERROR); + } + } + if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b1, sizeof (*tr)); + if (b1->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h1 ? h1->src_port : 0; + tr->dst_port = h1 ? h1->dst_port : 0; + tr->bound = (next1 != UDP_INPUT_NEXT_ICMP4_ERROR && + next1 != UDP_INPUT_NEXT_ICMP6_ERROR); + } + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + udp_header_t *h0 = 0; + u32 i0, next0; + u32 advance0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* ip4/6_local hands us the ip header, not the udp header */ + if (is_ip4) + advance0 = sizeof (ip4_header_t); + else + advance0 = sizeof (ip6_header_t); + + if (PREDICT_FALSE (b0->current_length < advance0 + sizeof (*h0))) + { + b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; + next0 = UDP_INPUT_NEXT_DROP; + goto trace_x1; + } + + vlib_buffer_advance (b0, advance0); + + h0 = vlib_buffer_get_current (b0); + + if (PREDICT_TRUE (clib_net_to_host_u16 (h0->length) <= + vlib_buffer_length_in_chain (vm, b0))) + { + i0 = sparse_vec_index (rt->next_by_dst_port, h0->dst_port); + next0 = vec_elt (rt->next_by_dst_port, i0); + + if (PREDICT_FALSE (i0 == SPARSE_VEC_INVALID_INDEX)) + { + // move the pointer back so icmp-error can find the + // ip packet header + vlib_buffer_advance (b0, -(word) advance0); + + if (PREDICT_FALSE (punt_unknown)) + { + b0->error = node->errors[UDP_ERROR_PUNT]; + next0 = UDP_INPUT_NEXT_PUNT; + } + else if (is_ip4) + { + icmp4_error_set_vnet_buffer (b0, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_port_unreachable, + 0); + next0 = UDP_INPUT_NEXT_ICMP4_ERROR; + n_no_listener++; + } + else + { + icmp6_error_set_vnet_buffer (b0, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_port_unreachable, + 0); + next0 = UDP_INPUT_NEXT_ICMP6_ERROR; + n_no_listener++; + } + } + else + { + b0->error = node->errors[UDP_ERROR_NONE]; + // advance to the payload + vlib_buffer_advance (b0, sizeof (*h0)); + } + } + else + { + b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; + next0 = UDP_INPUT_NEXT_DROP; + } + + trace_x1: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h0->src_port; + tr->dst_port = h0->dst_port; + tr->bound = (next0 != UDP_INPUT_NEXT_ICMP4_ERROR && + next0 != UDP_INPUT_NEXT_ICMP6_ERROR); + } + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_error_count (vm, node->node_index, UDP_ERROR_NO_LISTENER, + n_no_listener); + return from_frame->n_vectors; +} + +static char *udp_error_strings[] = { +#define udp_error(n,s) s, +#include "udp_error.def" +#undef udp_error +}; + +static uword +udp4_input (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * from_frame) +{ + return udp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +udp6_input (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * from_frame) +{ + return udp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (udp4_input_node) = { + .function = udp4_input, + .name = "ip4-udp-lookup", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .runtime_data_bytes = sizeof (udp_input_runtime_t), + + .n_errors = UDP_N_ERROR, + .error_strings = udp_error_strings, + + .n_next_nodes = UDP_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [UDP_INPUT_NEXT_##s] = n, + foreach_udp_input_next +#undef _ + }, + + .format_buffer = format_udp_header, + .format_trace = format_udp_rx_trace, + .unformat_buffer = unformat_udp_header, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (udp4_input_node, udp4_input); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (udp6_input_node) = { + .function = udp6_input, + .name = "ip6-udp-lookup", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .runtime_data_bytes = sizeof (udp_input_runtime_t), + + .n_errors = UDP_N_ERROR, + .error_strings = udp_error_strings, + + .n_next_nodes = UDP_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [UDP_INPUT_NEXT_##s] = n, + foreach_udp_input_next +#undef _ + }, + + .format_buffer = format_udp_header, + .format_trace = format_udp_rx_trace, + .unformat_buffer = unformat_udp_header, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (udp6_input_node, udp6_input); + +static void +add_dst_port (udp_main_t * um, + udp_dst_port_t dst_port, char *dst_port_name, u8 is_ip4) +{ + udp_dst_port_info_t *pi; + u32 i; + + vec_add2 (um->dst_port_infos[is_ip4], pi, 1); + i = pi - um->dst_port_infos[is_ip4]; + + pi->name = dst_port_name; + pi->dst_port = dst_port; + pi->next_index = pi->node_index = ~0; + + hash_set (um->dst_port_info_by_dst_port[is_ip4], dst_port, i); + + if (pi->name) + hash_set_mem (um->dst_port_info_by_name[is_ip4], pi->name, i); +} + +void +udp_register_dst_port (vlib_main_t * vm, + udp_dst_port_t dst_port, u32 node_index, u8 is_ip4) +{ + udp_main_t *um = &udp_main; + udp_dst_port_info_t *pi; + udp_input_runtime_t *rt; + u16 *n; + + { + clib_error_t *error = vlib_call_init_function (vm, udp_local_init); + if (error) + clib_error_report (error); + } + + pi = udp_get_dst_port_info (um, dst_port, is_ip4); + if (!pi) + { + add_dst_port (um, dst_port, 0, is_ip4); + pi = udp_get_dst_port_info (um, dst_port, is_ip4); + ASSERT (pi); + } + + pi->node_index = node_index; + pi->next_index = vlib_node_add_next (vm, + is_ip4 ? udp4_input_node.index + : udp6_input_node.index, node_index); + + /* Setup udp protocol -> next index sparse vector mapping. */ + rt = vlib_node_get_runtime_data + (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); + n = sparse_vec_validate (rt->next_by_dst_port, + clib_host_to_net_u16 (dst_port)); + n[0] = pi->next_index; +} + +void +udp_unregister_dst_port (vlib_main_t * vm, udp_dst_port_t dst_port, u8 is_ip4) +{ + udp_main_t *um = &udp_main; + udp_dst_port_info_t *pi; + udp_input_runtime_t *rt; + u16 *n; + + pi = udp_get_dst_port_info (um, dst_port, is_ip4); + /* Not registered? Fagedaboudit */ + if (!pi) + return; + + /* Kill the mapping. Don't bother killing the pi, it may be back. */ + rt = vlib_node_get_runtime_data + (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); + n = sparse_vec_validate (rt->next_by_dst_port, + clib_host_to_net_u16 (dst_port)); + n[0] = SPARSE_VEC_INVALID_INDEX; +} + +void +udp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add) +{ + udp_input_runtime_t *rt; + + { + clib_error_t *error = vlib_call_init_function (vm, udp_local_init); + if (error) + clib_error_report (error); + } + + rt = vlib_node_get_runtime_data + (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); + + rt->punt_unknown = is_add; +} + +/* Parse a UDP header. */ +uword +unformat_udp_header (unformat_input_t * input, va_list * args) +{ + u8 **result = va_arg (*args, u8 **); + udp_header_t *udp; + __attribute__ ((unused)) int old_length; + u16 src_port, dst_port; + + /* Allocate space for IP header. */ + { + void *p; + + old_length = vec_len (*result); + vec_add2 (*result, p, sizeof (ip4_header_t)); + udp = p; + } + + memset (udp, 0, sizeof (udp[0])); + if (unformat (input, "src-port %d dst-port %d", &src_port, &dst_port)) + { + udp->src_port = clib_host_to_net_u16 (src_port); + udp->dst_port = clib_host_to_net_u16 (dst_port); + return 1; + } + return 0; +} + +static void +udp_setup_node (vlib_main_t * vm, u32 node_index) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + pg_node_t *pn = pg_get_node (node_index); + + n->format_buffer = format_udp_header; + n->unformat_buffer = unformat_udp_header; + pn->unformat_edit = unformat_pg_udp_header; +} + +clib_error_t * +udp_local_init (vlib_main_t * vm) +{ + udp_input_runtime_t *rt; + udp_main_t *um = &udp_main; + int i; + + { + clib_error_t *error; + error = vlib_call_init_function (vm, udp_init); + if (error) + clib_error_report (error); + } + + + for (i = 0; i < 2; i++) + { + um->dst_port_info_by_name[i] = hash_create_string (0, sizeof (uword)); + um->dst_port_info_by_dst_port[i] = hash_create (0, sizeof (uword)); + } + + udp_setup_node (vm, udp4_input_node.index); + udp_setup_node (vm, udp6_input_node.index); + + rt = vlib_node_get_runtime_data (vm, udp4_input_node.index); + + rt->next_by_dst_port = sparse_vec_new + ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), + /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); + + rt->punt_unknown = 0; + +#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 1 /* is_ip4 */); + foreach_udp4_dst_port +#undef _ + rt = vlib_node_get_runtime_data (vm, udp6_input_node.index); + + rt->next_by_dst_port = sparse_vec_new + ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), + /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); + + rt->punt_unknown = 0; + +#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 0 /* is_ip4 */); + foreach_udp6_dst_port +#undef _ + ip4_register_protocol (IP_PROTOCOL_UDP, udp4_input_node.index); + /* Note: ip6 differs from ip4, UDP is hotwired to ip6-udp-lookup */ + return 0; +} + +VLIB_INIT_FUNCTION (udp_local_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp_packet.h b/src/vnet/udp/udp_packet.h new file mode 100644 index 00000000..beea3059 --- /dev/null +++ b/src/vnet/udp/udp_packet.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip4/udp_packet.h: UDP packet format + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_udp_packet_h +#define included_udp_packet_h + +typedef struct +{ + /* Source and destination port. */ + u16 src_port, dst_port; + + /* Length of UDP header plus payload. */ + u16 length; + + /* Checksum of UDP pseudo-header and data or + zero if checksum is disabled. */ + u16 checksum; +} udp_header_t; + +#endif /* included_udp_packet_h */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp_pg.c b/src/vnet/udp/udp_pg.c new file mode 100644 index 00000000..c9d8d38c --- /dev/null +++ b/src/vnet/udp/udp_pg.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/udp_pg: UDP packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include /* for unformat_udp_udp_port */ + +#define UDP_PG_EDIT_LENGTH (1 << 0) +#define UDP_PG_EDIT_CHECKSUM (1 << 1) + +always_inline void +udp_pg_edit_function_inline (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, u32 n_packets, u32 flags) +{ + vlib_main_t *vm = vlib_get_main (); + u32 ip_offset, udp_offset; + + udp_offset = g->start_byte_offset; + ip_offset = (g - 1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t *p0; + ip4_header_t *ip0; + udp_header_t *udp0; + u32 udp_len0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ip0 = (void *) (p0->data + ip_offset); + udp0 = (void *) (p0->data + udp_offset); + udp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); + + if (flags & UDP_PG_EDIT_LENGTH) + udp0->length = + clib_net_to_host_u16 (vlib_buffer_length_in_chain (vm, p0) + - ip_offset); + + /* Initialize checksum with header. */ + if (flags & UDP_PG_EDIT_CHECKSUM) + { + ip_csum_t sum0; + + sum0 = clib_mem_unaligned (&ip0->src_address, u64); + + sum0 = ip_csum_with_carry + (sum0, clib_host_to_net_u32 (udp_len0 + (ip0->protocol << 16))); + + /* Invalidate possibly old checksum. */ + udp0->checksum = 0; + + sum0 = + ip_incremental_checksum_buffer (vm, p0, udp_offset, udp_len0, + sum0); + + sum0 = ~ip_csum_fold (sum0); + + /* Zero checksum means checksumming disabled. */ + sum0 = sum0 != 0 ? sum0 : 0xffff; + + udp0->checksum = sum0; + } + } +} + +static void +udp_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, u32 * packets, u32 n_packets) +{ + switch (g->edit_function_opaque) + { + case UDP_PG_EDIT_LENGTH: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_LENGTH); + break; + + case UDP_PG_EDIT_CHECKSUM: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_CHECKSUM); + break; + + case UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH); + break; + + default: + ASSERT (0); + break; + } +} + +typedef struct +{ + pg_edit_t src_port, dst_port; + pg_edit_t length; + pg_edit_t checksum; +} pg_udp_header_t; + +static inline void +pg_udp_header_init (pg_udp_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, udp_header_t, f); + _(src_port); + _(dst_port); + _(length); + _(checksum); +#undef _ +} + +uword +unformat_pg_udp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t *s = va_arg (*args, pg_stream_t *); + pg_udp_header_t *p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (udp_header_t), + &group_index); + pg_udp_header_init (p); + + /* Defaults. */ + p->checksum.type = PG_EDIT_UNSPECIFIED; + p->length.type = PG_EDIT_UNSPECIFIED; + + if (!unformat (input, "UDP: %U -> %U", + unformat_pg_edit, + unformat_tcp_udp_port, &p->src_port, + unformat_pg_edit, unformat_tcp_udp_port, &p->dst_port)) + goto error; + + /* Parse options. */ + while (1) + { + if (unformat (input, "length %U", + unformat_pg_edit, unformat_pg_number, &p->length)) + ; + + else if (unformat (input, "checksum %U", + unformat_pg_edit, unformat_pg_number, &p->checksum)) + ; + + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t *im = &ip_main; + u16 dst_port; + tcp_udp_port_info_t *pi; + + pi = 0; + if (p->dst_port.type == PG_EDIT_FIXED) + { + dst_port = pg_edit_get_value (&p->dst_port, PG_EDIT_LO); + pi = ip_get_tcp_udp_port_info (im, dst_port); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (!unformat_user (input, unformat_pg_payload, s)) + goto error; + + p = pg_get_edit_group (s, group_index); + if (p->checksum.type == PG_EDIT_UNSPECIFIED + || p->length.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t *g = pg_stream_get_group (s, group_index); + g->edit_function = udp_pg_edit_function; + g->edit_function_opaque = 0; + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + g->edit_function_opaque |= UDP_PG_EDIT_CHECKSUM; + if (p->length.type == PG_EDIT_UNSPECIFIED) + g->edit_function_opaque |= UDP_PG_EDIT_LENGTH; + } + + return 1; + } + +error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/vnet_all_api_h.h b/src/vnet/vnet_all_api_h.h index 142acedc..c4075db6 100644 --- a/src/vnet/vnet_all_api_h.h +++ b/src/vnet/vnet_all_api_h.h @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include diff --git a/src/vnet/vxlan-gpe/vxlan_gpe.h b/src/vnet/vxlan-gpe/vxlan_gpe.h index 1b4bc44e..e768d230 100644 --- a/src/vnet/vxlan-gpe/vxlan_gpe.h +++ b/src/vnet/vxlan-gpe/vxlan_gpe.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include /** * @brief VXLAN GPE header struct diff --git a/src/vnet/vxlan/vxlan.h b/src/vnet/vxlan/vxlan.h index adfa3a8e..dca1cd12 100644 --- a/src/vnet/vxlan/vxlan.h +++ b/src/vnet/vxlan/vxlan.h @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/vpp/api/vpe.api b/src/vpp/api/vpe.api index 24f48293..2d6e4f37 100644 --- a/src/vpp/api/vpe.api +++ b/src/vpp/api/vpe.api @@ -38,6 +38,7 @@ * IPSEC-GRE APIs: see .../src/vnet/ipsec-gre/{ipsec_gre.api, ipsec_gre_api.c} * LISP APIs: see .../src/vnet/lisp/{lisp.api, lisp_api.c} * LISP-GPE APIs: see .../src/vnet/lisp-gpe/{lisp_gpe.api, lisp_gpe_api.c} + * SESSION APIs: .../vnet/session/{session.api session_api.c} * MPLS APIs: see .../src/vnet/mpls/{mpls.api, mpls_api.c} * SR APIs: see .../src/vnet/sr/{sr.api, sr_api.c} * DPDK APIs: see ... /src/vnet/devices/dpdk/{dpdk.api, dpdk_api.c} diff --git a/src/vppinfra.am b/src/vppinfra.am index 8d375958..4b9f0c29 100644 --- a/src/vppinfra.am +++ b/src/vppinfra.am @@ -157,7 +157,9 @@ nobase_include_HEADERS = \ vppinfra/asm_mips.h \ vppinfra/asm_x86.h \ vppinfra/bihash_8_8.h \ + vppinfra/bihash_16_8.h \ vppinfra/bihash_24_8.h \ + vppinfra/bihash_48_8.h \ vppinfra/bihash_template.h \ vppinfra/bihash_template.c \ vppinfra/bitmap.h \ @@ -206,6 +208,7 @@ nobase_include_HEADERS = \ vppinfra/timer.h \ vppinfra/tw_timer_2t_1w_2048sl.h \ vppinfra/tw_timer_16t_2w_512sl.h \ + vppinfra/tw_timer_16t_1w_2048sl.h \ vppinfra/tw_timer_template.h \ vppinfra/tw_timer_template.c \ vppinfra/types.h \ @@ -261,6 +264,8 @@ CLIB_CORE = \ vppinfra/tw_timer_2t_1w_2048sl.c \ vppinfra/tw_timer_16t_2w_512sl.h \ vppinfra/tw_timer_16t_2w_512sl.c \ + vppinfra/tw_timer_16t_1w_2048sl.h \ + vppinfra/tw_timer_16t_1w_2048sl.c \ vppinfra/unformat.c \ vppinfra/vec.c \ vppinfra/vector.c \ diff --git a/src/vppinfra/bihash_16_8.h b/src/vppinfra/bihash_16_8.h new file mode 100644 index 00000000..ce80f70e --- /dev/null +++ b/src/vppinfra/bihash_16_8.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#undef BIHASH_TYPE + +#define BIHASH_TYPE _16_8 +#define BIHASH_KVP_PER_PAGE 4 + +#ifndef __included_bihash_16_8_h__ +#define __included_bihash_16_8_h__ + +#include +#include +#include +#include + +typedef struct +{ + u64 key[2]; + u64 value; +} clib_bihash_kv_16_8_t; + +static inline int +clib_bihash_is_free_16_8 (clib_bihash_kv_16_8_t * v) +{ + /* Free values are memset to 0xff, check a bit... */ + if (v->key[0] == ~0ULL && v->value == ~0ULL) + return 1; + return 0; +} + +#if __SSE4_2__ +#ifndef __defined_crc_u32__ +#define __defined_crc_u32__ +static inline u32 +crc_u32 (u32 data, u32 value) +{ + __asm__ volatile ("crc32l %[data], %[value];":[value] "+r" (value):[data] + "rm" (data)); + return value; +} +#endif /* __defined_crc_u32__ */ + +static inline u64 +clib_bihash_hash_16_8 (clib_bihash_kv_16_8_t * v) +{ + u32 *dp = (u32 *) & v->key[0]; + u32 value = 0; + + value = crc_u32 (dp[0], value); + value = crc_u32 (dp[1], value); + value = crc_u32 (dp[2], value); + value = crc_u32 (dp[3], value); + + return value; +} +#else +static inline u64 +clib_bihash_hash_16_8 (clib_bihash_kv_16_8_t * v) +{ + u64 tmp = v->key[0] ^ v->key[1]; + return clib_xxhash (tmp); +} +#endif + +static inline u8 * +format_bihash_kvp_16_8 (u8 * s, va_list * args) +{ + clib_bihash_kv_16_8_t *v = va_arg (*args, clib_bihash_kv_16_8_t *); + + s = format (s, "key %llu %llu value %llu", v->key[0], v->key[1], v->value); + return s; +} + +static inline int +clib_bihash_key_compare_16_8 (u64 * a, u64 * b) +{ + return ((a[0] ^ b[0]) | (a[1] ^ b[1])) == 0; +} + +#undef __included_bihash_template_h__ +#include + +#endif /* __included_bihash_16_8_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/bihash_48_8.h b/src/vppinfra/bihash_48_8.h new file mode 100644 index 00000000..1a6e7691 --- /dev/null +++ b/src/vppinfra/bihash_48_8.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#undef BIHASH_TYPE + +#define BIHASH_TYPE _48_8 +#define BIHASH_KVP_PER_PAGE 4 + +#ifndef __included_bihash_48_8_h__ +#define __included_bihash_48_8_h__ + +#include +#include +#include +#include + +typedef struct +{ + u64 key[6]; + u64 value; +} clib_bihash_kv_48_8_t; + +static inline int +clib_bihash_is_free_48_8 (const clib_bihash_kv_48_8_t * v) +{ + /* Free values are memset to 0xff, check a bit... */ + if (v->key[0] == ~0ULL && v->value == ~0ULL) + return 1; + return 0; +} + +#if __SSE4_2__ +#ifndef __defined_crc_u32__ +#define __defined_crc_u32__ +static inline u32 +crc_u32 (u32 data, u32 value) +{ + __asm__ volatile ("crc32l %[data], %[value];":[value] "+r" (value):[data] + "rm" (data)); + return value; +} +#endif /* __defined_crc_u32__ */ + +static inline u64 +clib_bihash_hash_48_8 (const clib_bihash_kv_48_8_t * v) +{ + const u32 *dp = (const u32 *) &v->key[0]; + u32 value = 0; + + value = crc_u32 (dp[0], value); + value = crc_u32 (dp[1], value); + value = crc_u32 (dp[2], value); + value = crc_u32 (dp[3], value); + value = crc_u32 (dp[4], value); + value = crc_u32 (dp[5], value); + value = crc_u32 (dp[6], value); + value = crc_u32 (dp[7], value); + value = crc_u32 (dp[8], value); + value = crc_u32 (dp[9], value); + value = crc_u32 (dp[10], value); + value = crc_u32 (dp[11], value); + + return value; +} +#else +static inline u64 +clib_bihash_hash_48_8 (const clib_bihash_kv_48_8_t * v) +{ + u64 tmp = v->key[0] ^ v->key[1] ^ v->key[2] ^ v->key[3] ^ v->key[4] + ^ v->key[5]; + return clib_xxhash (tmp); +} +#endif + +static inline u8 * +format_bihash_kvp_48_8 (u8 * s, va_list * args) +{ + clib_bihash_kv_48_8_t *v = va_arg (*args, clib_bihash_kv_48_8_t *); + + s = format (s, "key %llu %llu %llu %llu %llu %llu value %llu", v->key[0], + v->key[1], v->key[2], v->key[3], v->key[4], v->key[5], + v->value); + return s; +} + +static inline int +clib_bihash_key_compare_48_8 (const u64 * a, const u64 * b) +{ + return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) | (a[3] ^ b[3]) + | (a[4] ^ b[4]) | (a[5] ^ b[5])) == 0; +} + +#undef __included_bihash_template_h__ +#include + +#endif /* __included_bihash_48_8_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/tw_timer_16t_1w_2048sl.c b/src/vppinfra/tw_timer_16t_1w_2048sl.c new file mode 100644 index 00000000..3f342045 --- /dev/null +++ b/src/vppinfra/tw_timer_16t_1w_2048sl.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "tw_timer_16t_1w_2048sl.h" +#include "tw_timer_template.c" + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/tw_timer_16t_1w_2048sl.h b/src/vppinfra/tw_timer_16t_1w_2048sl.h new file mode 100644 index 00000000..685ac31e --- /dev/null +++ b/src/vppinfra/tw_timer_16t_1w_2048sl.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_tw_timer_16t_2w_512sl_h__ +#define __included_tw_timer_16t_2w_512sl_h__ + +/* ... So that a client app can create multiple wheel geometries */ +#undef TW_TIMER_WHEELS +#undef TW_SLOTS_PER_RING +#undef TW_RING_SHIFT +#undef TW_RING_MASK +#undef TW_TIMERS_PER_OBJECT +#undef LOG2_TW_TIMERS_PER_OBJECT +#undef TW_SUFFIX + +#define TW_TIMER_WHEELS 1 +#define TW_SLOTS_PER_RING 2048 +#define TW_RING_SHIFT 11 +#define TW_RING_MASK (TW_SLOTS_PER_RING -1) +#define TW_TIMERS_PER_OBJECT 16 +#define LOG2_TW_TIMERS_PER_OBJECT 4 +#define TW_SUFFIX _16t_1w_2048sl + +#include + +#endif /* __included_tw_timer_16t_2w_512sl_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From 80f54e20270ed0628ee725e3e3c515731a0188f2 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Wed, 8 Mar 2017 19:08:56 -0500 Subject: vlib_mains == 0 special cases be gone Clean up spurious binary API client link dependency on libvlib.so, which managed to hide behind vlib_mains == 0 checks reached by VLIB_xxx_FUNCTION macros. Change-Id: I5df1f8ab07dca1944250e643ccf06e60a8462325 Signed-off-by: Dave Barach --- src/plugins/dpdk/ipsec/ipsec.c | 8 +- src/vlib-api.am | 4 +- src/vlib/buffer.c | 27 +- src/vlib/global_funcs.h | 2 +- src/vlib/node_cli.c | 28 +- src/vlib/node_funcs.h | 4 +- src/vlib/threads.c | 16 +- src/vlib/threads.h | 43 ++- src/vlibapi/api.h | 4 +- src/vlibapi/api_shared.c | 530 ++--------------------------------- src/vlibapi/node_serialize.c | 15 +- src/vlibmemory/memory_vlib.c | 471 +++++++++++++++++++++++++++++++ src/vnet/devices/virtio/vhost-user.c | 9 +- src/vpp-api-test.am | 2 - src/vpp/api/api.c | 1 - src/vpp/api/gmon.c | 9 +- 16 files changed, 575 insertions(+), 598 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index 16bec20a..b0aaaaec 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -380,13 +380,9 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, im->cb.check_support_cb = dpdk_ipsec_check_support; im->cb.add_del_sa_sess_cb = add_del_sa_sess; - if (vec_len (vlib_mains) == 0) - vlib_node_set_state (&vlib_global_main, dpdk_crypto_input_node.index, + for (i = 1; i < tm->n_vlib_mains; i++) + vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index, VLIB_NODE_STATE_POLLING); - else - for (i = 1; i < tm->n_vlib_mains; i++) - vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index, - VLIB_NODE_STATE_POLLING); /* TODO cryptodev counters */ diff --git a/src/vlib-api.am b/src/vlib-api.am index c05929b1..4e1dae99 100644 --- a/src/vlib-api.am +++ b/src/vlib-api.am @@ -14,7 +14,7 @@ lib_LTLIBRARIES += libvlibmemory.la libvlibapi.la libvlibmemoryclient.la \ libvlibsocket.la -libvlibmemory_la_DEPENDENCIES = libvppinfra.la libsvm.la libvlib.la +libvlibmemory_la_DEPENDENCIES = libvppinfra.la libsvm.la libvlibmemory_la_LIBADD = $(libvlibmemory_la_DEPENDENCIES) -lpthread libvlibmemory_la_SOURCES = \ vlibmemory/api.h \ @@ -26,7 +26,7 @@ libvlibmemory_la_SOURCES = \ vlibmemory/unix_shared_memory_queue.c \ vlibmemory/unix_shared_memory_queue.h -libvlibapi_la_DEPENDENCIES = libvppinfra.la libvlib.la libvlibmemory.la +libvlibapi_la_DEPENDENCIES = libvppinfra.la libvlibapi_la_LIBADD = $(libvlibapi_la_DEPENDENCIES) libvlibapi_la_SOURCES = \ vlibapi/api.h \ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 9f26bec7..6ba82584 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -261,7 +261,28 @@ done: return result; } -vlib_main_t **vlib_mains; +/* + * Hand-craft a static vector w/ length 1, so vec_len(vlib_mains) =1 + * and vlib_mains[0] = &vlib_global_main from the beginning of time. + * + * The only place which should ever expand vlib_mains is start_workers() + * in threads.c. It knows about the bootstrap vector. + */ +/* *INDENT-OFF* */ +static struct +{ + vec_header_t h; + vlib_main_t *vm; +} __attribute__ ((packed)) __bootstrap_vlib_main_vector + __attribute__ ((aligned (CLIB_CACHE_LINE_BYTES))) = +{ + .h.len = 1, + .vm = &vlib_global_main, +}; +/* *INDENT-ON* */ + +vlib_main_t **vlib_mains = &__bootstrap_vlib_main_vector.vm; + /* When dubugging validate that given buffers are either known allocated or known free. */ @@ -280,7 +301,7 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, ASSERT (os_get_cpu_number () == 0); /* smp disaster check */ - if (vlib_mains) + if (vec_len (vlib_mains) > 1) ASSERT (vm == vlib_mains[0]); is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED; @@ -956,7 +977,7 @@ show_buffers (vlib_main_t * vm, do { - curr_vm = vec_len (vlib_mains) ? vlib_mains[vm_index] : vm; + curr_vm = vlib_mains[vm_index]; bm = curr_vm->buffer_main; /* *INDENT-OFF* */ diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h index bbdbdef5..f51ec381 100644 --- a/src/vlib/global_funcs.h +++ b/src/vlib/global_funcs.h @@ -23,7 +23,7 @@ always_inline vlib_main_t * vlib_get_main (void) { vlib_main_t *vm; - vm = vlib_mains ? vlib_mains[os_get_cpu_number ()] : &vlib_global_main; + vm = vlib_mains[os_get_cpu_number ()]; ASSERT (vm); return vm; } diff --git a/src/vlib/node_cli.c b/src/vlib/node_cli.c index 05d0f0b5..62ab2e64 100644 --- a/src/vlib/node_cli.c +++ b/src/vlib/node_cli.c @@ -248,16 +248,11 @@ show_node_runtime (vlib_main_t * vm, if (unformat (input, "max") || unformat (input, "m")) max = 1; - if (vec_len (vlib_mains) == 0) - vec_add1 (stat_vms, vm); - else + for (i = 0; i < vec_len (vlib_mains); i++) { - for (i = 0; i < vec_len (vlib_mains); i++) - { - stat_vm = vlib_mains[i]; - if (stat_vm) - vec_add1 (stat_vms, stat_vm); - } + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); } /* @@ -331,7 +326,7 @@ show_node_runtime (vlib_main_t * vm, } } - if (vec_len (vlib_mains)) + if (vec_len (vlib_mains) > 1) { vlib_worker_thread_t *w = vlib_worker_threads + j; if (j > 0) @@ -404,16 +399,11 @@ clear_node_runtime (vlib_main_t * vm, vlib_main_t **stat_vms = 0, *stat_vm; vlib_node_runtime_t *r; - if (vec_len (vlib_mains) == 0) - vec_add1 (stat_vms, vm); - else + for (i = 0; i < vec_len (vlib_mains); i++) { - for (i = 0; i < vec_len (vlib_mains); i++) - { - stat_vm = vlib_mains[i]; - if (stat_vm) - vec_add1 (stat_vms, stat_vm); - } + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); } vlib_worker_thread_barrier_sync (vm); diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index f49a8d6f..8ccfc438 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -201,7 +201,7 @@ vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) vlib_frame_t *f; u32 cpu_index = frame_index & VLIB_CPU_MASK; u32 offset = frame_index & VLIB_OFFSET_MASK; - vm = vlib_mains ? vlib_mains[cpu_index] : vm; + vm = vlib_mains[cpu_index]; f = vm->heap_base + offset; return f; } @@ -213,7 +213,7 @@ vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) ASSERT (((uword) f & VLIB_CPU_MASK) == 0); - vm = vlib_mains ? vlib_mains[f->cpu_index] : vm; + vm = vlib_mains[f->cpu_index]; i = ((u8 *) f - (u8 *) vm->heap_base); return i | f->cpu_index; diff --git a/src/vlib/threads.c b/src/vlib/threads.c index e3ea3c9c..4676be97 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -570,9 +570,13 @@ start_workers (vlib_main_t * vm) if (n_vlib_mains > 1) { - vec_validate (vlib_mains, tm->n_vlib_mains - 1); + /* Replace hand-crafted length-1 vector with a real vector */ + vlib_mains = 0; + + vec_validate_aligned (vlib_mains, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); _vec_len (vlib_mains) = 0; - vec_add1 (vlib_mains, vm); + vec_add1_aligned (vlib_mains, vm, CLIB_CACHE_LINE_BYTES); vlib_worker_threads->wait_at_barrier = clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES); @@ -685,7 +689,7 @@ start_workers (vlib_main_t * vm) /* Packet trace buffers are guaranteed to be empty, nothing to do here */ clib_mem_set_heap (oldheap); - vec_add1 (vlib_mains, vm_clone); + vec_add1_aligned (vlib_mains, vm_clone, CLIB_CACHE_LINE_BYTES); vm_clone->error_main.counters = vec_dup (vlib_mains[0]->error_main.counters); @@ -805,7 +809,7 @@ vlib_worker_thread_node_runtime_update (void) ASSERT (os_get_cpu_number () == 0); - if (vec_len (vlib_mains) == 0) + if (vec_len (vlib_mains) == 1) return; vm = vlib_mains[0]; @@ -1148,7 +1152,7 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) f64 deadline; u32 count; - if (!vlib_mains) + if (vec_len (vlib_mains) < 2) return; count = vec_len (vlib_mains) - 1; @@ -1179,7 +1183,7 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) { f64 deadline; - if (!vlib_mains) + if (vec_len (vlib_mains) < 2) return; if (--vlib_worker_threads[0].recursion_level > 0) diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 75a5a281..a032311c 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -222,30 +222,25 @@ vlib_worker_thread_barrier_check (void) } } -#define foreach_vlib_main(body) \ -do { \ - vlib_main_t ** __vlib_mains = 0, *this_vlib_main; \ - int ii; \ - \ - if (vec_len (vlib_mains) == 0) \ - vec_add1 (__vlib_mains, &vlib_global_main); \ - else \ - { \ - for (ii = 0; ii < vec_len (vlib_mains); ii++) \ - { \ - this_vlib_main = vlib_mains[ii]; \ - if (this_vlib_main) \ - vec_add1 (__vlib_mains, this_vlib_main); \ - } \ - } \ - \ - for (ii = 0; ii < vec_len (__vlib_mains); ii++) \ - { \ - this_vlib_main = __vlib_mains[ii]; \ - /* body uses this_vlib_main... */ \ - (body); \ - } \ - vec_free (__vlib_mains); \ +#define foreach_vlib_main(body) \ +do { \ + vlib_main_t ** __vlib_mains = 0, *this_vlib_main; \ + int ii; \ + \ + for (ii = 0; ii < vec_len (vlib_mains); ii++) \ + { \ + this_vlib_main = vlib_mains[ii]; \ + if (this_vlib_main) \ + vec_add1 (__vlib_mains, this_vlib_main); \ + } \ + \ + for (ii = 0; ii < vec_len (__vlib_mains); ii++) \ + { \ + this_vlib_main = __vlib_mains[ii]; \ + /* body uses this_vlib_main... */ \ + (body); \ + } \ + vec_free (__vlib_mains); \ } while (0); #define foreach_sched_policy \ diff --git a/src/vlibapi/api.h b/src/vlibapi/api.h index 2cbeb63c..87a56121 100644 --- a/src/vlibapi/api.h +++ b/src/vlibapi/api.h @@ -252,11 +252,13 @@ void vl_msg_api_queue_handler (unix_shared_memory_queue_t * q); vl_api_trace_t *vl_msg_api_trace_get (api_main_t * am, vl_api_trace_which_t which); +void vl_msg_api_barrier_sync (void) __attribute__ ((weak)); +void vl_msg_api_barrier_release (void) __attribute__ ((weak)); void vl_msg_api_free (void *); void vl_noop_handler (void *mp); -clib_error_t *vl_api_init (vlib_main_t * vm); void vl_msg_api_increment_missing_client_counter (void); void vl_msg_api_post_mortem_dump (void); +void vl_msg_api_post_mortem_dump_enable_disable (int enable); void vl_msg_api_register_pd_handler (void *handler, u16 msg_id_host_byte_order); int vl_msg_api_pd_handler (void *mp, int rv); diff --git a/src/vlibapi/api_shared.c b/src/vlibapi/api_shared.c index 69ba10c1..6774e3dd 100644 --- a/src/vlibapi/api_shared.c +++ b/src/vlibapi/api_shared.c @@ -23,11 +23,6 @@ #include #include #include -#include -#include -#include -#include -#include #include #include #include @@ -36,19 +31,14 @@ #include #include -api_main_t api_main; - -void vl_msg_api_barrier_sync (void) __attribute__ ((weak)); -void -vl_msg_api_barrier_sync (void) -{ -} - -void vl_msg_api_barrier_release (void) __attribute__ ((weak)); -void -vl_msg_api_barrier_release (void) -{ -} +/* *INDENT-OFF* */ +api_main_t api_main = + { + .region_name = "/unset", + .api_uid = -1, + .api_gid = -1, + }; +/* *INDENT-ON* */ void vl_msg_api_increment_missing_client_counter (void) @@ -57,14 +47,6 @@ vl_msg_api_increment_missing_client_counter (void) am->missing_clients++; } -typedef enum -{ - DUMP, - CUSTOM_DUMP, - REPLAY, - INITIALIZERS, -} vl_api_replay_t; - int vl_msg_api_rx_trace_enabled (api_main_t * am) { @@ -397,6 +379,16 @@ vl_msg_api_trace_configure (api_main_t * am, vl_api_trace_which_t which, return 0; } +void +vl_msg_api_barrier_sync (void) +{ +} + +void +vl_msg_api_barrier_release (void) +{ +} + always_inline void msg_handler_internal (api_main_t * am, void *the_msg, int trace_it, int do_it, int free_it) @@ -748,495 +740,15 @@ vl_noop_handler (void *mp) { } -clib_error_t * -vl_api_init (vlib_main_t * vm) -{ - static u8 once; - api_main_t *am = &api_main; - - if (once) - return 0; - - once = 1; - - am->region_name = "/unset"; - /* - * Eventually passed to fchown, -1 => "current user" - * instead of 0 => "root". A very fine disctinction at best. - */ - if (am->api_uid == 0) - am->api_uid = -1; - if (am->api_gid == 0) - am->api_gid = -1; - - return (0); -} - -void vl_msg_api_custom_dump_configure (api_main_t * am) - __attribute__ ((weak)); -void -vl_msg_api_custom_dump_configure (api_main_t * am) -{ -} - -VLIB_INIT_FUNCTION (vl_api_init); - -static void -vl_msg_api_process_file (vlib_main_t * vm, u8 * filename, - u32 first_index, u32 last_index, - vl_api_replay_t which) -{ - vl_api_trace_file_header_t *hp; - int i, fd; - struct stat statb; - size_t file_size; - u8 *msg; - u8 endian_swap_needed = 0; - api_main_t *am = &api_main; - u8 *tmpbuf = 0; - u32 nitems; - void **saved_print_handlers = 0; - - fd = open ((char *) filename, O_RDONLY); - - if (fd < 0) - { - vlib_cli_output (vm, "Couldn't open %s\n", filename); - return; - } - - if (fstat (fd, &statb) < 0) - { - vlib_cli_output (vm, "Couldn't stat %s\n", filename); - close (fd); - return; - } - - if (!(statb.st_mode & S_IFREG) || (statb.st_size < sizeof (*hp))) - { - vlib_cli_output (vm, "File not plausible: %s\n", filename); - close (fd); - return; - } - - file_size = statb.st_size; - file_size = (file_size + 4095) & ~(4096); - - hp = mmap (0, file_size, PROT_READ, MAP_PRIVATE, fd, 0); - - if (hp == (vl_api_trace_file_header_t *) MAP_FAILED) - { - vlib_cli_output (vm, "mmap failed: %s\n", filename); - close (fd); - return; - } - close (fd); - - if ((clib_arch_is_little_endian && hp->endian == VL_API_BIG_ENDIAN) - || (clib_arch_is_big_endian && hp->endian == VL_API_LITTLE_ENDIAN)) - endian_swap_needed = 1; - - if (endian_swap_needed) - nitems = ntohl (hp->nitems); - else - nitems = hp->nitems; - - if (last_index == (u32) ~ 0) - { - last_index = nitems - 1; - } - - if (first_index >= nitems || last_index >= nitems) - { - vlib_cli_output (vm, "Range (%d, %d) outside file range (0, %d)\n", - first_index, last_index, nitems - 1); - munmap (hp, file_size); - return; - } - if (hp->wrapped) - vlib_cli_output (vm, - "Note: wrapped/incomplete trace, results may vary\n"); - - if (which == CUSTOM_DUMP) - { - saved_print_handlers = (void **) vec_dup (am->msg_print_handlers); - vl_msg_api_custom_dump_configure (am); - } - - - msg = (u8 *) (hp + 1); - - for (i = 0; i < first_index; i++) - { - trace_cfg_t *cfgp; - int size; - u16 msg_id; - - size = clib_host_to_net_u32 (*(u32 *) msg); - msg += sizeof (u32); - - if (clib_arch_is_little_endian) - msg_id = ntohs (*((u16 *) msg)); - else - msg_id = *((u16 *) msg); - - cfgp = am->api_trace_cfg + msg_id; - if (!cfgp) - { - vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id); - munmap (hp, file_size); - return; - } - msg += size; - } - - if (which == REPLAY) - am->replay_in_progress = 1; - - for (; i <= last_index; i++) - { - trace_cfg_t *cfgp; - u16 *msg_idp; - u16 msg_id; - int size; - - if (which == DUMP) - vlib_cli_output (vm, "---------- trace %d -----------\n", i); - - size = clib_host_to_net_u32 (*(u32 *) msg); - msg += sizeof (u32); - - if (clib_arch_is_little_endian) - msg_id = ntohs (*((u16 *) msg)); - else - msg_id = *((u16 *) msg); - - cfgp = am->api_trace_cfg + msg_id; - if (!cfgp) - { - vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id); - munmap (hp, file_size); - vec_free (tmpbuf); - am->replay_in_progress = 0; - return; - } - - /* Copy the buffer (from the read-only mmap'ed file) */ - vec_validate (tmpbuf, size - 1 + sizeof (uword)); - clib_memcpy (tmpbuf + sizeof (uword), msg, size); - memset (tmpbuf, 0xf, sizeof (uword)); - - /* - * Endian swap if needed. All msg data is supposed to be - * in network byte order. All msg handlers are supposed to - * know that. The generic message dumpers don't know that. - * One could fix apigen, I suppose. - */ - if ((which == DUMP && clib_arch_is_little_endian) || endian_swap_needed) - { - void (*endian_fp) (void *); - if (msg_id >= vec_len (am->msg_endian_handlers) - || (am->msg_endian_handlers[msg_id] == 0)) - { - vlib_cli_output (vm, "Ugh: msg id %d no endian swap\n", msg_id); - munmap (hp, file_size); - vec_free (tmpbuf); - am->replay_in_progress = 0; - return; - } - endian_fp = am->msg_endian_handlers[msg_id]; - (*endian_fp) (tmpbuf + sizeof (uword)); - } - - /* msg_id always in network byte order */ - if (clib_arch_is_little_endian) - { - msg_idp = (u16 *) (tmpbuf + sizeof (uword)); - *msg_idp = msg_id; - } - - switch (which) - { - case CUSTOM_DUMP: - case DUMP: - if (msg_id < vec_len (am->msg_print_handlers) && - am->msg_print_handlers[msg_id]) - { - u8 *(*print_fp) (void *, void *); - - print_fp = (void *) am->msg_print_handlers[msg_id]; - (*print_fp) (tmpbuf + sizeof (uword), vm); - } - else - { - vlib_cli_output (vm, "Skipping msg id %d: no print fcn\n", - msg_id); - break; - } - break; - - case INITIALIZERS: - if (msg_id < vec_len (am->msg_print_handlers) && - am->msg_print_handlers[msg_id]) - { - u8 *s; - int j; - u8 *(*print_fp) (void *, void *); - - print_fp = (void *) am->msg_print_handlers[msg_id]; - - vlib_cli_output (vm, "/*"); - - (*print_fp) (tmpbuf + sizeof (uword), vm); - vlib_cli_output (vm, "*/\n"); - - s = format (0, "static u8 * vl_api_%s_%d[%d] = {", - am->msg_names[msg_id], i, - am->api_trace_cfg[msg_id].size); - - for (j = 0; j < am->api_trace_cfg[msg_id].size; j++) - { - if ((j & 7) == 0) - s = format (s, "\n "); - s = format (s, "0x%02x,", tmpbuf[sizeof (uword) + j]); - } - s = format (s, "\n};\n%c", 0); - vlib_cli_output (vm, (char *) s); - vec_free (s); - } - break; - - case REPLAY: - if (msg_id < vec_len (am->msg_print_handlers) && - am->msg_print_handlers[msg_id] && cfgp->replay_enable) - { - void (*handler) (void *); - - handler = (void *) am->msg_handlers[msg_id]; - - if (!am->is_mp_safe[msg_id]) - vl_msg_api_barrier_sync (); - (*handler) (tmpbuf + sizeof (uword)); - if (!am->is_mp_safe[msg_id]) - vl_msg_api_barrier_release (); - } - else - { - if (cfgp->replay_enable) - vlib_cli_output (vm, "Skipping msg id %d: no handler\n", - msg_id); - break; - } - break; - } - - _vec_len (tmpbuf) = 0; - msg += size; - } - - if (saved_print_handlers) - { - clib_memcpy (am->msg_print_handlers, saved_print_handlers, - vec_len (am->msg_print_handlers) * sizeof (void *)); - vec_free (saved_print_handlers); - } - - munmap (hp, file_size); - vec_free (tmpbuf); - am->replay_in_progress = 0; -} - -u8 * -format_vl_msg_api_trace_status (u8 * s, va_list * args) -{ - api_main_t *am = va_arg (*args, api_main_t *); - vl_api_trace_which_t which = va_arg (*args, vl_api_trace_which_t); - vl_api_trace_t *tp; - char *trace_name; - - switch (which) - { - case VL_API_TRACE_TX: - tp = am->tx_trace; - trace_name = "TX trace"; - break; - - case VL_API_TRACE_RX: - tp = am->rx_trace; - trace_name = "RX trace"; - break; - - default: - abort (); - } - - if (tp == 0) - { - s = format (s, "%s: not yet configured.\n", trace_name); - return s; - } - - s = format (s, "%s: used %d of %d items, %s enabled, %s wrapped\n", - trace_name, vec_len (tp->traces), tp->nitems, - tp->enabled ? "is" : "is not", tp->wrapped ? "has" : "has not"); - return s; -} static u8 post_mortem_dump_enabled; -static clib_error_t * -api_trace_command_fn (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - u32 nitems = 256 << 10; - api_main_t *am = &api_main; - vl_api_trace_which_t which = VL_API_TRACE_RX; - u8 *filename; - u32 first = 0; - u32 last = (u32) ~ 0; - FILE *fp; - int rv; - - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "on") || unformat (input, "enable")) - { - if (unformat (input, "nitems %d", &nitems)) - ; - vl_msg_api_trace_configure (am, which, nitems); - vl_msg_api_trace_onoff (am, which, 1 /* on */ ); - } - else if (unformat (input, "off")) - { - vl_msg_api_trace_onoff (am, which, 0); - } - else if (unformat (input, "save %s", &filename)) - { - u8 *chroot_filename; - if (strstr ((char *) filename, "..") - || index ((char *) filename, '/')) - { - vlib_cli_output (vm, "illegal characters in filename '%s'", - filename); - return 0; - } - - chroot_filename = format (0, "/tmp/%s%c", filename, 0); - - vec_free (filename); - - fp = fopen ((char *) chroot_filename, "w"); - if (fp == NULL) - { - vlib_cli_output (vm, "Couldn't create %s\n", chroot_filename); - return 0; - } - rv = vl_msg_api_trace_save (am, which, fp); - fclose (fp); - if (rv == -1) - vlib_cli_output (vm, "API Trace data not present\n"); - else if (rv == -2) - vlib_cli_output (vm, "File for writing is closed\n"); - else if (rv == -10) - vlib_cli_output (vm, "Error while writing header to file\n"); - else if (rv == -11) - vlib_cli_output (vm, "Error while writing trace to file\n"); - else if (rv == -12) - vlib_cli_output (vm, - "Error while writing end of buffer trace to file\n"); - else if (rv == -13) - vlib_cli_output (vm, - "Error while writing start of buffer trace to file\n"); - else if (rv < 0) - vlib_cli_output (vm, "Unkown error while saving: %d", rv); - else - vlib_cli_output (vm, "API trace saved to %s\n", chroot_filename); - vec_free (chroot_filename); - } - else if (unformat (input, "dump %s", &filename)) - { - vl_msg_api_process_file (vm, filename, first, last, DUMP); - } - else if (unformat (input, "custom-dump %s", &filename)) - { - vl_msg_api_process_file (vm, filename, first, last, CUSTOM_DUMP); - } - else if (unformat (input, "replay %s", &filename)) - { - vl_msg_api_process_file (vm, filename, first, last, REPLAY); - } - else if (unformat (input, "initializers %s", &filename)) - { - vl_msg_api_process_file (vm, filename, first, last, INITIALIZERS); - } - else if (unformat (input, "tx")) - { - which = VL_API_TRACE_TX; - } - else if (unformat (input, "first %d", &first)) - { - ; - } - else if (unformat (input, "last %d", &last)) - { - ; - } - else if (unformat (input, "status")) - { - vlib_cli_output (vm, "%U", format_vl_msg_api_trace_status, - am, which); - } - else if (unformat (input, "free")) - { - vl_msg_api_trace_onoff (am, which, 0); - vl_msg_api_trace_free (am, which); - } - else if (unformat (input, "post-mortem-on")) - post_mortem_dump_enabled = 1; - else if (unformat (input, "post-mortem-off")) - post_mortem_dump_enabled = 0; - else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - } - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (api_trace_command, static) = { - .path = "api trace", - .short_help = - "api trace [on|off][dump|save|replay ][status][free][post-mortem-on]", - .function = api_trace_command_fn, -}; -/* *INDENT-ON* */ - -static clib_error_t * -api_config_fn (vlib_main_t * vm, unformat_input_t * input) +void +vl_msg_api_post_mortem_dump_enable_disable (int enable) { - u32 nitems = 256 << 10; - vl_api_trace_which_t which = VL_API_TRACE_RX; - api_main_t *am = &api_main; - - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "on") || unformat (input, "enable")) - { - if (unformat (input, "nitems %d", &nitems)) - ; - vl_msg_api_trace_configure (am, which, nitems); - vl_msg_api_trace_onoff (am, which, 1 /* on */ ); - post_mortem_dump_enabled = 1; - } - else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - } - return 0; + post_mortem_dump_enabled = enable; } -VLIB_CONFIG_FUNCTION (api_config_fn, "api-trace"); - void vl_msg_api_post_mortem_dump (void) { diff --git a/src/vlibapi/node_serialize.c b/src/vlibapi/node_serialize.c index 4dc1a7d2..50e5c41c 100644 --- a/src/vlibapi/node_serialize.c +++ b/src/vlibapi/node_serialize.c @@ -73,16 +73,11 @@ vlib_node_serialize (vlib_node_main_t * nm, u8 * vector, if (vec_len (stat_vms) == 0) { - if (vec_len (vlib_mains) == 0) - vec_add1 (stat_vms, vm); - else + for (i = 0; i < vec_len (vlib_mains); i++) { - for (i = 0; i < vec_len (vlib_mains); i++) - { - stat_vm = vlib_mains[i]; - if (stat_vm) - vec_add1 (stat_vms, stat_vm); - } + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); } } @@ -286,7 +281,7 @@ vlib_node_unserialize (u8 * vector) return nodes_by_thread; } -#if CLIB_DEBUG > 0 +#if TEST_CODE static clib_error_t * test_node_serialize_command_fn (vlib_main_t * vm, diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c index 3a7415c0..d2e05968 100644 --- a/src/vlibmemory/memory_vlib.c +++ b/src/vlibmemory/memory_vlib.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -1437,6 +1439,475 @@ rpc_api_hookup (vlib_main_t * vm) VLIB_API_INIT_FUNCTION (rpc_api_hookup); +typedef enum +{ + DUMP, + CUSTOM_DUMP, + REPLAY, + INITIALIZERS, +} vl_api_replay_t; + +u8 * +format_vl_msg_api_trace_status (u8 * s, va_list * args) +{ + api_main_t *am = va_arg (*args, api_main_t *); + vl_api_trace_which_t which = va_arg (*args, vl_api_trace_which_t); + vl_api_trace_t *tp; + char *trace_name; + + switch (which) + { + case VL_API_TRACE_TX: + tp = am->tx_trace; + trace_name = "TX trace"; + break; + + case VL_API_TRACE_RX: + tp = am->rx_trace; + trace_name = "RX trace"; + break; + + default: + abort (); + } + + if (tp == 0) + { + s = format (s, "%s: not yet configured.\n", trace_name); + return s; + } + + s = format (s, "%s: used %d of %d items, %s enabled, %s wrapped\n", + trace_name, vec_len (tp->traces), tp->nitems, + tp->enabled ? "is" : "is not", tp->wrapped ? "has" : "has not"); + return s; +} + +void vl_msg_api_custom_dump_configure (api_main_t * am) + __attribute__ ((weak)); +void +vl_msg_api_custom_dump_configure (api_main_t * am) +{ +} + +static void +vl_msg_api_process_file (vlib_main_t * vm, u8 * filename, + u32 first_index, u32 last_index, + vl_api_replay_t which) +{ + vl_api_trace_file_header_t *hp; + int i, fd; + struct stat statb; + size_t file_size; + u8 *msg; + u8 endian_swap_needed = 0; + api_main_t *am = &api_main; + u8 *tmpbuf = 0; + u32 nitems; + void **saved_print_handlers = 0; + + fd = open ((char *) filename, O_RDONLY); + + if (fd < 0) + { + vlib_cli_output (vm, "Couldn't open %s\n", filename); + return; + } + + if (fstat (fd, &statb) < 0) + { + vlib_cli_output (vm, "Couldn't stat %s\n", filename); + close (fd); + return; + } + + if (!(statb.st_mode & S_IFREG) || (statb.st_size < sizeof (*hp))) + { + vlib_cli_output (vm, "File not plausible: %s\n", filename); + close (fd); + return; + } + + file_size = statb.st_size; + file_size = (file_size + 4095) & ~(4096); + + hp = mmap (0, file_size, PROT_READ, MAP_PRIVATE, fd, 0); + + if (hp == (vl_api_trace_file_header_t *) MAP_FAILED) + { + vlib_cli_output (vm, "mmap failed: %s\n", filename); + close (fd); + return; + } + close (fd); + + if ((clib_arch_is_little_endian && hp->endian == VL_API_BIG_ENDIAN) + || (clib_arch_is_big_endian && hp->endian == VL_API_LITTLE_ENDIAN)) + endian_swap_needed = 1; + + if (endian_swap_needed) + nitems = ntohl (hp->nitems); + else + nitems = hp->nitems; + + if (last_index == (u32) ~ 0) + { + last_index = nitems - 1; + } + + if (first_index >= nitems || last_index >= nitems) + { + vlib_cli_output (vm, "Range (%d, %d) outside file range (0, %d)\n", + first_index, last_index, nitems - 1); + munmap (hp, file_size); + return; + } + if (hp->wrapped) + vlib_cli_output (vm, + "Note: wrapped/incomplete trace, results may vary\n"); + + if (which == CUSTOM_DUMP) + { + saved_print_handlers = (void **) vec_dup (am->msg_print_handlers); + vl_msg_api_custom_dump_configure (am); + } + + + msg = (u8 *) (hp + 1); + + for (i = 0; i < first_index; i++) + { + trace_cfg_t *cfgp; + int size; + u16 msg_id; + + size = clib_host_to_net_u32 (*(u32 *) msg); + msg += sizeof (u32); + + if (clib_arch_is_little_endian) + msg_id = ntohs (*((u16 *) msg)); + else + msg_id = *((u16 *) msg); + + cfgp = am->api_trace_cfg + msg_id; + if (!cfgp) + { + vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id); + munmap (hp, file_size); + return; + } + msg += size; + } + + if (which == REPLAY) + am->replay_in_progress = 1; + + for (; i <= last_index; i++) + { + trace_cfg_t *cfgp; + u16 *msg_idp; + u16 msg_id; + int size; + + if (which == DUMP) + vlib_cli_output (vm, "---------- trace %d -----------\n", i); + + size = clib_host_to_net_u32 (*(u32 *) msg); + msg += sizeof (u32); + + if (clib_arch_is_little_endian) + msg_id = ntohs (*((u16 *) msg)); + else + msg_id = *((u16 *) msg); + + cfgp = am->api_trace_cfg + msg_id; + if (!cfgp) + { + vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id); + munmap (hp, file_size); + vec_free (tmpbuf); + am->replay_in_progress = 0; + return; + } + + /* Copy the buffer (from the read-only mmap'ed file) */ + vec_validate (tmpbuf, size - 1 + sizeof (uword)); + clib_memcpy (tmpbuf + sizeof (uword), msg, size); + memset (tmpbuf, 0xf, sizeof (uword)); + + /* + * Endian swap if needed. All msg data is supposed to be + * in network byte order. All msg handlers are supposed to + * know that. The generic message dumpers don't know that. + * One could fix apigen, I suppose. + */ + if ((which == DUMP && clib_arch_is_little_endian) || endian_swap_needed) + { + void (*endian_fp) (void *); + if (msg_id >= vec_len (am->msg_endian_handlers) + || (am->msg_endian_handlers[msg_id] == 0)) + { + vlib_cli_output (vm, "Ugh: msg id %d no endian swap\n", msg_id); + munmap (hp, file_size); + vec_free (tmpbuf); + am->replay_in_progress = 0; + return; + } + endian_fp = am->msg_endian_handlers[msg_id]; + (*endian_fp) (tmpbuf + sizeof (uword)); + } + + /* msg_id always in network byte order */ + if (clib_arch_is_little_endian) + { + msg_idp = (u16 *) (tmpbuf + sizeof (uword)); + *msg_idp = msg_id; + } + + switch (which) + { + case CUSTOM_DUMP: + case DUMP: + if (msg_id < vec_len (am->msg_print_handlers) && + am->msg_print_handlers[msg_id]) + { + u8 *(*print_fp) (void *, void *); + + print_fp = (void *) am->msg_print_handlers[msg_id]; + (*print_fp) (tmpbuf + sizeof (uword), vm); + } + else + { + vlib_cli_output (vm, "Skipping msg id %d: no print fcn\n", + msg_id); + break; + } + break; + + case INITIALIZERS: + if (msg_id < vec_len (am->msg_print_handlers) && + am->msg_print_handlers[msg_id]) + { + u8 *s; + int j; + u8 *(*print_fp) (void *, void *); + + print_fp = (void *) am->msg_print_handlers[msg_id]; + + vlib_cli_output (vm, "/*"); + + (*print_fp) (tmpbuf + sizeof (uword), vm); + vlib_cli_output (vm, "*/\n"); + + s = format (0, "static u8 * vl_api_%s_%d[%d] = {", + am->msg_names[msg_id], i, + am->api_trace_cfg[msg_id].size); + + for (j = 0; j < am->api_trace_cfg[msg_id].size; j++) + { + if ((j & 7) == 0) + s = format (s, "\n "); + s = format (s, "0x%02x,", tmpbuf[sizeof (uword) + j]); + } + s = format (s, "\n};\n%c", 0); + vlib_cli_output (vm, (char *) s); + vec_free (s); + } + break; + + case REPLAY: + if (msg_id < vec_len (am->msg_print_handlers) && + am->msg_print_handlers[msg_id] && cfgp->replay_enable) + { + void (*handler) (void *); + + handler = (void *) am->msg_handlers[msg_id]; + + if (!am->is_mp_safe[msg_id]) + vl_msg_api_barrier_sync (); + (*handler) (tmpbuf + sizeof (uword)); + if (!am->is_mp_safe[msg_id]) + vl_msg_api_barrier_release (); + } + else + { + if (cfgp->replay_enable) + vlib_cli_output (vm, "Skipping msg id %d: no handler\n", + msg_id); + break; + } + break; + } + + _vec_len (tmpbuf) = 0; + msg += size; + } + + if (saved_print_handlers) + { + clib_memcpy (am->msg_print_handlers, saved_print_handlers, + vec_len (am->msg_print_handlers) * sizeof (void *)); + vec_free (saved_print_handlers); + } + + munmap (hp, file_size); + vec_free (tmpbuf); + am->replay_in_progress = 0; +} + +static clib_error_t * +api_trace_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + u32 nitems = 256 << 10; + api_main_t *am = &api_main; + vl_api_trace_which_t which = VL_API_TRACE_RX; + u8 *filename; + u32 first = 0; + u32 last = (u32) ~ 0; + FILE *fp; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "on") || unformat (input, "enable")) + { + if (unformat (input, "nitems %d", &nitems)) + ; + vl_msg_api_trace_configure (am, which, nitems); + vl_msg_api_trace_onoff (am, which, 1 /* on */ ); + } + else if (unformat (input, "off")) + { + vl_msg_api_trace_onoff (am, which, 0); + } + else if (unformat (input, "save %s", &filename)) + { + u8 *chroot_filename; + if (strstr ((char *) filename, "..") + || index ((char *) filename, '/')) + { + vlib_cli_output (vm, "illegal characters in filename '%s'", + filename); + return 0; + } + + chroot_filename = format (0, "/tmp/%s%c", filename, 0); + + vec_free (filename); + + fp = fopen ((char *) chroot_filename, "w"); + if (fp == NULL) + { + vlib_cli_output (vm, "Couldn't create %s\n", chroot_filename); + return 0; + } + rv = vl_msg_api_trace_save (am, which, fp); + fclose (fp); + if (rv == -1) + vlib_cli_output (vm, "API Trace data not present\n"); + else if (rv == -2) + vlib_cli_output (vm, "File for writing is closed\n"); + else if (rv == -10) + vlib_cli_output (vm, "Error while writing header to file\n"); + else if (rv == -11) + vlib_cli_output (vm, "Error while writing trace to file\n"); + else if (rv == -12) + vlib_cli_output (vm, + "Error while writing end of buffer trace to file\n"); + else if (rv == -13) + vlib_cli_output (vm, + "Error while writing start of buffer trace to file\n"); + else if (rv < 0) + vlib_cli_output (vm, "Unkown error while saving: %d", rv); + else + vlib_cli_output (vm, "API trace saved to %s\n", chroot_filename); + vec_free (chroot_filename); + } + else if (unformat (input, "dump %s", &filename)) + { + vl_msg_api_process_file (vm, filename, first, last, DUMP); + } + else if (unformat (input, "custom-dump %s", &filename)) + { + vl_msg_api_process_file (vm, filename, first, last, CUSTOM_DUMP); + } + else if (unformat (input, "replay %s", &filename)) + { + vl_msg_api_process_file (vm, filename, first, last, REPLAY); + } + else if (unformat (input, "initializers %s", &filename)) + { + vl_msg_api_process_file (vm, filename, first, last, INITIALIZERS); + } + else if (unformat (input, "tx")) + { + which = VL_API_TRACE_TX; + } + else if (unformat (input, "first %d", &first)) + { + ; + } + else if (unformat (input, "last %d", &last)) + { + ; + } + else if (unformat (input, "status")) + { + vlib_cli_output (vm, "%U", format_vl_msg_api_trace_status, + am, which); + } + else if (unformat (input, "free")) + { + vl_msg_api_trace_onoff (am, which, 0); + vl_msg_api_trace_free (am, which); + } + else if (unformat (input, "post-mortem-on")) + vl_msg_api_post_mortem_dump_enable_disable (1 /* enable */ ); + else if (unformat (input, "post-mortem-off")) + vl_msg_api_post_mortem_dump_enable_disable (0 /* enable */ ); + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (api_trace_command, static) = { + .path = "api trace", + .short_help = + "api trace [on|off][dump|save|replay ][status][free][post-mortem-on]", + .function = api_trace_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +api_config_fn (vlib_main_t * vm, unformat_input_t * input) +{ + u32 nitems = 256 << 10; + vl_api_trace_which_t which = VL_API_TRACE_RX; + api_main_t *am = &api_main; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "on") || unformat (input, "enable")) + { + if (unformat (input, "nitems %d", &nitems)) + ; + vl_msg_api_trace_configure (am, which, nitems); + vl_msg_api_trace_onoff (am, which, 1 /* on */ ); + vl_msg_api_post_mortem_dump_enable_disable (1 /* enable */ ); + } + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (api_config_fn, "api-trace"); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index b6b4c04a..100ec613 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -374,8 +374,7 @@ vhost_user_rx_thread_placement () for (i = vum->input_cpu_first_index; i < vum->input_cpu_first_index + vum->input_cpu_count; i++) { - vlib_node_set_state (vlib_mains ? vlib_mains[i] : &vlib_global_main, - vhost_user_input_node.index, + vlib_node_set_state (vlib_mains[i], vhost_user_input_node.index, VLIB_NODE_STATE_DISABLED); vec_add1 (workers, i); } @@ -406,9 +405,9 @@ vhost_user_rx_thread_placement () iaq.qid = qid; iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; vec_add1 (vhc->rx_queues, iaq); - vlib_node_set_state (vlib_mains ? vlib_mains[cpu_index] : - &vlib_global_main, vhost_user_input_node.index, - VLIB_NODE_STATE_POLLING); + vlib_node_set_state (vlib_mains[cpu_index], + vhost_user_input_node.index, + VLIB_NODE_STATE_POLLING); } }); /* *INDENT-ON* */ diff --git a/src/vpp-api-test.am b/src/vpp-api-test.am index f0d5df62..ceab687c 100644 --- a/src/vpp-api-test.am +++ b/src/vpp-api-test.am @@ -34,14 +34,12 @@ vpp_json_test_SOURCES = \ vat/json_test.c vpp_api_test_LDADD = \ - libvlib.la \ libvlibmemoryclient.la \ libsvm.la \ libvatplugin.la \ libvppinfra.la \ libvlibapi.la \ libvlibmemory.la \ - libvnet.la \ -lpthread -lm -lrt -ldl -lcrypto vpp_api_test_LDFLAGS = -Wl,--export-dynamic diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c index 828394ed..c85dc680 100644 --- a/src/vpp/api/api.c +++ b/src/vpp/api/api.c @@ -2143,7 +2143,6 @@ vpe_api_init (vlib_main_t * vm) am->oam_events_registration_hash = hash_create (0, sizeof (uword)); am->bfd_events_registration_hash = hash_create (0, sizeof (uword)); - vl_api_init (vm); vl_set_memory_region_name ("/vpe-api"); vl_enable_disable_memory_api (vm, 1 /* enable it */ ); diff --git a/src/vpp/api/gmon.c b/src/vpp/api/gmon.c index 610f40ed..277be8c0 100644 --- a/src/vpp/api/gmon.c +++ b/src/vpp/api/gmon.c @@ -122,13 +122,8 @@ gmon_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) /* Initial wait for the world to settle down */ vlib_process_suspend (vm, 5.0); - if (vec_len (vlib_mains) == 0) - vec_add1 (gm->my_vlib_mains, &vlib_global_main); - else - { - for (i = 0; i < vec_len (vlib_mains); i++) - vec_add1 (gm->my_vlib_mains, vlib_mains[i]); - } + for (i = 0; i < vec_len (vlib_mains); i++) + vec_add1 (gm->my_vlib_mains, vlib_mains[i]); while (1) { -- cgit 1.2.3-korg From 374e2c5fc30a5bfabfd2eb6c2d3ca5797402af16 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 9 Mar 2017 20:38:15 +0100 Subject: Retire vpp_lite vpp_lite platform is not needed anymore as same efect can be achieved with following startup.conf config: plugins { plugin dpdk_plugin.so { disable } } Change-Id: I690ea8ceb1c6e1fe32e01e7da54e9958019a93bf Signed-off-by: Damjan Marion --- Makefile | 16 +- build-data/platforms/vpp.mk | 5 - build-data/platforms/vpp_lite.mk | 50 - src/configure.ac | 4 +- src/plugins/Makefile.am | 6 +- src/plugins/ixge.am | 20 + src/plugins/ixge/ixge.c | 2947 +++++++++++++++++++++++ src/plugins/ixge/ixge.h | 1293 ++++++++++ src/vlib/buffer.c | 6 + src/vnet.am | 14 +- src/vnet/devices/nic/ixge.c | 2938 ---------------------- src/vnet/devices/nic/ixge.h | 1293 ---------- src/vnet/devices/nic/sfp.c | 117 - src/vnet/devices/nic/sfp.h | 117 - src/vnet/ethernet/sfp.c | 117 + src/vnet/ethernet/sfp.h | 117 + src/vpp-api/lua/bench.lua | 4 +- src/vpp-api/lua/examples/cli/lua-cli.lua | 4 +- src/vpp-api/lua/examples/example-classifier.lua | 4 +- src/vpp-api/lua/examples/example-cli.lua | 4 +- src/vpp/vnet/main.c | 3 - test/framework.py | 4 +- 22 files changed, 4529 insertions(+), 4554 deletions(-) delete mode 100644 build-data/platforms/vpp_lite.mk create mode 100644 src/plugins/ixge.am create mode 100644 src/plugins/ixge/ixge.c create mode 100644 src/plugins/ixge/ixge.h delete mode 100644 src/vnet/devices/nic/ixge.c delete mode 100644 src/vnet/devices/nic/ixge.h delete mode 100644 src/vnet/devices/nic/sfp.c delete mode 100644 src/vnet/devices/nic/sfp.h create mode 100644 src/vnet/ethernet/sfp.c create mode 100644 src/vnet/ethernet/sfp.h (limited to 'src/vlib/buffer.c') diff --git a/Makefile b/Makefile index f0173cc1..1527d60a 100644 --- a/Makefile +++ b/Makefile @@ -230,18 +230,18 @@ define test endef test: bootstrap - $(call test,vpp_lite,vpp_lite,test) + $(call test,vpp,vpp,test) test-debug: bootstrap - $(call test,vpp_lite,vpp_lite_debug,test) + $(call test,vpp,vpp_debug,test) test-all: bootstrap $(eval EXTENDED_TESTS=yes) - $(call test,vpp_lite,vpp_lite,test) + $(call test,vpp,vpp,test) test-all-debug: bootstrap $(eval EXTENDED_TESTS=yes) - $(call test,vpp_lite,vpp_lite_debug,test) + $(call test,vpp,vpp_debug,test) test-help: @make -C test help @@ -262,7 +262,7 @@ test-wipe-doc: @make -C test wipe-doc test-cov: bootstrap - $(call test,vpp_lite,vpp_lite_gcov,cov) + $(call test,vpp,vpp_gcov,cov) test-wipe-cov: @make -C test wipe-cov @@ -271,10 +271,10 @@ test-checkstyle: @make -C test checkstyle retest: - $(call test,vpp_lite,vpp_lite,retest) + $(call test,vpp,vpp,retest) retest-debug: - $(call test,vpp_lite,vpp_lite_debug,retest) + $(call test,vpp,vpp_debug,retest) STARTUP_DIR ?= $(PWD) ifeq ("$(wildcard $(STARTUP_CONF))","") @@ -376,8 +376,6 @@ endef verify: install-dep $(BR)/.bootstrap.ok dpdk-install-dev $(call banner,"Building for PLATFORM=vpp using gcc") @make -C build-root PLATFORM=vpp TAG=vpp wipe-all install-packages - $(call banner,"Building for PLATFORM=vpp_lite using gcc") - @make -C build-root PLATFORM=vpp_lite TAG=vpp_lite wipe-all install-packages ifeq ($(OS_ID)-$(OS_VERSION_ID),ubuntu-16.04) $(call banner,"Installing dependencies") @sudo -E apt-get update diff --git a/build-data/platforms/vpp.mk b/build-data/platforms/vpp.mk index 401a383a..c61375d8 100644 --- a/build-data/platforms/vpp.mk +++ b/build-data/platforms/vpp.mk @@ -35,11 +35,6 @@ vpp_uses_dpdk = yes vpp_root_packages = vpp gmod -vpp_configure_args_vpp = --with-dpdk - -# Set these parameters carefully. The vlib_buffer_t is 128 bytes, i.e. -vlib_configure_args_vpp = --with-pre-data=128 - # DPDK configuration parameters # vpp_uses_dpdk_cryptodev_sw = yes # vpp_uses_dpdk_mlx5_pmd = yes diff --git a/build-data/platforms/vpp_lite.mk b/build-data/platforms/vpp_lite.mk deleted file mode 100644 index a556b487..00000000 --- a/build-data/platforms/vpp_lite.mk +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2016 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# vector packet processor -vpp_lite_arch = native -ifeq ($(shell uname -m),x86_64) -vpp_lite_march = corei7 # Nehalem Instruction set -vpp_lite_mtune = corei7-avx # Optimize for Sandy Bridge -else -vpp_lite_march = native -vpp_lite_mtune = generic -endif -vpp_lite_native_tools = vppapigen - -vpp_lite_uses_dpdk = no - -# Uncoment to enable building unit tests -#vpp_lite_enable_tests = yes - -vpp_lite_root_packages = vpp gmod - -vlib_configure_args_vpp_lite = --with-pre-data=128 - -vnet_configure_args_vpp_lite = -vpp_configure_args_vpp_lite = - -vpp_lite_debug_TAG_CFLAGS = -g -O0 -DCLIB_DEBUG -DFORTIFY_SOURCE=2 -march=$(MARCH) \ - -fstack-protector-all -fPIC -Werror -vpp_lite_debug_TAG_LDFLAGS = -g -O0 -DCLIB_DEBUG -DFORTIFY_SOURCE=2 -march=$(MARCH) \ - -fstack-protector-all -fPIC -Werror - -vpp_lite_TAG_CFLAGS = -g -O2 -DFORTIFY_SOURCE=2 -march=$(MARCH) -mtune=$(MTUNE) \ - -fstack-protector -fPIC -Werror -vpp_lite_TAG_LDFLAGS = -g -O2 -DFORTIFY_SOURCE=2 -march=$(MARCH) -mtune=$(MTUNE) \ - -fstack-protector -fPIC -Werror - -vpp_lite_gcov_TAG_CFLAGS = -g -O0 -DCLIB_DEBUG -march=$(MARCH) \ - -fPIC -Werror -fprofile-arcs -ftest-coverage -vpp_lite_gcov_TAG_LDFLAGS = -g -O0 -DCLIB_DEBUG -march=$(MARCH) \ - -fPIC -Werror -coverage diff --git a/src/configure.ac b/src/configure.ac index c22d152e..d90740d9 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -97,7 +97,6 @@ DISABLE_ARG(papi, [Disable Python API bindings]) DISABLE_ARG(japi, [Disable Java API bindings]) # --with-X -WITH_ARG(dpdk, [Use use DPDK]) WITH_ARG(dpdk_crypto_sw,[Use DPDK cryptodev SW PMDs]) WITH_ARG(dpdk_mlx5_pmd, [Use DPDK with mlx5 PMD]) @@ -130,7 +129,6 @@ AC_ARG_WITH(pre-data, AC_SUBST(PRE_DATA_SIZE, [$with_pre_data]) AC_SUBST(APICLI, [-DVPP_API_TEST_BUILTIN=${n_with_apicli}]) -AC_DEFINE_UNQUOTED(DPDK, [${n_with_dpdk}]) AC_DEFINE_UNQUOTED(DPDK_SHARED_LIB, [${n_enable_dpdk_shared}]) AC_DEFINE_UNQUOTED(DPDK_CRYPTO_SW, [${n_with_dpdk_crypto_sw}]) AC_DEFINE_UNQUOTED(WITH_LIBSSL, [${n_with_libssl}]) @@ -147,9 +145,11 @@ AC_SUBST(AR_FLAGS) # Please keep alphabetical order PLUGIN_ENABLED(acl) +PLUGIN_ENABLED(dpdk) PLUGIN_ENABLED(flowperpkt) PLUGIN_ENABLED(ila) PLUGIN_ENABLED(ioam) +PLUGIN_ENABLED(ixge) PLUGIN_ENABLED(lb) PLUGIN_ENABLED(sixrd) PLUGIN_ENABLED(snat) diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am index 7b36049e..255e644f 100644 --- a/src/plugins/Makefile.am +++ b/src/plugins/Makefile.am @@ -33,7 +33,7 @@ if ENABLE_ACL_PLUGIN include acl.am endif -if WITH_DPDK +if ENABLE_DPDK_PLUGIN include dpdk.am endif @@ -49,6 +49,10 @@ if ENABLE_IOAM_PLUGIN include ioam.am endif +if ENABLE_IXGE_PLUGIN +include ixge.am +endif + if ENABLE_LB_PLUGIN include lb.am endif diff --git a/src/plugins/ixge.am b/src/plugins/ixge.am new file mode 100644 index 00000000..7e61344b --- /dev/null +++ b/src/plugins/ixge.am @@ -0,0 +1,20 @@ +# Copyright (c) 2016 Cisco Systems, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppplugins_LTLIBRARIES += ixge_plugin.la + +ixge_plugin_la_SOURCES = ixge/ixge.c + +noinst_HEADERS += ixge/ixge.h + +# vi:syntax=automake diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c new file mode 100644 index 00000000..4eebc457 --- /dev/null +++ b/src/plugins/ixge/ixge.c @@ -0,0 +1,2947 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * WARNING! + * This driver is not intended for production use and it is unsupported. + * It is provided for educational use only. + * Please use supported DPDK driver instead. + */ + +#if __x86_64__ +#include + +#ifndef CLIB_HAVE_VEC128 +#warning HACK: ixge driver wont really work, missing u32x4 +typedef unsigned long long u32x4; +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#define IXGE_ALWAYS_POLL 0 + +#define EVENT_SET_FLAGS 0 +#define IXGE_HWBP_RACE_ELOG 0 + +#define PCI_VENDOR_ID_INTEL 0x8086 + +/* 10 GIG E (XGE) PHY IEEE 802.3 clause 45 definitions. */ +#define XGE_PHY_DEV_TYPE_PMA_PMD 1 +#define XGE_PHY_DEV_TYPE_PHY_XS 4 +#define XGE_PHY_ID1 0x2 +#define XGE_PHY_ID2 0x3 +#define XGE_PHY_CONTROL 0x0 +#define XGE_PHY_CONTROL_RESET (1 << 15) + +ixge_main_t ixge_main; +static vlib_node_registration_t ixge_input_node; +static vlib_node_registration_t ixge_process_node; + +static void +ixge_semaphore_get (ixge_device_t * xd) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + u32 i; + + i = 0; + while (!(r->software_semaphore & (1 << 0))) + { + if (i > 0) + vlib_process_suspend (vm, 100e-6); + i++; + } + do + { + r->software_semaphore |= 1 << 1; + } + while (!(r->software_semaphore & (1 << 1))); +} + +static void +ixge_semaphore_release (ixge_device_t * xd) +{ + ixge_regs_t *r = xd->regs; + r->software_semaphore &= ~3; +} + +static void +ixge_software_firmware_sync (ixge_device_t * xd, u32 sw_mask) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + u32 fw_mask = sw_mask << 5; + u32 m, done = 0; + + while (!done) + { + ixge_semaphore_get (xd); + m = r->software_firmware_sync; + done = (m & fw_mask) == 0; + if (done) + r->software_firmware_sync = m | sw_mask; + ixge_semaphore_release (xd); + if (!done) + vlib_process_suspend (vm, 10e-3); + } +} + +static void +ixge_software_firmware_sync_release (ixge_device_t * xd, u32 sw_mask) +{ + ixge_regs_t *r = xd->regs; + ixge_semaphore_get (xd); + r->software_firmware_sync &= ~sw_mask; + ixge_semaphore_release (xd); +} + +u32 +ixge_read_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, + u32 v, u32 is_read) +{ + ixge_regs_t *r = xd->regs; + const u32 busy_bit = 1 << 30; + u32 x; + + ASSERT (xd->phy_index < 2); + ixge_software_firmware_sync (xd, 1 << (1 + xd->phy_index)); + + ASSERT (reg_index < (1 << 16)); + ASSERT (dev_type < (1 << 5)); + if (!is_read) + r->xge_mac.phy_data = v; + + /* Address cycle. */ + x = + reg_index | (dev_type << 16) | (xd-> + phys[xd->phy_index].mdio_address << 21); + r->xge_mac.phy_command = x | busy_bit; + /* Busy wait timed to take 28e-6 secs. No suspend. */ + while (r->xge_mac.phy_command & busy_bit) + ; + + r->xge_mac.phy_command = x | ((is_read ? 2 : 1) << 26) | busy_bit; + while (r->xge_mac.phy_command & busy_bit) + ; + + if (is_read) + v = r->xge_mac.phy_data >> 16; + + ixge_software_firmware_sync_release (xd, 1 << (1 + xd->phy_index)); + + return v; +} + +static u32 +ixge_read_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index) +{ + return ixge_read_write_phy_reg (xd, dev_type, reg_index, 0, /* is_read */ + 1); +} + +static void +ixge_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, u32 v) +{ + (void) ixge_read_write_phy_reg (xd, dev_type, reg_index, v, /* is_read */ + 0); +} + +static void +ixge_i2c_put_bits (i2c_bus_t * b, int scl, int sda) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); + u32 v; + + v = 0; + v |= (sda != 0) << 3; + v |= (scl != 0) << 1; + xd->regs->i2c_control = v; +} + +static void +ixge_i2c_get_bits (i2c_bus_t * b, int *scl, int *sda) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); + u32 v; + + v = xd->regs->i2c_control; + *sda = (v & (1 << 2)) != 0; + *scl = (v & (1 << 0)) != 0; +} + +static u16 +ixge_read_eeprom (ixge_device_t * xd, u32 address) +{ + ixge_regs_t *r = xd->regs; + u32 v; + r->eeprom_read = (( /* start bit */ (1 << 0)) | (address << 2)); + /* Wait for done bit. */ + while (!((v = r->eeprom_read) & (1 << 1))) + ; + return v >> 16; +} + +static void +ixge_sfp_enable_disable_laser (ixge_device_t * xd, uword enable) +{ + u32 tx_disable_bit = 1 << 3; + if (enable) + xd->regs->sdp_control &= ~tx_disable_bit; + else + xd->regs->sdp_control |= tx_disable_bit; +} + +static void +ixge_sfp_enable_disable_10g (ixge_device_t * xd, uword enable) +{ + u32 is_10g_bit = 1 << 5; + if (enable) + xd->regs->sdp_control |= is_10g_bit; + else + xd->regs->sdp_control &= ~is_10g_bit; +} + +static clib_error_t * +ixge_sfp_phy_init_from_eeprom (ixge_device_t * xd, u16 sfp_type) +{ + u16 a, id, reg_values_addr = 0; + + a = ixge_read_eeprom (xd, 0x2b); + if (a == 0 || a == 0xffff) + return clib_error_create ("no init sequence in eeprom"); + + while (1) + { + id = ixge_read_eeprom (xd, ++a); + if (id == 0xffff) + break; + reg_values_addr = ixge_read_eeprom (xd, ++a); + if (id == sfp_type) + break; + } + if (id != sfp_type) + return clib_error_create ("failed to find id 0x%x", sfp_type); + + ixge_software_firmware_sync (xd, 1 << 3); + while (1) + { + u16 v = ixge_read_eeprom (xd, ++reg_values_addr); + if (v == 0xffff) + break; + xd->regs->core_analog_config = v; + } + ixge_software_firmware_sync_release (xd, 1 << 3); + + /* Make sure laser is off. We'll turn on the laser when + the interface is brought up. */ + ixge_sfp_enable_disable_laser (xd, /* enable */ 0); + ixge_sfp_enable_disable_10g (xd, /* is_10g */ 1); + + return 0; +} + +static void +ixge_sfp_device_up_down (ixge_device_t * xd, uword is_up) +{ + u32 v; + + if (is_up) + { + /* pma/pmd 10g serial SFI. */ + xd->regs->xge_mac.auto_negotiation_control2 &= ~(3 << 16); + xd->regs->xge_mac.auto_negotiation_control2 |= 2 << 16; + + v = xd->regs->xge_mac.auto_negotiation_control; + v &= ~(7 << 13); + v |= (0 << 13); + /* Restart autoneg. */ + v |= (1 << 12); + xd->regs->xge_mac.auto_negotiation_control = v; + + while (!(xd->regs->xge_mac.link_partner_ability[0] & 0xf0000)) + ; + + v = xd->regs->xge_mac.auto_negotiation_control; + + /* link mode 10g sfi serdes */ + v &= ~(7 << 13); + v |= (3 << 13); + + /* Restart autoneg. */ + v |= (1 << 12); + xd->regs->xge_mac.auto_negotiation_control = v; + + xd->regs->xge_mac.link_status; + } + + ixge_sfp_enable_disable_laser (xd, /* enable */ is_up); + + /* Give time for link partner to notice that we're up. */ + if (is_up && vlib_in_process_context (vlib_get_main ())) + { + vlib_process_suspend (vlib_get_main (), 300e-3); + } +} + +always_inline ixge_dma_regs_t * +get_dma_regs (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 qi) +{ + ixge_regs_t *r = xd->regs; + ASSERT (qi < 128); + if (rt == VLIB_RX) + return qi < 64 ? &r->rx_dma0[qi] : &r->rx_dma1[qi - 64]; + else + return &r->tx_dma[qi]; +} + +static clib_error_t * +ixge_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, hif->dev_instance); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); + + if (is_up) + { + xd->regs->rx_enable |= 1; + xd->regs->tx_dma_control |= 1; + dr->control |= 1 << 25; + while (!(dr->control & (1 << 25))) + ; + } + else + { + xd->regs->rx_enable &= ~1; + xd->regs->tx_dma_control &= ~1; + } + + ixge_sfp_device_up_down (xd, is_up); + + return /* no error */ 0; +} + +static void +ixge_sfp_phy_init (ixge_device_t * xd) +{ + ixge_phy_t *phy = xd->phys + xd->phy_index; + i2c_bus_t *ib = &xd->i2c_bus; + + ib->private_data = xd->device_index; + ib->put_bits = ixge_i2c_put_bits; + ib->get_bits = ixge_i2c_get_bits; + vlib_i2c_init (ib); + + vlib_i2c_read_eeprom (ib, 0x50, 0, 128, (u8 *) & xd->sfp_eeprom); + + if (vlib_i2c_bus_timed_out (ib) || !sfp_eeprom_is_valid (&xd->sfp_eeprom)) + xd->sfp_eeprom.id = SFP_ID_unknown; + else + { + /* FIXME 5 => SR/LR eeprom ID. */ + clib_error_t *e = + ixge_sfp_phy_init_from_eeprom (xd, 5 + xd->pci_function); + if (e) + clib_error_report (e); + } + + phy->mdio_address = ~0; +} + +static void +ixge_phy_init (ixge_device_t * xd) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_phy_t *phy = xd->phys + xd->phy_index; + + switch (xd->device_id) + { + case IXGE_82599_sfp: + case IXGE_82599_sfp_em: + case IXGE_82599_sfp_fcoe: + /* others? */ + return ixge_sfp_phy_init (xd); + + default: + break; + } + + /* Probe address of phy. */ + { + u32 i, v; + + phy->mdio_address = ~0; + for (i = 0; i < 32; i++) + { + phy->mdio_address = i; + v = ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1); + if (v != 0xffff && v != 0) + break; + } + + /* No PHY found? */ + if (i >= 32) + return; + } + + phy->id = + ((ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1) << 16) | + ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID2)); + + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, phy id 0x%d mdio address %d",.format_args = "i4i4i4",}; + struct + { + u32 instance, id, address; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->id = phy->id; + ed->address = phy->mdio_address; + } + + /* Reset phy. */ + ixge_write_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL, + XGE_PHY_CONTROL_RESET); + + /* Wait for self-clearning reset bit to clear. */ + do + { + vlib_process_suspend (vm, 1e-3); + } + while (ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL) & + XGE_PHY_CONTROL_RESET); +} + +static u8 * +format_ixge_rx_from_hw_descriptor (u8 * s, va_list * va) +{ + ixge_rx_from_hw_descriptor_t *d = + va_arg (*va, ixge_rx_from_hw_descriptor_t *); + u32 s0 = d->status[0], s2 = d->status[2]; + u32 is_ip4, is_ip6, is_ip, is_tcp, is_udp; + uword indent = format_get_indent (s); + + s = format (s, "%s-owned", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE) ? "sw" : + "hw"); + s = + format (s, ", length this descriptor %d, l3 offset %d", + d->n_packet_bytes_this_descriptor, + IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s0)); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) + s = format (s, ", end-of-packet"); + + s = format (s, "\n%U", format_white_space, indent); + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR) + s = format (s, "layer2 error"); + + if (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2) + { + s = format (s, "layer 2 type %d", (s0 & 0x1f)); + return s; + } + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN) + s = format (s, "vlan header 0x%x\n%U", d->vlan_tag, + format_white_space, indent); + + if ((is_ip4 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4))) + { + s = format (s, "ip4%s", + (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT) ? " options" : + ""); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED) + s = format (s, " checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) ? + "bad" : "ok"); + } + if ((is_ip6 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6))) + s = format (s, "ip6%s", + (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT) ? " extended" : + ""); + is_tcp = is_udp = 0; + if ((is_ip = (is_ip4 | is_ip6))) + { + is_tcp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP) != 0; + is_udp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP) != 0; + if (is_tcp) + s = format (s, ", tcp"); + if (is_udp) + s = format (s, ", udp"); + } + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED) + s = format (s, ", tcp checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR) ? "bad" : + "ok"); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED) + s = format (s, ", udp checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR) ? "bad" : + "ok"); + + return s; +} + +static u8 * +format_ixge_tx_descriptor (u8 * s, va_list * va) +{ + ixge_tx_descriptor_t *d = va_arg (*va, ixge_tx_descriptor_t *); + u32 s0 = d->status0, s1 = d->status1; + uword indent = format_get_indent (s); + u32 v; + + s = format (s, "buffer 0x%Lx, %d packet bytes, %d bytes this buffer", + d->buffer_address, s1 >> 14, d->n_bytes_this_buffer); + + s = format (s, "\n%U", format_white_space, indent); + + if ((v = (s0 >> 0) & 3)) + s = format (s, "reserved 0x%x, ", v); + + if ((v = (s0 >> 2) & 3)) + s = format (s, "mac 0x%x, ", v); + + if ((v = (s0 >> 4) & 0xf) != 3) + s = format (s, "type 0x%x, ", v); + + s = format (s, "%s%s%s%s%s%s%s%s", + (s0 & (1 << 8)) ? "eop, " : "", + (s0 & (1 << 9)) ? "insert-fcs, " : "", + (s0 & (1 << 10)) ? "reserved26, " : "", + (s0 & (1 << 11)) ? "report-status, " : "", + (s0 & (1 << 12)) ? "reserved28, " : "", + (s0 & (1 << 13)) ? "is-advanced, " : "", + (s0 & (1 << 14)) ? "vlan-enable, " : "", + (s0 & (1 << 15)) ? "tx-segmentation, " : ""); + + if ((v = s1 & 0xf) != 0) + s = format (s, "status 0x%x, ", v); + + if ((v = (s1 >> 4) & 0xf)) + s = format (s, "context 0x%x, ", v); + + if ((v = (s1 >> 8) & 0x3f)) + s = format (s, "options 0x%x, ", v); + + return s; +} + +typedef struct +{ + ixge_descriptor_t before, after; + + u32 buffer_index; + + u16 device_index; + + u8 queue_index; + + u8 is_start_of_packet; + + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} ixge_rx_dma_trace_t; + +static u8 * +format_ixge_rx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + vlib_node_t *node = va_arg (*va, vlib_node_t *); + vnet_main_t *vnm = vnet_get_main (); + ixge_rx_dma_trace_t *t = va_arg (*va, ixge_rx_dma_trace_t *); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); + format_function_t *f; + uword indent = format_get_indent (s); + + { + vnet_sw_interface_t *sw = + vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + s = + format (s, "%U rx queue %d", format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + } + + s = format (s, "\n%Ubefore: %U", + format_white_space, indent, + format_ixge_rx_from_hw_descriptor, &t->before); + s = format (s, "\n%Uafter : head/tail address 0x%Lx/0x%Lx", + format_white_space, indent, + t->after.rx_to_hw.head_address, t->after.rx_to_hw.tail_address); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U", format_white_space, indent); + + f = node->format_buffer; + if (!f || !t->is_start_of_packet) + f = format_hex_bytes; + s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +#define foreach_ixge_error \ + _ (none, "no error") \ + _ (tx_full_drops, "tx ring full drops") \ + _ (ip4_checksum_error, "ip4 checksum errors") \ + _ (rx_alloc_fail, "rx buf alloc from free list failed") \ + _ (rx_alloc_no_physmem, "rx buf alloc failed no physmem") + +typedef enum +{ +#define _(f,s) IXGE_ERROR_##f, + foreach_ixge_error +#undef _ + IXGE_N_ERROR, +} ixge_error_t; + +always_inline void +ixge_rx_next_and_error_from_status_x1 (ixge_device_t * xd, + u32 s00, u32 s02, + u8 * next0, u8 * error0, u32 * flags0) +{ + u8 is0_ip4, is0_ip6, n0, e0; + u32 f0; + + e0 = IXGE_ERROR_none; + n0 = IXGE_RX_NEXT_ETHERNET_INPUT; + + is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; + + e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e0); + + is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; + + n0 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n0; + + /* Check for error. */ + n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; + + f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + + f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + + *error0 = e0; + *next0 = n0; + *flags0 = f0; +} + +always_inline void +ixge_rx_next_and_error_from_status_x2 (ixge_device_t * xd, + u32 s00, u32 s02, + u32 s10, u32 s12, + u8 * next0, u8 * error0, u32 * flags0, + u8 * next1, u8 * error1, u32 * flags1) +{ + u8 is0_ip4, is0_ip6, n0, e0; + u8 is1_ip4, is1_ip6, n1, e1; + u32 f0, f1; + + e0 = e1 = IXGE_ERROR_none; + n0 = n1 = IXGE_RX_NEXT_IP4_INPUT; + + is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + is1_ip4 = s12 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + + n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; + n1 = is1_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n1; + + e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e0); + e1 = (is1_ip4 && (s12 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e1); + + is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + is1_ip6 = s10 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + + n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; + n1 = is1_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n1; + + n0 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n0; + n1 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n1; + + /* Check for error. */ + n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; + n1 = e1 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n1; + + *error0 = e0; + *error1 = e1; + + *next0 = n0; + *next1 = n1; + + f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + f1 = ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + + f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + f1 |= ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + + *flags0 = f0; + *flags1 = f1; +} + +static void +ixge_rx_trace (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + ixge_descriptor_t * before_descriptors, + u32 * before_buffers, + ixge_descriptor_t * after_descriptors, uword n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = dq->rx.node; + ixge_rx_from_hw_descriptor_t *bd; + ixge_rx_to_hw_descriptor_t *ad; + u32 *b, n_left, is_sop, next_index_sop; + + n_left = n_descriptors; + b = before_buffers; + bd = &before_descriptors->rx_from_hw; + ad = &after_descriptors->rx_to_hw; + is_sop = dq->rx.is_start_of_packet; + next_index_sop = dq->rx.saved_start_of_packet_next_index; + + while (n_left >= 2) + { + u32 bi0, bi1, flags0, flags1; + vlib_buffer_t *b0, *b1; + ixge_rx_dma_trace_t *t0, *t1; + u8 next0, error0, next1, error1; + + bi0 = b[0]; + bi1 = b[1]; + n_left -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + ixge_rx_next_and_error_from_status_x2 (xd, + bd[0].status[0], bd[0].status[2], + bd[1].status[0], bd[1].status[2], + &next0, &error0, &flags0, + &next1, &error1, &flags1); + + next_index_sop = is_sop ? next0 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + next_index_sop = is_sop ? next1 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b1, /* follow_chain */ 0); + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->is_start_of_packet = is_sop; + is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t1->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t1->device_index = xd->device_index; + t0->before.rx_from_hw = bd[0]; + t1->before.rx_from_hw = bd[1]; + t0->after.rx_to_hw = ad[0]; + t1->after.rx_to_hw = ad[1]; + t0->buffer_index = bi0; + t1->buffer_index = bi1; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + memcpy (t1->buffer.pre_data, b1->data + b1->current_data, + sizeof (t1->buffer.pre_data)); + + b += 2; + bd += 2; + ad += 2; + } + + while (n_left >= 1) + { + u32 bi0, flags0; + vlib_buffer_t *b0; + ixge_rx_dma_trace_t *t0; + u8 next0, error0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ixge_rx_next_and_error_from_status_x1 (xd, + bd[0].status[0], bd[0].status[2], + &next0, &error0, &flags0); + + next_index_sop = is_sop ? next0 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t0->before.rx_from_hw = bd[0]; + t0->after.rx_to_hw = ad[0]; + t0->buffer_index = bi0; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + + b += 1; + bd += 1; + ad += 1; + } +} + +typedef struct +{ + ixge_tx_descriptor_t descriptor; + + u32 buffer_index; + + u16 device_index; + + u8 queue_index; + + u8 is_start_of_packet; + + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} ixge_tx_dma_trace_t; + +static u8 * +format_ixge_tx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + ixge_tx_dma_trace_t *t = va_arg (*va, ixge_tx_dma_trace_t *); + vnet_main_t *vnm = vnet_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); + format_function_t *f; + uword indent = format_get_indent (s); + + { + vnet_sw_interface_t *sw = + vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + s = + format (s, "%U tx queue %d", format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + } + + s = format (s, "\n%Udescriptor: %U", + format_white_space, indent, + format_ixge_tx_descriptor, &t->descriptor); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U", format_white_space, indent); + + f = format_ethernet_header_with_length; + if (!f || !t->is_start_of_packet) + f = format_hex_bytes; + s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +typedef struct +{ + vlib_node_runtime_t *node; + + u32 is_start_of_packet; + + u32 n_bytes_in_packet; + + ixge_tx_descriptor_t *start_of_packet_descriptor; +} ixge_tx_state_t; + +static void +ixge_tx_trace (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + ixge_tx_state_t * tx_state, + ixge_tx_descriptor_t * descriptors, + u32 * buffers, uword n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = tx_state->node; + ixge_tx_descriptor_t *d; + u32 *b, n_left, is_sop; + + n_left = n_descriptors; + b = buffers; + d = descriptors; + is_sop = tx_state->is_start_of_packet; + + while (n_left >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + ixge_tx_dma_trace_t *t0, *t1; + + bi0 = b[0]; + bi1 = b[1]; + n_left -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->is_start_of_packet = is_sop; + is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t1->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t1->device_index = xd->device_index; + t0->descriptor = d[0]; + t1->descriptor = d[1]; + t0->buffer_index = bi0; + t1->buffer_index = bi1; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + memcpy (t1->buffer.pre_data, b1->data + b1->current_data, + sizeof (t1->buffer.pre_data)); + + b += 2; + d += 2; + } + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + ixge_tx_dma_trace_t *t0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t0->descriptor = d[0]; + t0->buffer_index = bi0; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + + b += 1; + d += 1; + } +} + +always_inline uword +ixge_ring_sub (ixge_dma_queue_t * q, u32 i0, u32 i1) +{ + i32 d = i1 - i0; + ASSERT (i0 < q->n_descriptors); + ASSERT (i1 < q->n_descriptors); + return d < 0 ? q->n_descriptors + d : d; +} + +always_inline uword +ixge_ring_add (ixge_dma_queue_t * q, u32 i0, u32 i1) +{ + u32 d = i0 + i1; + ASSERT (i0 < q->n_descriptors); + ASSERT (i1 < q->n_descriptors); + d -= d >= q->n_descriptors ? q->n_descriptors : 0; + return d; +} + +always_inline uword +ixge_tx_descriptor_matches_template (ixge_main_t * xm, + ixge_tx_descriptor_t * d) +{ + u32 cmp; + + cmp = ((d->status0 & xm->tx_descriptor_template_mask.status0) + ^ xm->tx_descriptor_template.status0); + if (cmp) + return 0; + cmp = ((d->status1 & xm->tx_descriptor_template_mask.status1) + ^ xm->tx_descriptor_template.status1); + if (cmp) + return 0; + + return 1; +} + +static uword +ixge_tx_no_wrap (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + u32 * buffers, + u32 start_descriptor_index, + u32 n_descriptors, ixge_tx_state_t * tx_state) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_tx_descriptor_t *d, *d_sop; + u32 n_left = n_descriptors; + u32 *to_free = vec_end (xm->tx_buffers_pending_free); + u32 *to_tx = + vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); + u32 is_sop = tx_state->is_start_of_packet; + u32 len_sop = tx_state->n_bytes_in_packet; + u16 template_status = xm->tx_descriptor_template.status0; + u32 descriptor_prefetch_rotor = 0; + + ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); + d = &dq->descriptors[start_descriptor_index].tx; + d_sop = is_sop ? d : tx_state->start_of_packet_descriptor; + + while (n_left >= 4) + { + vlib_buffer_t *b0, *b1; + u32 bi0, fi0, len0; + u32 bi1, fi1, len1; + u8 is_eop0, is_eop1; + + /* Prefetch next iteration. */ + vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD); + vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD); + + if ((descriptor_prefetch_rotor & 0x3) == 0) + CLIB_PREFETCH (d + 4, CLIB_CACHE_LINE_BYTES, STORE); + + descriptor_prefetch_rotor += 2; + + bi0 = buffers[0]; + bi1 = buffers[1]; + + to_free[0] = fi0 = to_tx[0]; + to_tx[0] = bi0; + to_free += fi0 != 0; + + to_free[0] = fi1 = to_tx[1]; + to_tx[1] = bi1; + to_free += fi1 != 0; + + buffers += 2; + n_left -= 2; + to_tx += 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + is_eop1 = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + len0 = b0->current_length; + len1 = b1->current_length; + + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 1)); + + d[0].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; + d[1].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi1) + b1->current_data; + + d[0].n_bytes_this_buffer = len0; + d[1].n_bytes_this_buffer = len1; + + d[0].status0 = + template_status | (is_eop0 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + d[1].status0 = + template_status | (is_eop1 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + + len_sop = (is_sop ? 0 : len_sop) + len0; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop0 ? d : d_sop; + + is_sop = is_eop0; + + len_sop = (is_sop ? 0 : len_sop) + len1; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop1 ? d : d_sop; + + is_sop = is_eop1; + } + + while (n_left > 0) + { + vlib_buffer_t *b0; + u32 bi0, fi0, len0; + u8 is_eop0; + + bi0 = buffers[0]; + + to_free[0] = fi0 = to_tx[0]; + to_tx[0] = bi0; + to_free += fi0 != 0; + + buffers += 1; + n_left -= 1; + to_tx += 1; + + b0 = vlib_get_buffer (vm, bi0); + + is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + len0 = b0->current_length; + + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); + + d[0].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; + + d[0].n_bytes_this_buffer = len0; + + d[0].status0 = + template_status | (is_eop0 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + + len_sop = (is_sop ? 0 : len_sop) + len0; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop0 ? d : d_sop; + + is_sop = is_eop0; + } + + if (tx_state->node->flags & VLIB_NODE_FLAG_TRACE) + { + to_tx = + vec_elt_at_index (dq->descriptor_buffer_indices, + start_descriptor_index); + ixge_tx_trace (xm, xd, dq, tx_state, + &dq->descriptors[start_descriptor_index].tx, to_tx, + n_descriptors); + } + + _vec_len (xm->tx_buffers_pending_free) = + to_free - xm->tx_buffers_pending_free; + + /* When we are done d_sop can point to end of ring. Wrap it if so. */ + { + ixge_tx_descriptor_t *d_start = &dq->descriptors[0].tx; + + ASSERT (d_sop - d_start <= dq->n_descriptors); + d_sop = d_sop - d_start == dq->n_descriptors ? d_start : d_sop; + } + + tx_state->is_start_of_packet = is_sop; + tx_state->start_of_packet_descriptor = d_sop; + tx_state->n_bytes_in_packet = len_sop; + + return n_descriptors; +} + +static uword +ixge_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + ixge_main_t *xm = &ixge_main; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + ixge_device_t *xd = vec_elt_at_index (xm->devices, rd->dev_instance); + ixge_dma_queue_t *dq; + u32 *from, n_left_tx, n_descriptors_to_tx, n_tail_drop; + u32 queue_index = 0; /* fixme parameter */ + ixge_tx_state_t tx_state; + + tx_state.node = node; + tx_state.is_start_of_packet = 1; + tx_state.start_of_packet_descriptor = 0; + tx_state.n_bytes_in_packet = 0; + + from = vlib_frame_vector_args (f); + + dq = vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); + + dq->head_index = dq->tx.head_index_write_back[0]; + + /* Since head == tail means ring is empty we can send up to dq->n_descriptors - 1. */ + n_left_tx = dq->n_descriptors - 1; + n_left_tx -= ixge_ring_sub (dq, dq->head_index, dq->tail_index); + + _vec_len (xm->tx_buffers_pending_free) = 0; + + n_descriptors_to_tx = f->n_vectors; + n_tail_drop = 0; + if (PREDICT_FALSE (n_descriptors_to_tx > n_left_tx)) + { + i32 i, n_ok, i_eop, i_sop; + + i_sop = i_eop = ~0; + for (i = n_left_tx - 1; i >= 0; i--) + { + vlib_buffer_t *b = vlib_get_buffer (vm, from[i]); + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + if (i_sop != ~0 && i_eop != ~0) + break; + i_eop = i; + i_sop = i + 1; + } + } + if (i == 0) + n_ok = 0; + else + n_ok = i_eop + 1; + + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, ring full to tx %d head %d tail %d",.format_args = + "i2i2i2i2",}; + struct + { + u16 instance, to_tx, head, tail; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->to_tx = n_descriptors_to_tx; + ed->head = dq->head_index; + ed->tail = dq->tail_index; + } + + if (n_ok < n_descriptors_to_tx) + { + n_tail_drop = n_descriptors_to_tx - n_ok; + vec_add (xm->tx_buffers_pending_free, from + n_ok, n_tail_drop); + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_tx_full_drops, n_tail_drop); + } + + n_descriptors_to_tx = n_ok; + } + + dq->tx.n_buffers_on_ring += n_descriptors_to_tx; + + /* Process from tail to end of descriptor ring. */ + if (n_descriptors_to_tx > 0 && dq->tail_index < dq->n_descriptors) + { + u32 n = + clib_min (dq->n_descriptors - dq->tail_index, n_descriptors_to_tx); + n = ixge_tx_no_wrap (xm, xd, dq, from, dq->tail_index, n, &tx_state); + from += n; + n_descriptors_to_tx -= n; + dq->tail_index += n; + ASSERT (dq->tail_index <= dq->n_descriptors); + if (dq->tail_index == dq->n_descriptors) + dq->tail_index = 0; + } + + if (n_descriptors_to_tx > 0) + { + u32 n = + ixge_tx_no_wrap (xm, xd, dq, from, 0, n_descriptors_to_tx, &tx_state); + from += n; + ASSERT (n == n_descriptors_to_tx); + dq->tail_index += n; + ASSERT (dq->tail_index <= dq->n_descriptors); + if (dq->tail_index == dq->n_descriptors) + dq->tail_index = 0; + } + + /* We should only get full packets. */ + ASSERT (tx_state.is_start_of_packet); + + /* Report status when last descriptor is done. */ + { + u32 i = dq->tail_index == 0 ? dq->n_descriptors - 1 : dq->tail_index - 1; + ixge_tx_descriptor_t *d = &dq->descriptors[i].tx; + d->status0 |= IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS; + } + + /* Give new descriptors to hardware. */ + { + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_TX, queue_index); + + CLIB_MEMORY_BARRIER (); + + dr->tail_index = dq->tail_index; + } + + /* Free any buffers that are done. */ + { + u32 n = _vec_len (xm->tx_buffers_pending_free); + if (n > 0) + { + vlib_buffer_free_no_next (vm, xm->tx_buffers_pending_free, n); + _vec_len (xm->tx_buffers_pending_free) = 0; + ASSERT (dq->tx.n_buffers_on_ring >= n); + dq->tx.n_buffers_on_ring -= (n - n_tail_drop); + } + } + + return f->n_vectors; +} + +static uword +ixge_rx_queue_no_wrap (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + u32 start_descriptor_index, u32 n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = dq->rx.node; + ixge_descriptor_t *d; + static ixge_descriptor_t *d_trace_save; + static u32 *d_trace_buffers; + u32 n_descriptors_left = n_descriptors; + u32 *to_rx = + vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); + u32 *to_add; + u32 bi_sop = dq->rx.saved_start_of_packet_buffer_index; + u32 bi_last = dq->rx.saved_last_buffer_index; + u32 next_index_sop = dq->rx.saved_start_of_packet_next_index; + u32 is_sop = dq->rx.is_start_of_packet; + u32 next_index, n_left_to_next, *to_next; + u32 n_packets = 0; + u32 n_bytes = 0; + u32 n_trace = vlib_get_trace_count (vm, node); + vlib_buffer_t *b_last, b_dummy; + + ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); + d = &dq->descriptors[start_descriptor_index]; + + b_last = bi_last != ~0 ? vlib_get_buffer (vm, bi_last) : &b_dummy; + next_index = dq->rx.next_index; + + if (n_trace > 0) + { + u32 n = clib_min (n_trace, n_descriptors); + if (d_trace_save) + { + _vec_len (d_trace_save) = 0; + _vec_len (d_trace_buffers) = 0; + } + vec_add (d_trace_save, (ixge_descriptor_t *) d, n); + vec_add (d_trace_buffers, to_rx, n); + } + + { + uword l = vec_len (xm->rx_buffers_to_add); + + if (l < n_descriptors_left) + { + u32 n_to_alloc = 2 * dq->n_descriptors - l; + u32 n_allocated; + + vec_resize (xm->rx_buffers_to_add, n_to_alloc); + + _vec_len (xm->rx_buffers_to_add) = l; + n_allocated = vlib_buffer_alloc_from_free_list + (vm, xm->rx_buffers_to_add + l, n_to_alloc, + xm->vlib_buffer_free_list_index); + _vec_len (xm->rx_buffers_to_add) += n_allocated; + + /* Handle transient allocation failure */ + if (PREDICT_FALSE (l + n_allocated <= n_descriptors_left)) + { + if (n_allocated == 0) + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_rx_alloc_no_physmem, 1); + else + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_rx_alloc_fail, 1); + + n_descriptors_left = l + n_allocated; + } + n_descriptors = n_descriptors_left; + } + + /* Add buffers from end of vector going backwards. */ + to_add = vec_end (xm->rx_buffers_to_add) - 1; + } + + while (n_descriptors_left > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_descriptors_left >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *b0, *b1; + u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; + u32 bi1, fi1, len1, l3_offset1, s21, s01, flags1; + u8 is_eop0, error0, next0; + u8 is_eop1, error1, next1; + ixge_descriptor_t d0, d1; + + vlib_prefetch_buffer_with_index (vm, to_rx[2], STORE); + vlib_prefetch_buffer_with_index (vm, to_rx[3], STORE); + + CLIB_PREFETCH (d + 2, 32, STORE); + + d0.as_u32x4 = d[0].as_u32x4; + d1.as_u32x4 = d[1].as_u32x4; + + s20 = d0.rx_from_hw.status[2]; + s21 = d1.rx_from_hw.status[2]; + + s00 = d0.rx_from_hw.status[0]; + s01 = d1.rx_from_hw.status[0]; + + if (! + ((s20 & s21) & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) + goto found_hw_owned_descriptor_x2; + + bi0 = to_rx[0]; + bi1 = to_rx[1]; + + ASSERT (to_add - 1 >= xm->rx_buffers_to_add); + fi0 = to_add[0]; + fi1 = to_add[-1]; + + to_rx[0] = fi0; + to_rx[1] = fi1; + to_rx += 2; + to_add -= 2; + + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi1)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi1)); + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + + CLIB_PREFETCH (b0->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b1->data, CLIB_CACHE_LINE_BYTES, LOAD); + + is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + is_eop1 = (s21 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + + ixge_rx_next_and_error_from_status_x2 (xd, s00, s20, s01, s21, + &next0, &error0, &flags0, + &next1, &error1, &flags1); + + next0 = is_sop ? next0 : next_index_sop; + next1 = is_eop0 ? next1 : next0; + next_index_sop = next1; + + b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + b1->flags |= flags1 | (!is_eop1 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->error = node->errors[error0]; + b1->error = node->errors[error1]; + + len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; + len1 = d1.rx_from_hw.n_packet_bytes_this_descriptor; + n_bytes += len0 + len1; + n_packets += is_eop0 + is_eop1; + + /* Give new buffers to hardware. */ + d0.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi0); + d1.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi1); + d0.rx_to_hw.head_address = d[0].rx_to_hw.tail_address; + d1.rx_to_hw.head_address = d[1].rx_to_hw.tail_address; + d[0].as_u32x4 = d0.as_u32x4; + d[1].as_u32x4 = d1.as_u32x4; + + d += 2; + n_descriptors_left -= 2; + + /* Point to either l2 or l3 header depending on next. */ + l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; + l3_offset1 = (is_eop0 && (next1 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s01) : 0; + + b0->current_length = len0 - l3_offset0; + b1->current_length = len1 - l3_offset1; + b0->current_data = l3_offset0; + b1->current_data = l3_offset1; + + b_last->next_buffer = is_sop ? ~0 : bi0; + b0->next_buffer = is_eop0 ? ~0 : bi1; + bi_last = bi1; + b_last = b1; + + if (CLIB_DEBUG > 0) + { + u32 bi_sop0 = is_sop ? bi0 : bi_sop; + u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; + + if (is_eop0) + { + u8 *msg = vlib_validate_buffer (vm, bi_sop0, + /* follow_buffer_next */ 1); + ASSERT (!msg); + } + if (is_eop1) + { + u8 *msg = vlib_validate_buffer (vm, bi_sop1, + /* follow_buffer_next */ 1); + ASSERT (!msg); + } + } + if (0) /* "Dave" version */ + { + u32 bi_sop0 = is_sop ? bi0 : bi_sop; + u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; + + if (is_eop0) + { + to_next[0] = bi_sop0; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop0, next0); + } + if (is_eop1) + { + to_next[0] = bi_sop1; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop1, next1); + } + is_sop = is_eop1; + bi_sop = bi_sop1; + } + if (1) /* "Eliot" version */ + { + /* Speculatively enqueue to cached next. */ + u8 saved_is_sop = is_sop; + u32 bi_sop_save = bi_sop; + + bi_sop = saved_is_sop ? bi0 : bi_sop; + to_next[0] = bi_sop; + to_next += is_eop0; + n_left_to_next -= is_eop0; + + bi_sop = is_eop0 ? bi1 : bi_sop; + to_next[0] = bi_sop; + to_next += is_eop1; + n_left_to_next -= is_eop1; + + is_sop = is_eop1; + + if (PREDICT_FALSE + (!(next0 == next_index && next1 == next_index))) + { + /* Undo speculation. */ + to_next -= is_eop0 + is_eop1; + n_left_to_next += is_eop0 + is_eop1; + + /* Re-do both descriptors being careful about where we enqueue. */ + bi_sop = saved_is_sop ? bi0 : bi_sop_save; + if (is_eop0) + { + if (next0 != next_index) + vlib_set_next_frame_buffer (vm, node, next0, bi_sop); + else + { + to_next[0] = bi_sop; + to_next += 1; + n_left_to_next -= 1; + } + } + + bi_sop = is_eop0 ? bi1 : bi_sop; + if (is_eop1) + { + if (next1 != next_index) + vlib_set_next_frame_buffer (vm, node, next1, bi_sop); + else + { + to_next[0] = bi_sop; + to_next += 1; + n_left_to_next -= 1; + } + } + + /* Switch cached next index when next for both packets is the same. */ + if (is_eop0 && is_eop1 && next0 == next1) + { + vlib_put_next_frame (vm, node, next_index, + n_left_to_next); + next_index = next0; + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + } + } + } + } + + /* Bail out of dual loop and proceed with single loop. */ + found_hw_owned_descriptor_x2: + + while (n_descriptors_left > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; + u8 is_eop0, error0, next0; + ixge_descriptor_t d0; + + d0.as_u32x4 = d[0].as_u32x4; + + s20 = d0.rx_from_hw.status[2]; + s00 = d0.rx_from_hw.status[0]; + + if (!(s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) + goto found_hw_owned_descriptor_x1; + + bi0 = to_rx[0]; + ASSERT (to_add >= xm->rx_buffers_to_add); + fi0 = to_add[0]; + + to_rx[0] = fi0; + to_rx += 1; + to_add -= 1; + + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi0)); + + b0 = vlib_get_buffer (vm, bi0); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + ixge_rx_next_and_error_from_status_x1 + (xd, s00, s20, &next0, &error0, &flags0); + + next0 = is_sop ? next0 : next_index_sop; + next_index_sop = next0; + + b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->error = node->errors[error0]; + + len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; + n_bytes += len0; + n_packets += is_eop0; + + /* Give new buffer to hardware. */ + d0.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi0); + d0.rx_to_hw.head_address = d0.rx_to_hw.tail_address; + d[0].as_u32x4 = d0.as_u32x4; + + d += 1; + n_descriptors_left -= 1; + + /* Point to either l2 or l3 header depending on next. */ + l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; + b0->current_length = len0 - l3_offset0; + b0->current_data = l3_offset0; + + b_last->next_buffer = is_sop ? ~0 : bi0; + bi_last = bi0; + b_last = b0; + + bi_sop = is_sop ? bi0 : bi_sop; + + if (CLIB_DEBUG > 0 && is_eop0) + { + u8 *msg = + vlib_validate_buffer (vm, bi_sop, /* follow_buffer_next */ 1); + ASSERT (!msg); + } + + if (0) /* "Dave" version */ + { + if (is_eop0) + { + to_next[0] = bi_sop; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop, next0); + } + } + if (1) /* "Eliot" version */ + { + if (PREDICT_TRUE (next0 == next_index)) + { + to_next[0] = bi_sop; + to_next += is_eop0; + n_left_to_next -= is_eop0; + } + else + { + if (next0 != next_index && is_eop0) + vlib_set_next_frame_buffer (vm, node, next0, bi_sop); + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + next_index = next0; + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + } + } + is_sop = is_eop0; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + +found_hw_owned_descriptor_x1: + if (n_descriptors_left > 0) + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + + _vec_len (xm->rx_buffers_to_add) = (to_add + 1) - xm->rx_buffers_to_add; + + { + u32 n_done = n_descriptors - n_descriptors_left; + + if (n_trace > 0 && n_done > 0) + { + u32 n = clib_min (n_trace, n_done); + ixge_rx_trace (xm, xd, dq, + d_trace_save, + d_trace_buffers, + &dq->descriptors[start_descriptor_index], n); + vlib_set_trace_count (vm, node, n_trace - n); + } + if (d_trace_save) + { + _vec_len (d_trace_save) = 0; + _vec_len (d_trace_buffers) = 0; + } + + /* Don't keep a reference to b_last if we don't have to. + Otherwise we can over-write a next_buffer pointer after already haven + enqueued a packet. */ + if (is_sop) + { + b_last->next_buffer = ~0; + bi_last = ~0; + } + + dq->rx.n_descriptors_done_this_call = n_done; + dq->rx.n_descriptors_done_total += n_done; + dq->rx.is_start_of_packet = is_sop; + dq->rx.saved_start_of_packet_buffer_index = bi_sop; + dq->rx.saved_last_buffer_index = bi_last; + dq->rx.saved_start_of_packet_next_index = next_index_sop; + dq->rx.next_index = next_index; + dq->rx.n_bytes += n_bytes; + + return n_packets; + } +} + +static uword +ixge_rx_queue (ixge_main_t * xm, + ixge_device_t * xd, + vlib_node_runtime_t * node, u32 queue_index) +{ + ixge_dma_queue_t *dq = + vec_elt_at_index (xd->dma_queues[VLIB_RX], queue_index); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, dq->queue_index); + uword n_packets = 0; + u32 hw_head_index, sw_head_index; + + /* One time initialization. */ + if (!dq->rx.node) + { + dq->rx.node = node; + dq->rx.is_start_of_packet = 1; + dq->rx.saved_start_of_packet_buffer_index = ~0; + dq->rx.saved_last_buffer_index = ~0; + } + + dq->rx.next_index = node->cached_next_index; + + dq->rx.n_descriptors_done_total = 0; + dq->rx.n_descriptors_done_this_call = 0; + dq->rx.n_bytes = 0; + + /* Fetch head from hardware and compare to where we think we are. */ + hw_head_index = dr->head_index; + sw_head_index = dq->head_index; + + if (hw_head_index == sw_head_index) + goto done; + + if (hw_head_index < sw_head_index) + { + u32 n_tried = dq->n_descriptors - sw_head_index; + n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); + sw_head_index = + ixge_ring_add (dq, sw_head_index, + dq->rx.n_descriptors_done_this_call); + + if (dq->rx.n_descriptors_done_this_call != n_tried) + goto done; + } + if (hw_head_index >= sw_head_index) + { + u32 n_tried = hw_head_index - sw_head_index; + n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); + sw_head_index = + ixge_ring_add (dq, sw_head_index, + dq->rx.n_descriptors_done_this_call); + } + +done: + dq->head_index = sw_head_index; + dq->tail_index = + ixge_ring_add (dq, dq->tail_index, dq->rx.n_descriptors_done_total); + + /* Give tail back to hardware. */ + CLIB_MEMORY_BARRIER (); + + dr->tail_index = dq->tail_index; + + vlib_increment_combined_counter (vnet_main. + interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + 0 /* cpu_index */ , + xd->vlib_sw_if_index, n_packets, + dq->rx.n_bytes); + + return n_packets; +} + +static void +ixge_interrupt (ixge_main_t * xm, ixge_device_t * xd, u32 i) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + + if (i != 20) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, %s",.format_args = "i1t1",.n_enum_strings = + 16,.enum_strings = + { + "flow director", + "rx miss", + "pci exception", + "mailbox", + "link status change", + "linksec key exchange", + "manageability event", + "reserved23", + "sdp0", + "sdp1", + "sdp2", + "sdp3", + "ecc", "descriptor handler error", "tcp timer", "other",},}; + struct + { + u8 instance; + u8 index; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->index = i - 16; + } + else + { + u32 v = r->xge_mac.link_status; + uword is_up = (v & (1 << 30)) != 0; + + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, link status change 0x%x",.format_args = "i4i4",}; + struct + { + u32 instance, link_status; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->link_status = v; + xd->link_status_at_last_link_change = v; + + vlib_process_signal_event (vm, ixge_process_node.index, + EVENT_SET_FLAGS, + ((is_up << 31) | xd->vlib_hw_if_index)); + } +} + +always_inline u32 +clean_block (u32 * b, u32 * t, u32 n_left) +{ + u32 *t0 = t; + + while (n_left >= 4) + { + u32 bi0, bi1, bi2, bi3; + + t[0] = bi0 = b[0]; + b[0] = 0; + t += bi0 != 0; + + t[0] = bi1 = b[1]; + b[1] = 0; + t += bi1 != 0; + + t[0] = bi2 = b[2]; + b[2] = 0; + t += bi2 != 0; + + t[0] = bi3 = b[3]; + b[3] = 0; + t += bi3 != 0; + + b += 4; + n_left -= 4; + } + + while (n_left > 0) + { + u32 bi0; + + t[0] = bi0 = b[0]; + b[0] = 0; + t += bi0 != 0; + b += 1; + n_left -= 1; + } + + return t - t0; +} + +static void +ixge_tx_queue (ixge_main_t * xm, ixge_device_t * xd, u32 queue_index) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_dma_queue_t *dq = + vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); + u32 n_clean, *b, *t, *t0; + i32 n_hw_owned_descriptors; + i32 first_to_clean, last_to_clean; + u64 hwbp_race = 0; + + /* Handle case where head write back pointer update + * arrives after the interrupt during high PCI bus loads. + */ + while ((dq->head_index == dq->tx.head_index_write_back[0]) && + dq->tx.n_buffers_on_ring && (dq->head_index != dq->tail_index)) + { + hwbp_race++; + if (IXGE_HWBP_RACE_ELOG && (hwbp_race == 1)) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d tx head index race: head %4d, tail %4d, buffs %4d",.format_args + = "i4i4i4i4",}; + struct + { + u32 instance, head_index, tail_index, n_buffers_on_ring; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->head_index = dq->head_index; + ed->tail_index = dq->tail_index; + ed->n_buffers_on_ring = dq->tx.n_buffers_on_ring; + } + } + + dq->head_index = dq->tx.head_index_write_back[0]; + n_hw_owned_descriptors = ixge_ring_sub (dq, dq->head_index, dq->tail_index); + ASSERT (dq->tx.n_buffers_on_ring >= n_hw_owned_descriptors); + n_clean = dq->tx.n_buffers_on_ring - n_hw_owned_descriptors; + + if (IXGE_HWBP_RACE_ELOG && hwbp_race) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d tx head index race: head %4d, hw_owned %4d, n_clean %4d, retries %d",.format_args + = "i4i4i4i4i4",}; + struct + { + u32 instance, head_index, n_hw_owned_descriptors, n_clean, retries; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->head_index = dq->head_index; + ed->n_hw_owned_descriptors = n_hw_owned_descriptors; + ed->n_clean = n_clean; + ed->retries = hwbp_race; + } + + /* + * This function used to wait until hardware owned zero descriptors. + * At high PPS rates, that doesn't happen until the TX ring is + * completely full of descriptors which need to be cleaned up. + * That, in turn, causes TX ring-full drops and/or long RX service + * interruptions. + */ + if (n_clean == 0) + return; + + /* Clean the n_clean descriptors prior to the reported hardware head */ + last_to_clean = dq->head_index - 1; + last_to_clean = (last_to_clean < 0) ? last_to_clean + dq->n_descriptors : + last_to_clean; + + first_to_clean = (last_to_clean) - (n_clean - 1); + first_to_clean = (first_to_clean < 0) ? first_to_clean + dq->n_descriptors : + first_to_clean; + + vec_resize (xm->tx_buffers_pending_free, dq->n_descriptors - 1); + t0 = t = xm->tx_buffers_pending_free; + b = dq->descriptor_buffer_indices + first_to_clean; + + /* Wrap case: clean from first to end, then start to last */ + if (first_to_clean > last_to_clean) + { + t += clean_block (b, t, (dq->n_descriptors - 1) - first_to_clean); + first_to_clean = 0; + b = dq->descriptor_buffer_indices; + } + + /* Typical case: clean from first to last */ + if (first_to_clean <= last_to_clean) + t += clean_block (b, t, (last_to_clean - first_to_clean) + 1); + + if (t > t0) + { + u32 n = t - t0; + vlib_buffer_free_no_next (vm, t0, n); + ASSERT (dq->tx.n_buffers_on_ring >= n); + dq->tx.n_buffers_on_ring -= n; + _vec_len (xm->tx_buffers_pending_free) = 0; + } +} + +/* RX queue interrupts 0 thru 7; TX 8 thru 15. */ +always_inline uword +ixge_interrupt_is_rx_queue (uword i) +{ + return i < 8; +} + +always_inline uword +ixge_interrupt_is_tx_queue (uword i) +{ + return i >= 8 && i < 16; +} + +always_inline uword +ixge_tx_queue_to_interrupt (uword i) +{ + return 8 + i; +} + +always_inline uword +ixge_rx_queue_to_interrupt (uword i) +{ + return 0 + i; +} + +always_inline uword +ixge_interrupt_rx_queue (uword i) +{ + ASSERT (ixge_interrupt_is_rx_queue (i)); + return i - 0; +} + +always_inline uword +ixge_interrupt_tx_queue (uword i) +{ + ASSERT (ixge_interrupt_is_tx_queue (i)); + return i - 8; +} + +static uword +ixge_device_input (ixge_main_t * xm, + ixge_device_t * xd, vlib_node_runtime_t * node) +{ + ixge_regs_t *r = xd->regs; + u32 i, s; + uword n_rx_packets = 0; + + s = r->interrupt.status_write_1_to_set; + if (s) + r->interrupt.status_write_1_to_clear = s; + + /* *INDENT-OFF* */ + foreach_set_bit (i, s, ({ + if (ixge_interrupt_is_rx_queue (i)) + n_rx_packets += ixge_rx_queue (xm, xd, node, ixge_interrupt_rx_queue (i)); + + else if (ixge_interrupt_is_tx_queue (i)) + ixge_tx_queue (xm, xd, ixge_interrupt_tx_queue (i)); + + else + ixge_interrupt (xm, xd, i); + })); + /* *INDENT-ON* */ + + return n_rx_packets; +} + +static uword +ixge_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd; + uword n_rx_packets = 0; + + if (node->state == VLIB_NODE_STATE_INTERRUPT) + { + uword i; + + /* Loop over devices with interrupts. */ + /* *INDENT-OFF* */ + foreach_set_bit (i, node->runtime_data[0], ({ + xd = vec_elt_at_index (xm->devices, i); + n_rx_packets += ixge_device_input (xm, xd, node); + + /* Re-enable interrupts since we're going to stay in interrupt mode. */ + if (! (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) + xd->regs->interrupt.enable_write_1_to_set = ~0; + })); + /* *INDENT-ON* */ + + /* Clear mask of devices with pending interrupts. */ + node->runtime_data[0] = 0; + } + else + { + /* Poll all devices for input/interrupts. */ + vec_foreach (xd, xm->devices) + { + n_rx_packets += ixge_device_input (xm, xd, node); + + /* Re-enable interrupts when switching out of polling mode. */ + if (node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) + xd->regs->interrupt.enable_write_1_to_set = ~0; + } + } + + return n_rx_packets; +} + +static char *ixge_error_strings[] = { +#define _(n,s) s, + foreach_ixge_error +#undef _ +}; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ixge_input_node, static) = { + .function = ixge_input, + .type = VLIB_NODE_TYPE_INPUT, + .name = "ixge-input", + + /* Will be enabled if/when hardware is detected. */ + .state = VLIB_NODE_STATE_DISABLED, + + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_ixge_rx_dma_trace, + + .n_errors = IXGE_N_ERROR, + .error_strings = ixge_error_strings, + + .n_next_nodes = IXGE_RX_N_NEXT, + .next_nodes = { + [IXGE_RX_NEXT_DROP] = "error-drop", + [IXGE_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + [IXGE_RX_NEXT_IP4_INPUT] = "ip4-input", + [IXGE_RX_NEXT_IP6_INPUT] = "ip6-input", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH_CLONE (ixge_input) +CLIB_MULTIARCH_SELECT_FN (ixge_input) +/* *INDENT-ON* */ + +static u8 * +format_ixge_device_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, i); + return format (s, "TenGigabitEthernet%U", + format_vlib_pci_handle, &xd->pci_device.bus_address); +} + +#define IXGE_COUNTER_IS_64_BIT (1 << 0) +#define IXGE_COUNTER_NOT_CLEAR_ON_READ (1 << 1) + +static u8 ixge_counter_flags[] = { +#define _(a,f) 0, +#define _64(a,f) IXGE_COUNTER_IS_64_BIT, + foreach_ixge_counter +#undef _ +#undef _64 +}; + +static void +ixge_update_counters (ixge_device_t * xd) +{ + /* Byte offset for counter registers. */ + static u32 reg_offsets[] = { +#define _(a,f) (a) / sizeof (u32), +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + }; + volatile u32 *r = (volatile u32 *) xd->regs; + int i; + + for (i = 0; i < ARRAY_LEN (xd->counters); i++) + { + u32 o = reg_offsets[i]; + xd->counters[i] += r[o]; + if (ixge_counter_flags[i] & IXGE_COUNTER_NOT_CLEAR_ON_READ) + r[o] = 0; + if (ixge_counter_flags[i] & IXGE_COUNTER_IS_64_BIT) + xd->counters[i] += (u64) r[o + 1] << (u64) 32; + } +} + +static u8 * +format_ixge_device_id (u8 * s, va_list * args) +{ + u32 device_id = va_arg (*args, u32); + char *t = 0; + switch (device_id) + { +#define _(f,n) case n: t = #f; break; + foreach_ixge_pci_device_id; +#undef _ + default: + t = 0; + break; + } + if (t == 0) + s = format (s, "unknown 0x%x", device_id); + else + s = format (s, "%s", t); + return s; +} + +static u8 * +format_ixge_link_status (u8 * s, va_list * args) +{ + ixge_device_t *xd = va_arg (*args, ixge_device_t *); + u32 v = xd->link_status_at_last_link_change; + + s = format (s, "%s", (v & (1 << 30)) ? "up" : "down"); + + { + char *modes[] = { + "1g", "10g parallel", "10g serial", "autoneg", + }; + char *speeds[] = { + "unknown", "100m", "1g", "10g", + }; + s = format (s, ", mode %s, speed %s", + modes[(v >> 26) & 3], speeds[(v >> 28) & 3]); + } + + return s; +} + +static u8 * +format_ixge_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + CLIB_UNUSED (int verbose) = va_arg (*args, int); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, dev_instance); + ixge_phy_t *phy = xd->phys + xd->phy_index; + uword indent = format_get_indent (s); + + ixge_update_counters (xd); + xd->link_status_at_last_link_change = xd->regs->xge_mac.link_status; + + s = format (s, "Intel 8259X: id %U\n%Ulink %U", + format_ixge_device_id, xd->device_id, + format_white_space, indent + 2, format_ixge_link_status, xd); + + { + + s = format (s, "\n%UPCIe %U", format_white_space, indent + 2, + format_vlib_pci_link_speed, &xd->pci_device); + } + + s = format (s, "\n%U", format_white_space, indent + 2); + if (phy->mdio_address != ~0) + s = format (s, "PHY address %d, id 0x%x", phy->mdio_address, phy->id); + else if (xd->sfp_eeprom.id == SFP_ID_sfp) + s = format (s, "SFP %U", format_sfp_eeprom, &xd->sfp_eeprom); + else + s = format (s, "PHY not found"); + + /* FIXME */ + { + ixge_dma_queue_t *dq = vec_elt_at_index (xd->dma_queues[VLIB_RX], 0); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); + u32 hw_head_index = dr->head_index; + u32 sw_head_index = dq->head_index; + u32 nitems; + + nitems = ixge_ring_sub (dq, hw_head_index, sw_head_index); + s = format (s, "\n%U%d unprocessed, %d total buffers on rx queue 0 ring", + format_white_space, indent + 2, nitems, dq->n_descriptors); + + s = format (s, "\n%U%d buffers in driver rx cache", + format_white_space, indent + 2, + vec_len (xm->rx_buffers_to_add)); + + s = format (s, "\n%U%d buffers on tx queue 0 ring", + format_white_space, indent + 2, + xd->dma_queues[VLIB_TX][0].tx.n_buffers_on_ring); + } + { + u32 i; + u64 v; + static char *names[] = { +#define _(a,f) #f, +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + }; + + for (i = 0; i < ARRAY_LEN (names); i++) + { + v = xd->counters[i] - xd->counters_last_clear[i]; + if (v != 0) + s = format (s, "\n%U%-40U%16Ld", + format_white_space, indent + 2, + format_c_identifier, names[i], v); + } + } + + return s; +} + +static void +ixge_clear_hw_interface_counters (u32 instance) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, instance); + ixge_update_counters (xd); + memcpy (xd->counters_last_clear, xd->counters, sizeof (xd->counters)); +} + +/* + * Dynamically redirect all pkts from a specific interface + * to the specified node + */ +static void +ixge_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + ixge_main_t *xm = &ixge_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + ixge_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + xd->per_interface_next_index = node_index; + return; + } + + xd->per_interface_next_index = + vlib_node_add_next (xm->vlib_main, ixge_input_node.index, node_index); +} + + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (ixge_device_class) = { + .name = "ixge", + .tx_function = ixge_interface_tx, + .format_device_name = format_ixge_device_name, + .format_device = format_ixge_device, + .format_tx_trace = format_ixge_tx_dma_trace, + .clear_counters = ixge_clear_hw_interface_counters, + .admin_up_down_function = ixge_interface_admin_up_down, + .rx_redirect_to_node = ixge_set_interface_next_node, + .flatten_output_chains = 1, +}; +/* *INDENT-ON* */ + +#define IXGE_N_BYTES_IN_RX_BUFFER (2048) // DAW-HACK: Set Rx buffer size so all packets < ETH_MTU_SIZE fit in the buffer (i.e. sop & eop for all descriptors). + +static clib_error_t * +ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_dma_queue_t *dq; + clib_error_t *error = 0; + + vec_validate (xd->dma_queues[rt], queue_index); + dq = vec_elt_at_index (xd->dma_queues[rt], queue_index); + + if (!xm->n_descriptors_per_cache_line) + xm->n_descriptors_per_cache_line = + CLIB_CACHE_LINE_BYTES / sizeof (dq->descriptors[0]); + + if (!xm->n_bytes_in_rx_buffer) + xm->n_bytes_in_rx_buffer = IXGE_N_BYTES_IN_RX_BUFFER; + xm->n_bytes_in_rx_buffer = round_pow2 (xm->n_bytes_in_rx_buffer, 1024); + if (!xm->vlib_buffer_free_list_index) + { + xm->vlib_buffer_free_list_index = + vlib_buffer_get_or_create_free_list (vm, xm->n_bytes_in_rx_buffer, + "ixge rx"); + ASSERT (xm->vlib_buffer_free_list_index != 0); + } + + if (!xm->n_descriptors[rt]) + xm->n_descriptors[rt] = 4 * VLIB_FRAME_SIZE; + + dq->queue_index = queue_index; + dq->n_descriptors = + round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); + dq->head_index = dq->tail_index = 0; + + dq->descriptors = vlib_physmem_alloc_aligned (vm, &error, + dq->n_descriptors * + sizeof (dq->descriptors[0]), + 128 /* per chip spec */ ); + if (error) + return error; + + memset (dq->descriptors, 0, + dq->n_descriptors * sizeof (dq->descriptors[0])); + vec_resize (dq->descriptor_buffer_indices, dq->n_descriptors); + + if (rt == VLIB_RX) + { + u32 n_alloc, i; + + n_alloc = vlib_buffer_alloc_from_free_list + (vm, dq->descriptor_buffer_indices, + vec_len (dq->descriptor_buffer_indices), + xm->vlib_buffer_free_list_index); + ASSERT (n_alloc == vec_len (dq->descriptor_buffer_indices)); + for (i = 0; i < n_alloc; i++) + { + vlib_buffer_t *b = + vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]); + dq->descriptors[i].rx_to_hw.tail_address = + vlib_physmem_virtual_to_physical (vm, b->data); + } + } + else + { + u32 i; + + dq->tx.head_index_write_back = + vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES); + + for (i = 0; i < dq->n_descriptors; i++) + dq->descriptors[i].tx = xm->tx_descriptor_template; + + vec_validate (xm->tx_buffers_pending_free, dq->n_descriptors - 1); + } + + { + ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); + u64 a; + + a = vlib_physmem_virtual_to_physical (vm, dq->descriptors); + dr->descriptor_address[0] = a & 0xFFFFFFFF; + dr->descriptor_address[1] = a >> (u64) 32; + dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); + dq->head_index = dq->tail_index = 0; + + if (rt == VLIB_RX) + { + ASSERT ((xm->n_bytes_in_rx_buffer / 1024) < 32); + dr->rx_split_control = + ( /* buffer size */ ((xm->n_bytes_in_rx_buffer / 1024) << 0) + | ( /* lo free descriptor threshold (units of 64 descriptors) */ + (1 << 22)) | ( /* descriptor type: advanced one buffer */ + (1 << 25)) | ( /* drop if no descriptors available */ + (1 << 28))); + + /* Give hardware all but last 16 cache lines' worth of descriptors. */ + dq->tail_index = dq->n_descriptors - + 16 * xm->n_descriptors_per_cache_line; + } + else + { + /* Make sure its initialized before hardware can get to it. */ + dq->tx.head_index_write_back[0] = dq->head_index; + + a = + vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back); + dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; + dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; + } + + /* DMA on 82599 does not work with [13] rx data write relaxed ordering + and [12] undocumented set. */ + if (rt == VLIB_RX) + dr->dca_control &= ~((1 << 13) | (1 << 12)); + + CLIB_MEMORY_BARRIER (); + + if (rt == VLIB_TX) + { + xd->regs->tx_dma_control |= (1 << 0); + dr->control |= ((32 << 0) /* prefetch threshold */ + | (64 << 8) /* host threshold */ + | (0 << 16) /* writeback threshold */ ); + } + + /* Enable this queue and wait for hardware to initialize + before adding to tail. */ + if (rt == VLIB_TX) + { + dr->control |= 1 << 25; + while (!(dr->control & (1 << 25))) + ; + } + + /* Set head/tail indices and enable DMA. */ + dr->head_index = dq->head_index; + dr->tail_index = dq->tail_index; + } + + return error; +} + +static u32 +ixge_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) +{ + ixge_device_t *xd; + ixge_regs_t *r; + u32 old; + ixge_main_t *xm = &ixge_main; + + xd = vec_elt_at_index (xm->devices, hw->dev_instance); + r = xd->regs; + + old = r->filter_control; + + if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) + r->filter_control = old | (1 << 9) /* unicast promiscuous */ ; + else + r->filter_control = old & ~(1 << 9); + + return old; +} + +static void +ixge_device_init (ixge_main_t * xm) +{ + vnet_main_t *vnm = vnet_get_main (); + ixge_device_t *xd; + + /* Reset chip(s). */ + vec_foreach (xd, xm->devices) + { + ixge_regs_t *r = xd->regs; + const u32 reset_bit = (1 << 26) | (1 << 3); + + r->control |= reset_bit; + + /* No need to suspend. Timed to take ~1e-6 secs */ + while (r->control & reset_bit) + ; + + /* Software loaded. */ + r->extended_control |= (1 << 28); + + ixge_phy_init (xd); + + /* Register ethernet interface. */ + { + u8 addr8[6]; + u32 i, addr32[2]; + clib_error_t *error; + + addr32[0] = r->rx_ethernet_address0[0][0]; + addr32[1] = r->rx_ethernet_address0[0][1]; + for (i = 0; i < 6; i++) + addr8[i] = addr32[i / 4] >> ((i % 4) * 8); + + error = ethernet_register_interface + (vnm, ixge_device_class.index, xd->device_index, + /* ethernet address */ addr8, + &xd->vlib_hw_if_index, ixge_flag_change); + if (error) + clib_error_report (error); + } + + { + vnet_sw_interface_t *sw = + vnet_get_hw_sw_interface (vnm, xd->vlib_hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + } + + ixge_dma_init (xd, VLIB_RX, /* queue_index */ 0); + + xm->n_descriptors[VLIB_TX] = 20 * VLIB_FRAME_SIZE; + + ixge_dma_init (xd, VLIB_TX, /* queue_index */ 0); + + /* RX/TX queue 0 gets mapped to interrupt bits 0 & 8. */ + r->interrupt.queue_mapping[0] = (( /* valid bit */ (1 << 7) | + ixge_rx_queue_to_interrupt (0)) << 0); + + r->interrupt.queue_mapping[0] |= (( /* valid bit */ (1 << 7) | + ixge_tx_queue_to_interrupt (0)) << 8); + + /* No use in getting too many interrupts. + Limit them to one every 3/4 ring size at line rate + min sized packets. + No need for this since kernel/vlib main loop provides adequate interrupt + limiting scheme. */ + if (0) + { + f64 line_rate_max_pps = + 10e9 / (8 * (64 + /* interframe padding */ 20)); + ixge_throttle_queue_interrupt (r, 0, + .75 * xm->n_descriptors[VLIB_RX] / + line_rate_max_pps); + } + + /* Accept all multicast and broadcast packets. Should really add them + to the dst_ethernet_address register array. */ + r->filter_control |= (1 << 10) | (1 << 8); + + /* Enable frames up to size in mac frame size register. */ + r->xge_mac.control |= 1 << 2; + r->xge_mac.rx_max_frame_size = (9216 + 14) << 16; + + /* Enable all interrupts. */ + if (!IXGE_ALWAYS_POLL) + r->interrupt.enable_write_1_to_set = ~0; + } +} + +static uword +ixge_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd; + uword event_type, *event_data = 0; + f64 timeout, link_debounce_deadline; + + ixge_device_init (xm); + + /* Clear all counters. */ + vec_foreach (xd, xm->devices) + { + ixge_update_counters (xd); + memset (xd->counters, 0, sizeof (xd->counters)); + } + + timeout = 30.0; + link_debounce_deadline = 1e70; + + while (1) + { + /* 36 bit stat counters could overflow in ~50 secs. + We poll every 30 secs to be conservative. */ + vlib_process_wait_for_event_or_clock (vm, timeout); + + event_type = vlib_process_get_events (vm, &event_data); + + switch (event_type) + { + case EVENT_SET_FLAGS: + /* 1 ms */ + link_debounce_deadline = vlib_time_now (vm) + 1e-3; + timeout = 1e-3; + break; + + case ~0: + /* No events found: timer expired. */ + if (vlib_time_now (vm) > link_debounce_deadline) + { + vec_foreach (xd, xm->devices) + { + ixge_regs_t *r = xd->regs; + u32 v = r->xge_mac.link_status; + uword is_up = (v & (1 << 30)) != 0; + + vnet_hw_interface_set_flags + (vnm, xd->vlib_hw_if_index, + is_up ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); + } + link_debounce_deadline = 1e70; + timeout = 30.0; + } + break; + + default: + ASSERT (0); + } + + if (event_data) + _vec_len (event_data) = 0; + + /* Query stats every 30 secs. */ + { + f64 now = vlib_time_now (vm); + if (now - xm->time_last_stats_update > 30) + { + xm->time_last_stats_update = now; + vec_foreach (xd, xm->devices) ixge_update_counters (xd); + } + } + } + + return 0; +} + +static vlib_node_registration_t ixge_process_node = { + .function = ixge_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "ixge-process", +}; + +clib_error_t * +ixge_init (vlib_main_t * vm) +{ + ixge_main_t *xm = &ixge_main; + clib_error_t *error; + + xm->vlib_main = vm; + memset (&xm->tx_descriptor_template, 0, + sizeof (xm->tx_descriptor_template)); + memset (&xm->tx_descriptor_template_mask, 0, + sizeof (xm->tx_descriptor_template_mask)); + xm->tx_descriptor_template.status0 = + (IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED | + IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED | + IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS); + xm->tx_descriptor_template_mask.status0 = 0xffff; + xm->tx_descriptor_template_mask.status1 = 0x00003fff; + + xm->tx_descriptor_template_mask.status0 &= + ~(IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET + | IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS); + xm->tx_descriptor_template_mask.status1 &= + ~(IXGE_TX_DESCRIPTOR_STATUS1_DONE); + + error = vlib_call_init_function (vm, pci_bus_init); + + return error; +} + +VLIB_INIT_FUNCTION (ixge_init); + + +static void +ixge_pci_intr_handler (vlib_pci_device_t * dev) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + + vlib_node_set_interrupt_pending (vm, ixge_input_node.index); + + /* Let node know which device is interrupting. */ + { + vlib_node_runtime_t *rt = + vlib_node_get_runtime (vm, ixge_input_node.index); + rt->runtime_data[0] |= 1 << dev->private_data; + } +} + +static clib_error_t * +ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev) +{ + ixge_main_t *xm = &ixge_main; + clib_error_t *error; + void *r; + ixge_device_t *xd; + + /* Device found: make sure we have dma memory. */ + if (unix_physmem_is_fake (vm)) + return clib_error_return (0, "no physical memory available"); + + error = vlib_pci_map_resource (dev, 0, &r); + if (error) + return error; + + vec_add2 (xm->devices, xd, 1); + + if (vec_len (xm->devices) == 1) + { + ixge_input_node.function = ixge_input_multiarch_select (); + } + + xd->pci_device = dev[0]; + xd->device_id = xd->pci_device.config0.header.device_id; + xd->regs = r; + xd->device_index = xd - xm->devices; + xd->pci_function = dev->bus_address.function; + xd->per_interface_next_index = ~0; + + + /* Chip found so enable node. */ + { + vlib_node_set_state (vm, ixge_input_node.index, + (IXGE_ALWAYS_POLL + ? VLIB_NODE_STATE_POLLING + : VLIB_NODE_STATE_INTERRUPT)); + + dev->private_data = xd->device_index; + } + + if (vec_len (xm->devices) == 1) + { + vlib_register_node (vm, &ixge_process_node); + xm->process_node_index = ixge_process_node.index; + } + + error = vlib_pci_bus_master_enable (dev); + + if (error) + return error; + + return vlib_pci_intr_enable (dev); +} + +/* *INDENT-OFF* */ +PCI_REGISTER_DEVICE (ixge_pci_device_registration,static) = { + .init_function = ixge_pci_init, + .interrupt_handler = ixge_pci_intr_handler, + .supported_devices = { +#define _(t,i) { .vendor_id = PCI_VENDOR_ID_INTEL, .device_id = i, }, + foreach_ixge_pci_device_id +#undef _ + { 0 }, + }, +}; +/* *INDENT-ON* */ + +void +ixge_set_next_node (ixge_rx_next_t next, char *name) +{ + vlib_node_registration_t *r = &ixge_input_node; + + switch (next) + { + case IXGE_RX_NEXT_IP4_INPUT: + case IXGE_RX_NEXT_IP6_INPUT: + case IXGE_RX_NEXT_ETHERNET_INPUT: + r->next_nodes[next] = name; + break; + + default: + clib_warning ("%s: illegal next %d\n", __FUNCTION__, next); + break; + } +} +#endif + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .default_disabled = 1, +}; + +/* *INDENT-ON* */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h new file mode 100644 index 00000000..779603b3 --- /dev/null +++ b/src/plugins/ixge/ixge.h @@ -0,0 +1,1293 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_ixge_h +#define included_ixge_h + +#include +#include +#include +#include +#include +#include + +typedef volatile struct +{ + /* [31:7] 128 byte aligned. */ + u32 descriptor_address[2]; + u32 n_descriptor_bytes; + + /* [5] rx/tx descriptor dca enable + [6] rx packet head dca enable + [7] rx packet tail dca enable + [9] rx/tx descriptor relaxed order + [11] rx/tx descriptor write back relaxed order + [13] rx/tx data write/read relaxed order + [15] rx head data write relaxed order + [31:24] apic id for cpu's cache. */ + u32 dca_control; + + u32 head_index; + + /* [4:0] tail buffer size (in 1k byte units) + [13:8] head buffer size (in 64 byte units) + [24:22] lo free descriptors threshold (units of 64 descriptors) + [27:25] descriptor type 0 = legacy, 1 = advanced one buffer (e.g. tail), + 2 = advanced header splitting (head + tail), 5 = advanced header + splitting (head only). + [28] drop if no descriptors available. */ + u32 rx_split_control; + + u32 tail_index; + CLIB_PAD_FROM_TO (0x1c, 0x28); + + /* [7:0] rx/tx prefetch threshold + [15:8] rx/tx host threshold + [24:16] rx/tx write back threshold + [25] rx/tx enable + [26] tx descriptor writeback flush + [30] rx strip vlan enable */ + u32 control; + + u32 rx_coallesce_control; + + union + { + struct + { + /* packets bytes lo hi */ + u32 stats[3]; + + u32 unused; + } rx; + + struct + { + u32 unused[2]; + + /* [0] enables head write back. */ + u32 head_index_write_back_address[2]; + } tx; + }; +} ixge_dma_regs_t; + +/* Only advanced descriptors are supported. */ +typedef struct +{ + u64 tail_address; + u64 head_address; +} ixge_rx_to_hw_descriptor_t; + +typedef struct +{ + u32 status[3]; + u16 n_packet_bytes_this_descriptor; + u16 vlan_tag; +} ixge_rx_from_hw_descriptor_t; + +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2 (1 << (4 + 11)) +/* Valid if not layer2. */ +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4 (1 << (4 + 0)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT (1 << (4 + 1)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6 (1 << (4 + 2)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT (1 << (4 + 3)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP (1 << (4 + 4)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP (1 << (4 + 5)) +#define IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET(s) (((s) >> 21) & 0x3ff) + +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE (1 << (0 + 0)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET (1 << (0 + 1)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN (1 << (0 + 3)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED (1 << (0 + 4)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED (1 << (0 + 5)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED (1 << (0 + 6)) +#define IXGE_RX_DESCRIPTOR_STATUS2_NOT_UNICAST (1 << (0 + 7)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_DOUBLE_VLAN (1 << (0 + 9)) +#define IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR (1 << (0 + 10)) +#define IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR (1 << (20 + 9)) +#define IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR (1 << (20 + 10)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR (1 << (20 + 11)) + +/* For layer2 packets stats0 bottom 3 bits give ether type index from filter. */ +#define IXGE_RX_DESCRIPTOR_STATUS0_LAYER2_ETHERNET_TYPE(s) ((s) & 7) + +typedef struct +{ + u64 buffer_address; + u16 n_bytes_this_buffer; + u16 status0; + u32 status1; +#define IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED (3 << 4) +#define IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED (1 << (8 + 5)) +#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS (8 + 3) +#define IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS) +#define IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS (1 << (8 + 1)) +#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET (8 + 0) +#define IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET) +#define IXGE_TX_DESCRIPTOR_STATUS1_DONE (1 << 0) +#define IXGE_TX_DESCRIPTOR_STATUS1_CONTEXT(i) (/* valid */ (1 << 7) | ((i) << 4)) +#define IXGE_TX_DESCRIPTOR_STATUS1_IPSEC_OFFLOAD (1 << (8 + 2)) +#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_TCP_UDP_CHECKSUM (1 << (8 + 1)) +#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_IP4_CHECKSUM (1 << (8 + 0)) +#define IXGE_TX_DESCRIPTOR_STATUS0_N_BYTES_THIS_BUFFER(l) ((l) << 0) +#define IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET(l) ((l) << 14) +} ixge_tx_descriptor_t; + +typedef struct +{ + struct + { + u8 checksum_start_offset; + u8 checksum_insert_offset; + u16 checksum_end_offset; + } ip, tcp; + u32 status0; + + u8 status1; + + /* Byte offset after UDP/TCP header. */ + u8 payload_offset; + + u16 max_tcp_segment_size; +} __attribute__ ((packed)) ixge_tx_context_descriptor_t; + +typedef union +{ + ixge_rx_to_hw_descriptor_t rx_to_hw; + ixge_rx_from_hw_descriptor_t rx_from_hw; + ixge_tx_descriptor_t tx; + u32x4 as_u32x4; +} ixge_descriptor_t; + +typedef volatile struct +{ + /* [2] pcie master disable + [3] mac reset + [26] global device reset */ + u32 control; + u32 control_alias; + /* [3:2] device id (0 or 1 for dual port chips) + [7] link is up + [17:10] num vfs + [18] io active + [19] pcie master enable status */ + u32 status_read_only; + CLIB_PAD_FROM_TO (0xc, 0x18); + /* [14] pf reset done + [17] relaxed ordering disable + [26] extended vlan enable + [28] driver loaded */ + u32 extended_control; + CLIB_PAD_FROM_TO (0x1c, 0x20); + + /* software definable pins. + sdp_data [7:0] + sdp_is_output [15:8] + sdp_is_native [23:16] + sdp_function [31:24]. + */ + u32 sdp_control; + CLIB_PAD_FROM_TO (0x24, 0x28); + + /* [0] i2c clock in + [1] i2c clock out + [2] i2c data in + [3] i2c data out */ + u32 i2c_control; + CLIB_PAD_FROM_TO (0x2c, 0x4c); + u32 tcp_timer; + + CLIB_PAD_FROM_TO (0x50, 0x200); + + u32 led_control; + + CLIB_PAD_FROM_TO (0x204, 0x600); + u32 core_spare; + CLIB_PAD_FROM_TO (0x604, 0x700); + + struct + { + u32 vflr_events_clear[4]; + u32 mailbox_interrupt_status[4]; + u32 mailbox_interrupt_enable[4]; + CLIB_PAD_FROM_TO (0x730, 0x800); + } pf_foo; + + struct + { + u32 status_write_1_to_clear; + CLIB_PAD_FROM_TO (0x804, 0x808); + u32 status_write_1_to_set; + CLIB_PAD_FROM_TO (0x80c, 0x810); + u32 status_auto_clear_enable; + CLIB_PAD_FROM_TO (0x814, 0x820); + + /* [11:3] minimum inter-interrupt interval + (2e-6 units; 20e-6 units for fast ethernet). + [15] low-latency interrupt moderation enable + [20:16] low-latency interrupt credit + [27:21] interval counter + [31] write disable for credit and counter (write only). */ + u32 throttle0[24]; + + u32 enable_write_1_to_set; + CLIB_PAD_FROM_TO (0x884, 0x888); + u32 enable_write_1_to_clear; + CLIB_PAD_FROM_TO (0x88c, 0x890); + u32 enable_auto_clear; + u32 msi_to_eitr_select; + /* [3:0] spd 0-3 interrupt detection enable + [4] msi-x enable + [5] other clear disable (makes other bits in status not clear on read) + etc. */ + u32 control; + CLIB_PAD_FROM_TO (0x89c, 0x900); + + /* Defines interrupt mapping for 128 rx + 128 tx queues. + 64 x 4 8 bit entries. + For register [i]: + [5:0] bit in interrupt status for rx queue 2*i + 0 + [7] valid bit + [13:8] bit for tx queue 2*i + 0 + [15] valid bit + similar for rx 2*i + 1 and tx 2*i + 1. */ + u32 queue_mapping[64]; + + /* tcp timer [7:0] and other interrupts [15:8] */ + u32 misc_mapping; + CLIB_PAD_FROM_TO (0xa04, 0xa90); + + /* 64 interrupts determined by mappings. */ + u32 status1_write_1_to_clear[4]; + u32 enable1_write_1_to_set[4]; + u32 enable1_write_1_to_clear[4]; + CLIB_PAD_FROM_TO (0xac0, 0xad0); + u32 status1_enable_auto_clear[4]; + CLIB_PAD_FROM_TO (0xae0, 0x1000); + } interrupt; + + ixge_dma_regs_t rx_dma0[64]; + + CLIB_PAD_FROM_TO (0x2000, 0x2140); + u32 dcb_rx_packet_plane_t4_config[8]; + u32 dcb_rx_packet_plane_t4_status[8]; + CLIB_PAD_FROM_TO (0x2180, 0x2300); + + /* reg i defines mapping for 4 rx queues starting at 4*i + 0. */ + u32 rx_queue_stats_mapping[32]; + u32 rx_queue_stats_control; + + CLIB_PAD_FROM_TO (0x2384, 0x2410); + u32 fc_user_descriptor_ptr[2]; + u32 fc_buffer_control; + CLIB_PAD_FROM_TO (0x241c, 0x2420); + u32 fc_rx_dma; + CLIB_PAD_FROM_TO (0x2424, 0x2430); + u32 dcb_packet_plane_control; + CLIB_PAD_FROM_TO (0x2434, 0x2f00); + + u32 rx_dma_control; + u32 pf_queue_drop_enable; + CLIB_PAD_FROM_TO (0x2f08, 0x2f20); + u32 rx_dma_descriptor_cache_config; + CLIB_PAD_FROM_TO (0x2f24, 0x3000); + + /* 1 bit. */ + u32 rx_enable; + CLIB_PAD_FROM_TO (0x3004, 0x3008); + /* [15:0] ether type (little endian) + [31:16] opcode (big endian) */ + u32 flow_control_control; + CLIB_PAD_FROM_TO (0x300c, 0x3020); + /* 3 bit traffic class for each of 8 priorities. */ + u32 rx_priority_to_traffic_class; + CLIB_PAD_FROM_TO (0x3024, 0x3028); + u32 rx_coallesce_data_buffer_control; + CLIB_PAD_FROM_TO (0x302c, 0x3190); + u32 rx_packet_buffer_flush_detect; + CLIB_PAD_FROM_TO (0x3194, 0x3200); + u32 flow_control_tx_timers[4]; /* 2 timer values */ + CLIB_PAD_FROM_TO (0x3210, 0x3220); + u32 flow_control_rx_threshold_lo[8]; + CLIB_PAD_FROM_TO (0x3240, 0x3260); + u32 flow_control_rx_threshold_hi[8]; + CLIB_PAD_FROM_TO (0x3280, 0x32a0); + u32 flow_control_refresh_threshold; + CLIB_PAD_FROM_TO (0x32a4, 0x3c00); + /* For each of 8 traffic classes (units of bytes). */ + u32 rx_packet_buffer_size[8]; + CLIB_PAD_FROM_TO (0x3c20, 0x3d00); + u32 flow_control_config; + CLIB_PAD_FROM_TO (0x3d04, 0x4200); + + struct + { + u32 pcs_config; + CLIB_PAD_FROM_TO (0x4204, 0x4208); + u32 link_control; + u32 link_status; + u32 pcs_debug[2]; + u32 auto_negotiation; + u32 link_partner_ability; + u32 auto_negotiation_tx_next_page; + u32 auto_negotiation_link_partner_next_page; + CLIB_PAD_FROM_TO (0x4228, 0x4240); + } gige_mac; + + struct + { + /* [0] tx crc enable + [2] enable frames up to max frame size register [31:16] + [10] pad frames < 64 bytes if specified by user + [15] loopback enable + [16] mdc hi speed + [17] turn off mdc between mdio packets */ + u32 control; + + /* [5] rx symbol error (all bits clear on read) + [6] rx illegal symbol + [7] rx idle error + [8] rx local fault + [9] rx remote fault */ + u32 status; + + u32 pause_and_pace_control; + CLIB_PAD_FROM_TO (0x424c, 0x425c); + u32 phy_command; + u32 phy_data; + CLIB_PAD_FROM_TO (0x4264, 0x4268); + + /* [31:16] max frame size in bytes. */ + u32 rx_max_frame_size; + CLIB_PAD_FROM_TO (0x426c, 0x4288); + + /* [0] + [2] pcs receive link up? (latch lo) + [7] local fault + [1] + [0] pcs 10g base r capable + [1] pcs 10g base x capable + [2] pcs 10g base w capable + [10] rx local fault + [11] tx local fault + [15:14] 2 => device present at this address (else not present) */ + u32 xgxs_status[2]; + + u32 base_x_pcs_status; + + /* [0] pass unrecognized flow control frames + [1] discard pause frames + [2] rx priority flow control enable (only in dcb mode) + [3] rx flow control enable. */ + u32 flow_control; + + /* [3:0] tx lanes change polarity + [7:4] rx lanes change polarity + [11:8] swizzle tx lanes + [15:12] swizzle rx lanes + 4 x 2 bit tx lane swap + 4 x 2 bit rx lane swap. */ + u32 serdes_control; + + u32 fifo_control; + + /* [0] force link up + [1] autoneg ack2 bit to transmit + [6:2] autoneg selector field to transmit + [8:7] 10g pma/pmd type 0 => xaui, 1 kx4, 2 cx4 + [9] 1g pma/pmd type 0 => sfi, 1 => kx/bx + [10] disable 10g on without main power + [11] restart autoneg on transition to dx power state + [12] restart autoneg + [15:13] link mode: + 0 => 1g no autoneg + 1 => 10g kx4 parallel link no autoneg + 2 => 1g bx autoneg + 3 => 10g sfi serdes + 4 => kx4/kx/kr + 5 => xgmii 1g/100m + 6 => kx4/kx/kr 1g an + 7 kx4/kx/kr sgmii. + [16] kr support + [17] fec requested + [18] fec ability + etc. */ + u32 auto_negotiation_control; + + /* [0] signal detect 1g/100m + [1] fec signal detect + [2] 10g serial pcs fec block lock + [3] 10g serial high error rate + [4] 10g serial pcs block lock + [5] kx/kx4/kr autoneg next page received + [6] kx/kx4/kr backplane autoneg next page received + [7] link status clear to read + [11:8] 10g signal detect (4 lanes) (for serial just lane 0) + [12] 10g serial signal detect + [16:13] 10g parallel lane sync status + [17] 10g parallel align status + [18] 1g sync status + [19] kx/kx4/kr backplane autoneg is idle + [20] 1g autoneg enabled + [21] 1g pcs enabled for sgmii + [22] 10g xgxs enabled + [23] 10g serial fec enabled (forward error detection) + [24] 10g kr pcs enabled + [25] sgmii enabled + [27:26] mac link mode + 0 => 1g + 1 => 10g parallel + 2 => 10g serial + 3 => autoneg + [29:28] link speed + 1 => 100m + 2 => 1g + 3 => 10g + [30] link is up + [31] kx/kx4/kr backplane autoneg completed successfully. */ + u32 link_status; + + /* [17:16] pma/pmd for 10g serial + 0 => kr, 2 => sfi + [18] disable dme pages */ + u32 auto_negotiation_control2; + + CLIB_PAD_FROM_TO (0x42ac, 0x42b0); + u32 link_partner_ability[2]; + CLIB_PAD_FROM_TO (0x42b8, 0x42d0); + u32 manageability_control; + u32 link_partner_next_page[2]; + CLIB_PAD_FROM_TO (0x42dc, 0x42e0); + u32 kr_pcs_control; + u32 kr_pcs_status; + u32 fec_status[2]; + CLIB_PAD_FROM_TO (0x42f0, 0x4314); + u32 sgmii_control; + CLIB_PAD_FROM_TO (0x4318, 0x4324); + u32 link_status2; + CLIB_PAD_FROM_TO (0x4328, 0x4900); + } xge_mac; + + u32 tx_dcb_control; + u32 tx_dcb_descriptor_plane_queue_select; + u32 tx_dcb_descriptor_plane_t1_config; + u32 tx_dcb_descriptor_plane_t1_status; + CLIB_PAD_FROM_TO (0x4910, 0x4950); + + /* For each TC in units of 1k bytes. */ + u32 tx_packet_buffer_thresholds[8]; + CLIB_PAD_FROM_TO (0x4970, 0x4980); + struct + { + u32 mmw; + u32 config; + u32 status; + u32 rate_drift; + } dcb_tx_rate_scheduler; + CLIB_PAD_FROM_TO (0x4990, 0x4a80); + u32 tx_dma_control; + CLIB_PAD_FROM_TO (0x4a84, 0x4a88); + u32 tx_dma_tcp_flags_control[2]; + CLIB_PAD_FROM_TO (0x4a90, 0x4b00); + u32 pf_mailbox[64]; + CLIB_PAD_FROM_TO (0x4c00, 0x5000); + + /* RX */ + u32 checksum_control; + CLIB_PAD_FROM_TO (0x5004, 0x5008); + u32 rx_filter_control; + CLIB_PAD_FROM_TO (0x500c, 0x5010); + u32 management_vlan_tag[8]; + u32 management_udp_tcp_ports[8]; + CLIB_PAD_FROM_TO (0x5050, 0x5078); + /* little endian. */ + u32 extended_vlan_ether_type; + CLIB_PAD_FROM_TO (0x507c, 0x5080); + /* [1] store/dma bad packets + [8] accept all multicast + [9] accept all unicast + [10] accept all broadcast. */ + u32 filter_control; + CLIB_PAD_FROM_TO (0x5084, 0x5088); + /* [15:0] vlan ethernet type (0x8100) little endian + [28] cfi bit expected + [29] drop packets with unexpected cfi bit + [30] vlan filter enable. */ + u32 vlan_control; + CLIB_PAD_FROM_TO (0x508c, 0x5090); + /* [1:0] hi bit of ethernet address for 12 bit index into multicast table + 0 => 47, 1 => 46, 2 => 45, 3 => 43. + [2] enable multicast filter + */ + u32 multicast_control; + CLIB_PAD_FROM_TO (0x5094, 0x5100); + u32 fcoe_rx_control; + CLIB_PAD_FROM_TO (0x5104, 0x5108); + u32 fc_flt_context; + CLIB_PAD_FROM_TO (0x510c, 0x5110); + u32 fc_filter_control; + CLIB_PAD_FROM_TO (0x5114, 0x5120); + u32 rx_message_type_lo; + CLIB_PAD_FROM_TO (0x5124, 0x5128); + /* [15:0] ethernet type (little endian) + [18:16] matche pri in vlan tag + [19] priority match enable + [25:20] virtualization pool + [26] pool enable + [27] is fcoe + [30] ieee 1588 timestamp enable + [31] filter enable. + (See ethernet_type_queue_select.) */ + u32 ethernet_type_queue_filter[8]; + CLIB_PAD_FROM_TO (0x5148, 0x5160); + /* [7:0] l2 ethernet type and + [15:8] l2 ethernet type or */ + u32 management_decision_filters1[8]; + u32 vf_vm_tx_switch_loopback_enable[2]; + u32 rx_time_sync_control; + CLIB_PAD_FROM_TO (0x518c, 0x5190); + u32 management_ethernet_type_filters[4]; + u32 rx_timestamp_attributes_lo; + u32 rx_timestamp_hi; + u32 rx_timestamp_attributes_hi; + CLIB_PAD_FROM_TO (0x51ac, 0x51b0); + u32 pf_virtual_control; + CLIB_PAD_FROM_TO (0x51b4, 0x51d8); + u32 fc_offset_parameter; + CLIB_PAD_FROM_TO (0x51dc, 0x51e0); + u32 vf_rx_enable[2]; + u32 rx_timestamp_lo; + CLIB_PAD_FROM_TO (0x51ec, 0x5200); + /* 12 bits determined by multicast_control + lookup bits in this vector. */ + u32 multicast_enable[128]; + + /* [0] ethernet address [31:0] + [1] [15:0] ethernet address [47:32] + [31] valid bit. + Index 0 is read from eeprom after reset. */ + u32 rx_ethernet_address0[16][2]; + + CLIB_PAD_FROM_TO (0x5480, 0x5800); + u32 wake_up_control; + CLIB_PAD_FROM_TO (0x5804, 0x5808); + u32 wake_up_filter_control; + CLIB_PAD_FROM_TO (0x580c, 0x5818); + u32 multiple_rx_queue_command_82598; + CLIB_PAD_FROM_TO (0x581c, 0x5820); + u32 management_control; + u32 management_filter_control; + CLIB_PAD_FROM_TO (0x5828, 0x5838); + u32 wake_up_ip4_address_valid; + CLIB_PAD_FROM_TO (0x583c, 0x5840); + u32 wake_up_ip4_address_table[4]; + u32 management_control_to_host; + CLIB_PAD_FROM_TO (0x5854, 0x5880); + u32 wake_up_ip6_address_table[4]; + + /* unicast_and broadcast_and vlan_and ip_address_and + etc. */ + u32 management_decision_filters[8]; + + u32 management_ip4_or_ip6_address_filters[4][4]; + CLIB_PAD_FROM_TO (0x58f0, 0x5900); + u32 wake_up_packet_length; + CLIB_PAD_FROM_TO (0x5904, 0x5910); + u32 management_ethernet_address_filters[4][2]; + CLIB_PAD_FROM_TO (0x5930, 0x5a00); + u32 wake_up_packet_memory[32]; + CLIB_PAD_FROM_TO (0x5a80, 0x5c00); + u32 redirection_table_82598[32]; + u32 rss_random_keys_82598[10]; + CLIB_PAD_FROM_TO (0x5ca8, 0x6000); + + ixge_dma_regs_t tx_dma[128]; + + u32 pf_vm_vlan_insert[64]; + u32 tx_dma_tcp_max_alloc_size_requests; + CLIB_PAD_FROM_TO (0x8104, 0x8110); + u32 vf_tx_enable[2]; + CLIB_PAD_FROM_TO (0x8118, 0x8120); + /* [0] dcb mode enable + [1] virtualization mode enable + [3:2] number of tcs/qs per pool. */ + u32 multiple_tx_queues_command; + CLIB_PAD_FROM_TO (0x8124, 0x8200); + u32 pf_vf_anti_spoof[8]; + u32 pf_dma_tx_switch_control; + CLIB_PAD_FROM_TO (0x8224, 0x82e0); + u32 tx_strict_low_latency_queues[4]; + CLIB_PAD_FROM_TO (0x82f0, 0x8600); + u32 tx_queue_stats_mapping_82599[32]; + u32 tx_queue_packet_counts[32]; + u32 tx_queue_byte_counts[32][2]; + + struct + { + u32 control; + u32 status; + u32 buffer_almost_full; + CLIB_PAD_FROM_TO (0x880c, 0x8810); + u32 buffer_min_ifg; + CLIB_PAD_FROM_TO (0x8814, 0x8900); + } tx_security; + + struct + { + u32 index; + u32 salt; + u32 key[4]; + CLIB_PAD_FROM_TO (0x8918, 0x8a00); + } tx_ipsec; + + struct + { + u32 capabilities; + u32 control; + u32 tx_sci[2]; + u32 sa; + u32 sa_pn[2]; + u32 key[2][4]; + /* untagged packets, encrypted packets, protected packets, + encrypted bytes, protected bytes */ + u32 stats[5]; + CLIB_PAD_FROM_TO (0x8a50, 0x8c00); + } tx_link_security; + + struct + { + u32 control; + u32 timestamp_value[2]; + u32 system_time[2]; + u32 increment_attributes; + u32 time_adjustment_offset[2]; + u32 aux_control; + u32 target_time[2][2]; + CLIB_PAD_FROM_TO (0x8c34, 0x8c3c); + u32 aux_time_stamp[2][2]; + CLIB_PAD_FROM_TO (0x8c4c, 0x8d00); + } tx_timesync; + + struct + { + u32 control; + u32 status; + CLIB_PAD_FROM_TO (0x8d08, 0x8e00); + } rx_security; + + struct + { + u32 index; + u32 ip_address[4]; + u32 spi; + u32 ip_index; + u32 key[4]; + u32 salt; + u32 mode; + CLIB_PAD_FROM_TO (0x8e34, 0x8f00); + } rx_ipsec; + + struct + { + u32 capabilities; + u32 control; + u32 sci[2]; + u32 sa[2]; + u32 sa_pn[2]; + u32 key[2][4]; + /* see datasheet */ + u32 stats[17]; + CLIB_PAD_FROM_TO (0x8f84, 0x9000); + } rx_link_security; + + /* 4 wake up, 2 management, 2 wake up. */ + u32 flexible_filters[8][16][4]; + CLIB_PAD_FROM_TO (0x9800, 0xa000); + + /* 4096 bits. */ + u32 vlan_filter[128]; + + /* [0] ethernet address [31:0] + [1] [15:0] ethernet address [47:32] + [31] valid bit. + Index 0 is read from eeprom after reset. */ + u32 rx_ethernet_address1[128][2]; + + /* select one of 64 pools for each rx address. */ + u32 rx_ethernet_address_pool_select[128][2]; + CLIB_PAD_FROM_TO (0xaa00, 0xc800); + u32 tx_priority_to_traffic_class; + CLIB_PAD_FROM_TO (0xc804, 0xcc00); + + /* In bytes units of 1k. Total packet buffer is 160k. */ + u32 tx_packet_buffer_size[8]; + + CLIB_PAD_FROM_TO (0xcc20, 0xcd10); + u32 tx_manageability_tc_mapping; + CLIB_PAD_FROM_TO (0xcd14, 0xcd20); + u32 dcb_tx_packet_plane_t2_config[8]; + u32 dcb_tx_packet_plane_t2_status[8]; + CLIB_PAD_FROM_TO (0xcd60, 0xce00); + + u32 tx_flow_control_status; + CLIB_PAD_FROM_TO (0xce04, 0xd000); + + ixge_dma_regs_t rx_dma1[64]; + + struct + { + /* Bigendian ip4 src/dst address. */ + u32 src_address[128]; + u32 dst_address[128]; + + /* TCP/UDP ports [15:0] src [31:16] dst; bigendian. */ + u32 tcp_udp_port[128]; + + /* [1:0] protocol tcp, udp, sctp, other + [4:2] match priority (highest wins) + [13:8] pool + [25] src address match disable + [26] dst address match disable + [27] src port match disable + [28] dst port match disable + [29] protocol match disable + [30] pool match disable + [31] enable. */ + u32 control[128]; + + /* [12] size bypass + [19:13] must be 0x80 + [20] low-latency interrupt + [27:21] rx queue. */ + u32 interrupt[128]; + } ip4_filters; + + CLIB_PAD_FROM_TO (0xea00, 0xeb00); + /* 4 bit rss output index indexed by 7 bit hash. + 128 8 bit fields = 32 registers. */ + u32 redirection_table_82599[32]; + + u32 rss_random_key_82599[10]; + CLIB_PAD_FROM_TO (0xeba8, 0xec00); + /* [15:0] reserved + [22:16] rx queue index + [29] low-latency interrupt on match + [31] enable */ + u32 ethernet_type_queue_select[8]; + CLIB_PAD_FROM_TO (0xec20, 0xec30); + u32 syn_packet_queue_filter; + CLIB_PAD_FROM_TO (0xec34, 0xec60); + u32 immediate_interrupt_rx_vlan_priority; + CLIB_PAD_FROM_TO (0xec64, 0xec70); + u32 rss_queues_per_traffic_class; + CLIB_PAD_FROM_TO (0xec74, 0xec90); + u32 lli_size_threshold; + CLIB_PAD_FROM_TO (0xec94, 0xed00); + + struct + { + u32 control; + CLIB_PAD_FROM_TO (0xed04, 0xed10); + u32 table[8]; + CLIB_PAD_FROM_TO (0xed30, 0xee00); + } fcoe_redirection; + + struct + { + /* [1:0] packet buffer allocation 0 => disabled, else 64k*2^(f-1) + [3] packet buffer initialization done + [4] perfetch match mode + [5] report status in rss field of rx descriptors + [7] report status always + [14:8] drop queue + [20:16] flex 2 byte packet offset (units of 2 bytes) + [27:24] max linked list length + [31:28] full threshold. */ + u32 control; + CLIB_PAD_FROM_TO (0xee04, 0xee0c); + + u32 data[8]; + + /* [1:0] 0 => no action, 1 => add, 2 => remove, 3 => query. + [2] valid filter found by query command + [3] filter update override + [4] ip6 adress table + [6:5] l4 protocol reserved, udp, tcp, sctp + [7] is ip6 + [8] clear head/tail + [9] packet drop action + [10] matched packet generates low-latency interrupt + [11] last in linked list + [12] collision + [15] rx queue enable + [22:16] rx queue + [29:24] pool. */ + u32 command; + + CLIB_PAD_FROM_TO (0xee30, 0xee3c); + /* ip4 dst/src address, tcp ports, udp ports. + set bits mean bit is ignored. */ + u32 ip4_masks[4]; + u32 filter_length; + u32 usage_stats; + u32 failed_usage_stats; + u32 filters_match_stats; + u32 filters_miss_stats; + CLIB_PAD_FROM_TO (0xee60, 0xee68); + /* Lookup, signature. */ + u32 hash_keys[2]; + /* [15:0] ip6 src address 1 bit per byte + [31:16] ip6 dst address. */ + u32 ip6_mask; + /* [0] vlan id + [1] vlan priority + [2] pool + [3] ip protocol + [4] flex + [5] dst ip6. */ + u32 other_mask; + CLIB_PAD_FROM_TO (0xee78, 0xf000); + } flow_director; + + struct + { + u32 l2_control[64]; + u32 vlan_pool_filter[64]; + u32 vlan_pool_filter_bitmap[128]; + u32 dst_ethernet_address[128]; + u32 mirror_rule[4]; + u32 mirror_rule_vlan[8]; + u32 mirror_rule_pool[8]; + CLIB_PAD_FROM_TO (0xf650, 0x10010); + } pf_bar; + + u32 eeprom_flash_control; + /* [0] start + [1] done + [15:2] address + [31:16] read data. */ + u32 eeprom_read; + CLIB_PAD_FROM_TO (0x10018, 0x1001c); + u32 flash_access; + CLIB_PAD_FROM_TO (0x10020, 0x10114); + u32 flash_data; + u32 flash_control; + u32 flash_read_data; + CLIB_PAD_FROM_TO (0x10120, 0x1013c); + u32 flash_opcode; + u32 software_semaphore; + CLIB_PAD_FROM_TO (0x10144, 0x10148); + u32 firmware_semaphore; + CLIB_PAD_FROM_TO (0x1014c, 0x10160); + u32 software_firmware_sync; + CLIB_PAD_FROM_TO (0x10164, 0x10200); + u32 general_rx_control; + CLIB_PAD_FROM_TO (0x10204, 0x11000); + + struct + { + u32 control; + CLIB_PAD_FROM_TO (0x11004, 0x11010); + /* [3:0] enable counters + [7:4] leaky bucket counter mode + [29] reset + [30] stop + [31] start. */ + u32 counter_control; + /* [7:0],[15:8],[23:16],[31:24] event for counters 0-3. + event codes: + 0x0 bad tlp + 0x10 reqs that reached timeout + etc. */ + u32 counter_event; + CLIB_PAD_FROM_TO (0x11018, 0x11020); + u32 counters_clear_on_read[4]; + u32 counter_config[4]; + struct + { + u32 address; + u32 data; + } indirect_access; + CLIB_PAD_FROM_TO (0x11048, 0x11050); + u32 extended_control; + CLIB_PAD_FROM_TO (0x11054, 0x11064); + u32 mirrored_revision_id; + CLIB_PAD_FROM_TO (0x11068, 0x11070); + u32 dca_requester_id_information; + + /* [0] global disable + [4:1] mode: 0 => legacy, 1 => dca 1.0. */ + u32 dca_control; + CLIB_PAD_FROM_TO (0x11078, 0x110b0); + /* [0] pci completion abort + [1] unsupported i/o address + [2] wrong byte enable + [3] pci timeout */ + u32 pcie_interrupt_status; + CLIB_PAD_FROM_TO (0x110b4, 0x110b8); + u32 pcie_interrupt_enable; + CLIB_PAD_FROM_TO (0x110bc, 0x110c0); + u32 msi_x_pba_clear[8]; + CLIB_PAD_FROM_TO (0x110e0, 0x12300); + } pcie; + + u32 interrupt_throttle1[128 - 24]; + CLIB_PAD_FROM_TO (0x124a0, 0x14f00); + + u32 core_analog_config; + CLIB_PAD_FROM_TO (0x14f04, 0x14f10); + u32 core_common_config; + CLIB_PAD_FROM_TO (0x14f14, 0x15f14); + + u32 link_sec_software_firmware_interface; +} ixge_regs_t; + +typedef union +{ + struct + { + /* Addresses bigendian. */ + union + { + struct + { + ip6_address_t src_address; + u32 unused[1]; + } ip6; + struct + { + u32 unused[3]; + ip4_address_t src_address, dst_address; + } ip4; + }; + + /* [15:0] src port (little endian). + [31:16] dst port. */ + u32 tcp_udp_ports; + + /* [15:0] vlan (cfi bit set to 0). + [31:16] flex bytes. bigendian. */ + u32 vlan_and_flex_word; + + /* [14:0] hash + [15] bucket valid + [31:16] signature (signature filers)/sw-index (perfect match). */ + u32 hash; + }; + + u32 as_u32[8]; +} ixge_flow_director_key_t; + +always_inline void +ixge_throttle_queue_interrupt (ixge_regs_t * r, + u32 queue_interrupt_index, + f64 inter_interrupt_interval_in_secs) +{ + volatile u32 *tr = + (queue_interrupt_index < ARRAY_LEN (r->interrupt.throttle0) + ? &r->interrupt.throttle0[queue_interrupt_index] + : &r->interrupt_throttle1[queue_interrupt_index]); + ASSERT (queue_interrupt_index < 128); + u32 v; + i32 i, mask = (1 << 9) - 1; + + i = flt_round_nearest (inter_interrupt_interval_in_secs / 2e-6); + i = i < 1 ? 1 : i; + i = i >= mask ? mask : i; + + v = tr[0]; + v &= ~(mask << 3); + v |= i << 3; + tr[0] = v; +} + +#define foreach_ixge_counter \ + _ (0x40d0, rx_total_packets) \ + _64 (0x40c0, rx_total_bytes) \ + _ (0x41b0, rx_good_packets_before_filtering) \ + _64 (0x41b4, rx_good_bytes_before_filtering) \ + _ (0x2f50, rx_dma_good_packets) \ + _64 (0x2f54, rx_dma_good_bytes) \ + _ (0x2f5c, rx_dma_duplicated_good_packets) \ + _64 (0x2f60, rx_dma_duplicated_good_bytes) \ + _ (0x2f68, rx_dma_good_loopback_packets) \ + _64 (0x2f6c, rx_dma_good_loopback_bytes) \ + _ (0x2f74, rx_dma_good_duplicated_loopback_packets) \ + _64 (0x2f78, rx_dma_good_duplicated_loopback_bytes) \ + _ (0x4074, rx_good_packets) \ + _64 (0x4088, rx_good_bytes) \ + _ (0x407c, rx_multicast_packets) \ + _ (0x4078, rx_broadcast_packets) \ + _ (0x405c, rx_64_byte_packets) \ + _ (0x4060, rx_65_127_byte_packets) \ + _ (0x4064, rx_128_255_byte_packets) \ + _ (0x4068, rx_256_511_byte_packets) \ + _ (0x406c, rx_512_1023_byte_packets) \ + _ (0x4070, rx_gt_1023_byte_packets) \ + _ (0x4000, rx_crc_errors) \ + _ (0x4120, rx_ip_checksum_errors) \ + _ (0x4004, rx_illegal_symbol_errors) \ + _ (0x4008, rx_error_symbol_errors) \ + _ (0x4034, rx_mac_local_faults) \ + _ (0x4038, rx_mac_remote_faults) \ + _ (0x4040, rx_length_errors) \ + _ (0x41a4, rx_xons) \ + _ (0x41a8, rx_xoffs) \ + _ (0x40a4, rx_undersize_packets) \ + _ (0x40a8, rx_fragments) \ + _ (0x40ac, rx_oversize_packets) \ + _ (0x40b0, rx_jabbers) \ + _ (0x40b4, rx_management_packets) \ + _ (0x40b8, rx_management_drops) \ + _ (0x3fa0, rx_missed_packets_pool_0) \ + _ (0x40d4, tx_total_packets) \ + _ (0x4080, tx_good_packets) \ + _64 (0x4090, tx_good_bytes) \ + _ (0x40f0, tx_multicast_packets) \ + _ (0x40f4, tx_broadcast_packets) \ + _ (0x87a0, tx_dma_good_packets) \ + _64 (0x87a4, tx_dma_good_bytes) \ + _ (0x40d8, tx_64_byte_packets) \ + _ (0x40dc, tx_65_127_byte_packets) \ + _ (0x40e0, tx_128_255_byte_packets) \ + _ (0x40e4, tx_256_511_byte_packets) \ + _ (0x40e8, tx_512_1023_byte_packets) \ + _ (0x40ec, tx_gt_1023_byte_packets) \ + _ (0x4010, tx_undersize_drops) \ + _ (0x8780, switch_security_violation_packets) \ + _ (0x5118, fc_crc_errors) \ + _ (0x241c, fc_rx_drops) \ + _ (0x2424, fc_last_error_count) \ + _ (0x2428, fcoe_rx_packets) \ + _ (0x242c, fcoe_rx_dwords) \ + _ (0x8784, fcoe_tx_packets) \ + _ (0x8788, fcoe_tx_dwords) \ + _ (0x1030, queue_0_rx_count) \ + _ (0x1430, queue_0_drop_count) \ + _ (0x1070, queue_1_rx_count) \ + _ (0x1470, queue_1_drop_count) \ + _ (0x10b0, queue_2_rx_count) \ + _ (0x14b0, queue_2_drop_count) \ + _ (0x10f0, queue_3_rx_count) \ + _ (0x14f0, queue_3_drop_count) \ + _ (0x1130, queue_4_rx_count) \ + _ (0x1530, queue_4_drop_count) \ + _ (0x1170, queue_5_rx_count) \ + _ (0x1570, queue_5_drop_count) \ + _ (0x11b0, queue_6_rx_count) \ + _ (0x15b0, queue_6_drop_count) \ + _ (0x11f0, queue_7_rx_count) \ + _ (0x15f0, queue_7_drop_count) \ + _ (0x1230, queue_8_rx_count) \ + _ (0x1630, queue_8_drop_count) \ + _ (0x1270, queue_9_rx_count) \ + _ (0x1270, queue_9_drop_count) + + + + +typedef enum +{ +#define _(a,f) IXGE_COUNTER_##f, +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + IXGE_N_COUNTER, +} ixge_counter_type_t; + +typedef struct +{ + u32 mdio_address; + + /* 32 bit ID read from ID registers. */ + u32 id; +} ixge_phy_t; + +typedef struct +{ + /* Cache aligned descriptors. */ + ixge_descriptor_t *descriptors; + + /* Number of descriptors in table. */ + u32 n_descriptors; + + /* Software head and tail pointers into descriptor ring. */ + u32 head_index, tail_index; + + /* Index into dma_queues vector. */ + u32 queue_index; + + /* Buffer indices corresponding to each active descriptor. */ + u32 *descriptor_buffer_indices; + + union + { + struct + { + u32 *volatile head_index_write_back; + + u32 n_buffers_on_ring; + } tx; + + struct + { + /* Buffer indices to use to replenish each descriptor. */ + u32 *replenish_buffer_indices; + + vlib_node_runtime_t *node; + u32 next_index; + + u32 saved_start_of_packet_buffer_index; + + u32 saved_start_of_packet_next_index; + u32 saved_last_buffer_index; + + u32 is_start_of_packet; + + u32 n_descriptors_done_total; + + u32 n_descriptors_done_this_call; + + u32 n_bytes; + } rx; + }; +} ixge_dma_queue_t; + +#define foreach_ixge_pci_device_id \ + _ (82598, 0x10b6) \ + _ (82598_bx, 0x1508) \ + _ (82598af_dual_port, 0x10c6) \ + _ (82598af_single_port, 0x10c7) \ + _ (82598at, 0x10c8) \ + _ (82598at2, 0x150b) \ + _ (82598eb_sfp_lom, 0x10db) \ + _ (82598eb_cx4, 0x10dd) \ + _ (82598_cx4_dual_port, 0x10ec) \ + _ (82598_da_dual_port, 0x10f1) \ + _ (82598_sr_dual_port_em, 0x10e1) \ + _ (82598eb_xf_lr, 0x10f4) \ + _ (82599_kx4, 0x10f7) \ + _ (82599_kx4_mezz, 0x1514) \ + _ (82599_kr, 0x1517) \ + _ (82599_combo_backplane, 0x10f8) \ + _ (82599_cx4, 0x10f9) \ + _ (82599_sfp, 0x10fb) \ + _ (82599_backplane_fcoe, 0x152a) \ + _ (82599_sfp_fcoe, 0x1529) \ + _ (82599_sfp_em, 0x1507) \ + _ (82599_xaui_lom, 0x10fc) \ + _ (82599_t3_lom, 0x151c) \ + _ (x540t, 0x1528) + +typedef enum +{ +#define _(f,n) IXGE_##f = n, + foreach_ixge_pci_device_id +#undef _ +} ixge_pci_device_id_t; + +typedef struct +{ + /* registers */ + ixge_regs_t *regs; + + /* Specific next index when using dynamic redirection */ + u32 per_interface_next_index; + + /* PCI bus info. */ + vlib_pci_device_t pci_device; + + /* From PCI config space header. */ + ixge_pci_device_id_t device_id; + + u16 device_index; + + /* 0 or 1. */ + u16 pci_function; + + /* VLIB interface for this instance. */ + u32 vlib_hw_if_index, vlib_sw_if_index; + + ixge_dma_queue_t *dma_queues[VLIB_N_RX_TX]; + + /* Phy index (0 or 1) and address on MDI bus. */ + u32 phy_index; + ixge_phy_t phys[2]; + + /* Value of link_status register at last link change. */ + u32 link_status_at_last_link_change; + + i2c_bus_t i2c_bus; + sfp_eeprom_t sfp_eeprom; + + /* Counters. */ + u64 counters[IXGE_N_COUNTER], counters_last_clear[IXGE_N_COUNTER]; +} ixge_device_t; + +typedef struct +{ + vlib_main_t *vlib_main; + + /* Vector of devices. */ + ixge_device_t *devices; + + /* Descriptor ring sizes. */ + u32 n_descriptors[VLIB_N_RX_TX]; + + /* RX buffer size. Must be at least 1k; will be rounded to + next largest 1k size. */ + u32 n_bytes_in_rx_buffer; + + u32 n_descriptors_per_cache_line; + + u32 vlib_buffer_free_list_index; + + u32 process_node_index; + + /* Template and mask for initializing/validating TX descriptors. */ + ixge_tx_descriptor_t tx_descriptor_template, tx_descriptor_template_mask; + + /* Vector of buffers for which TX is done and can be freed. */ + u32 *tx_buffers_pending_free; + + u32 *rx_buffers_to_add; + + f64 time_last_stats_update; +} ixge_main_t; + +ixge_main_t ixge_main; +vnet_device_class_t ixge_device_class; + +typedef enum +{ + IXGE_RX_NEXT_IP4_INPUT, + IXGE_RX_NEXT_IP6_INPUT, + IXGE_RX_NEXT_ETHERNET_INPUT, + IXGE_RX_NEXT_DROP, + IXGE_RX_N_NEXT, +} ixge_rx_next_t; + +void ixge_set_next_node (ixge_rx_next_t, char *); + +#endif /* included_ixge_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 6ba82584..a517a597 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -44,6 +44,7 @@ */ #include +#include uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, @@ -583,6 +584,11 @@ alloc_from_free_list (vlib_main_t * vm, dst = alloc_buffers; + /* wait with buffer memory allocation as long as possible + in case external buffer manager takes over */ + if (PREDICT_FALSE (vm->os_physmem_alloc_aligned == 0)) + unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); + n_filled = fill_free_list (vm, free_list, n_alloc_buffers); if (n_filled == 0) return 0; diff --git a/src/vnet.am b/src/vnet.am index ca24c9c5..bc9655cc 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -114,14 +114,16 @@ libvnet_la_SOURCES += \ vnet/ethernet/init.c \ vnet/ethernet/interface.c \ vnet/ethernet/node.c \ - vnet/ethernet/pg.c + vnet/ethernet/pg.c \ + vnet/ethernet/sfp.c nobase_include_HEADERS += \ vnet/ethernet/arp_packet.h \ vnet/ethernet/error.def \ vnet/ethernet/ethernet.h \ vnet/ethernet/packet.h \ - vnet/ethernet/types.def + vnet/ethernet/types.def \ + vnet/ethernet/sfp.h ######################################## # Layer 2 protocol: Ethernet bridging @@ -792,14 +794,6 @@ nobase_include_HEADERS += \ vnet/pg/pg.h \ vnet/pg/edit.h -if !WITH_DPDK -libvnet_la_SOURCES += \ - vnet/devices/nic/ixge.c \ - vnet/devices/nic/ixge.h \ - vnet/devices/nic/sfp.c \ - vnet/devices/nic/sfp.h -endif - ######################################## # virtio ######################################## diff --git a/src/vnet/devices/nic/ixge.c b/src/vnet/devices/nic/ixge.c deleted file mode 100644 index d4c4c6b7..00000000 --- a/src/vnet/devices/nic/ixge.c +++ /dev/null @@ -1,2938 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * WARNING! - * This driver is not intended for production use and it is unsupported. - * It is provided for educational use only. - * Please use supported DPDK driver instead. - */ - -#if __x86_64__ -#include - -#ifndef CLIB_HAVE_VEC128 -#warning HACK: ixge driver wont really work, missing u32x4 -typedef unsigned long long u32x4; -#endif - -#include -#include -#include -#include -#include -#include - -#define IXGE_ALWAYS_POLL 0 - -#define EVENT_SET_FLAGS 0 -#define IXGE_HWBP_RACE_ELOG 0 - -#define PCI_VENDOR_ID_INTEL 0x8086 - -/* 10 GIG E (XGE) PHY IEEE 802.3 clause 45 definitions. */ -#define XGE_PHY_DEV_TYPE_PMA_PMD 1 -#define XGE_PHY_DEV_TYPE_PHY_XS 4 -#define XGE_PHY_ID1 0x2 -#define XGE_PHY_ID2 0x3 -#define XGE_PHY_CONTROL 0x0 -#define XGE_PHY_CONTROL_RESET (1 << 15) - -ixge_main_t ixge_main; -static vlib_node_registration_t ixge_input_node; -static vlib_node_registration_t ixge_process_node; - -static void -ixge_semaphore_get (ixge_device_t * xd) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - ixge_regs_t *r = xd->regs; - u32 i; - - i = 0; - while (!(r->software_semaphore & (1 << 0))) - { - if (i > 0) - vlib_process_suspend (vm, 100e-6); - i++; - } - do - { - r->software_semaphore |= 1 << 1; - } - while (!(r->software_semaphore & (1 << 1))); -} - -static void -ixge_semaphore_release (ixge_device_t * xd) -{ - ixge_regs_t *r = xd->regs; - r->software_semaphore &= ~3; -} - -static void -ixge_software_firmware_sync (ixge_device_t * xd, u32 sw_mask) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - ixge_regs_t *r = xd->regs; - u32 fw_mask = sw_mask << 5; - u32 m, done = 0; - - while (!done) - { - ixge_semaphore_get (xd); - m = r->software_firmware_sync; - done = (m & fw_mask) == 0; - if (done) - r->software_firmware_sync = m | sw_mask; - ixge_semaphore_release (xd); - if (!done) - vlib_process_suspend (vm, 10e-3); - } -} - -static void -ixge_software_firmware_sync_release (ixge_device_t * xd, u32 sw_mask) -{ - ixge_regs_t *r = xd->regs; - ixge_semaphore_get (xd); - r->software_firmware_sync &= ~sw_mask; - ixge_semaphore_release (xd); -} - -u32 -ixge_read_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, - u32 v, u32 is_read) -{ - ixge_regs_t *r = xd->regs; - const u32 busy_bit = 1 << 30; - u32 x; - - ASSERT (xd->phy_index < 2); - ixge_software_firmware_sync (xd, 1 << (1 + xd->phy_index)); - - ASSERT (reg_index < (1 << 16)); - ASSERT (dev_type < (1 << 5)); - if (!is_read) - r->xge_mac.phy_data = v; - - /* Address cycle. */ - x = - reg_index | (dev_type << 16) | (xd-> - phys[xd->phy_index].mdio_address << 21); - r->xge_mac.phy_command = x | busy_bit; - /* Busy wait timed to take 28e-6 secs. No suspend. */ - while (r->xge_mac.phy_command & busy_bit) - ; - - r->xge_mac.phy_command = x | ((is_read ? 2 : 1) << 26) | busy_bit; - while (r->xge_mac.phy_command & busy_bit) - ; - - if (is_read) - v = r->xge_mac.phy_data >> 16; - - ixge_software_firmware_sync_release (xd, 1 << (1 + xd->phy_index)); - - return v; -} - -static u32 -ixge_read_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index) -{ - return ixge_read_write_phy_reg (xd, dev_type, reg_index, 0, /* is_read */ - 1); -} - -static void -ixge_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, u32 v) -{ - (void) ixge_read_write_phy_reg (xd, dev_type, reg_index, v, /* is_read */ - 0); -} - -static void -ixge_i2c_put_bits (i2c_bus_t * b, int scl, int sda) -{ - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); - u32 v; - - v = 0; - v |= (sda != 0) << 3; - v |= (scl != 0) << 1; - xd->regs->i2c_control = v; -} - -static void -ixge_i2c_get_bits (i2c_bus_t * b, int *scl, int *sda) -{ - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); - u32 v; - - v = xd->regs->i2c_control; - *sda = (v & (1 << 2)) != 0; - *scl = (v & (1 << 0)) != 0; -} - -static u16 -ixge_read_eeprom (ixge_device_t * xd, u32 address) -{ - ixge_regs_t *r = xd->regs; - u32 v; - r->eeprom_read = (( /* start bit */ (1 << 0)) | (address << 2)); - /* Wait for done bit. */ - while (!((v = r->eeprom_read) & (1 << 1))) - ; - return v >> 16; -} - -static void -ixge_sfp_enable_disable_laser (ixge_device_t * xd, uword enable) -{ - u32 tx_disable_bit = 1 << 3; - if (enable) - xd->regs->sdp_control &= ~tx_disable_bit; - else - xd->regs->sdp_control |= tx_disable_bit; -} - -static void -ixge_sfp_enable_disable_10g (ixge_device_t * xd, uword enable) -{ - u32 is_10g_bit = 1 << 5; - if (enable) - xd->regs->sdp_control |= is_10g_bit; - else - xd->regs->sdp_control &= ~is_10g_bit; -} - -static clib_error_t * -ixge_sfp_phy_init_from_eeprom (ixge_device_t * xd, u16 sfp_type) -{ - u16 a, id, reg_values_addr = 0; - - a = ixge_read_eeprom (xd, 0x2b); - if (a == 0 || a == 0xffff) - return clib_error_create ("no init sequence in eeprom"); - - while (1) - { - id = ixge_read_eeprom (xd, ++a); - if (id == 0xffff) - break; - reg_values_addr = ixge_read_eeprom (xd, ++a); - if (id == sfp_type) - break; - } - if (id != sfp_type) - return clib_error_create ("failed to find id 0x%x", sfp_type); - - ixge_software_firmware_sync (xd, 1 << 3); - while (1) - { - u16 v = ixge_read_eeprom (xd, ++reg_values_addr); - if (v == 0xffff) - break; - xd->regs->core_analog_config = v; - } - ixge_software_firmware_sync_release (xd, 1 << 3); - - /* Make sure laser is off. We'll turn on the laser when - the interface is brought up. */ - ixge_sfp_enable_disable_laser (xd, /* enable */ 0); - ixge_sfp_enable_disable_10g (xd, /* is_10g */ 1); - - return 0; -} - -static void -ixge_sfp_device_up_down (ixge_device_t * xd, uword is_up) -{ - u32 v; - - if (is_up) - { - /* pma/pmd 10g serial SFI. */ - xd->regs->xge_mac.auto_negotiation_control2 &= ~(3 << 16); - xd->regs->xge_mac.auto_negotiation_control2 |= 2 << 16; - - v = xd->regs->xge_mac.auto_negotiation_control; - v &= ~(7 << 13); - v |= (0 << 13); - /* Restart autoneg. */ - v |= (1 << 12); - xd->regs->xge_mac.auto_negotiation_control = v; - - while (!(xd->regs->xge_mac.link_partner_ability[0] & 0xf0000)) - ; - - v = xd->regs->xge_mac.auto_negotiation_control; - - /* link mode 10g sfi serdes */ - v &= ~(7 << 13); - v |= (3 << 13); - - /* Restart autoneg. */ - v |= (1 << 12); - xd->regs->xge_mac.auto_negotiation_control = v; - - xd->regs->xge_mac.link_status; - } - - ixge_sfp_enable_disable_laser (xd, /* enable */ is_up); - - /* Give time for link partner to notice that we're up. */ - if (is_up && vlib_in_process_context (vlib_get_main ())) - { - vlib_process_suspend (vlib_get_main (), 300e-3); - } -} - -always_inline ixge_dma_regs_t * -get_dma_regs (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 qi) -{ - ixge_regs_t *r = xd->regs; - ASSERT (qi < 128); - if (rt == VLIB_RX) - return qi < 64 ? &r->rx_dma0[qi] : &r->rx_dma1[qi - 64]; - else - return &r->tx_dma[qi]; -} - -static clib_error_t * -ixge_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) -{ - vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); - uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, hif->dev_instance); - ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); - - if (is_up) - { - xd->regs->rx_enable |= 1; - xd->regs->tx_dma_control |= 1; - dr->control |= 1 << 25; - while (!(dr->control & (1 << 25))) - ; - } - else - { - xd->regs->rx_enable &= ~1; - xd->regs->tx_dma_control &= ~1; - } - - ixge_sfp_device_up_down (xd, is_up); - - return /* no error */ 0; -} - -static void -ixge_sfp_phy_init (ixge_device_t * xd) -{ - ixge_phy_t *phy = xd->phys + xd->phy_index; - i2c_bus_t *ib = &xd->i2c_bus; - - ib->private_data = xd->device_index; - ib->put_bits = ixge_i2c_put_bits; - ib->get_bits = ixge_i2c_get_bits; - vlib_i2c_init (ib); - - vlib_i2c_read_eeprom (ib, 0x50, 0, 128, (u8 *) & xd->sfp_eeprom); - - if (vlib_i2c_bus_timed_out (ib) || !sfp_eeprom_is_valid (&xd->sfp_eeprom)) - xd->sfp_eeprom.id = SFP_ID_unknown; - else - { - /* FIXME 5 => SR/LR eeprom ID. */ - clib_error_t *e = - ixge_sfp_phy_init_from_eeprom (xd, 5 + xd->pci_function); - if (e) - clib_error_report (e); - } - - phy->mdio_address = ~0; -} - -static void -ixge_phy_init (ixge_device_t * xd) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - ixge_phy_t *phy = xd->phys + xd->phy_index; - - switch (xd->device_id) - { - case IXGE_82599_sfp: - case IXGE_82599_sfp_em: - case IXGE_82599_sfp_fcoe: - /* others? */ - return ixge_sfp_phy_init (xd); - - default: - break; - } - - /* Probe address of phy. */ - { - u32 i, v; - - phy->mdio_address = ~0; - for (i = 0; i < 32; i++) - { - phy->mdio_address = i; - v = ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1); - if (v != 0xffff && v != 0) - break; - } - - /* No PHY found? */ - if (i >= 32) - return; - } - - phy->id = - ((ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1) << 16) | - ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID2)); - - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d, phy id 0x%d mdio address %d",.format_args = "i4i4i4",}; - struct - { - u32 instance, id, address; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->id = phy->id; - ed->address = phy->mdio_address; - } - - /* Reset phy. */ - ixge_write_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL, - XGE_PHY_CONTROL_RESET); - - /* Wait for self-clearning reset bit to clear. */ - do - { - vlib_process_suspend (vm, 1e-3); - } - while (ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL) & - XGE_PHY_CONTROL_RESET); -} - -static u8 * -format_ixge_rx_from_hw_descriptor (u8 * s, va_list * va) -{ - ixge_rx_from_hw_descriptor_t *d = - va_arg (*va, ixge_rx_from_hw_descriptor_t *); - u32 s0 = d->status[0], s2 = d->status[2]; - u32 is_ip4, is_ip6, is_ip, is_tcp, is_udp; - uword indent = format_get_indent (s); - - s = format (s, "%s-owned", - (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE) ? "sw" : - "hw"); - s = - format (s, ", length this descriptor %d, l3 offset %d", - d->n_packet_bytes_this_descriptor, - IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s0)); - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) - s = format (s, ", end-of-packet"); - - s = format (s, "\n%U", format_white_space, indent); - - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR) - s = format (s, "layer2 error"); - - if (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2) - { - s = format (s, "layer 2 type %d", (s0 & 0x1f)); - return s; - } - - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN) - s = format (s, "vlan header 0x%x\n%U", d->vlan_tag, - format_white_space, indent); - - if ((is_ip4 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4))) - { - s = format (s, "ip4%s", - (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT) ? " options" : - ""); - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED) - s = format (s, " checksum %s", - (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) ? - "bad" : "ok"); - } - if ((is_ip6 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6))) - s = format (s, "ip6%s", - (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT) ? " extended" : - ""); - is_tcp = is_udp = 0; - if ((is_ip = (is_ip4 | is_ip6))) - { - is_tcp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP) != 0; - is_udp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP) != 0; - if (is_tcp) - s = format (s, ", tcp"); - if (is_udp) - s = format (s, ", udp"); - } - - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED) - s = format (s, ", tcp checksum %s", - (s2 & IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR) ? "bad" : - "ok"); - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED) - s = format (s, ", udp checksum %s", - (s2 & IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR) ? "bad" : - "ok"); - - return s; -} - -static u8 * -format_ixge_tx_descriptor (u8 * s, va_list * va) -{ - ixge_tx_descriptor_t *d = va_arg (*va, ixge_tx_descriptor_t *); - u32 s0 = d->status0, s1 = d->status1; - uword indent = format_get_indent (s); - u32 v; - - s = format (s, "buffer 0x%Lx, %d packet bytes, %d bytes this buffer", - d->buffer_address, s1 >> 14, d->n_bytes_this_buffer); - - s = format (s, "\n%U", format_white_space, indent); - - if ((v = (s0 >> 0) & 3)) - s = format (s, "reserved 0x%x, ", v); - - if ((v = (s0 >> 2) & 3)) - s = format (s, "mac 0x%x, ", v); - - if ((v = (s0 >> 4) & 0xf) != 3) - s = format (s, "type 0x%x, ", v); - - s = format (s, "%s%s%s%s%s%s%s%s", - (s0 & (1 << 8)) ? "eop, " : "", - (s0 & (1 << 9)) ? "insert-fcs, " : "", - (s0 & (1 << 10)) ? "reserved26, " : "", - (s0 & (1 << 11)) ? "report-status, " : "", - (s0 & (1 << 12)) ? "reserved28, " : "", - (s0 & (1 << 13)) ? "is-advanced, " : "", - (s0 & (1 << 14)) ? "vlan-enable, " : "", - (s0 & (1 << 15)) ? "tx-segmentation, " : ""); - - if ((v = s1 & 0xf) != 0) - s = format (s, "status 0x%x, ", v); - - if ((v = (s1 >> 4) & 0xf)) - s = format (s, "context 0x%x, ", v); - - if ((v = (s1 >> 8) & 0x3f)) - s = format (s, "options 0x%x, ", v); - - return s; -} - -typedef struct -{ - ixge_descriptor_t before, after; - - u32 buffer_index; - - u16 device_index; - - u8 queue_index; - - u8 is_start_of_packet; - - /* Copy of VLIB buffer; packet data stored in pre_data. */ - vlib_buffer_t buffer; -} ixge_rx_dma_trace_t; - -static u8 * -format_ixge_rx_dma_trace (u8 * s, va_list * va) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); - vlib_node_t *node = va_arg (*va, vlib_node_t *); - vnet_main_t *vnm = vnet_get_main (); - ixge_rx_dma_trace_t *t = va_arg (*va, ixge_rx_dma_trace_t *); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); - format_function_t *f; - uword indent = format_get_indent (s); - - { - vnet_sw_interface_t *sw = - vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); - s = - format (s, "%U rx queue %d", format_vnet_sw_interface_name, vnm, sw, - t->queue_index); - } - - s = format (s, "\n%Ubefore: %U", - format_white_space, indent, - format_ixge_rx_from_hw_descriptor, &t->before); - s = format (s, "\n%Uafter : head/tail address 0x%Lx/0x%Lx", - format_white_space, indent, - t->after.rx_to_hw.head_address, t->after.rx_to_hw.tail_address); - - s = format (s, "\n%Ubuffer 0x%x: %U", - format_white_space, indent, - t->buffer_index, format_vlib_buffer, &t->buffer); - - s = format (s, "\n%U", format_white_space, indent); - - f = node->format_buffer; - if (!f || !t->is_start_of_packet) - f = format_hex_bytes; - s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); - - return s; -} - -#define foreach_ixge_error \ - _ (none, "no error") \ - _ (tx_full_drops, "tx ring full drops") \ - _ (ip4_checksum_error, "ip4 checksum errors") \ - _ (rx_alloc_fail, "rx buf alloc from free list failed") \ - _ (rx_alloc_no_physmem, "rx buf alloc failed no physmem") - -typedef enum -{ -#define _(f,s) IXGE_ERROR_##f, - foreach_ixge_error -#undef _ - IXGE_N_ERROR, -} ixge_error_t; - -always_inline void -ixge_rx_next_and_error_from_status_x1 (ixge_device_t * xd, - u32 s00, u32 s02, - u8 * next0, u8 * error0, u32 * flags0) -{ - u8 is0_ip4, is0_ip6, n0, e0; - u32 f0; - - e0 = IXGE_ERROR_none; - n0 = IXGE_RX_NEXT_ETHERNET_INPUT; - - is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; - n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; - - e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) - ? IXGE_ERROR_ip4_checksum_error : e0); - - is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; - n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; - - n0 = (xd->per_interface_next_index != ~0) ? - xd->per_interface_next_index : n0; - - /* Check for error. */ - n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; - - f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED - | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); - - f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR - | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); - - *error0 = e0; - *next0 = n0; - *flags0 = f0; -} - -always_inline void -ixge_rx_next_and_error_from_status_x2 (ixge_device_t * xd, - u32 s00, u32 s02, - u32 s10, u32 s12, - u8 * next0, u8 * error0, u32 * flags0, - u8 * next1, u8 * error1, u32 * flags1) -{ - u8 is0_ip4, is0_ip6, n0, e0; - u8 is1_ip4, is1_ip6, n1, e1; - u32 f0, f1; - - e0 = e1 = IXGE_ERROR_none; - n0 = n1 = IXGE_RX_NEXT_IP4_INPUT; - - is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; - is1_ip4 = s12 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; - - n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; - n1 = is1_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n1; - - e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) - ? IXGE_ERROR_ip4_checksum_error : e0); - e1 = (is1_ip4 && (s12 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) - ? IXGE_ERROR_ip4_checksum_error : e1); - - is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; - is1_ip6 = s10 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; - - n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; - n1 = is1_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n1; - - n0 = (xd->per_interface_next_index != ~0) ? - xd->per_interface_next_index : n0; - n1 = (xd->per_interface_next_index != ~0) ? - xd->per_interface_next_index : n1; - - /* Check for error. */ - n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; - n1 = e1 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n1; - - *error0 = e0; - *error1 = e1; - - *next0 = n0; - *next1 = n1; - - f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED - | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); - f1 = ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED - | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); - - f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR - | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); - f1 |= ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR - | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); - - *flags0 = f0; - *flags1 = f1; -} - -static void -ixge_rx_trace (ixge_main_t * xm, - ixge_device_t * xd, - ixge_dma_queue_t * dq, - ixge_descriptor_t * before_descriptors, - u32 * before_buffers, - ixge_descriptor_t * after_descriptors, uword n_descriptors) -{ - vlib_main_t *vm = xm->vlib_main; - vlib_node_runtime_t *node = dq->rx.node; - ixge_rx_from_hw_descriptor_t *bd; - ixge_rx_to_hw_descriptor_t *ad; - u32 *b, n_left, is_sop, next_index_sop; - - n_left = n_descriptors; - b = before_buffers; - bd = &before_descriptors->rx_from_hw; - ad = &after_descriptors->rx_to_hw; - is_sop = dq->rx.is_start_of_packet; - next_index_sop = dq->rx.saved_start_of_packet_next_index; - - while (n_left >= 2) - { - u32 bi0, bi1, flags0, flags1; - vlib_buffer_t *b0, *b1; - ixge_rx_dma_trace_t *t0, *t1; - u8 next0, error0, next1, error1; - - bi0 = b[0]; - bi1 = b[1]; - n_left -= 2; - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - ixge_rx_next_and_error_from_status_x2 (xd, - bd[0].status[0], bd[0].status[2], - bd[1].status[0], bd[1].status[2], - &next0, &error0, &flags0, - &next1, &error1, &flags1); - - next_index_sop = is_sop ? next0 : next_index_sop; - vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); - t0->is_start_of_packet = is_sop; - is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - next_index_sop = is_sop ? next1 : next_index_sop; - vlib_trace_buffer (vm, node, next_index_sop, b1, /* follow_chain */ 0); - t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); - t1->is_start_of_packet = is_sop; - is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t0->queue_index = dq->queue_index; - t1->queue_index = dq->queue_index; - t0->device_index = xd->device_index; - t1->device_index = xd->device_index; - t0->before.rx_from_hw = bd[0]; - t1->before.rx_from_hw = bd[1]; - t0->after.rx_to_hw = ad[0]; - t1->after.rx_to_hw = ad[1]; - t0->buffer_index = bi0; - t1->buffer_index = bi1; - memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); - memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); - memcpy (t0->buffer.pre_data, b0->data + b0->current_data, - sizeof (t0->buffer.pre_data)); - memcpy (t1->buffer.pre_data, b1->data + b1->current_data, - sizeof (t1->buffer.pre_data)); - - b += 2; - bd += 2; - ad += 2; - } - - while (n_left >= 1) - { - u32 bi0, flags0; - vlib_buffer_t *b0; - ixge_rx_dma_trace_t *t0; - u8 next0, error0; - - bi0 = b[0]; - n_left -= 1; - - b0 = vlib_get_buffer (vm, bi0); - - ixge_rx_next_and_error_from_status_x1 (xd, - bd[0].status[0], bd[0].status[2], - &next0, &error0, &flags0); - - next_index_sop = is_sop ? next0 : next_index_sop; - vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); - t0->is_start_of_packet = is_sop; - is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t0->queue_index = dq->queue_index; - t0->device_index = xd->device_index; - t0->before.rx_from_hw = bd[0]; - t0->after.rx_to_hw = ad[0]; - t0->buffer_index = bi0; - memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); - memcpy (t0->buffer.pre_data, b0->data + b0->current_data, - sizeof (t0->buffer.pre_data)); - - b += 1; - bd += 1; - ad += 1; - } -} - -typedef struct -{ - ixge_tx_descriptor_t descriptor; - - u32 buffer_index; - - u16 device_index; - - u8 queue_index; - - u8 is_start_of_packet; - - /* Copy of VLIB buffer; packet data stored in pre_data. */ - vlib_buffer_t buffer; -} ixge_tx_dma_trace_t; - -static u8 * -format_ixge_tx_dma_trace (u8 * s, va_list * va) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); - CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); - ixge_tx_dma_trace_t *t = va_arg (*va, ixge_tx_dma_trace_t *); - vnet_main_t *vnm = vnet_get_main (); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); - format_function_t *f; - uword indent = format_get_indent (s); - - { - vnet_sw_interface_t *sw = - vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); - s = - format (s, "%U tx queue %d", format_vnet_sw_interface_name, vnm, sw, - t->queue_index); - } - - s = format (s, "\n%Udescriptor: %U", - format_white_space, indent, - format_ixge_tx_descriptor, &t->descriptor); - - s = format (s, "\n%Ubuffer 0x%x: %U", - format_white_space, indent, - t->buffer_index, format_vlib_buffer, &t->buffer); - - s = format (s, "\n%U", format_white_space, indent); - - f = format_ethernet_header_with_length; - if (!f || !t->is_start_of_packet) - f = format_hex_bytes; - s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); - - return s; -} - -typedef struct -{ - vlib_node_runtime_t *node; - - u32 is_start_of_packet; - - u32 n_bytes_in_packet; - - ixge_tx_descriptor_t *start_of_packet_descriptor; -} ixge_tx_state_t; - -static void -ixge_tx_trace (ixge_main_t * xm, - ixge_device_t * xd, - ixge_dma_queue_t * dq, - ixge_tx_state_t * tx_state, - ixge_tx_descriptor_t * descriptors, - u32 * buffers, uword n_descriptors) -{ - vlib_main_t *vm = xm->vlib_main; - vlib_node_runtime_t *node = tx_state->node; - ixge_tx_descriptor_t *d; - u32 *b, n_left, is_sop; - - n_left = n_descriptors; - b = buffers; - d = descriptors; - is_sop = tx_state->is_start_of_packet; - - while (n_left >= 2) - { - u32 bi0, bi1; - vlib_buffer_t *b0, *b1; - ixge_tx_dma_trace_t *t0, *t1; - - bi0 = b[0]; - bi1 = b[1]; - n_left -= 2; - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); - t0->is_start_of_packet = is_sop; - is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); - t1->is_start_of_packet = is_sop; - is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t0->queue_index = dq->queue_index; - t1->queue_index = dq->queue_index; - t0->device_index = xd->device_index; - t1->device_index = xd->device_index; - t0->descriptor = d[0]; - t1->descriptor = d[1]; - t0->buffer_index = bi0; - t1->buffer_index = bi1; - memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); - memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); - memcpy (t0->buffer.pre_data, b0->data + b0->current_data, - sizeof (t0->buffer.pre_data)); - memcpy (t1->buffer.pre_data, b1->data + b1->current_data, - sizeof (t1->buffer.pre_data)); - - b += 2; - d += 2; - } - - while (n_left >= 1) - { - u32 bi0; - vlib_buffer_t *b0; - ixge_tx_dma_trace_t *t0; - - bi0 = b[0]; - n_left -= 1; - - b0 = vlib_get_buffer (vm, bi0); - - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); - t0->is_start_of_packet = is_sop; - is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t0->queue_index = dq->queue_index; - t0->device_index = xd->device_index; - t0->descriptor = d[0]; - t0->buffer_index = bi0; - memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); - memcpy (t0->buffer.pre_data, b0->data + b0->current_data, - sizeof (t0->buffer.pre_data)); - - b += 1; - d += 1; - } -} - -always_inline uword -ixge_ring_sub (ixge_dma_queue_t * q, u32 i0, u32 i1) -{ - i32 d = i1 - i0; - ASSERT (i0 < q->n_descriptors); - ASSERT (i1 < q->n_descriptors); - return d < 0 ? q->n_descriptors + d : d; -} - -always_inline uword -ixge_ring_add (ixge_dma_queue_t * q, u32 i0, u32 i1) -{ - u32 d = i0 + i1; - ASSERT (i0 < q->n_descriptors); - ASSERT (i1 < q->n_descriptors); - d -= d >= q->n_descriptors ? q->n_descriptors : 0; - return d; -} - -always_inline uword -ixge_tx_descriptor_matches_template (ixge_main_t * xm, - ixge_tx_descriptor_t * d) -{ - u32 cmp; - - cmp = ((d->status0 & xm->tx_descriptor_template_mask.status0) - ^ xm->tx_descriptor_template.status0); - if (cmp) - return 0; - cmp = ((d->status1 & xm->tx_descriptor_template_mask.status1) - ^ xm->tx_descriptor_template.status1); - if (cmp) - return 0; - - return 1; -} - -static uword -ixge_tx_no_wrap (ixge_main_t * xm, - ixge_device_t * xd, - ixge_dma_queue_t * dq, - u32 * buffers, - u32 start_descriptor_index, - u32 n_descriptors, ixge_tx_state_t * tx_state) -{ - vlib_main_t *vm = xm->vlib_main; - ixge_tx_descriptor_t *d, *d_sop; - u32 n_left = n_descriptors; - u32 *to_free = vec_end (xm->tx_buffers_pending_free); - u32 *to_tx = - vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); - u32 is_sop = tx_state->is_start_of_packet; - u32 len_sop = tx_state->n_bytes_in_packet; - u16 template_status = xm->tx_descriptor_template.status0; - u32 descriptor_prefetch_rotor = 0; - - ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); - d = &dq->descriptors[start_descriptor_index].tx; - d_sop = is_sop ? d : tx_state->start_of_packet_descriptor; - - while (n_left >= 4) - { - vlib_buffer_t *b0, *b1; - u32 bi0, fi0, len0; - u32 bi1, fi1, len1; - u8 is_eop0, is_eop1; - - /* Prefetch next iteration. */ - vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD); - vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD); - - if ((descriptor_prefetch_rotor & 0x3) == 0) - CLIB_PREFETCH (d + 4, CLIB_CACHE_LINE_BYTES, STORE); - - descriptor_prefetch_rotor += 2; - - bi0 = buffers[0]; - bi1 = buffers[1]; - - to_free[0] = fi0 = to_tx[0]; - to_tx[0] = bi0; - to_free += fi0 != 0; - - to_free[0] = fi1 = to_tx[1]; - to_tx[1] = bi1; - to_free += fi1 != 0; - - buffers += 2; - n_left -= 2; - to_tx += 2; - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - is_eop1 = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - len0 = b0->current_length; - len1 = b1->current_length; - - ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); - ASSERT (ixge_tx_descriptor_matches_template (xm, d + 1)); - - d[0].buffer_address = - vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; - d[1].buffer_address = - vlib_get_buffer_data_physical_address (vm, bi1) + b1->current_data; - - d[0].n_bytes_this_buffer = len0; - d[1].n_bytes_this_buffer = len1; - - d[0].status0 = - template_status | (is_eop0 << - IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); - d[1].status0 = - template_status | (is_eop1 << - IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); - - len_sop = (is_sop ? 0 : len_sop) + len0; - d_sop[0].status1 = - IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); - d += 1; - d_sop = is_eop0 ? d : d_sop; - - is_sop = is_eop0; - - len_sop = (is_sop ? 0 : len_sop) + len1; - d_sop[0].status1 = - IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); - d += 1; - d_sop = is_eop1 ? d : d_sop; - - is_sop = is_eop1; - } - - while (n_left > 0) - { - vlib_buffer_t *b0; - u32 bi0, fi0, len0; - u8 is_eop0; - - bi0 = buffers[0]; - - to_free[0] = fi0 = to_tx[0]; - to_tx[0] = bi0; - to_free += fi0 != 0; - - buffers += 1; - n_left -= 1; - to_tx += 1; - - b0 = vlib_get_buffer (vm, bi0); - - is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - len0 = b0->current_length; - - ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); - - d[0].buffer_address = - vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; - - d[0].n_bytes_this_buffer = len0; - - d[0].status0 = - template_status | (is_eop0 << - IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); - - len_sop = (is_sop ? 0 : len_sop) + len0; - d_sop[0].status1 = - IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); - d += 1; - d_sop = is_eop0 ? d : d_sop; - - is_sop = is_eop0; - } - - if (tx_state->node->flags & VLIB_NODE_FLAG_TRACE) - { - to_tx = - vec_elt_at_index (dq->descriptor_buffer_indices, - start_descriptor_index); - ixge_tx_trace (xm, xd, dq, tx_state, - &dq->descriptors[start_descriptor_index].tx, to_tx, - n_descriptors); - } - - _vec_len (xm->tx_buffers_pending_free) = - to_free - xm->tx_buffers_pending_free; - - /* When we are done d_sop can point to end of ring. Wrap it if so. */ - { - ixge_tx_descriptor_t *d_start = &dq->descriptors[0].tx; - - ASSERT (d_sop - d_start <= dq->n_descriptors); - d_sop = d_sop - d_start == dq->n_descriptors ? d_start : d_sop; - } - - tx_state->is_start_of_packet = is_sop; - tx_state->start_of_packet_descriptor = d_sop; - tx_state->n_bytes_in_packet = len_sop; - - return n_descriptors; -} - -static uword -ixge_interface_tx (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * f) -{ - ixge_main_t *xm = &ixge_main; - vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; - ixge_device_t *xd = vec_elt_at_index (xm->devices, rd->dev_instance); - ixge_dma_queue_t *dq; - u32 *from, n_left_tx, n_descriptors_to_tx, n_tail_drop; - u32 queue_index = 0; /* fixme parameter */ - ixge_tx_state_t tx_state; - - tx_state.node = node; - tx_state.is_start_of_packet = 1; - tx_state.start_of_packet_descriptor = 0; - tx_state.n_bytes_in_packet = 0; - - from = vlib_frame_vector_args (f); - - dq = vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); - - dq->head_index = dq->tx.head_index_write_back[0]; - - /* Since head == tail means ring is empty we can send up to dq->n_descriptors - 1. */ - n_left_tx = dq->n_descriptors - 1; - n_left_tx -= ixge_ring_sub (dq, dq->head_index, dq->tail_index); - - _vec_len (xm->tx_buffers_pending_free) = 0; - - n_descriptors_to_tx = f->n_vectors; - n_tail_drop = 0; - if (PREDICT_FALSE (n_descriptors_to_tx > n_left_tx)) - { - i32 i, n_ok, i_eop, i_sop; - - i_sop = i_eop = ~0; - for (i = n_left_tx - 1; i >= 0; i--) - { - vlib_buffer_t *b = vlib_get_buffer (vm, from[i]); - if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) - { - if (i_sop != ~0 && i_eop != ~0) - break; - i_eop = i; - i_sop = i + 1; - } - } - if (i == 0) - n_ok = 0; - else - n_ok = i_eop + 1; - - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d, ring full to tx %d head %d tail %d",.format_args = - "i2i2i2i2",}; - struct - { - u16 instance, to_tx, head, tail; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->to_tx = n_descriptors_to_tx; - ed->head = dq->head_index; - ed->tail = dq->tail_index; - } - - if (n_ok < n_descriptors_to_tx) - { - n_tail_drop = n_descriptors_to_tx - n_ok; - vec_add (xm->tx_buffers_pending_free, from + n_ok, n_tail_drop); - vlib_error_count (vm, ixge_input_node.index, - IXGE_ERROR_tx_full_drops, n_tail_drop); - } - - n_descriptors_to_tx = n_ok; - } - - dq->tx.n_buffers_on_ring += n_descriptors_to_tx; - - /* Process from tail to end of descriptor ring. */ - if (n_descriptors_to_tx > 0 && dq->tail_index < dq->n_descriptors) - { - u32 n = - clib_min (dq->n_descriptors - dq->tail_index, n_descriptors_to_tx); - n = ixge_tx_no_wrap (xm, xd, dq, from, dq->tail_index, n, &tx_state); - from += n; - n_descriptors_to_tx -= n; - dq->tail_index += n; - ASSERT (dq->tail_index <= dq->n_descriptors); - if (dq->tail_index == dq->n_descriptors) - dq->tail_index = 0; - } - - if (n_descriptors_to_tx > 0) - { - u32 n = - ixge_tx_no_wrap (xm, xd, dq, from, 0, n_descriptors_to_tx, &tx_state); - from += n; - ASSERT (n == n_descriptors_to_tx); - dq->tail_index += n; - ASSERT (dq->tail_index <= dq->n_descriptors); - if (dq->tail_index == dq->n_descriptors) - dq->tail_index = 0; - } - - /* We should only get full packets. */ - ASSERT (tx_state.is_start_of_packet); - - /* Report status when last descriptor is done. */ - { - u32 i = dq->tail_index == 0 ? dq->n_descriptors - 1 : dq->tail_index - 1; - ixge_tx_descriptor_t *d = &dq->descriptors[i].tx; - d->status0 |= IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS; - } - - /* Give new descriptors to hardware. */ - { - ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_TX, queue_index); - - CLIB_MEMORY_BARRIER (); - - dr->tail_index = dq->tail_index; - } - - /* Free any buffers that are done. */ - { - u32 n = _vec_len (xm->tx_buffers_pending_free); - if (n > 0) - { - vlib_buffer_free_no_next (vm, xm->tx_buffers_pending_free, n); - _vec_len (xm->tx_buffers_pending_free) = 0; - ASSERT (dq->tx.n_buffers_on_ring >= n); - dq->tx.n_buffers_on_ring -= (n - n_tail_drop); - } - } - - return f->n_vectors; -} - -static uword -ixge_rx_queue_no_wrap (ixge_main_t * xm, - ixge_device_t * xd, - ixge_dma_queue_t * dq, - u32 start_descriptor_index, u32 n_descriptors) -{ - vlib_main_t *vm = xm->vlib_main; - vlib_node_runtime_t *node = dq->rx.node; - ixge_descriptor_t *d; - static ixge_descriptor_t *d_trace_save; - static u32 *d_trace_buffers; - u32 n_descriptors_left = n_descriptors; - u32 *to_rx = - vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); - u32 *to_add; - u32 bi_sop = dq->rx.saved_start_of_packet_buffer_index; - u32 bi_last = dq->rx.saved_last_buffer_index; - u32 next_index_sop = dq->rx.saved_start_of_packet_next_index; - u32 is_sop = dq->rx.is_start_of_packet; - u32 next_index, n_left_to_next, *to_next; - u32 n_packets = 0; - u32 n_bytes = 0; - u32 n_trace = vlib_get_trace_count (vm, node); - vlib_buffer_t *b_last, b_dummy; - - ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); - d = &dq->descriptors[start_descriptor_index]; - - b_last = bi_last != ~0 ? vlib_get_buffer (vm, bi_last) : &b_dummy; - next_index = dq->rx.next_index; - - if (n_trace > 0) - { - u32 n = clib_min (n_trace, n_descriptors); - if (d_trace_save) - { - _vec_len (d_trace_save) = 0; - _vec_len (d_trace_buffers) = 0; - } - vec_add (d_trace_save, (ixge_descriptor_t *) d, n); - vec_add (d_trace_buffers, to_rx, n); - } - - { - uword l = vec_len (xm->rx_buffers_to_add); - - if (l < n_descriptors_left) - { - u32 n_to_alloc = 2 * dq->n_descriptors - l; - u32 n_allocated; - - vec_resize (xm->rx_buffers_to_add, n_to_alloc); - - _vec_len (xm->rx_buffers_to_add) = l; - n_allocated = vlib_buffer_alloc_from_free_list - (vm, xm->rx_buffers_to_add + l, n_to_alloc, - xm->vlib_buffer_free_list_index); - _vec_len (xm->rx_buffers_to_add) += n_allocated; - - /* Handle transient allocation failure */ - if (PREDICT_FALSE (l + n_allocated <= n_descriptors_left)) - { - if (n_allocated == 0) - vlib_error_count (vm, ixge_input_node.index, - IXGE_ERROR_rx_alloc_no_physmem, 1); - else - vlib_error_count (vm, ixge_input_node.index, - IXGE_ERROR_rx_alloc_fail, 1); - - n_descriptors_left = l + n_allocated; - } - n_descriptors = n_descriptors_left; - } - - /* Add buffers from end of vector going backwards. */ - to_add = vec_end (xm->rx_buffers_to_add) - 1; - } - - while (n_descriptors_left > 0) - { - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - - while (n_descriptors_left >= 4 && n_left_to_next >= 2) - { - vlib_buffer_t *b0, *b1; - u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; - u32 bi1, fi1, len1, l3_offset1, s21, s01, flags1; - u8 is_eop0, error0, next0; - u8 is_eop1, error1, next1; - ixge_descriptor_t d0, d1; - - vlib_prefetch_buffer_with_index (vm, to_rx[2], STORE); - vlib_prefetch_buffer_with_index (vm, to_rx[3], STORE); - - CLIB_PREFETCH (d + 2, 32, STORE); - - d0.as_u32x4 = d[0].as_u32x4; - d1.as_u32x4 = d[1].as_u32x4; - - s20 = d0.rx_from_hw.status[2]; - s21 = d1.rx_from_hw.status[2]; - - s00 = d0.rx_from_hw.status[0]; - s01 = d1.rx_from_hw.status[0]; - - if (! - ((s20 & s21) & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) - goto found_hw_owned_descriptor_x2; - - bi0 = to_rx[0]; - bi1 = to_rx[1]; - - ASSERT (to_add - 1 >= xm->rx_buffers_to_add); - fi0 = to_add[0]; - fi1 = to_add[-1]; - - to_rx[0] = fi0; - to_rx[1] = fi1; - to_rx += 2; - to_add -= 2; - - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, bi0)); - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, bi1)); - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, fi0)); - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, fi1)); - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - /* - * Turn this on if you run into - * "bad monkey" contexts, and you want to know exactly - * which nodes they've visited... See main.c... - */ - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); - - CLIB_PREFETCH (b0->data, CLIB_CACHE_LINE_BYTES, LOAD); - CLIB_PREFETCH (b1->data, CLIB_CACHE_LINE_BYTES, LOAD); - - is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; - is_eop1 = (s21 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; - - ixge_rx_next_and_error_from_status_x2 (xd, s00, s20, s01, s21, - &next0, &error0, &flags0, - &next1, &error1, &flags1); - - next0 = is_sop ? next0 : next_index_sop; - next1 = is_eop0 ? next1 : next0; - next_index_sop = next1; - - b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); - b1->flags |= flags1 | (!is_eop1 << VLIB_BUFFER_LOG2_NEXT_PRESENT); - - vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; - - b0->error = node->errors[error0]; - b1->error = node->errors[error1]; - - len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; - len1 = d1.rx_from_hw.n_packet_bytes_this_descriptor; - n_bytes += len0 + len1; - n_packets += is_eop0 + is_eop1; - - /* Give new buffers to hardware. */ - d0.rx_to_hw.tail_address = - vlib_get_buffer_data_physical_address (vm, fi0); - d1.rx_to_hw.tail_address = - vlib_get_buffer_data_physical_address (vm, fi1); - d0.rx_to_hw.head_address = d[0].rx_to_hw.tail_address; - d1.rx_to_hw.head_address = d[1].rx_to_hw.tail_address; - d[0].as_u32x4 = d0.as_u32x4; - d[1].as_u32x4 = d1.as_u32x4; - - d += 2; - n_descriptors_left -= 2; - - /* Point to either l2 or l3 header depending on next. */ - l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) - ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; - l3_offset1 = (is_eop0 && (next1 != IXGE_RX_NEXT_ETHERNET_INPUT)) - ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s01) : 0; - - b0->current_length = len0 - l3_offset0; - b1->current_length = len1 - l3_offset1; - b0->current_data = l3_offset0; - b1->current_data = l3_offset1; - - b_last->next_buffer = is_sop ? ~0 : bi0; - b0->next_buffer = is_eop0 ? ~0 : bi1; - bi_last = bi1; - b_last = b1; - - if (CLIB_DEBUG > 0) - { - u32 bi_sop0 = is_sop ? bi0 : bi_sop; - u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; - - if (is_eop0) - { - u8 *msg = vlib_validate_buffer (vm, bi_sop0, - /* follow_buffer_next */ 1); - ASSERT (!msg); - } - if (is_eop1) - { - u8 *msg = vlib_validate_buffer (vm, bi_sop1, - /* follow_buffer_next */ 1); - ASSERT (!msg); - } - } - if (0) /* "Dave" version */ - { - u32 bi_sop0 = is_sop ? bi0 : bi_sop; - u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; - - if (is_eop0) - { - to_next[0] = bi_sop0; - to_next++; - n_left_to_next--; - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi_sop0, next0); - } - if (is_eop1) - { - to_next[0] = bi_sop1; - to_next++; - n_left_to_next--; - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi_sop1, next1); - } - is_sop = is_eop1; - bi_sop = bi_sop1; - } - if (1) /* "Eliot" version */ - { - /* Speculatively enqueue to cached next. */ - u8 saved_is_sop = is_sop; - u32 bi_sop_save = bi_sop; - - bi_sop = saved_is_sop ? bi0 : bi_sop; - to_next[0] = bi_sop; - to_next += is_eop0; - n_left_to_next -= is_eop0; - - bi_sop = is_eop0 ? bi1 : bi_sop; - to_next[0] = bi_sop; - to_next += is_eop1; - n_left_to_next -= is_eop1; - - is_sop = is_eop1; - - if (PREDICT_FALSE - (!(next0 == next_index && next1 == next_index))) - { - /* Undo speculation. */ - to_next -= is_eop0 + is_eop1; - n_left_to_next += is_eop0 + is_eop1; - - /* Re-do both descriptors being careful about where we enqueue. */ - bi_sop = saved_is_sop ? bi0 : bi_sop_save; - if (is_eop0) - { - if (next0 != next_index) - vlib_set_next_frame_buffer (vm, node, next0, bi_sop); - else - { - to_next[0] = bi_sop; - to_next += 1; - n_left_to_next -= 1; - } - } - - bi_sop = is_eop0 ? bi1 : bi_sop; - if (is_eop1) - { - if (next1 != next_index) - vlib_set_next_frame_buffer (vm, node, next1, bi_sop); - else - { - to_next[0] = bi_sop; - to_next += 1; - n_left_to_next -= 1; - } - } - - /* Switch cached next index when next for both packets is the same. */ - if (is_eop0 && is_eop1 && next0 == next1) - { - vlib_put_next_frame (vm, node, next_index, - n_left_to_next); - next_index = next0; - vlib_get_next_frame (vm, node, next_index, - to_next, n_left_to_next); - } - } - } - } - - /* Bail out of dual loop and proceed with single loop. */ - found_hw_owned_descriptor_x2: - - while (n_descriptors_left > 0 && n_left_to_next > 0) - { - vlib_buffer_t *b0; - u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; - u8 is_eop0, error0, next0; - ixge_descriptor_t d0; - - d0.as_u32x4 = d[0].as_u32x4; - - s20 = d0.rx_from_hw.status[2]; - s00 = d0.rx_from_hw.status[0]; - - if (!(s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) - goto found_hw_owned_descriptor_x1; - - bi0 = to_rx[0]; - ASSERT (to_add >= xm->rx_buffers_to_add); - fi0 = to_add[0]; - - to_rx[0] = fi0; - to_rx += 1; - to_add -= 1; - - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, bi0)); - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, fi0)); - - b0 = vlib_get_buffer (vm, bi0); - - /* - * Turn this on if you run into - * "bad monkey" contexts, and you want to know exactly - * which nodes they've visited... - */ - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); - - is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; - ixge_rx_next_and_error_from_status_x1 - (xd, s00, s20, &next0, &error0, &flags0); - - next0 = is_sop ? next0 : next_index_sop; - next_index_sop = next0; - - b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); - - vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - - b0->error = node->errors[error0]; - - len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; - n_bytes += len0; - n_packets += is_eop0; - - /* Give new buffer to hardware. */ - d0.rx_to_hw.tail_address = - vlib_get_buffer_data_physical_address (vm, fi0); - d0.rx_to_hw.head_address = d0.rx_to_hw.tail_address; - d[0].as_u32x4 = d0.as_u32x4; - - d += 1; - n_descriptors_left -= 1; - - /* Point to either l2 or l3 header depending on next. */ - l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) - ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; - b0->current_length = len0 - l3_offset0; - b0->current_data = l3_offset0; - - b_last->next_buffer = is_sop ? ~0 : bi0; - bi_last = bi0; - b_last = b0; - - bi_sop = is_sop ? bi0 : bi_sop; - - if (CLIB_DEBUG > 0 && is_eop0) - { - u8 *msg = - vlib_validate_buffer (vm, bi_sop, /* follow_buffer_next */ 1); - ASSERT (!msg); - } - - if (0) /* "Dave" version */ - { - if (is_eop0) - { - to_next[0] = bi_sop; - to_next++; - n_left_to_next--; - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi_sop, next0); - } - } - if (1) /* "Eliot" version */ - { - if (PREDICT_TRUE (next0 == next_index)) - { - to_next[0] = bi_sop; - to_next += is_eop0; - n_left_to_next -= is_eop0; - } - else - { - if (next0 != next_index && is_eop0) - vlib_set_next_frame_buffer (vm, node, next0, bi_sop); - - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - next_index = next0; - vlib_get_next_frame (vm, node, next_index, - to_next, n_left_to_next); - } - } - is_sop = is_eop0; - } - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - } - -found_hw_owned_descriptor_x1: - if (n_descriptors_left > 0) - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - - _vec_len (xm->rx_buffers_to_add) = (to_add + 1) - xm->rx_buffers_to_add; - - { - u32 n_done = n_descriptors - n_descriptors_left; - - if (n_trace > 0 && n_done > 0) - { - u32 n = clib_min (n_trace, n_done); - ixge_rx_trace (xm, xd, dq, - d_trace_save, - d_trace_buffers, - &dq->descriptors[start_descriptor_index], n); - vlib_set_trace_count (vm, node, n_trace - n); - } - if (d_trace_save) - { - _vec_len (d_trace_save) = 0; - _vec_len (d_trace_buffers) = 0; - } - - /* Don't keep a reference to b_last if we don't have to. - Otherwise we can over-write a next_buffer pointer after already haven - enqueued a packet. */ - if (is_sop) - { - b_last->next_buffer = ~0; - bi_last = ~0; - } - - dq->rx.n_descriptors_done_this_call = n_done; - dq->rx.n_descriptors_done_total += n_done; - dq->rx.is_start_of_packet = is_sop; - dq->rx.saved_start_of_packet_buffer_index = bi_sop; - dq->rx.saved_last_buffer_index = bi_last; - dq->rx.saved_start_of_packet_next_index = next_index_sop; - dq->rx.next_index = next_index; - dq->rx.n_bytes += n_bytes; - - return n_packets; - } -} - -static uword -ixge_rx_queue (ixge_main_t * xm, - ixge_device_t * xd, - vlib_node_runtime_t * node, u32 queue_index) -{ - ixge_dma_queue_t *dq = - vec_elt_at_index (xd->dma_queues[VLIB_RX], queue_index); - ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, dq->queue_index); - uword n_packets = 0; - u32 hw_head_index, sw_head_index; - - /* One time initialization. */ - if (!dq->rx.node) - { - dq->rx.node = node; - dq->rx.is_start_of_packet = 1; - dq->rx.saved_start_of_packet_buffer_index = ~0; - dq->rx.saved_last_buffer_index = ~0; - } - - dq->rx.next_index = node->cached_next_index; - - dq->rx.n_descriptors_done_total = 0; - dq->rx.n_descriptors_done_this_call = 0; - dq->rx.n_bytes = 0; - - /* Fetch head from hardware and compare to where we think we are. */ - hw_head_index = dr->head_index; - sw_head_index = dq->head_index; - - if (hw_head_index == sw_head_index) - goto done; - - if (hw_head_index < sw_head_index) - { - u32 n_tried = dq->n_descriptors - sw_head_index; - n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); - sw_head_index = - ixge_ring_add (dq, sw_head_index, - dq->rx.n_descriptors_done_this_call); - - if (dq->rx.n_descriptors_done_this_call != n_tried) - goto done; - } - if (hw_head_index >= sw_head_index) - { - u32 n_tried = hw_head_index - sw_head_index; - n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); - sw_head_index = - ixge_ring_add (dq, sw_head_index, - dq->rx.n_descriptors_done_this_call); - } - -done: - dq->head_index = sw_head_index; - dq->tail_index = - ixge_ring_add (dq, dq->tail_index, dq->rx.n_descriptors_done_total); - - /* Give tail back to hardware. */ - CLIB_MEMORY_BARRIER (); - - dr->tail_index = dq->tail_index; - - vlib_increment_combined_counter (vnet_main. - interface_main.combined_sw_if_counters + - VNET_INTERFACE_COUNTER_RX, - 0 /* cpu_index */ , - xd->vlib_sw_if_index, n_packets, - dq->rx.n_bytes); - - return n_packets; -} - -static void -ixge_interrupt (ixge_main_t * xm, ixge_device_t * xd, u32 i) -{ - vlib_main_t *vm = xm->vlib_main; - ixge_regs_t *r = xd->regs; - - if (i != 20) - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d, %s",.format_args = "i1t1",.n_enum_strings = - 16,.enum_strings = - { - "flow director", - "rx miss", - "pci exception", - "mailbox", - "link status change", - "linksec key exchange", - "manageability event", - "reserved23", - "sdp0", - "sdp1", - "sdp2", - "sdp3", - "ecc", "descriptor handler error", "tcp timer", "other",},}; - struct - { - u8 instance; - u8 index; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->index = i - 16; - } - else - { - u32 v = r->xge_mac.link_status; - uword is_up = (v & (1 << 30)) != 0; - - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d, link status change 0x%x",.format_args = "i4i4",}; - struct - { - u32 instance, link_status; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->link_status = v; - xd->link_status_at_last_link_change = v; - - vlib_process_signal_event (vm, ixge_process_node.index, - EVENT_SET_FLAGS, - ((is_up << 31) | xd->vlib_hw_if_index)); - } -} - -always_inline u32 -clean_block (u32 * b, u32 * t, u32 n_left) -{ - u32 *t0 = t; - - while (n_left >= 4) - { - u32 bi0, bi1, bi2, bi3; - - t[0] = bi0 = b[0]; - b[0] = 0; - t += bi0 != 0; - - t[0] = bi1 = b[1]; - b[1] = 0; - t += bi1 != 0; - - t[0] = bi2 = b[2]; - b[2] = 0; - t += bi2 != 0; - - t[0] = bi3 = b[3]; - b[3] = 0; - t += bi3 != 0; - - b += 4; - n_left -= 4; - } - - while (n_left > 0) - { - u32 bi0; - - t[0] = bi0 = b[0]; - b[0] = 0; - t += bi0 != 0; - b += 1; - n_left -= 1; - } - - return t - t0; -} - -static void -ixge_tx_queue (ixge_main_t * xm, ixge_device_t * xd, u32 queue_index) -{ - vlib_main_t *vm = xm->vlib_main; - ixge_dma_queue_t *dq = - vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); - u32 n_clean, *b, *t, *t0; - i32 n_hw_owned_descriptors; - i32 first_to_clean, last_to_clean; - u64 hwbp_race = 0; - - /* Handle case where head write back pointer update - * arrives after the interrupt during high PCI bus loads. - */ - while ((dq->head_index == dq->tx.head_index_write_back[0]) && - dq->tx.n_buffers_on_ring && (dq->head_index != dq->tail_index)) - { - hwbp_race++; - if (IXGE_HWBP_RACE_ELOG && (hwbp_race == 1)) - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d tx head index race: head %4d, tail %4d, buffs %4d",.format_args - = "i4i4i4i4",}; - struct - { - u32 instance, head_index, tail_index, n_buffers_on_ring; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->head_index = dq->head_index; - ed->tail_index = dq->tail_index; - ed->n_buffers_on_ring = dq->tx.n_buffers_on_ring; - } - } - - dq->head_index = dq->tx.head_index_write_back[0]; - n_hw_owned_descriptors = ixge_ring_sub (dq, dq->head_index, dq->tail_index); - ASSERT (dq->tx.n_buffers_on_ring >= n_hw_owned_descriptors); - n_clean = dq->tx.n_buffers_on_ring - n_hw_owned_descriptors; - - if (IXGE_HWBP_RACE_ELOG && hwbp_race) - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d tx head index race: head %4d, hw_owned %4d, n_clean %4d, retries %d",.format_args - = "i4i4i4i4i4",}; - struct - { - u32 instance, head_index, n_hw_owned_descriptors, n_clean, retries; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->head_index = dq->head_index; - ed->n_hw_owned_descriptors = n_hw_owned_descriptors; - ed->n_clean = n_clean; - ed->retries = hwbp_race; - } - - /* - * This function used to wait until hardware owned zero descriptors. - * At high PPS rates, that doesn't happen until the TX ring is - * completely full of descriptors which need to be cleaned up. - * That, in turn, causes TX ring-full drops and/or long RX service - * interruptions. - */ - if (n_clean == 0) - return; - - /* Clean the n_clean descriptors prior to the reported hardware head */ - last_to_clean = dq->head_index - 1; - last_to_clean = (last_to_clean < 0) ? last_to_clean + dq->n_descriptors : - last_to_clean; - - first_to_clean = (last_to_clean) - (n_clean - 1); - first_to_clean = (first_to_clean < 0) ? first_to_clean + dq->n_descriptors : - first_to_clean; - - vec_resize (xm->tx_buffers_pending_free, dq->n_descriptors - 1); - t0 = t = xm->tx_buffers_pending_free; - b = dq->descriptor_buffer_indices + first_to_clean; - - /* Wrap case: clean from first to end, then start to last */ - if (first_to_clean > last_to_clean) - { - t += clean_block (b, t, (dq->n_descriptors - 1) - first_to_clean); - first_to_clean = 0; - b = dq->descriptor_buffer_indices; - } - - /* Typical case: clean from first to last */ - if (first_to_clean <= last_to_clean) - t += clean_block (b, t, (last_to_clean - first_to_clean) + 1); - - if (t > t0) - { - u32 n = t - t0; - vlib_buffer_free_no_next (vm, t0, n); - ASSERT (dq->tx.n_buffers_on_ring >= n); - dq->tx.n_buffers_on_ring -= n; - _vec_len (xm->tx_buffers_pending_free) = 0; - } -} - -/* RX queue interrupts 0 thru 7; TX 8 thru 15. */ -always_inline uword -ixge_interrupt_is_rx_queue (uword i) -{ - return i < 8; -} - -always_inline uword -ixge_interrupt_is_tx_queue (uword i) -{ - return i >= 8 && i < 16; -} - -always_inline uword -ixge_tx_queue_to_interrupt (uword i) -{ - return 8 + i; -} - -always_inline uword -ixge_rx_queue_to_interrupt (uword i) -{ - return 0 + i; -} - -always_inline uword -ixge_interrupt_rx_queue (uword i) -{ - ASSERT (ixge_interrupt_is_rx_queue (i)); - return i - 0; -} - -always_inline uword -ixge_interrupt_tx_queue (uword i) -{ - ASSERT (ixge_interrupt_is_tx_queue (i)); - return i - 8; -} - -static uword -ixge_device_input (ixge_main_t * xm, - ixge_device_t * xd, vlib_node_runtime_t * node) -{ - ixge_regs_t *r = xd->regs; - u32 i, s; - uword n_rx_packets = 0; - - s = r->interrupt.status_write_1_to_set; - if (s) - r->interrupt.status_write_1_to_clear = s; - - /* *INDENT-OFF* */ - foreach_set_bit (i, s, ({ - if (ixge_interrupt_is_rx_queue (i)) - n_rx_packets += ixge_rx_queue (xm, xd, node, ixge_interrupt_rx_queue (i)); - - else if (ixge_interrupt_is_tx_queue (i)) - ixge_tx_queue (xm, xd, ixge_interrupt_tx_queue (i)); - - else - ixge_interrupt (xm, xd, i); - })); - /* *INDENT-ON* */ - - return n_rx_packets; -} - -static uword -ixge_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) -{ - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd; - uword n_rx_packets = 0; - - if (node->state == VLIB_NODE_STATE_INTERRUPT) - { - uword i; - - /* Loop over devices with interrupts. */ - /* *INDENT-OFF* */ - foreach_set_bit (i, node->runtime_data[0], ({ - xd = vec_elt_at_index (xm->devices, i); - n_rx_packets += ixge_device_input (xm, xd, node); - - /* Re-enable interrupts since we're going to stay in interrupt mode. */ - if (! (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) - xd->regs->interrupt.enable_write_1_to_set = ~0; - })); - /* *INDENT-ON* */ - - /* Clear mask of devices with pending interrupts. */ - node->runtime_data[0] = 0; - } - else - { - /* Poll all devices for input/interrupts. */ - vec_foreach (xd, xm->devices) - { - n_rx_packets += ixge_device_input (xm, xd, node); - - /* Re-enable interrupts when switching out of polling mode. */ - if (node->flags & - VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) - xd->regs->interrupt.enable_write_1_to_set = ~0; - } - } - - return n_rx_packets; -} - -static char *ixge_error_strings[] = { -#define _(n,s) s, - foreach_ixge_error -#undef _ -}; - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (ixge_input_node, static) = { - .function = ixge_input, - .type = VLIB_NODE_TYPE_INPUT, - .name = "ixge-input", - - /* Will be enabled if/when hardware is detected. */ - .state = VLIB_NODE_STATE_DISABLED, - - .format_buffer = format_ethernet_header_with_length, - .format_trace = format_ixge_rx_dma_trace, - - .n_errors = IXGE_N_ERROR, - .error_strings = ixge_error_strings, - - .n_next_nodes = IXGE_RX_N_NEXT, - .next_nodes = { - [IXGE_RX_NEXT_DROP] = "error-drop", - [IXGE_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", - [IXGE_RX_NEXT_IP4_INPUT] = "ip4-input", - [IXGE_RX_NEXT_IP6_INPUT] = "ip6-input", - }, -}; - -VLIB_NODE_FUNCTION_MULTIARCH_CLONE (ixge_input) -CLIB_MULTIARCH_SELECT_FN (ixge_input) -/* *INDENT-ON* */ - -static u8 * -format_ixge_device_name (u8 * s, va_list * args) -{ - u32 i = va_arg (*args, u32); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, i); - return format (s, "TenGigabitEthernet%U", - format_vlib_pci_handle, &xd->pci_device.bus_address); -} - -#define IXGE_COUNTER_IS_64_BIT (1 << 0) -#define IXGE_COUNTER_NOT_CLEAR_ON_READ (1 << 1) - -static u8 ixge_counter_flags[] = { -#define _(a,f) 0, -#define _64(a,f) IXGE_COUNTER_IS_64_BIT, - foreach_ixge_counter -#undef _ -#undef _64 -}; - -static void -ixge_update_counters (ixge_device_t * xd) -{ - /* Byte offset for counter registers. */ - static u32 reg_offsets[] = { -#define _(a,f) (a) / sizeof (u32), -#define _64(a,f) _(a,f) - foreach_ixge_counter -#undef _ -#undef _64 - }; - volatile u32 *r = (volatile u32 *) xd->regs; - int i; - - for (i = 0; i < ARRAY_LEN (xd->counters); i++) - { - u32 o = reg_offsets[i]; - xd->counters[i] += r[o]; - if (ixge_counter_flags[i] & IXGE_COUNTER_NOT_CLEAR_ON_READ) - r[o] = 0; - if (ixge_counter_flags[i] & IXGE_COUNTER_IS_64_BIT) - xd->counters[i] += (u64) r[o + 1] << (u64) 32; - } -} - -static u8 * -format_ixge_device_id (u8 * s, va_list * args) -{ - u32 device_id = va_arg (*args, u32); - char *t = 0; - switch (device_id) - { -#define _(f,n) case n: t = #f; break; - foreach_ixge_pci_device_id; -#undef _ - default: - t = 0; - break; - } - if (t == 0) - s = format (s, "unknown 0x%x", device_id); - else - s = format (s, "%s", t); - return s; -} - -static u8 * -format_ixge_link_status (u8 * s, va_list * args) -{ - ixge_device_t *xd = va_arg (*args, ixge_device_t *); - u32 v = xd->link_status_at_last_link_change; - - s = format (s, "%s", (v & (1 << 30)) ? "up" : "down"); - - { - char *modes[] = { - "1g", "10g parallel", "10g serial", "autoneg", - }; - char *speeds[] = { - "unknown", "100m", "1g", "10g", - }; - s = format (s, ", mode %s, speed %s", - modes[(v >> 26) & 3], speeds[(v >> 28) & 3]); - } - - return s; -} - -static u8 * -format_ixge_device (u8 * s, va_list * args) -{ - u32 dev_instance = va_arg (*args, u32); - CLIB_UNUSED (int verbose) = va_arg (*args, int); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, dev_instance); - ixge_phy_t *phy = xd->phys + xd->phy_index; - uword indent = format_get_indent (s); - - ixge_update_counters (xd); - xd->link_status_at_last_link_change = xd->regs->xge_mac.link_status; - - s = format (s, "Intel 8259X: id %U\n%Ulink %U", - format_ixge_device_id, xd->device_id, - format_white_space, indent + 2, format_ixge_link_status, xd); - - { - - s = format (s, "\n%UPCIe %U", format_white_space, indent + 2, - format_vlib_pci_link_speed, &xd->pci_device); - } - - s = format (s, "\n%U", format_white_space, indent + 2); - if (phy->mdio_address != ~0) - s = format (s, "PHY address %d, id 0x%x", phy->mdio_address, phy->id); - else if (xd->sfp_eeprom.id == SFP_ID_sfp) - s = format (s, "SFP %U", format_sfp_eeprom, &xd->sfp_eeprom); - else - s = format (s, "PHY not found"); - - /* FIXME */ - { - ixge_dma_queue_t *dq = vec_elt_at_index (xd->dma_queues[VLIB_RX], 0); - ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); - u32 hw_head_index = dr->head_index; - u32 sw_head_index = dq->head_index; - u32 nitems; - - nitems = ixge_ring_sub (dq, hw_head_index, sw_head_index); - s = format (s, "\n%U%d unprocessed, %d total buffers on rx queue 0 ring", - format_white_space, indent + 2, nitems, dq->n_descriptors); - - s = format (s, "\n%U%d buffers in driver rx cache", - format_white_space, indent + 2, - vec_len (xm->rx_buffers_to_add)); - - s = format (s, "\n%U%d buffers on tx queue 0 ring", - format_white_space, indent + 2, - xd->dma_queues[VLIB_TX][0].tx.n_buffers_on_ring); - } - { - u32 i; - u64 v; - static char *names[] = { -#define _(a,f) #f, -#define _64(a,f) _(a,f) - foreach_ixge_counter -#undef _ -#undef _64 - }; - - for (i = 0; i < ARRAY_LEN (names); i++) - { - v = xd->counters[i] - xd->counters_last_clear[i]; - if (v != 0) - s = format (s, "\n%U%-40U%16Ld", - format_white_space, indent + 2, - format_c_identifier, names[i], v); - } - } - - return s; -} - -static void -ixge_clear_hw_interface_counters (u32 instance) -{ - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, instance); - ixge_update_counters (xd); - memcpy (xd->counters_last_clear, xd->counters, sizeof (xd->counters)); -} - -/* - * Dynamically redirect all pkts from a specific interface - * to the specified node - */ -static void -ixge_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, - u32 node_index) -{ - ixge_main_t *xm = &ixge_main; - vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); - ixge_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); - - /* Shut off redirection */ - if (node_index == ~0) - { - xd->per_interface_next_index = node_index; - return; - } - - xd->per_interface_next_index = - vlib_node_add_next (xm->vlib_main, ixge_input_node.index, node_index); -} - - -/* *INDENT-OFF* */ -VNET_DEVICE_CLASS (ixge_device_class) = { - .name = "ixge", - .tx_function = ixge_interface_tx, - .format_device_name = format_ixge_device_name, - .format_device = format_ixge_device, - .format_tx_trace = format_ixge_tx_dma_trace, - .clear_counters = ixge_clear_hw_interface_counters, - .admin_up_down_function = ixge_interface_admin_up_down, - .rx_redirect_to_node = ixge_set_interface_next_node, - .flatten_output_chains = 1, -}; -/* *INDENT-ON* */ - -#define IXGE_N_BYTES_IN_RX_BUFFER (2048) // DAW-HACK: Set Rx buffer size so all packets < ETH_MTU_SIZE fit in the buffer (i.e. sop & eop for all descriptors). - -static clib_error_t * -ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - ixge_dma_queue_t *dq; - clib_error_t *error = 0; - - vec_validate (xd->dma_queues[rt], queue_index); - dq = vec_elt_at_index (xd->dma_queues[rt], queue_index); - - if (!xm->n_descriptors_per_cache_line) - xm->n_descriptors_per_cache_line = - CLIB_CACHE_LINE_BYTES / sizeof (dq->descriptors[0]); - - if (!xm->n_bytes_in_rx_buffer) - xm->n_bytes_in_rx_buffer = IXGE_N_BYTES_IN_RX_BUFFER; - xm->n_bytes_in_rx_buffer = round_pow2 (xm->n_bytes_in_rx_buffer, 1024); - if (!xm->vlib_buffer_free_list_index) - { - xm->vlib_buffer_free_list_index = - vlib_buffer_get_or_create_free_list (vm, xm->n_bytes_in_rx_buffer, - "ixge rx"); - ASSERT (xm->vlib_buffer_free_list_index != 0); - } - - if (!xm->n_descriptors[rt]) - xm->n_descriptors[rt] = 4 * VLIB_FRAME_SIZE; - - dq->queue_index = queue_index; - dq->n_descriptors = - round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); - dq->head_index = dq->tail_index = 0; - - dq->descriptors = vlib_physmem_alloc_aligned (vm, &error, - dq->n_descriptors * - sizeof (dq->descriptors[0]), - 128 /* per chip spec */ ); - if (error) - return error; - - memset (dq->descriptors, 0, - dq->n_descriptors * sizeof (dq->descriptors[0])); - vec_resize (dq->descriptor_buffer_indices, dq->n_descriptors); - - if (rt == VLIB_RX) - { - u32 n_alloc, i; - - n_alloc = vlib_buffer_alloc_from_free_list - (vm, dq->descriptor_buffer_indices, - vec_len (dq->descriptor_buffer_indices), - xm->vlib_buffer_free_list_index); - ASSERT (n_alloc == vec_len (dq->descriptor_buffer_indices)); - for (i = 0; i < n_alloc; i++) - { - vlib_buffer_t *b = - vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]); - dq->descriptors[i].rx_to_hw.tail_address = - vlib_physmem_virtual_to_physical (vm, b->data); - } - } - else - { - u32 i; - - dq->tx.head_index_write_back = - vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES); - - for (i = 0; i < dq->n_descriptors; i++) - dq->descriptors[i].tx = xm->tx_descriptor_template; - - vec_validate (xm->tx_buffers_pending_free, dq->n_descriptors - 1); - } - - { - ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); - u64 a; - - a = vlib_physmem_virtual_to_physical (vm, dq->descriptors); - dr->descriptor_address[0] = a & 0xFFFFFFFF; - dr->descriptor_address[1] = a >> (u64) 32; - dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); - dq->head_index = dq->tail_index = 0; - - if (rt == VLIB_RX) - { - ASSERT ((xm->n_bytes_in_rx_buffer / 1024) < 32); - dr->rx_split_control = - ( /* buffer size */ ((xm->n_bytes_in_rx_buffer / 1024) << 0) - | ( /* lo free descriptor threshold (units of 64 descriptors) */ - (1 << 22)) | ( /* descriptor type: advanced one buffer */ - (1 << 25)) | ( /* drop if no descriptors available */ - (1 << 28))); - - /* Give hardware all but last 16 cache lines' worth of descriptors. */ - dq->tail_index = dq->n_descriptors - - 16 * xm->n_descriptors_per_cache_line; - } - else - { - /* Make sure its initialized before hardware can get to it. */ - dq->tx.head_index_write_back[0] = dq->head_index; - - a = - vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back); - dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; - dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; - } - - /* DMA on 82599 does not work with [13] rx data write relaxed ordering - and [12] undocumented set. */ - if (rt == VLIB_RX) - dr->dca_control &= ~((1 << 13) | (1 << 12)); - - CLIB_MEMORY_BARRIER (); - - if (rt == VLIB_TX) - { - xd->regs->tx_dma_control |= (1 << 0); - dr->control |= ((32 << 0) /* prefetch threshold */ - | (64 << 8) /* host threshold */ - | (0 << 16) /* writeback threshold */ ); - } - - /* Enable this queue and wait for hardware to initialize - before adding to tail. */ - if (rt == VLIB_TX) - { - dr->control |= 1 << 25; - while (!(dr->control & (1 << 25))) - ; - } - - /* Set head/tail indices and enable DMA. */ - dr->head_index = dq->head_index; - dr->tail_index = dq->tail_index; - } - - return error; -} - -static u32 -ixge_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) -{ - ixge_device_t *xd; - ixge_regs_t *r; - u32 old; - ixge_main_t *xm = &ixge_main; - - xd = vec_elt_at_index (xm->devices, hw->dev_instance); - r = xd->regs; - - old = r->filter_control; - - if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) - r->filter_control = old | (1 << 9) /* unicast promiscuous */ ; - else - r->filter_control = old & ~(1 << 9); - - return old; -} - -static void -ixge_device_init (ixge_main_t * xm) -{ - vnet_main_t *vnm = vnet_get_main (); - ixge_device_t *xd; - - /* Reset chip(s). */ - vec_foreach (xd, xm->devices) - { - ixge_regs_t *r = xd->regs; - const u32 reset_bit = (1 << 26) | (1 << 3); - - r->control |= reset_bit; - - /* No need to suspend. Timed to take ~1e-6 secs */ - while (r->control & reset_bit) - ; - - /* Software loaded. */ - r->extended_control |= (1 << 28); - - ixge_phy_init (xd); - - /* Register ethernet interface. */ - { - u8 addr8[6]; - u32 i, addr32[2]; - clib_error_t *error; - - addr32[0] = r->rx_ethernet_address0[0][0]; - addr32[1] = r->rx_ethernet_address0[0][1]; - for (i = 0; i < 6; i++) - addr8[i] = addr32[i / 4] >> ((i % 4) * 8); - - error = ethernet_register_interface - (vnm, ixge_device_class.index, xd->device_index, - /* ethernet address */ addr8, - &xd->vlib_hw_if_index, ixge_flag_change); - if (error) - clib_error_report (error); - } - - { - vnet_sw_interface_t *sw = - vnet_get_hw_sw_interface (vnm, xd->vlib_hw_if_index); - xd->vlib_sw_if_index = sw->sw_if_index; - } - - ixge_dma_init (xd, VLIB_RX, /* queue_index */ 0); - - xm->n_descriptors[VLIB_TX] = 20 * VLIB_FRAME_SIZE; - - ixge_dma_init (xd, VLIB_TX, /* queue_index */ 0); - - /* RX/TX queue 0 gets mapped to interrupt bits 0 & 8. */ - r->interrupt.queue_mapping[0] = (( /* valid bit */ (1 << 7) | - ixge_rx_queue_to_interrupt (0)) << 0); - - r->interrupt.queue_mapping[0] |= (( /* valid bit */ (1 << 7) | - ixge_tx_queue_to_interrupt (0)) << 8); - - /* No use in getting too many interrupts. - Limit them to one every 3/4 ring size at line rate - min sized packets. - No need for this since kernel/vlib main loop provides adequate interrupt - limiting scheme. */ - if (0) - { - f64 line_rate_max_pps = - 10e9 / (8 * (64 + /* interframe padding */ 20)); - ixge_throttle_queue_interrupt (r, 0, - .75 * xm->n_descriptors[VLIB_RX] / - line_rate_max_pps); - } - - /* Accept all multicast and broadcast packets. Should really add them - to the dst_ethernet_address register array. */ - r->filter_control |= (1 << 10) | (1 << 8); - - /* Enable frames up to size in mac frame size register. */ - r->xge_mac.control |= 1 << 2; - r->xge_mac.rx_max_frame_size = (9216 + 14) << 16; - - /* Enable all interrupts. */ - if (!IXGE_ALWAYS_POLL) - r->interrupt.enable_write_1_to_set = ~0; - } -} - -static uword -ixge_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) -{ - vnet_main_t *vnm = vnet_get_main (); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd; - uword event_type, *event_data = 0; - f64 timeout, link_debounce_deadline; - - ixge_device_init (xm); - - /* Clear all counters. */ - vec_foreach (xd, xm->devices) - { - ixge_update_counters (xd); - memset (xd->counters, 0, sizeof (xd->counters)); - } - - timeout = 30.0; - link_debounce_deadline = 1e70; - - while (1) - { - /* 36 bit stat counters could overflow in ~50 secs. - We poll every 30 secs to be conservative. */ - vlib_process_wait_for_event_or_clock (vm, timeout); - - event_type = vlib_process_get_events (vm, &event_data); - - switch (event_type) - { - case EVENT_SET_FLAGS: - /* 1 ms */ - link_debounce_deadline = vlib_time_now (vm) + 1e-3; - timeout = 1e-3; - break; - - case ~0: - /* No events found: timer expired. */ - if (vlib_time_now (vm) > link_debounce_deadline) - { - vec_foreach (xd, xm->devices) - { - ixge_regs_t *r = xd->regs; - u32 v = r->xge_mac.link_status; - uword is_up = (v & (1 << 30)) != 0; - - vnet_hw_interface_set_flags - (vnm, xd->vlib_hw_if_index, - is_up ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); - } - link_debounce_deadline = 1e70; - timeout = 30.0; - } - break; - - default: - ASSERT (0); - } - - if (event_data) - _vec_len (event_data) = 0; - - /* Query stats every 30 secs. */ - { - f64 now = vlib_time_now (vm); - if (now - xm->time_last_stats_update > 30) - { - xm->time_last_stats_update = now; - vec_foreach (xd, xm->devices) ixge_update_counters (xd); - } - } - } - - return 0; -} - -static vlib_node_registration_t ixge_process_node = { - .function = ixge_process, - .type = VLIB_NODE_TYPE_PROCESS, - .name = "ixge-process", -}; - -clib_error_t * -ixge_init (vlib_main_t * vm) -{ - ixge_main_t *xm = &ixge_main; - clib_error_t *error; - - xm->vlib_main = vm; - memset (&xm->tx_descriptor_template, 0, - sizeof (xm->tx_descriptor_template)); - memset (&xm->tx_descriptor_template_mask, 0, - sizeof (xm->tx_descriptor_template_mask)); - xm->tx_descriptor_template.status0 = - (IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED | - IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED | - IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS); - xm->tx_descriptor_template_mask.status0 = 0xffff; - xm->tx_descriptor_template_mask.status1 = 0x00003fff; - - xm->tx_descriptor_template_mask.status0 &= - ~(IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET - | IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS); - xm->tx_descriptor_template_mask.status1 &= - ~(IXGE_TX_DESCRIPTOR_STATUS1_DONE); - - error = vlib_call_init_function (vm, pci_bus_init); - - return error; -} - -VLIB_INIT_FUNCTION (ixge_init); - - -static void -ixge_pci_intr_handler (vlib_pci_device_t * dev) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - - vlib_node_set_interrupt_pending (vm, ixge_input_node.index); - - /* Let node know which device is interrupting. */ - { - vlib_node_runtime_t *rt = - vlib_node_get_runtime (vm, ixge_input_node.index); - rt->runtime_data[0] |= 1 << dev->private_data; - } -} - -static clib_error_t * -ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev) -{ - ixge_main_t *xm = &ixge_main; - clib_error_t *error; - void *r; - ixge_device_t *xd; - - /* Device found: make sure we have dma memory. */ - if (unix_physmem_is_fake (vm)) - return clib_error_return (0, "no physical memory available"); - - error = vlib_pci_map_resource (dev, 0, &r); - if (error) - return error; - - vec_add2 (xm->devices, xd, 1); - - if (vec_len (xm->devices) == 1) - { - ixge_input_node.function = ixge_input_multiarch_select (); - } - - xd->pci_device = dev[0]; - xd->device_id = xd->pci_device.config0.header.device_id; - xd->regs = r; - xd->device_index = xd - xm->devices; - xd->pci_function = dev->bus_address.function; - xd->per_interface_next_index = ~0; - - - /* Chip found so enable node. */ - { - vlib_node_set_state (vm, ixge_input_node.index, - (IXGE_ALWAYS_POLL - ? VLIB_NODE_STATE_POLLING - : VLIB_NODE_STATE_INTERRUPT)); - - dev->private_data = xd->device_index; - } - - if (vec_len (xm->devices) == 1) - { - vlib_register_node (vm, &ixge_process_node); - xm->process_node_index = ixge_process_node.index; - } - - error = vlib_pci_bus_master_enable (dev); - - if (error) - return error; - - return vlib_pci_intr_enable (dev); -} - -/* *INDENT-OFF* */ -PCI_REGISTER_DEVICE (ixge_pci_device_registration,static) = { - .init_function = ixge_pci_init, - .interrupt_handler = ixge_pci_intr_handler, - .supported_devices = { -#define _(t,i) { .vendor_id = PCI_VENDOR_ID_INTEL, .device_id = i, }, - foreach_ixge_pci_device_id -#undef _ - { 0 }, - }, -}; -/* *INDENT-ON* */ - -void -ixge_set_next_node (ixge_rx_next_t next, char *name) -{ - vlib_node_registration_t *r = &ixge_input_node; - - switch (next) - { - case IXGE_RX_NEXT_IP4_INPUT: - case IXGE_RX_NEXT_IP6_INPUT: - case IXGE_RX_NEXT_ETHERNET_INPUT: - r->next_nodes[next] = name; - break; - - default: - clib_warning ("%s: illegal next %d\n", __FUNCTION__, next); - break; - } -} -#endif - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/devices/nic/ixge.h b/src/vnet/devices/nic/ixge.h deleted file mode 100644 index a8e652dc..00000000 --- a/src/vnet/devices/nic/ixge.h +++ /dev/null @@ -1,1293 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_ixge_h -#define included_ixge_h - -#include -#include -#include -#include -#include -#include - -typedef volatile struct -{ - /* [31:7] 128 byte aligned. */ - u32 descriptor_address[2]; - u32 n_descriptor_bytes; - - /* [5] rx/tx descriptor dca enable - [6] rx packet head dca enable - [7] rx packet tail dca enable - [9] rx/tx descriptor relaxed order - [11] rx/tx descriptor write back relaxed order - [13] rx/tx data write/read relaxed order - [15] rx head data write relaxed order - [31:24] apic id for cpu's cache. */ - u32 dca_control; - - u32 head_index; - - /* [4:0] tail buffer size (in 1k byte units) - [13:8] head buffer size (in 64 byte units) - [24:22] lo free descriptors threshold (units of 64 descriptors) - [27:25] descriptor type 0 = legacy, 1 = advanced one buffer (e.g. tail), - 2 = advanced header splitting (head + tail), 5 = advanced header - splitting (head only). - [28] drop if no descriptors available. */ - u32 rx_split_control; - - u32 tail_index; - CLIB_PAD_FROM_TO (0x1c, 0x28); - - /* [7:0] rx/tx prefetch threshold - [15:8] rx/tx host threshold - [24:16] rx/tx write back threshold - [25] rx/tx enable - [26] tx descriptor writeback flush - [30] rx strip vlan enable */ - u32 control; - - u32 rx_coallesce_control; - - union - { - struct - { - /* packets bytes lo hi */ - u32 stats[3]; - - u32 unused; - } rx; - - struct - { - u32 unused[2]; - - /* [0] enables head write back. */ - u32 head_index_write_back_address[2]; - } tx; - }; -} ixge_dma_regs_t; - -/* Only advanced descriptors are supported. */ -typedef struct -{ - u64 tail_address; - u64 head_address; -} ixge_rx_to_hw_descriptor_t; - -typedef struct -{ - u32 status[3]; - u16 n_packet_bytes_this_descriptor; - u16 vlan_tag; -} ixge_rx_from_hw_descriptor_t; - -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2 (1 << (4 + 11)) -/* Valid if not layer2. */ -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4 (1 << (4 + 0)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT (1 << (4 + 1)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6 (1 << (4 + 2)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT (1 << (4 + 3)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP (1 << (4 + 4)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP (1 << (4 + 5)) -#define IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET(s) (((s) >> 21) & 0x3ff) - -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE (1 << (0 + 0)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET (1 << (0 + 1)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN (1 << (0 + 3)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED (1 << (0 + 4)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED (1 << (0 + 5)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED (1 << (0 + 6)) -#define IXGE_RX_DESCRIPTOR_STATUS2_NOT_UNICAST (1 << (0 + 7)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_DOUBLE_VLAN (1 << (0 + 9)) -#define IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR (1 << (0 + 10)) -#define IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR (1 << (20 + 9)) -#define IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR (1 << (20 + 10)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR (1 << (20 + 11)) - -/* For layer2 packets stats0 bottom 3 bits give ether type index from filter. */ -#define IXGE_RX_DESCRIPTOR_STATUS0_LAYER2_ETHERNET_TYPE(s) ((s) & 7) - -typedef struct -{ - u64 buffer_address; - u16 n_bytes_this_buffer; - u16 status0; - u32 status1; -#define IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED (3 << 4) -#define IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED (1 << (8 + 5)) -#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS (8 + 3) -#define IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS) -#define IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS (1 << (8 + 1)) -#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET (8 + 0) -#define IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET) -#define IXGE_TX_DESCRIPTOR_STATUS1_DONE (1 << 0) -#define IXGE_TX_DESCRIPTOR_STATUS1_CONTEXT(i) (/* valid */ (1 << 7) | ((i) << 4)) -#define IXGE_TX_DESCRIPTOR_STATUS1_IPSEC_OFFLOAD (1 << (8 + 2)) -#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_TCP_UDP_CHECKSUM (1 << (8 + 1)) -#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_IP4_CHECKSUM (1 << (8 + 0)) -#define IXGE_TX_DESCRIPTOR_STATUS0_N_BYTES_THIS_BUFFER(l) ((l) << 0) -#define IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET(l) ((l) << 14) -} ixge_tx_descriptor_t; - -typedef struct -{ - struct - { - u8 checksum_start_offset; - u8 checksum_insert_offset; - u16 checksum_end_offset; - } ip, tcp; - u32 status0; - - u8 status1; - - /* Byte offset after UDP/TCP header. */ - u8 payload_offset; - - u16 max_tcp_segment_size; -} __attribute__ ((packed)) ixge_tx_context_descriptor_t; - -typedef union -{ - ixge_rx_to_hw_descriptor_t rx_to_hw; - ixge_rx_from_hw_descriptor_t rx_from_hw; - ixge_tx_descriptor_t tx; - u32x4 as_u32x4; -} ixge_descriptor_t; - -typedef volatile struct -{ - /* [2] pcie master disable - [3] mac reset - [26] global device reset */ - u32 control; - u32 control_alias; - /* [3:2] device id (0 or 1 for dual port chips) - [7] link is up - [17:10] num vfs - [18] io active - [19] pcie master enable status */ - u32 status_read_only; - CLIB_PAD_FROM_TO (0xc, 0x18); - /* [14] pf reset done - [17] relaxed ordering disable - [26] extended vlan enable - [28] driver loaded */ - u32 extended_control; - CLIB_PAD_FROM_TO (0x1c, 0x20); - - /* software definable pins. - sdp_data [7:0] - sdp_is_output [15:8] - sdp_is_native [23:16] - sdp_function [31:24]. - */ - u32 sdp_control; - CLIB_PAD_FROM_TO (0x24, 0x28); - - /* [0] i2c clock in - [1] i2c clock out - [2] i2c data in - [3] i2c data out */ - u32 i2c_control; - CLIB_PAD_FROM_TO (0x2c, 0x4c); - u32 tcp_timer; - - CLIB_PAD_FROM_TO (0x50, 0x200); - - u32 led_control; - - CLIB_PAD_FROM_TO (0x204, 0x600); - u32 core_spare; - CLIB_PAD_FROM_TO (0x604, 0x700); - - struct - { - u32 vflr_events_clear[4]; - u32 mailbox_interrupt_status[4]; - u32 mailbox_interrupt_enable[4]; - CLIB_PAD_FROM_TO (0x730, 0x800); - } pf_foo; - - struct - { - u32 status_write_1_to_clear; - CLIB_PAD_FROM_TO (0x804, 0x808); - u32 status_write_1_to_set; - CLIB_PAD_FROM_TO (0x80c, 0x810); - u32 status_auto_clear_enable; - CLIB_PAD_FROM_TO (0x814, 0x820); - - /* [11:3] minimum inter-interrupt interval - (2e-6 units; 20e-6 units for fast ethernet). - [15] low-latency interrupt moderation enable - [20:16] low-latency interrupt credit - [27:21] interval counter - [31] write disable for credit and counter (write only). */ - u32 throttle0[24]; - - u32 enable_write_1_to_set; - CLIB_PAD_FROM_TO (0x884, 0x888); - u32 enable_write_1_to_clear; - CLIB_PAD_FROM_TO (0x88c, 0x890); - u32 enable_auto_clear; - u32 msi_to_eitr_select; - /* [3:0] spd 0-3 interrupt detection enable - [4] msi-x enable - [5] other clear disable (makes other bits in status not clear on read) - etc. */ - u32 control; - CLIB_PAD_FROM_TO (0x89c, 0x900); - - /* Defines interrupt mapping for 128 rx + 128 tx queues. - 64 x 4 8 bit entries. - For register [i]: - [5:0] bit in interrupt status for rx queue 2*i + 0 - [7] valid bit - [13:8] bit for tx queue 2*i + 0 - [15] valid bit - similar for rx 2*i + 1 and tx 2*i + 1. */ - u32 queue_mapping[64]; - - /* tcp timer [7:0] and other interrupts [15:8] */ - u32 misc_mapping; - CLIB_PAD_FROM_TO (0xa04, 0xa90); - - /* 64 interrupts determined by mappings. */ - u32 status1_write_1_to_clear[4]; - u32 enable1_write_1_to_set[4]; - u32 enable1_write_1_to_clear[4]; - CLIB_PAD_FROM_TO (0xac0, 0xad0); - u32 status1_enable_auto_clear[4]; - CLIB_PAD_FROM_TO (0xae0, 0x1000); - } interrupt; - - ixge_dma_regs_t rx_dma0[64]; - - CLIB_PAD_FROM_TO (0x2000, 0x2140); - u32 dcb_rx_packet_plane_t4_config[8]; - u32 dcb_rx_packet_plane_t4_status[8]; - CLIB_PAD_FROM_TO (0x2180, 0x2300); - - /* reg i defines mapping for 4 rx queues starting at 4*i + 0. */ - u32 rx_queue_stats_mapping[32]; - u32 rx_queue_stats_control; - - CLIB_PAD_FROM_TO (0x2384, 0x2410); - u32 fc_user_descriptor_ptr[2]; - u32 fc_buffer_control; - CLIB_PAD_FROM_TO (0x241c, 0x2420); - u32 fc_rx_dma; - CLIB_PAD_FROM_TO (0x2424, 0x2430); - u32 dcb_packet_plane_control; - CLIB_PAD_FROM_TO (0x2434, 0x2f00); - - u32 rx_dma_control; - u32 pf_queue_drop_enable; - CLIB_PAD_FROM_TO (0x2f08, 0x2f20); - u32 rx_dma_descriptor_cache_config; - CLIB_PAD_FROM_TO (0x2f24, 0x3000); - - /* 1 bit. */ - u32 rx_enable; - CLIB_PAD_FROM_TO (0x3004, 0x3008); - /* [15:0] ether type (little endian) - [31:16] opcode (big endian) */ - u32 flow_control_control; - CLIB_PAD_FROM_TO (0x300c, 0x3020); - /* 3 bit traffic class for each of 8 priorities. */ - u32 rx_priority_to_traffic_class; - CLIB_PAD_FROM_TO (0x3024, 0x3028); - u32 rx_coallesce_data_buffer_control; - CLIB_PAD_FROM_TO (0x302c, 0x3190); - u32 rx_packet_buffer_flush_detect; - CLIB_PAD_FROM_TO (0x3194, 0x3200); - u32 flow_control_tx_timers[4]; /* 2 timer values */ - CLIB_PAD_FROM_TO (0x3210, 0x3220); - u32 flow_control_rx_threshold_lo[8]; - CLIB_PAD_FROM_TO (0x3240, 0x3260); - u32 flow_control_rx_threshold_hi[8]; - CLIB_PAD_FROM_TO (0x3280, 0x32a0); - u32 flow_control_refresh_threshold; - CLIB_PAD_FROM_TO (0x32a4, 0x3c00); - /* For each of 8 traffic classes (units of bytes). */ - u32 rx_packet_buffer_size[8]; - CLIB_PAD_FROM_TO (0x3c20, 0x3d00); - u32 flow_control_config; - CLIB_PAD_FROM_TO (0x3d04, 0x4200); - - struct - { - u32 pcs_config; - CLIB_PAD_FROM_TO (0x4204, 0x4208); - u32 link_control; - u32 link_status; - u32 pcs_debug[2]; - u32 auto_negotiation; - u32 link_partner_ability; - u32 auto_negotiation_tx_next_page; - u32 auto_negotiation_link_partner_next_page; - CLIB_PAD_FROM_TO (0x4228, 0x4240); - } gige_mac; - - struct - { - /* [0] tx crc enable - [2] enable frames up to max frame size register [31:16] - [10] pad frames < 64 bytes if specified by user - [15] loopback enable - [16] mdc hi speed - [17] turn off mdc between mdio packets */ - u32 control; - - /* [5] rx symbol error (all bits clear on read) - [6] rx illegal symbol - [7] rx idle error - [8] rx local fault - [9] rx remote fault */ - u32 status; - - u32 pause_and_pace_control; - CLIB_PAD_FROM_TO (0x424c, 0x425c); - u32 phy_command; - u32 phy_data; - CLIB_PAD_FROM_TO (0x4264, 0x4268); - - /* [31:16] max frame size in bytes. */ - u32 rx_max_frame_size; - CLIB_PAD_FROM_TO (0x426c, 0x4288); - - /* [0] - [2] pcs receive link up? (latch lo) - [7] local fault - [1] - [0] pcs 10g base r capable - [1] pcs 10g base x capable - [2] pcs 10g base w capable - [10] rx local fault - [11] tx local fault - [15:14] 2 => device present at this address (else not present) */ - u32 xgxs_status[2]; - - u32 base_x_pcs_status; - - /* [0] pass unrecognized flow control frames - [1] discard pause frames - [2] rx priority flow control enable (only in dcb mode) - [3] rx flow control enable. */ - u32 flow_control; - - /* [3:0] tx lanes change polarity - [7:4] rx lanes change polarity - [11:8] swizzle tx lanes - [15:12] swizzle rx lanes - 4 x 2 bit tx lane swap - 4 x 2 bit rx lane swap. */ - u32 serdes_control; - - u32 fifo_control; - - /* [0] force link up - [1] autoneg ack2 bit to transmit - [6:2] autoneg selector field to transmit - [8:7] 10g pma/pmd type 0 => xaui, 1 kx4, 2 cx4 - [9] 1g pma/pmd type 0 => sfi, 1 => kx/bx - [10] disable 10g on without main power - [11] restart autoneg on transition to dx power state - [12] restart autoneg - [15:13] link mode: - 0 => 1g no autoneg - 1 => 10g kx4 parallel link no autoneg - 2 => 1g bx autoneg - 3 => 10g sfi serdes - 4 => kx4/kx/kr - 5 => xgmii 1g/100m - 6 => kx4/kx/kr 1g an - 7 kx4/kx/kr sgmii. - [16] kr support - [17] fec requested - [18] fec ability - etc. */ - u32 auto_negotiation_control; - - /* [0] signal detect 1g/100m - [1] fec signal detect - [2] 10g serial pcs fec block lock - [3] 10g serial high error rate - [4] 10g serial pcs block lock - [5] kx/kx4/kr autoneg next page received - [6] kx/kx4/kr backplane autoneg next page received - [7] link status clear to read - [11:8] 10g signal detect (4 lanes) (for serial just lane 0) - [12] 10g serial signal detect - [16:13] 10g parallel lane sync status - [17] 10g parallel align status - [18] 1g sync status - [19] kx/kx4/kr backplane autoneg is idle - [20] 1g autoneg enabled - [21] 1g pcs enabled for sgmii - [22] 10g xgxs enabled - [23] 10g serial fec enabled (forward error detection) - [24] 10g kr pcs enabled - [25] sgmii enabled - [27:26] mac link mode - 0 => 1g - 1 => 10g parallel - 2 => 10g serial - 3 => autoneg - [29:28] link speed - 1 => 100m - 2 => 1g - 3 => 10g - [30] link is up - [31] kx/kx4/kr backplane autoneg completed successfully. */ - u32 link_status; - - /* [17:16] pma/pmd for 10g serial - 0 => kr, 2 => sfi - [18] disable dme pages */ - u32 auto_negotiation_control2; - - CLIB_PAD_FROM_TO (0x42ac, 0x42b0); - u32 link_partner_ability[2]; - CLIB_PAD_FROM_TO (0x42b8, 0x42d0); - u32 manageability_control; - u32 link_partner_next_page[2]; - CLIB_PAD_FROM_TO (0x42dc, 0x42e0); - u32 kr_pcs_control; - u32 kr_pcs_status; - u32 fec_status[2]; - CLIB_PAD_FROM_TO (0x42f0, 0x4314); - u32 sgmii_control; - CLIB_PAD_FROM_TO (0x4318, 0x4324); - u32 link_status2; - CLIB_PAD_FROM_TO (0x4328, 0x4900); - } xge_mac; - - u32 tx_dcb_control; - u32 tx_dcb_descriptor_plane_queue_select; - u32 tx_dcb_descriptor_plane_t1_config; - u32 tx_dcb_descriptor_plane_t1_status; - CLIB_PAD_FROM_TO (0x4910, 0x4950); - - /* For each TC in units of 1k bytes. */ - u32 tx_packet_buffer_thresholds[8]; - CLIB_PAD_FROM_TO (0x4970, 0x4980); - struct - { - u32 mmw; - u32 config; - u32 status; - u32 rate_drift; - } dcb_tx_rate_scheduler; - CLIB_PAD_FROM_TO (0x4990, 0x4a80); - u32 tx_dma_control; - CLIB_PAD_FROM_TO (0x4a84, 0x4a88); - u32 tx_dma_tcp_flags_control[2]; - CLIB_PAD_FROM_TO (0x4a90, 0x4b00); - u32 pf_mailbox[64]; - CLIB_PAD_FROM_TO (0x4c00, 0x5000); - - /* RX */ - u32 checksum_control; - CLIB_PAD_FROM_TO (0x5004, 0x5008); - u32 rx_filter_control; - CLIB_PAD_FROM_TO (0x500c, 0x5010); - u32 management_vlan_tag[8]; - u32 management_udp_tcp_ports[8]; - CLIB_PAD_FROM_TO (0x5050, 0x5078); - /* little endian. */ - u32 extended_vlan_ether_type; - CLIB_PAD_FROM_TO (0x507c, 0x5080); - /* [1] store/dma bad packets - [8] accept all multicast - [9] accept all unicast - [10] accept all broadcast. */ - u32 filter_control; - CLIB_PAD_FROM_TO (0x5084, 0x5088); - /* [15:0] vlan ethernet type (0x8100) little endian - [28] cfi bit expected - [29] drop packets with unexpected cfi bit - [30] vlan filter enable. */ - u32 vlan_control; - CLIB_PAD_FROM_TO (0x508c, 0x5090); - /* [1:0] hi bit of ethernet address for 12 bit index into multicast table - 0 => 47, 1 => 46, 2 => 45, 3 => 43. - [2] enable multicast filter - */ - u32 multicast_control; - CLIB_PAD_FROM_TO (0x5094, 0x5100); - u32 fcoe_rx_control; - CLIB_PAD_FROM_TO (0x5104, 0x5108); - u32 fc_flt_context; - CLIB_PAD_FROM_TO (0x510c, 0x5110); - u32 fc_filter_control; - CLIB_PAD_FROM_TO (0x5114, 0x5120); - u32 rx_message_type_lo; - CLIB_PAD_FROM_TO (0x5124, 0x5128); - /* [15:0] ethernet type (little endian) - [18:16] matche pri in vlan tag - [19] priority match enable - [25:20] virtualization pool - [26] pool enable - [27] is fcoe - [30] ieee 1588 timestamp enable - [31] filter enable. - (See ethernet_type_queue_select.) */ - u32 ethernet_type_queue_filter[8]; - CLIB_PAD_FROM_TO (0x5148, 0x5160); - /* [7:0] l2 ethernet type and - [15:8] l2 ethernet type or */ - u32 management_decision_filters1[8]; - u32 vf_vm_tx_switch_loopback_enable[2]; - u32 rx_time_sync_control; - CLIB_PAD_FROM_TO (0x518c, 0x5190); - u32 management_ethernet_type_filters[4]; - u32 rx_timestamp_attributes_lo; - u32 rx_timestamp_hi; - u32 rx_timestamp_attributes_hi; - CLIB_PAD_FROM_TO (0x51ac, 0x51b0); - u32 pf_virtual_control; - CLIB_PAD_FROM_TO (0x51b4, 0x51d8); - u32 fc_offset_parameter; - CLIB_PAD_FROM_TO (0x51dc, 0x51e0); - u32 vf_rx_enable[2]; - u32 rx_timestamp_lo; - CLIB_PAD_FROM_TO (0x51ec, 0x5200); - /* 12 bits determined by multicast_control - lookup bits in this vector. */ - u32 multicast_enable[128]; - - /* [0] ethernet address [31:0] - [1] [15:0] ethernet address [47:32] - [31] valid bit. - Index 0 is read from eeprom after reset. */ - u32 rx_ethernet_address0[16][2]; - - CLIB_PAD_FROM_TO (0x5480, 0x5800); - u32 wake_up_control; - CLIB_PAD_FROM_TO (0x5804, 0x5808); - u32 wake_up_filter_control; - CLIB_PAD_FROM_TO (0x580c, 0x5818); - u32 multiple_rx_queue_command_82598; - CLIB_PAD_FROM_TO (0x581c, 0x5820); - u32 management_control; - u32 management_filter_control; - CLIB_PAD_FROM_TO (0x5828, 0x5838); - u32 wake_up_ip4_address_valid; - CLIB_PAD_FROM_TO (0x583c, 0x5840); - u32 wake_up_ip4_address_table[4]; - u32 management_control_to_host; - CLIB_PAD_FROM_TO (0x5854, 0x5880); - u32 wake_up_ip6_address_table[4]; - - /* unicast_and broadcast_and vlan_and ip_address_and - etc. */ - u32 management_decision_filters[8]; - - u32 management_ip4_or_ip6_address_filters[4][4]; - CLIB_PAD_FROM_TO (0x58f0, 0x5900); - u32 wake_up_packet_length; - CLIB_PAD_FROM_TO (0x5904, 0x5910); - u32 management_ethernet_address_filters[4][2]; - CLIB_PAD_FROM_TO (0x5930, 0x5a00); - u32 wake_up_packet_memory[32]; - CLIB_PAD_FROM_TO (0x5a80, 0x5c00); - u32 redirection_table_82598[32]; - u32 rss_random_keys_82598[10]; - CLIB_PAD_FROM_TO (0x5ca8, 0x6000); - - ixge_dma_regs_t tx_dma[128]; - - u32 pf_vm_vlan_insert[64]; - u32 tx_dma_tcp_max_alloc_size_requests; - CLIB_PAD_FROM_TO (0x8104, 0x8110); - u32 vf_tx_enable[2]; - CLIB_PAD_FROM_TO (0x8118, 0x8120); - /* [0] dcb mode enable - [1] virtualization mode enable - [3:2] number of tcs/qs per pool. */ - u32 multiple_tx_queues_command; - CLIB_PAD_FROM_TO (0x8124, 0x8200); - u32 pf_vf_anti_spoof[8]; - u32 pf_dma_tx_switch_control; - CLIB_PAD_FROM_TO (0x8224, 0x82e0); - u32 tx_strict_low_latency_queues[4]; - CLIB_PAD_FROM_TO (0x82f0, 0x8600); - u32 tx_queue_stats_mapping_82599[32]; - u32 tx_queue_packet_counts[32]; - u32 tx_queue_byte_counts[32][2]; - - struct - { - u32 control; - u32 status; - u32 buffer_almost_full; - CLIB_PAD_FROM_TO (0x880c, 0x8810); - u32 buffer_min_ifg; - CLIB_PAD_FROM_TO (0x8814, 0x8900); - } tx_security; - - struct - { - u32 index; - u32 salt; - u32 key[4]; - CLIB_PAD_FROM_TO (0x8918, 0x8a00); - } tx_ipsec; - - struct - { - u32 capabilities; - u32 control; - u32 tx_sci[2]; - u32 sa; - u32 sa_pn[2]; - u32 key[2][4]; - /* untagged packets, encrypted packets, protected packets, - encrypted bytes, protected bytes */ - u32 stats[5]; - CLIB_PAD_FROM_TO (0x8a50, 0x8c00); - } tx_link_security; - - struct - { - u32 control; - u32 timestamp_value[2]; - u32 system_time[2]; - u32 increment_attributes; - u32 time_adjustment_offset[2]; - u32 aux_control; - u32 target_time[2][2]; - CLIB_PAD_FROM_TO (0x8c34, 0x8c3c); - u32 aux_time_stamp[2][2]; - CLIB_PAD_FROM_TO (0x8c4c, 0x8d00); - } tx_timesync; - - struct - { - u32 control; - u32 status; - CLIB_PAD_FROM_TO (0x8d08, 0x8e00); - } rx_security; - - struct - { - u32 index; - u32 ip_address[4]; - u32 spi; - u32 ip_index; - u32 key[4]; - u32 salt; - u32 mode; - CLIB_PAD_FROM_TO (0x8e34, 0x8f00); - } rx_ipsec; - - struct - { - u32 capabilities; - u32 control; - u32 sci[2]; - u32 sa[2]; - u32 sa_pn[2]; - u32 key[2][4]; - /* see datasheet */ - u32 stats[17]; - CLIB_PAD_FROM_TO (0x8f84, 0x9000); - } rx_link_security; - - /* 4 wake up, 2 management, 2 wake up. */ - u32 flexible_filters[8][16][4]; - CLIB_PAD_FROM_TO (0x9800, 0xa000); - - /* 4096 bits. */ - u32 vlan_filter[128]; - - /* [0] ethernet address [31:0] - [1] [15:0] ethernet address [47:32] - [31] valid bit. - Index 0 is read from eeprom after reset. */ - u32 rx_ethernet_address1[128][2]; - - /* select one of 64 pools for each rx address. */ - u32 rx_ethernet_address_pool_select[128][2]; - CLIB_PAD_FROM_TO (0xaa00, 0xc800); - u32 tx_priority_to_traffic_class; - CLIB_PAD_FROM_TO (0xc804, 0xcc00); - - /* In bytes units of 1k. Total packet buffer is 160k. */ - u32 tx_packet_buffer_size[8]; - - CLIB_PAD_FROM_TO (0xcc20, 0xcd10); - u32 tx_manageability_tc_mapping; - CLIB_PAD_FROM_TO (0xcd14, 0xcd20); - u32 dcb_tx_packet_plane_t2_config[8]; - u32 dcb_tx_packet_plane_t2_status[8]; - CLIB_PAD_FROM_TO (0xcd60, 0xce00); - - u32 tx_flow_control_status; - CLIB_PAD_FROM_TO (0xce04, 0xd000); - - ixge_dma_regs_t rx_dma1[64]; - - struct - { - /* Bigendian ip4 src/dst address. */ - u32 src_address[128]; - u32 dst_address[128]; - - /* TCP/UDP ports [15:0] src [31:16] dst; bigendian. */ - u32 tcp_udp_port[128]; - - /* [1:0] protocol tcp, udp, sctp, other - [4:2] match priority (highest wins) - [13:8] pool - [25] src address match disable - [26] dst address match disable - [27] src port match disable - [28] dst port match disable - [29] protocol match disable - [30] pool match disable - [31] enable. */ - u32 control[128]; - - /* [12] size bypass - [19:13] must be 0x80 - [20] low-latency interrupt - [27:21] rx queue. */ - u32 interrupt[128]; - } ip4_filters; - - CLIB_PAD_FROM_TO (0xea00, 0xeb00); - /* 4 bit rss output index indexed by 7 bit hash. - 128 8 bit fields = 32 registers. */ - u32 redirection_table_82599[32]; - - u32 rss_random_key_82599[10]; - CLIB_PAD_FROM_TO (0xeba8, 0xec00); - /* [15:0] reserved - [22:16] rx queue index - [29] low-latency interrupt on match - [31] enable */ - u32 ethernet_type_queue_select[8]; - CLIB_PAD_FROM_TO (0xec20, 0xec30); - u32 syn_packet_queue_filter; - CLIB_PAD_FROM_TO (0xec34, 0xec60); - u32 immediate_interrupt_rx_vlan_priority; - CLIB_PAD_FROM_TO (0xec64, 0xec70); - u32 rss_queues_per_traffic_class; - CLIB_PAD_FROM_TO (0xec74, 0xec90); - u32 lli_size_threshold; - CLIB_PAD_FROM_TO (0xec94, 0xed00); - - struct - { - u32 control; - CLIB_PAD_FROM_TO (0xed04, 0xed10); - u32 table[8]; - CLIB_PAD_FROM_TO (0xed30, 0xee00); - } fcoe_redirection; - - struct - { - /* [1:0] packet buffer allocation 0 => disabled, else 64k*2^(f-1) - [3] packet buffer initialization done - [4] perfetch match mode - [5] report status in rss field of rx descriptors - [7] report status always - [14:8] drop queue - [20:16] flex 2 byte packet offset (units of 2 bytes) - [27:24] max linked list length - [31:28] full threshold. */ - u32 control; - CLIB_PAD_FROM_TO (0xee04, 0xee0c); - - u32 data[8]; - - /* [1:0] 0 => no action, 1 => add, 2 => remove, 3 => query. - [2] valid filter found by query command - [3] filter update override - [4] ip6 adress table - [6:5] l4 protocol reserved, udp, tcp, sctp - [7] is ip6 - [8] clear head/tail - [9] packet drop action - [10] matched packet generates low-latency interrupt - [11] last in linked list - [12] collision - [15] rx queue enable - [22:16] rx queue - [29:24] pool. */ - u32 command; - - CLIB_PAD_FROM_TO (0xee30, 0xee3c); - /* ip4 dst/src address, tcp ports, udp ports. - set bits mean bit is ignored. */ - u32 ip4_masks[4]; - u32 filter_length; - u32 usage_stats; - u32 failed_usage_stats; - u32 filters_match_stats; - u32 filters_miss_stats; - CLIB_PAD_FROM_TO (0xee60, 0xee68); - /* Lookup, signature. */ - u32 hash_keys[2]; - /* [15:0] ip6 src address 1 bit per byte - [31:16] ip6 dst address. */ - u32 ip6_mask; - /* [0] vlan id - [1] vlan priority - [2] pool - [3] ip protocol - [4] flex - [5] dst ip6. */ - u32 other_mask; - CLIB_PAD_FROM_TO (0xee78, 0xf000); - } flow_director; - - struct - { - u32 l2_control[64]; - u32 vlan_pool_filter[64]; - u32 vlan_pool_filter_bitmap[128]; - u32 dst_ethernet_address[128]; - u32 mirror_rule[4]; - u32 mirror_rule_vlan[8]; - u32 mirror_rule_pool[8]; - CLIB_PAD_FROM_TO (0xf650, 0x10010); - } pf_bar; - - u32 eeprom_flash_control; - /* [0] start - [1] done - [15:2] address - [31:16] read data. */ - u32 eeprom_read; - CLIB_PAD_FROM_TO (0x10018, 0x1001c); - u32 flash_access; - CLIB_PAD_FROM_TO (0x10020, 0x10114); - u32 flash_data; - u32 flash_control; - u32 flash_read_data; - CLIB_PAD_FROM_TO (0x10120, 0x1013c); - u32 flash_opcode; - u32 software_semaphore; - CLIB_PAD_FROM_TO (0x10144, 0x10148); - u32 firmware_semaphore; - CLIB_PAD_FROM_TO (0x1014c, 0x10160); - u32 software_firmware_sync; - CLIB_PAD_FROM_TO (0x10164, 0x10200); - u32 general_rx_control; - CLIB_PAD_FROM_TO (0x10204, 0x11000); - - struct - { - u32 control; - CLIB_PAD_FROM_TO (0x11004, 0x11010); - /* [3:0] enable counters - [7:4] leaky bucket counter mode - [29] reset - [30] stop - [31] start. */ - u32 counter_control; - /* [7:0],[15:8],[23:16],[31:24] event for counters 0-3. - event codes: - 0x0 bad tlp - 0x10 reqs that reached timeout - etc. */ - u32 counter_event; - CLIB_PAD_FROM_TO (0x11018, 0x11020); - u32 counters_clear_on_read[4]; - u32 counter_config[4]; - struct - { - u32 address; - u32 data; - } indirect_access; - CLIB_PAD_FROM_TO (0x11048, 0x11050); - u32 extended_control; - CLIB_PAD_FROM_TO (0x11054, 0x11064); - u32 mirrored_revision_id; - CLIB_PAD_FROM_TO (0x11068, 0x11070); - u32 dca_requester_id_information; - - /* [0] global disable - [4:1] mode: 0 => legacy, 1 => dca 1.0. */ - u32 dca_control; - CLIB_PAD_FROM_TO (0x11078, 0x110b0); - /* [0] pci completion abort - [1] unsupported i/o address - [2] wrong byte enable - [3] pci timeout */ - u32 pcie_interrupt_status; - CLIB_PAD_FROM_TO (0x110b4, 0x110b8); - u32 pcie_interrupt_enable; - CLIB_PAD_FROM_TO (0x110bc, 0x110c0); - u32 msi_x_pba_clear[8]; - CLIB_PAD_FROM_TO (0x110e0, 0x12300); - } pcie; - - u32 interrupt_throttle1[128 - 24]; - CLIB_PAD_FROM_TO (0x124a0, 0x14f00); - - u32 core_analog_config; - CLIB_PAD_FROM_TO (0x14f04, 0x14f10); - u32 core_common_config; - CLIB_PAD_FROM_TO (0x14f14, 0x15f14); - - u32 link_sec_software_firmware_interface; -} ixge_regs_t; - -typedef union -{ - struct - { - /* Addresses bigendian. */ - union - { - struct - { - ip6_address_t src_address; - u32 unused[1]; - } ip6; - struct - { - u32 unused[3]; - ip4_address_t src_address, dst_address; - } ip4; - }; - - /* [15:0] src port (little endian). - [31:16] dst port. */ - u32 tcp_udp_ports; - - /* [15:0] vlan (cfi bit set to 0). - [31:16] flex bytes. bigendian. */ - u32 vlan_and_flex_word; - - /* [14:0] hash - [15] bucket valid - [31:16] signature (signature filers)/sw-index (perfect match). */ - u32 hash; - }; - - u32 as_u32[8]; -} ixge_flow_director_key_t; - -always_inline void -ixge_throttle_queue_interrupt (ixge_regs_t * r, - u32 queue_interrupt_index, - f64 inter_interrupt_interval_in_secs) -{ - volatile u32 *tr = - (queue_interrupt_index < ARRAY_LEN (r->interrupt.throttle0) - ? &r->interrupt.throttle0[queue_interrupt_index] - : &r->interrupt_throttle1[queue_interrupt_index]); - ASSERT (queue_interrupt_index < 128); - u32 v; - i32 i, mask = (1 << 9) - 1; - - i = flt_round_nearest (inter_interrupt_interval_in_secs / 2e-6); - i = i < 1 ? 1 : i; - i = i >= mask ? mask : i; - - v = tr[0]; - v &= ~(mask << 3); - v |= i << 3; - tr[0] = v; -} - -#define foreach_ixge_counter \ - _ (0x40d0, rx_total_packets) \ - _64 (0x40c0, rx_total_bytes) \ - _ (0x41b0, rx_good_packets_before_filtering) \ - _64 (0x41b4, rx_good_bytes_before_filtering) \ - _ (0x2f50, rx_dma_good_packets) \ - _64 (0x2f54, rx_dma_good_bytes) \ - _ (0x2f5c, rx_dma_duplicated_good_packets) \ - _64 (0x2f60, rx_dma_duplicated_good_bytes) \ - _ (0x2f68, rx_dma_good_loopback_packets) \ - _64 (0x2f6c, rx_dma_good_loopback_bytes) \ - _ (0x2f74, rx_dma_good_duplicated_loopback_packets) \ - _64 (0x2f78, rx_dma_good_duplicated_loopback_bytes) \ - _ (0x4074, rx_good_packets) \ - _64 (0x4088, rx_good_bytes) \ - _ (0x407c, rx_multicast_packets) \ - _ (0x4078, rx_broadcast_packets) \ - _ (0x405c, rx_64_byte_packets) \ - _ (0x4060, rx_65_127_byte_packets) \ - _ (0x4064, rx_128_255_byte_packets) \ - _ (0x4068, rx_256_511_byte_packets) \ - _ (0x406c, rx_512_1023_byte_packets) \ - _ (0x4070, rx_gt_1023_byte_packets) \ - _ (0x4000, rx_crc_errors) \ - _ (0x4120, rx_ip_checksum_errors) \ - _ (0x4004, rx_illegal_symbol_errors) \ - _ (0x4008, rx_error_symbol_errors) \ - _ (0x4034, rx_mac_local_faults) \ - _ (0x4038, rx_mac_remote_faults) \ - _ (0x4040, rx_length_errors) \ - _ (0x41a4, rx_xons) \ - _ (0x41a8, rx_xoffs) \ - _ (0x40a4, rx_undersize_packets) \ - _ (0x40a8, rx_fragments) \ - _ (0x40ac, rx_oversize_packets) \ - _ (0x40b0, rx_jabbers) \ - _ (0x40b4, rx_management_packets) \ - _ (0x40b8, rx_management_drops) \ - _ (0x3fa0, rx_missed_packets_pool_0) \ - _ (0x40d4, tx_total_packets) \ - _ (0x4080, tx_good_packets) \ - _64 (0x4090, tx_good_bytes) \ - _ (0x40f0, tx_multicast_packets) \ - _ (0x40f4, tx_broadcast_packets) \ - _ (0x87a0, tx_dma_good_packets) \ - _64 (0x87a4, tx_dma_good_bytes) \ - _ (0x40d8, tx_64_byte_packets) \ - _ (0x40dc, tx_65_127_byte_packets) \ - _ (0x40e0, tx_128_255_byte_packets) \ - _ (0x40e4, tx_256_511_byte_packets) \ - _ (0x40e8, tx_512_1023_byte_packets) \ - _ (0x40ec, tx_gt_1023_byte_packets) \ - _ (0x4010, tx_undersize_drops) \ - _ (0x8780, switch_security_violation_packets) \ - _ (0x5118, fc_crc_errors) \ - _ (0x241c, fc_rx_drops) \ - _ (0x2424, fc_last_error_count) \ - _ (0x2428, fcoe_rx_packets) \ - _ (0x242c, fcoe_rx_dwords) \ - _ (0x8784, fcoe_tx_packets) \ - _ (0x8788, fcoe_tx_dwords) \ - _ (0x1030, queue_0_rx_count) \ - _ (0x1430, queue_0_drop_count) \ - _ (0x1070, queue_1_rx_count) \ - _ (0x1470, queue_1_drop_count) \ - _ (0x10b0, queue_2_rx_count) \ - _ (0x14b0, queue_2_drop_count) \ - _ (0x10f0, queue_3_rx_count) \ - _ (0x14f0, queue_3_drop_count) \ - _ (0x1130, queue_4_rx_count) \ - _ (0x1530, queue_4_drop_count) \ - _ (0x1170, queue_5_rx_count) \ - _ (0x1570, queue_5_drop_count) \ - _ (0x11b0, queue_6_rx_count) \ - _ (0x15b0, queue_6_drop_count) \ - _ (0x11f0, queue_7_rx_count) \ - _ (0x15f0, queue_7_drop_count) \ - _ (0x1230, queue_8_rx_count) \ - _ (0x1630, queue_8_drop_count) \ - _ (0x1270, queue_9_rx_count) \ - _ (0x1270, queue_9_drop_count) - - - - -typedef enum -{ -#define _(a,f) IXGE_COUNTER_##f, -#define _64(a,f) _(a,f) - foreach_ixge_counter -#undef _ -#undef _64 - IXGE_N_COUNTER, -} ixge_counter_type_t; - -typedef struct -{ - u32 mdio_address; - - /* 32 bit ID read from ID registers. */ - u32 id; -} ixge_phy_t; - -typedef struct -{ - /* Cache aligned descriptors. */ - ixge_descriptor_t *descriptors; - - /* Number of descriptors in table. */ - u32 n_descriptors; - - /* Software head and tail pointers into descriptor ring. */ - u32 head_index, tail_index; - - /* Index into dma_queues vector. */ - u32 queue_index; - - /* Buffer indices corresponding to each active descriptor. */ - u32 *descriptor_buffer_indices; - - union - { - struct - { - u32 *volatile head_index_write_back; - - u32 n_buffers_on_ring; - } tx; - - struct - { - /* Buffer indices to use to replenish each descriptor. */ - u32 *replenish_buffer_indices; - - vlib_node_runtime_t *node; - u32 next_index; - - u32 saved_start_of_packet_buffer_index; - - u32 saved_start_of_packet_next_index; - u32 saved_last_buffer_index; - - u32 is_start_of_packet; - - u32 n_descriptors_done_total; - - u32 n_descriptors_done_this_call; - - u32 n_bytes; - } rx; - }; -} ixge_dma_queue_t; - -#define foreach_ixge_pci_device_id \ - _ (82598, 0x10b6) \ - _ (82598_bx, 0x1508) \ - _ (82598af_dual_port, 0x10c6) \ - _ (82598af_single_port, 0x10c7) \ - _ (82598at, 0x10c8) \ - _ (82598at2, 0x150b) \ - _ (82598eb_sfp_lom, 0x10db) \ - _ (82598eb_cx4, 0x10dd) \ - _ (82598_cx4_dual_port, 0x10ec) \ - _ (82598_da_dual_port, 0x10f1) \ - _ (82598_sr_dual_port_em, 0x10e1) \ - _ (82598eb_xf_lr, 0x10f4) \ - _ (82599_kx4, 0x10f7) \ - _ (82599_kx4_mezz, 0x1514) \ - _ (82599_kr, 0x1517) \ - _ (82599_combo_backplane, 0x10f8) \ - _ (82599_cx4, 0x10f9) \ - _ (82599_sfp, 0x10fb) \ - _ (82599_backplane_fcoe, 0x152a) \ - _ (82599_sfp_fcoe, 0x1529) \ - _ (82599_sfp_em, 0x1507) \ - _ (82599_xaui_lom, 0x10fc) \ - _ (82599_t3_lom, 0x151c) \ - _ (x540t, 0x1528) - -typedef enum -{ -#define _(f,n) IXGE_##f = n, - foreach_ixge_pci_device_id -#undef _ -} ixge_pci_device_id_t; - -typedef struct -{ - /* registers */ - ixge_regs_t *regs; - - /* Specific next index when using dynamic redirection */ - u32 per_interface_next_index; - - /* PCI bus info. */ - vlib_pci_device_t pci_device; - - /* From PCI config space header. */ - ixge_pci_device_id_t device_id; - - u16 device_index; - - /* 0 or 1. */ - u16 pci_function; - - /* VLIB interface for this instance. */ - u32 vlib_hw_if_index, vlib_sw_if_index; - - ixge_dma_queue_t *dma_queues[VLIB_N_RX_TX]; - - /* Phy index (0 or 1) and address on MDI bus. */ - u32 phy_index; - ixge_phy_t phys[2]; - - /* Value of link_status register at last link change. */ - u32 link_status_at_last_link_change; - - i2c_bus_t i2c_bus; - sfp_eeprom_t sfp_eeprom; - - /* Counters. */ - u64 counters[IXGE_N_COUNTER], counters_last_clear[IXGE_N_COUNTER]; -} ixge_device_t; - -typedef struct -{ - vlib_main_t *vlib_main; - - /* Vector of devices. */ - ixge_device_t *devices; - - /* Descriptor ring sizes. */ - u32 n_descriptors[VLIB_N_RX_TX]; - - /* RX buffer size. Must be at least 1k; will be rounded to - next largest 1k size. */ - u32 n_bytes_in_rx_buffer; - - u32 n_descriptors_per_cache_line; - - u32 vlib_buffer_free_list_index; - - u32 process_node_index; - - /* Template and mask for initializing/validating TX descriptors. */ - ixge_tx_descriptor_t tx_descriptor_template, tx_descriptor_template_mask; - - /* Vector of buffers for which TX is done and can be freed. */ - u32 *tx_buffers_pending_free; - - u32 *rx_buffers_to_add; - - f64 time_last_stats_update; -} ixge_main_t; - -ixge_main_t ixge_main; -vnet_device_class_t ixge_device_class; - -typedef enum -{ - IXGE_RX_NEXT_IP4_INPUT, - IXGE_RX_NEXT_IP6_INPUT, - IXGE_RX_NEXT_ETHERNET_INPUT, - IXGE_RX_NEXT_DROP, - IXGE_RX_N_NEXT, -} ixge_rx_next_t; - -void ixge_set_next_node (ixge_rx_next_t, char *); - -#endif /* included_ixge_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/devices/nic/sfp.c b/src/vnet/devices/nic/sfp.c deleted file mode 100644 index 9e9c008d..00000000 --- a/src/vnet/devices/nic/sfp.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -static u8 * -format_space_terminated (u8 * s, va_list * args) -{ - u32 l = va_arg (*args, u32); - u8 *v = va_arg (*args, u8 *); - u8 *p; - - for (p = v + l - 1; p >= v && p[0] == ' '; p--) - ; - vec_add (s, v, clib_min (p - v + 1, l)); - return s; -} - -static u8 * -format_sfp_id (u8 * s, va_list * args) -{ - u32 id = va_arg (*args, u32); - char *t = 0; - switch (id) - { -#define _(f) case SFP_ID_##f: t = #f; break; - foreach_sfp_id -#undef _ - default: - return format (s, "unknown 0x%x", id); - } - return format (s, "%s", t); -} - -static u8 * -format_sfp_compatibility (u8 * s, va_list * args) -{ - u32 c = va_arg (*args, u32); - char *t = 0; - switch (c) - { -#define _(a,b,f) case SFP_COMPATIBILITY_##f: t = #f; break; - foreach_sfp_compatibility -#undef _ - default: - return format (s, "unknown 0x%x", c); - } - return format (s, "%s", t); -} - -u32 -sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c) -{ - static struct - { - u8 byte, bit; - } t[] = - { -#define _(a,b,f) { .byte = a, .bit = b, }, - foreach_sfp_compatibility -#undef _ - }; - - ASSERT (c < ARRAY_LEN (t)); - return (e->compatibility[t[c].byte] & (1 << t[c].bit)) != 0; -} - -u8 * -format_sfp_eeprom (u8 * s, va_list * args) -{ - sfp_eeprom_t *e = va_arg (*args, sfp_eeprom_t *); - uword indent = format_get_indent (s); - int i; - - if (e->id != SFP_ID_sfp) - s = format (s, "id %U, ", format_sfp_id, e->id); - - s = format (s, "compatibility:"); - for (i = 0; i < SFP_N_COMPATIBILITY; i++) - if (sfp_is_comatible (e, i)) - s = format (s, " %U", format_sfp_compatibility, i); - - s = format (s, "\n%Uvendor: %U, part %U", - format_white_space, indent, - format_space_terminated, sizeof (e->vendor_name), - e->vendor_name, format_space_terminated, - sizeof (e->vendor_part_number), e->vendor_part_number); - s = - format (s, "\n%Urevision: %U, serial: %U, date code: %U", - format_white_space, indent, format_space_terminated, - sizeof (e->vendor_revision), e->vendor_revision, - format_space_terminated, sizeof (e->vendor_serial_number), - e->vendor_serial_number, format_space_terminated, - sizeof (e->vendor_date_code), e->vendor_date_code); - - return s; -} - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/devices/nic/sfp.h b/src/vnet/devices/nic/sfp.h deleted file mode 100644 index a1ac7997..00000000 --- a/src/vnet/devices/nic/sfp.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_vnet_optics_sfp_h -#define included_vnet_optics_sfp_h - -#include - -#define foreach_sfp_id \ - _ (unknown) \ - _ (gbic) \ - _ (on_motherboard) \ - _ (sfp) - -typedef enum -{ -#define _(f) SFP_ID_##f, - foreach_sfp_id -#undef _ -} sfp_id_t; - -typedef struct -{ - u8 id; - u8 extended_id; - u8 connector_type; - u8 compatibility[8]; - u8 encoding; - u8 nominal_bit_rate_100mbits_per_sec; - u8 reserved13; - u8 link_length[5]; - u8 reserved19; - u8 vendor_name[16]; - u8 reserved36; - u8 vendor_oui[3]; - u8 vendor_part_number[16]; - u8 vendor_revision[4]; - /* 16 bit value network byte order. */ - u8 laser_wavelength_in_nm[2]; - u8 reserved62; - u8 checksum_0_to_62; - - u8 options[2]; - u8 max_bit_rate_margin_percent; - u8 min_bit_rate_margin_percent; - u8 vendor_serial_number[16]; - u8 vendor_date_code[8]; - u8 reserved92[3]; - u8 checksum_63_to_94; - u8 vendor_specific[32]; - u8 reserved128[384]; - - /* Vendor specific data follows. */ - u8 vendor_specific1[0]; -} sfp_eeprom_t; - -always_inline uword -sfp_eeprom_is_valid (sfp_eeprom_t * e) -{ - int i; - u8 sum = 0; - for (i = 0; i < 63; i++) - sum += ((u8 *) e)[i]; - return sum == e->checksum_0_to_62; -} - -/* _ (byte_index, bit_index, name) */ -#define foreach_sfp_compatibility \ - _ (0, 4, 10g_base_sr) \ - _ (0, 5, 10g_base_lr) \ - _ (1, 2, oc48_long_reach) \ - _ (1, 1, oc48_intermediate_reach) \ - _ (1, 0, oc48_short_reach) \ - _ (2, 6, oc12_long_reach) \ - _ (2, 5, oc12_intermediate_reach) \ - _ (2, 4, oc12_short_reach) \ - _ (2, 2, oc3_long_reach) \ - _ (2, 1, oc3_intermediate_reach) \ - _ (2, 0, oc3_short_reach) \ - _ (3, 3, 1g_base_t) \ - _ (3, 2, 1g_base_cx) \ - _ (3, 1, 1g_base_lx) \ - _ (3, 0, 1g_base_sx) - -typedef enum -{ -#define _(a,b,f) SFP_COMPATIBILITY_##f, - foreach_sfp_compatibility -#undef _ - SFP_N_COMPATIBILITY, -} sfp_compatibility_t; - -u32 sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c); - -format_function_t format_sfp_eeprom; - -#endif /* included_vnet_optics_sfp_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ethernet/sfp.c b/src/vnet/ethernet/sfp.c new file mode 100644 index 00000000..624740e3 --- /dev/null +++ b/src/vnet/ethernet/sfp.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +static u8 * +format_space_terminated (u8 * s, va_list * args) +{ + u32 l = va_arg (*args, u32); + u8 *v = va_arg (*args, u8 *); + u8 *p; + + for (p = v + l - 1; p >= v && p[0] == ' '; p--) + ; + vec_add (s, v, clib_min (p - v + 1, l)); + return s; +} + +static u8 * +format_sfp_id (u8 * s, va_list * args) +{ + u32 id = va_arg (*args, u32); + char *t = 0; + switch (id) + { +#define _(f) case SFP_ID_##f: t = #f; break; + foreach_sfp_id +#undef _ + default: + return format (s, "unknown 0x%x", id); + } + return format (s, "%s", t); +} + +static u8 * +format_sfp_compatibility (u8 * s, va_list * args) +{ + u32 c = va_arg (*args, u32); + char *t = 0; + switch (c) + { +#define _(a,b,f) case SFP_COMPATIBILITY_##f: t = #f; break; + foreach_sfp_compatibility +#undef _ + default: + return format (s, "unknown 0x%x", c); + } + return format (s, "%s", t); +} + +u32 +sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c) +{ + static struct + { + u8 byte, bit; + } t[] = + { +#define _(a,b,f) { .byte = a, .bit = b, }, + foreach_sfp_compatibility +#undef _ + }; + + ASSERT (c < ARRAY_LEN (t)); + return (e->compatibility[t[c].byte] & (1 << t[c].bit)) != 0; +} + +u8 * +format_sfp_eeprom (u8 * s, va_list * args) +{ + sfp_eeprom_t *e = va_arg (*args, sfp_eeprom_t *); + uword indent = format_get_indent (s); + int i; + + if (e->id != SFP_ID_sfp) + s = format (s, "id %U, ", format_sfp_id, e->id); + + s = format (s, "compatibility:"); + for (i = 0; i < SFP_N_COMPATIBILITY; i++) + if (sfp_is_comatible (e, i)) + s = format (s, " %U", format_sfp_compatibility, i); + + s = format (s, "\n%Uvendor: %U, part %U", + format_white_space, indent, + format_space_terminated, sizeof (e->vendor_name), + e->vendor_name, format_space_terminated, + sizeof (e->vendor_part_number), e->vendor_part_number); + s = + format (s, "\n%Urevision: %U, serial: %U, date code: %U", + format_white_space, indent, format_space_terminated, + sizeof (e->vendor_revision), e->vendor_revision, + format_space_terminated, sizeof (e->vendor_serial_number), + e->vendor_serial_number, format_space_terminated, + sizeof (e->vendor_date_code), e->vendor_date_code); + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/ethernet/sfp.h b/src/vnet/ethernet/sfp.h new file mode 100644 index 00000000..a1ac7997 --- /dev/null +++ b/src/vnet/ethernet/sfp.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_vnet_optics_sfp_h +#define included_vnet_optics_sfp_h + +#include + +#define foreach_sfp_id \ + _ (unknown) \ + _ (gbic) \ + _ (on_motherboard) \ + _ (sfp) + +typedef enum +{ +#define _(f) SFP_ID_##f, + foreach_sfp_id +#undef _ +} sfp_id_t; + +typedef struct +{ + u8 id; + u8 extended_id; + u8 connector_type; + u8 compatibility[8]; + u8 encoding; + u8 nominal_bit_rate_100mbits_per_sec; + u8 reserved13; + u8 link_length[5]; + u8 reserved19; + u8 vendor_name[16]; + u8 reserved36; + u8 vendor_oui[3]; + u8 vendor_part_number[16]; + u8 vendor_revision[4]; + /* 16 bit value network byte order. */ + u8 laser_wavelength_in_nm[2]; + u8 reserved62; + u8 checksum_0_to_62; + + u8 options[2]; + u8 max_bit_rate_margin_percent; + u8 min_bit_rate_margin_percent; + u8 vendor_serial_number[16]; + u8 vendor_date_code[8]; + u8 reserved92[3]; + u8 checksum_63_to_94; + u8 vendor_specific[32]; + u8 reserved128[384]; + + /* Vendor specific data follows. */ + u8 vendor_specific1[0]; +} sfp_eeprom_t; + +always_inline uword +sfp_eeprom_is_valid (sfp_eeprom_t * e) +{ + int i; + u8 sum = 0; + for (i = 0; i < 63; i++) + sum += ((u8 *) e)[i]; + return sum == e->checksum_0_to_62; +} + +/* _ (byte_index, bit_index, name) */ +#define foreach_sfp_compatibility \ + _ (0, 4, 10g_base_sr) \ + _ (0, 5, 10g_base_lr) \ + _ (1, 2, oc48_long_reach) \ + _ (1, 1, oc48_intermediate_reach) \ + _ (1, 0, oc48_short_reach) \ + _ (2, 6, oc12_long_reach) \ + _ (2, 5, oc12_intermediate_reach) \ + _ (2, 4, oc12_short_reach) \ + _ (2, 2, oc3_long_reach) \ + _ (2, 1, oc3_intermediate_reach) \ + _ (2, 0, oc3_short_reach) \ + _ (3, 3, 1g_base_t) \ + _ (3, 2, 1g_base_cx) \ + _ (3, 1, 1g_base_lx) \ + _ (3, 0, 1g_base_sx) + +typedef enum +{ +#define _(a,b,f) SFP_COMPATIBILITY_##f, + foreach_sfp_compatibility +#undef _ + SFP_N_COMPATIBILITY, +} sfp_compatibility_t; + +u32 sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c); + +format_function_t format_sfp_eeprom; + +#endif /* included_vnet_optics_sfp_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vpp-api/lua/bench.lua b/src/vpp-api/lua/bench.lua index 8e5a0b4b..c7231b90 100644 --- a/src/vpp-api/lua/bench.lua +++ b/src/vpp-api/lua/bench.lua @@ -53,9 +53,9 @@ function do_bench() end root_dir = "/home/ubuntu/vpp" -pneum_path = root_dir .. "/build-root/install-vpp_lite_debug-native/vpp-api/lib64/libpneum.so" +pneum_path = root_dir .. "/build-root/install-vpp_debug-native/vpp-api/lib64/libpneum.so" vpp:init({ pneum_path = pneum_path }) -vpp:json_api(root_dir .. "/build-root/install-vpp_lite_debug-native/vpp/vpp-api/vpe.api.json") +vpp:json_api(root_dir .. "/build-root/install-vpp_debug-native/vpp/vpp-api/vpe.api.json") vpp:connect("lua-bench") local n_tests = 10 diff --git a/src/vpp-api/lua/examples/cli/lua-cli.lua b/src/vpp-api/lua/examples/cli/lua-cli.lua index b3a24d7d..4a27af53 100644 --- a/src/vpp-api/lua/examples/cli/lua-cli.lua +++ b/src/vpp-api/lua/examples/cli/lua-cli.lua @@ -557,12 +557,12 @@ end function init_vpp(vpp) local root_dir = "/home/ubuntu/vpp" - local pneum_path = root_dir .. "/build-root/install-vpp_lite_debug-native/vpp-api/lib64/libpneum.so" + local pneum_path = root_dir .. "/build-root/install-vpp_debug-native/vpp-api/lib64/libpneum.so" vpp:init({ pneum_path = pneum_path }) vpp:init({ pneum_path = pneum_path }) - vpp:json_api(root_dir .. "/build-root/install-vpp_lite_debug-native/vpp/vpp-api/vpe.api.json") + vpp:json_api(root_dir .. "/build-root/install-vpp_debug-native/vpp/vpp-api/vpe.api.json") diff --git a/src/vpp-api/lua/examples/example-classifier.lua b/src/vpp-api/lua/examples/example-classifier.lua index ec9c3d3e..b1270757 100644 --- a/src/vpp-api/lua/examples/example-classifier.lua +++ b/src/vpp-api/lua/examples/example-classifier.lua @@ -20,12 +20,12 @@ local vpp = require "vpp-lapi" local bit = require("bit") root_dir = "/home/ubuntu/vpp" -pneum_path = root_dir .. "/build-root/install-vpp_lite_debug-native/vpp-api/lib64/libpneum.so" +pneum_path = root_dir .. "/build-root/install-vpp_debug-native/vpp-api/lib64/libpneum.so" vpp:init({ pneum_path = pneum_path }) -vpp:json_api(root_dir .. "/build-root/install-vpp_lite_debug-native/vpp/vpp-api/vpe.api.json") +vpp:json_api(root_dir .. "/build-root/install-vpp_debug-native/vpp/vpp-api/vpe.api.json") vpp:connect("aytest") diff --git a/src/vpp-api/lua/examples/example-cli.lua b/src/vpp-api/lua/examples/example-cli.lua index 8b84989f..85425caf 100644 --- a/src/vpp-api/lua/examples/example-cli.lua +++ b/src/vpp-api/lua/examples/example-cli.lua @@ -18,11 +18,11 @@ vpp = require "vpp-lapi" root_dir = "/home/ubuntu/vpp" -pneum_path = root_dir .. "/build-root/install-vpp_lite_debug-native/vpp-api/lib64/libpneum.so" +pneum_path = root_dir .. "/build-root/install-vpp_debug-native/vpp-api/lib64/libpneum.so" vpp:init({ pneum_path = pneum_path }) -vpp:json_api(root_dir .. "/build-root/install-vpp_lite_debug-native/vpp/vpp-api/vpe.api.json") +vpp:json_api(root_dir .. "/build-root/install-vpp_debug-native/vpp/vpp-api/vpe.api.json") vpp:connect("aytest") diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index a566d956..d6a12325 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -199,9 +199,6 @@ defaulted: { vm->init_functions_called = hash_create (0, /* value bytes */ 0); vpe_main_init (vm); -#if DPDK == 0 - unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); -#endif return vlib_unix_main (argc, argv); } else diff --git a/test/framework.py b/test/framework.py index 6b1799a5..a0284e37 100644 --- a/test/framework.py +++ b/test/framework.py @@ -157,7 +157,9 @@ class VppTestCase(unittest.TestCase): cls.vpp_cmdline = [cls.vpp_bin, "unix", "{", "nodaemon", debug_cli, coredump_size, "}", "api-trace", "{", "on", "}", - "api-segment", "{", "prefix", cls.shm_prefix, "}"] + "api-segment", "{", "prefix", cls.shm_prefix, "}", + "plugins", "{", "plugin", "dpdk_plugin.so", "{", + "disable", "}", "}"] if cls.plugin_path is not None: cls.vpp_cmdline.extend(["plugin_path", cls.plugin_path]) cls.logger.info("vpp_cmdline: %s" % cls.vpp_cmdline) -- cgit 1.2.3-korg From 586afd762bfa149f5ca167bd5fd5a0cd59ce94fe Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 5 Apr 2017 19:18:20 +0200 Subject: Use thread local storage for thread index This patch deprecates stack-based thread identification, Also removes requirement that thread stacks are adjacent. Finally, possibly annoying for some folks, it renames all occurences of cpu_index and cpu_number with thread index. Using word "cpu" is misleading here as thread can be migrated ti different CPU, and also it is not related to linux cpu index. Change-Id: I68cdaf661e701d2336fc953dcb9978d10a70f7c1 Signed-off-by: Damjan Marion --- src/examples/srv6-sample-localsid/node.c | 4 +- src/plugins/dpdk/buffer.c | 2 +- src/plugins/dpdk/device/device.c | 8 +- src/plugins/dpdk/device/dpdk_priv.h | 8 +- src/plugins/dpdk/device/init.c | 2 +- src/plugins/dpdk/device/node.c | 32 +++--- src/plugins/dpdk/hqos/hqos.c | 16 +-- src/plugins/dpdk/ipsec/cli.c | 8 +- src/plugins/dpdk/ipsec/crypto_node.c | 4 +- src/plugins/dpdk/ipsec/esp.h | 4 +- src/plugins/dpdk/ipsec/esp_decrypt.c | 4 +- src/plugins/dpdk/ipsec/esp_encrypt.c | 5 +- src/plugins/dpdk/ipsec/ipsec.c | 2 +- src/plugins/dpdk/ipsec/ipsec.h | 4 +- src/plugins/dpdk/main.c | 2 +- src/plugins/flowperpkt/l2_node.c | 2 +- src/plugins/flowperpkt/node.c | 2 +- src/plugins/ioam/export-common/ioam_export.h | 6 +- .../ioam/ip6/ioam_cache_tunnel_select_node.c | 16 +-- src/plugins/ixge/ixge.c | 2 +- src/plugins/lb/lb.c | 8 +- src/plugins/lb/node.c | 22 ++-- src/plugins/lb/refcount.c | 8 +- src/plugins/lb/refcount.h | 4 +- src/plugins/memif/node.c | 35 +++--- src/plugins/snat/in2out.c | 110 +++++++++--------- src/plugins/snat/out2in.c | 102 ++++++++--------- src/plugins/snat/snat.h | 10 +- src/vlib/buffer.c | 6 +- src/vlib/buffer_funcs.h | 4 +- src/vlib/cli.c | 6 +- src/vlib/counter.h | 16 +-- src/vlib/error.c | 2 +- src/vlib/global_funcs.h | 2 +- src/vlib/main.c | 14 +-- src/vlib/main.h | 2 +- src/vlib/node.c | 2 +- src/vlib/node.h | 6 +- src/vlib/node_funcs.h | 8 +- src/vlib/threads.c | 69 ++++------- src/vlib/threads.h | 21 ++-- src/vlib/unix/cj.c | 7 +- src/vlib/unix/cj.h | 2 +- src/vlib/unix/main.c | 43 +++---- src/vnet/adj/adj_l2.c | 4 +- src/vnet/adj/adj_midchain.c | 8 +- src/vnet/adj/adj_nsh.c | 4 +- src/vnet/classify/vnet_classify.c | 16 +-- src/vnet/cop/ip4_whitelist.c | 8 +- src/vnet/cop/ip6_whitelist.c | 8 +- src/vnet/devices/af_packet/node.c | 20 ++-- src/vnet/devices/devices.c | 61 +++++----- src/vnet/devices/devices.h | 18 +-- src/vnet/devices/netmap/node.c | 24 ++-- src/vnet/devices/ssvm/node.c | 6 +- src/vnet/devices/virtio/vhost-user.c | 127 +++++++++++---------- src/vnet/dpo/lookup_dpo.c | 20 ++-- src/vnet/dpo/replicate_dpo.c | 12 +- src/vnet/ethernet/arp.c | 2 +- src/vnet/ethernet/interface.c | 7 +- src/vnet/ethernet/node.c | 14 +-- src/vnet/gre/node.c | 8 +- src/vnet/interface.h | 2 +- src/vnet/interface_output.c | 53 ++++----- src/vnet/ip/ip4_forward.c | 34 +++--- src/vnet/ip/ip4_input.c | 8 +- src/vnet/ip/ip6_forward.c | 24 ++-- src/vnet/ip/ip6_input.c | 8 +- src/vnet/ip/ip6_neighbor.c | 4 +- src/vnet/ipsec/esp.h | 8 +- src/vnet/ipsec/esp_decrypt.c | 13 ++- src/vnet/ipsec/esp_encrypt.c | 13 ++- src/vnet/ipsec/ikev2.c | 64 ++++++----- src/vnet/ipsec/ipsec.h | 12 +- src/vnet/ipsec/ipsec_if.c | 2 +- src/vnet/l2/l2_bvi.h | 2 +- src/vnet/l2/l2_input.c | 14 +-- src/vnet/l2/l2_output.c | 6 +- src/vnet/l2tp/decap.c | 2 +- src/vnet/l2tp/encap.c | 2 +- src/vnet/l2tp/l2tp.c | 6 +- src/vnet/lisp-gpe/decap.c | 16 +-- src/vnet/lldp/lldp_input.c | 2 +- src/vnet/map/ip4_map.c | 14 +-- src/vnet/map/ip4_map_t.c | 12 +- src/vnet/map/ip6_map.c | 19 +-- src/vnet/map/ip6_map_t.c | 12 +- src/vnet/mpls/mpls_input.c | 8 +- src/vnet/mpls/mpls_lookup.c | 20 ++-- src/vnet/mpls/mpls_output.c | 10 +- src/vnet/pg/input.c | 4 +- src/vnet/replication.c | 20 ++-- src/vnet/replication.h | 2 +- src/vnet/session/node.c | 2 +- src/vnet/sr/sr_localsid.c | 44 +++---- src/vnet/tcp/builtin_client.c | 2 +- src/vnet/tcp/tcp.c | 8 +- src/vnet/tcp/tcp_debug.h | 2 +- src/vnet/tcp/tcp_input.c | 10 +- src/vnet/tcp/tcp_output.c | 20 ++-- src/vnet/udp/udp_input.c | 2 +- src/vnet/unix/tapcli.c | 2 +- src/vnet/unix/tuntap.c | 4 +- src/vnet/vxlan-gpe/decap.c | 10 +- src/vnet/vxlan-gpe/encap.c | 12 +- src/vnet/vxlan/decap.c | 10 +- src/vnet/vxlan/encap.c | 12 +- src/vpp/stats/stats.c | 14 +-- src/vpp/stats/stats.h | 2 +- 109 files changed, 790 insertions(+), 791 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/examples/srv6-sample-localsid/node.c b/src/examples/srv6-sample-localsid/node.c index 7bae9cd7..e83e2352 100644 --- a/src/examples/srv6-sample-localsid/node.c +++ b/src/examples/srv6-sample-localsid/node.c @@ -114,7 +114,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -168,7 +168,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram /* This increments the SRv6 per LocalSID counters.*/ vlib_increment_combined_counter (((next0 == SRV6_SAMPLE_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : &(sm->sr_ls_valid_counters)), - cpu_index, + thread_index, ls0 - sm->localsids, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 2765c292..c80b3fa8 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -132,7 +132,7 @@ dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) u32 merge_index; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 50b26689..91661246 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -243,7 +243,7 @@ static_always_inline ASSERT (ring->tx_tail == 0); n_retry = 16; - queue_id = vm->cpu_index; + queue_id = vm->thread_index; do { @@ -266,7 +266,7 @@ static_always_inline { /* no wrap, transmit in one burst */ dpdk_device_hqos_per_worker_thread_t *hqos = - &xd->hqos_wt[vm->cpu_index]; + &xd->hqos_wt[vm->thread_index]; ASSERT (hqos->swq != NULL); @@ -332,7 +332,7 @@ dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp) { dpdk_main_t *dm = &dpdk_main; - u32 my_cpu = vm->cpu_index; + u32 my_cpu = vm->thread_index; struct rte_mbuf *mb_new; if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0) @@ -376,7 +376,7 @@ dpdk_interface_tx (vlib_main_t * vm, tx_ring_hdr_t *ring; u32 n_on_ring; - my_cpu = vm->cpu_index; + my_cpu = vm->thread_index; queue_id = my_cpu; diff --git a/src/plugins/dpdk/device/dpdk_priv.h b/src/plugins/dpdk/device/dpdk_priv.h index dd40ff48..52b4ca4b 100644 --- a/src/plugins/dpdk/device/dpdk_priv.h +++ b/src/plugins/dpdk/device/dpdk_priv.h @@ -79,7 +79,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) { vlib_simple_counter_main_t *cm; vnet_main_t *vnm = vnet_get_main (); - u32 my_cpu = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u64 rxerrors, last_rxerrors; /* only update counters for PMD interfaces */ @@ -96,7 +96,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_NO_BUF); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, xd->stats.rx_nombuf - xd->last_stats.rx_nombuf); } @@ -107,7 +107,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_MISS); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, xd->stats.imissed - xd->last_stats.imissed); } @@ -119,7 +119,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_ERROR); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, rxerrors - last_rxerrors); } diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 538db6cb..7eaf8da7 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -324,7 +324,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) int rv; int j; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) { diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index e740fd18..b10e0fad 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -283,7 +283,7 @@ dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, */ static_always_inline u32 dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, - vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id, + vlib_node_runtime_t * node, u32 thread_index, u16 queue_id, int maybe_multiseg) { u32 n_buffers; @@ -294,7 +294,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, uword n_rx_bytes = 0; u32 n_trace, trace_cnt __attribute__ ((unused)); vlib_buffer_free_list_t *fl; - vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, cpu_index); + vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, thread_index); if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) return 0; @@ -306,7 +306,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, return 0; } - vec_reset_length (xd->d_trace_buffers[cpu_index]); + vec_reset_length (xd->d_trace_buffers[thread_index]); trace_cnt = n_trace = vlib_get_trace_count (vm, node); if (n_trace > 0) @@ -318,7 +318,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, { struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++]; vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); - vec_add1 (xd->d_trace_buffers[cpu_index], + vec_add1 (xd->d_trace_buffers[thread_index], vlib_get_buffer_index (vm, b)); } } @@ -546,20 +546,22 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, vlib_put_next_frame (vm, node, next_index, n_left_to_next); } - if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[cpu_index]) > 0)) + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[thread_index]) > 0)) { - dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers[cpu_index], - vec_len (xd->d_trace_buffers[cpu_index])); - vlib_set_trace_count (vm, node, n_trace - - vec_len (xd->d_trace_buffers[cpu_index])); + dpdk_rx_trace (dm, node, xd, queue_id, + xd->d_trace_buffers[thread_index], + vec_len (xd->d_trace_buffers[thread_index])); + vlib_set_trace_count (vm, node, + n_trace - + vec_len (xd->d_trace_buffers[thread_index])); } vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); + thread_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, mb_index); + vnet_device_increment_rx_packets (thread_index, mb_index); return mb_index; } @@ -630,19 +632,19 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) dpdk_device_t *xd; uword n_rx_packets = 0; dpdk_device_and_queue_t *dq; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* * Poll all devices on this cpu for input/interrupts. */ /* *INDENT-OFF* */ - vec_foreach (dq, dm->devices_by_cpu[cpu_index]) + vec_foreach (dq, dm->devices_by_cpu[thread_index]) { xd = vec_elt_at_index(dm->devices, dq->device); if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 1); + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1); else - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 0); + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 0); } /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index a288fca7..8b251beb 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -397,7 +397,7 @@ static_always_inline void dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0; @@ -405,12 +405,12 @@ dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) { vlib_worker_thread_barrier_check (); - u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]); + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (dev_pos >= n_devs) dev_pos = 0; dpdk_device_and_queue_t *dq = - vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos); + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; @@ -479,7 +479,7 @@ static_always_inline void dpdk_hqos_thread_internal (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0; @@ -487,7 +487,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) { vlib_worker_thread_barrier_check (); - u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]); + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (PREDICT_FALSE (n_devs == 0)) { dev_pos = 0; @@ -497,7 +497,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) dev_pos = 0; dpdk_device_and_queue_t *dq = - vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos); + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; @@ -586,7 +586,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w) vm = vlib_get_main (); - ASSERT (vm->cpu_index == os_get_cpu_number ()); + ASSERT (vm->thread_index == vlib_get_thread_index ()); clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); @@ -595,7 +595,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w) while (tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); - if (vec_len (dm->devices_by_hqos_cpu[vm->cpu_index]) == 0) + if (vec_len (dm->devices_by_hqos_cpu[vm->thread_index]) == 0) return clib_error ("current I/O TX thread does not have any devices assigned to it"); diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c index cd0a6037..3ae8c9b8 100644 --- a/src/plugins/dpdk/ipsec/cli.c +++ b/src/plugins/dpdk/ipsec/cli.c @@ -42,8 +42,8 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) for (i = 0; i < tm->n_vlib_mains; i++) { uword key, data; - u32 cpu_index = vlib_mains[i]->cpu_index; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + u32 thread_index = vlib_mains[i]->thread_index; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; u8 *s = 0; if (skip_master) @@ -57,7 +57,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) i32 last_cdev = -1; crypto_qp_data_t *qpd; - s = format (s, "%u\t", cpu_index); + s = format (s, "%u\t", thread_index); /* *INDENT-OFF* */ vec_foreach (qpd, cwm->qp_data) @@ -95,7 +95,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) cap.sym.auth.algo = p_key->auth_algo; check_algo_is_supported (&cap, auth_str); vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n", - vlib_mains[i]->cpu_index, cipher_str, auth_str, + vlib_mains[i]->thread_index, cipher_str, auth_str, p_key->is_outbound ? "out" : "in", cwm->qp_data[data].dev_id, cwm->qp_data[data].qp_id); diff --git a/src/plugins/dpdk/ipsec/crypto_node.c b/src/plugins/dpdk/ipsec/crypto_node.c index dc3452b2..a3c45902 100644 --- a/src/plugins/dpdk/ipsec/crypto_node.c +++ b/src/plugins/dpdk/ipsec/crypto_node.c @@ -171,9 +171,9 @@ static uword dpdk_crypto_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; crypto_qp_data_t *qpd; u32 n_deq = 0; diff --git a/src/plugins/dpdk/ipsec/esp.h b/src/plugins/dpdk/ipsec/esp.h index 320295b1..56f0c756 100644 --- a/src/plugins/dpdk/ipsec/esp.h +++ b/src/plugins/dpdk/ipsec/esp.h @@ -170,9 +170,9 @@ static_always_inline int create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, u8 is_outbound) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; struct rte_crypto_sym_xform cipher_xform = { 0 }; struct rte_crypto_sym_xform auth_xform = { 0 }; struct rte_crypto_sym_xform *xfs; diff --git a/src/plugins/dpdk/ipsec/esp_decrypt.c b/src/plugins/dpdk/ipsec/esp_decrypt.c index 286e03f8..bab76e3b 100644 --- a/src/plugins/dpdk/ipsec/esp_decrypt.c +++ b/src/plugins/dpdk/ipsec/esp_decrypt.c @@ -88,7 +88,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, { u32 n_left_from, *from, *to_next, next_index; ipsec_main_t *im = &ipsec_main; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); dpdk_crypto_main_t * dcm = &dpdk_crypto_main; dpdk_esp_main_t * em = &dpdk_esp_main; u32 i; @@ -104,7 +104,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, return n_left_from; } - crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, cpu_index); + crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, thread_index); u32 n_qps = vec_len(cwm->qp_data); struct rte_crypto_op ** cops_to_enq[n_qps]; u32 n_cop_qp[n_qps], * bi_to_enq[n_qps]; diff --git a/src/plugins/dpdk/ipsec/esp_encrypt.c b/src/plugins/dpdk/ipsec/esp_encrypt.c index 5b03de73..f996d7df 100644 --- a/src/plugins/dpdk/ipsec/esp_encrypt.c +++ b/src/plugins/dpdk/ipsec/esp_encrypt.c @@ -93,7 +93,7 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, { u32 n_left_from, *from, *to_next, next_index; ipsec_main_t *im = &ipsec_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; dpdk_esp_main_t *em = &dpdk_esp_main; u32 i; @@ -111,7 +111,8 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, return n_left_from; } - crypto_worker_main_t *cwm = vec_elt_at_index (dcm->workers_main, cpu_index); + crypto_worker_main_t *cwm = + vec_elt_at_index (dcm->workers_main, thread_index); u32 n_qps = vec_len (cwm->qp_data); struct rte_crypto_op **cops_to_enq[n_qps]; u32 n_cop_qp[n_qps], *bi_to_enq[n_qps]; diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index b0aaaaec..5d8f4fba 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -289,7 +289,7 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, if (!map) { clib_warning ("unable to create hash table for worker %u", - vlib_mains[i]->cpu_index); + vlib_mains[i]->thread_index); goto error; } cwm->algo_qp_map = map; diff --git a/src/plugins/dpdk/ipsec/ipsec.h b/src/plugins/dpdk/ipsec/ipsec.h index 28bffc80..f0f793c0 100644 --- a/src/plugins/dpdk/ipsec/ipsec.h +++ b/src/plugins/dpdk/ipsec/ipsec.h @@ -95,8 +95,8 @@ static_always_inline void crypto_alloc_cops () { dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - u32 cpu_index = os_get_cpu_number (); - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + u32 thread_index = vlib_get_thread_index (); + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; unsigned socket_id = rte_socket_id (); crypto_qp_data_t *qpd; diff --git a/src/plugins/dpdk/main.c b/src/plugins/dpdk/main.c index 7ee2a785..942b8b2d 100644 --- a/src/plugins/dpdk/main.c +++ b/src/plugins/dpdk/main.c @@ -39,7 +39,7 @@ rte_delay_us_override (unsigned us) * thread then do not intercept. (Must not be called from an * independent pthread). */ - if (os_get_cpu_number () == 0) + if (vlib_get_thread_index () == 0) { /* * We're in the vlib main thread or a vlib process. Make sure diff --git a/src/plugins/flowperpkt/l2_node.c b/src/plugins/flowperpkt/l2_node.c index 1c2f681e..fdaf81d1 100644 --- a/src/plugins/flowperpkt/l2_node.c +++ b/src/plugins/flowperpkt/l2_node.c @@ -102,7 +102,7 @@ add_to_flow_record_l2 (vlib_main_t * vm, u8 * src_mac, u8 * dst_mac, u16 ethertype, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->cpu_index; + u32 my_cpu_number = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; diff --git a/src/plugins/flowperpkt/node.c b/src/plugins/flowperpkt/node.c index f77f087d..0277682d 100644 --- a/src/plugins/flowperpkt/node.c +++ b/src/plugins/flowperpkt/node.c @@ -101,7 +101,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, u32 src_address, u32 dst_address, u8 tos, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->cpu_index; + u32 my_cpu_number = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; diff --git a/src/plugins/ioam/export-common/ioam_export.h b/src/plugins/ioam/export-common/ioam_export.h index 2bf3fd54..9de0d13b 100644 --- a/src/plugins/ioam/export-common/ioam_export.h +++ b/src/plugins/ioam/export-common/ioam_export.h @@ -477,8 +477,8 @@ do { \ from = vlib_frame_vector_args (F); \ n_left_from = (F)->n_vectors; \ next_index = (N)->cached_next_index; \ - while (__sync_lock_test_and_set ((EM)->lockp[(VM)->cpu_index], 1)); \ - my_buf = ioam_export_get_my_buffer (EM, (VM)->cpu_index); \ + while (__sync_lock_test_and_set ((EM)->lockp[(VM)->thread_index], 1)); \ + my_buf = ioam_export_get_my_buffer (EM, (VM)->thread_index); \ my_buf->touched_at = vlib_time_now (VM); \ while (n_left_from > 0) \ { \ @@ -620,7 +620,7 @@ do { \ } \ vlib_node_increment_counter (VM, export_node.index, \ EXPORT_ERROR_RECORDED, pkts_recorded); \ - *(EM)->lockp[(VM)->cpu_index] = 0; \ + *(EM)->lockp[(VM)->thread_index] = 0; \ } while(0) #endif /* __included_ioam_export_h__ */ diff --git a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c index a56dc040..0cf742c9 100644 --- a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c +++ b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c @@ -396,7 +396,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp0->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index0)) + vm->thread_index, &pool_index0)) { cache_ts_added++; } @@ -419,7 +419,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index0; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -455,7 +455,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp1->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index1)) + vm->thread_index, &pool_index1)) { cache_ts_added++; } @@ -479,7 +479,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh1 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index1; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -562,7 +562,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp0->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index0)) + vm->thread_index, &pool_index0)) { cache_ts_added++; } @@ -585,7 +585,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index0; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -701,7 +701,7 @@ expired_cache_ts_timer_callback (u32 * expired_timers) ioam_cache_main_t *cm = &ioam_cache_main; int i; u32 pool_index; - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 count = 0; for (i = 0; i < vec_len (expired_timers); i++) @@ -724,7 +724,7 @@ ioam_cache_ts_timer_tick_node_fn (vlib_main_t * vm, vlib_frame_t * f) { ioam_cache_main_t *cm = &ioam_cache_main; - u32 my_thread_index = os_get_cpu_number (); + u32 my_thread_index = vlib_get_thread_index (); struct timespec ts, tsrem; tw_timer_expire_timers_16t_2w_512sl (&cm->timer_wheels[my_thread_index], diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index f3c5cc09..08f5b692 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -1887,7 +1887,7 @@ done: vlib_increment_combined_counter (vnet_main. interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - 0 /* cpu_index */ , + 0 /* thread_index */ , xd->vlib_sw_if_index, n_packets, dq->rx.n_bytes); diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index add81236..addc2a42 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -63,11 +63,11 @@ u8 *format_lb_main (u8 * s, va_list * args) s = format(s, " #vips: %u\n", pool_elts(lbm->vips)); s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1); - u32 cpu_index; - for(cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++ ) { - lb_hash_t *h = lbm->per_cpu[cpu_index].sticky_ht; + u32 thread_index; + for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) { + lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht; if (h) { - s = format(s, "core %d\n", cpu_index); + s = format(s, "core %d\n", thread_index); s = format(s, " timeout: %ds\n", h->timeout); s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h)); } diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c index 8b763c53..3171148b 100644 --- a/src/plugins/lb/node.c +++ b/src/plugins/lb/node.c @@ -60,10 +60,10 @@ format_lb_trace (u8 * s, va_list * args) return s; } -lb_hash_t *lb_get_sticky_table(u32 cpu_index) +lb_hash_t *lb_get_sticky_table(u32 thread_index) { lb_main_t *lbm = &lb_main; - lb_hash_t *sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + lb_hash_t *sticky_ht = lbm->per_cpu[thread_index].sticky_ht; //Check if size changed if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) { @@ -71,8 +71,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index) lb_hash_bucket_t *b; u32 i; lb_hash_foreach_entry(sticky_ht, b, i) { - vlib_refcount_add(&lbm->as_refcount, cpu_index, b->value[i], -1); - vlib_refcount_add(&lbm->as_refcount, cpu_index, 0, 1); + vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1); + vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1); } lb_hash_free(sticky_ht); @@ -81,8 +81,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index) //Create if necessary if (PREDICT_FALSE(sticky_ht == NULL)) { - lbm->per_cpu[cpu_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); - sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + lbm->per_cpu[thread_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); + sticky_ht = lbm->per_cpu[thread_index].sticky_ht; clib_warning("Regenerated sticky table %p", sticky_ht); } @@ -153,10 +153,10 @@ lb_node_fn (vlib_main_t * vm, { lb_main_t *lbm = &lb_main; u32 n_left_from, *from, next_index, *to_next, n_left_to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 lb_time = lb_hash_time_now(vm); - lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index); + lb_hash_t *sticky_ht = lb_get_sticky_table(thread_index); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; @@ -240,9 +240,9 @@ lb_node_fn (vlib_main_t * vm, //Configuration may be changed, vectors resized, etc... //Dereference previously used - vlib_refcount_add(&lbm->as_refcount, cpu_index, + vlib_refcount_add(&lbm->as_refcount, thread_index, lb_hash_available_value(sticky_ht, hash0, available_index0), -1); - vlib_refcount_add(&lbm->as_refcount, cpu_index, + vlib_refcount_add(&lbm->as_refcount, thread_index, asindex0, 1); //Add sticky entry @@ -260,7 +260,7 @@ lb_node_fn (vlib_main_t * vm, } vlib_increment_simple_counter(&lbm->vip_counters[counter], - cpu_index, + thread_index, vnet_buffer (p0)->ip.adj_index[VLIB_TX], 1); diff --git a/src/plugins/lb/refcount.c b/src/plugins/lb/refcount.c index 22415c88..6f01ab5a 100644 --- a/src/plugins/lb/refcount.c +++ b/src/plugins/lb/refcount.c @@ -31,10 +31,10 @@ u64 vlib_refcount_get(vlib_refcount_t *r, u32 index) { u64 count = 0; vlib_thread_main_t *tm = vlib_get_thread_main (); - u32 cpu_index; - for (cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++) { - if (r->per_cpu[cpu_index].length > index) - count += r->per_cpu[cpu_index].counters[index]; + u32 thread_index; + for (thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++) { + if (r->per_cpu[thread_index].length > index) + count += r->per_cpu[thread_index].counters[index]; } return count; } diff --git a/src/plugins/lb/refcount.h b/src/plugins/lb/refcount.h index 8c26e7be..dcfcb3fe 100644 --- a/src/plugins/lb/refcount.h +++ b/src/plugins/lb/refcount.h @@ -45,9 +45,9 @@ typedef struct { void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size); static_always_inline -void vlib_refcount_add(vlib_refcount_t *r, u32 cpu_index, u32 counter_index, i32 v) +void vlib_refcount_add(vlib_refcount_t *r, u32 thread_index, u32 counter_index, i32 v) { - vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[cpu_index]; + vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[thread_index]; if (PREDICT_FALSE(counter_index >= per_cpu->length)) __vlib_refcount_resize(per_cpu, clib_max(counter_index + 16, per_cpu->length * 2)); diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c index 659d5dfb..cee1f3d1 100644 --- a/src/plugins/memif/node.c +++ b/src/plugins/memif/node.c @@ -94,7 +94,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_rx_bytes = 0; u32 *to_next = 0; u32 n_free_bufs; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 bi0, bi1; vlib_buffer_t *b0, *b1; u16 ring_size = 1 << mif->log2_ring_size; @@ -105,14 +105,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (mif->per_interface_next_index != ~0) next_index = mif->per_interface_next_index; - n_free_bufs = vec_len (nm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < ring_size)) { - vec_validate (nm->rx_buffers[cpu_index], ring_size + n_free_bufs - 1); + vec_validate (nm->rx_buffers[thread_index], + ring_size + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], ring_size); - _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; } head = ring->head; @@ -158,15 +159,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, CLIB_CACHE_LINE_BYTES, LOAD); } /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1; - bi0 = nm->rx_buffers[cpu_index][last_buf]; - bi1 = nm->rx_buffers[cpu_index][last_buf - 1]; - _vec_len (nm->rx_buffers[cpu_index]) -= 2; + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + bi0 = nm->rx_buffers[thread_index][last_buf]; + bi1 = nm->rx_buffers[thread_index][last_buf - 1]; + _vec_len (nm->rx_buffers[thread_index]) -= 2; if (last_buf > 4) { - memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 2]); - memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 3]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 2]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 3]); } /* enqueue buffer */ @@ -256,9 +257,9 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, while (num_slots && n_left_to_next) { /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1; - bi0 = nm->rx_buffers[cpu_index][last_buf]; - _vec_len (nm->rx_buffers[cpu_index]) = last_buf; + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + bi0 = nm->rx_buffers[thread_index][last_buf]; + _vec_len (nm->rx_buffers[thread_index]) = last_buf; /* enqueue buffer */ to_next[0] = bi0; @@ -315,7 +316,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, ring->tail = head; vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters - + VNET_INTERFACE_COUNTER_RX, cpu_index, + + VNET_INTERFACE_COUNTER_RX, thread_index, mif->hw_if_index, n_rx_packets, n_rx_bytes); @@ -327,7 +328,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); memif_main_t *nm = &memif_main; memif_if_t *mif; @@ -337,7 +338,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (mif->flags & MEMIF_IF_FLAG_ADMIN_UP && mif->flags & MEMIF_IF_FLAG_CONNECTED && (mif->if_index % nm->input_cpu_count) == - (cpu_index - nm->input_cpu_first_index)) + (thread_index - nm->input_cpu_first_index)) { if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) n_rx_packets += diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c index b4961365..e5ee965f 100644 --- a/src/plugins/snat/in2out.c +++ b/src/plugins/snat/in2out.c @@ -212,7 +212,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, snat_session_t ** sessionp, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index) + u32 thread_index) { snat_user_t *u; snat_user_key_t user_key; @@ -246,27 +246,27 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0)) { /* no, make a new one */ - pool_get (sm->per_thread_data[cpu_index].users, u); + pool_get (sm->per_thread_data[thread_index].users, u); memset (u, 0, sizeof (*u)); u->addr = ip0->src_address; u->fib_index = rx_fib_index0; - pool_get (sm->per_thread_data[cpu_index].list_pool, per_user_list_head_elt); + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); - kv0.value = u - sm->per_thread_data[cpu_index].users; + kv0.value = u - sm->per_thread_data[thread_index].users; /* add user */ clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */); } else { - u = pool_elt_at_index (sm->per_thread_data[cpu_index].users, + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, value0.value); } @@ -276,25 +276,25 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, /* Remove the oldest dynamic translation */ do { oldest_per_user_translation_list_index = - clib_dlist_remove_head (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove_head (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); ASSERT (oldest_per_user_translation_list_index != ~0); /* add it back to the end of the LRU list */ - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index, oldest_per_user_translation_list_index); /* Get the list element */ oldest_per_user_translation_list_elt = - pool_elt_at_index (sm->per_thread_data[cpu_index].list_pool, + pool_elt_at_index (sm->per_thread_data[thread_index].list_pool, oldest_per_user_translation_list_index); /* Get the session index from the list element */ session_index = oldest_per_user_translation_list_elt->value; /* Get the session */ - s = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, session_index); } while (snat_is_session_static (s)); @@ -346,7 +346,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, } /* Create a new session */ - pool_get (sm->per_thread_data[cpu_index].sessions, s); + pool_get (sm->per_thread_data[thread_index].sessions, s); memset (s, 0, sizeof (*s)); s->outside_address_index = address_index; @@ -362,22 +362,22 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, } /* Create list elts */ - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt); - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); per_user_translation_list_elt->value = - s - sm->per_thread_data[cpu_index].sessions; + s - sm->per_thread_data[thread_index].sessions; s->per_user_index = per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s->per_user_list_head_index, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); } s->in2out = *key0; @@ -388,12 +388,12 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, /* Add to translation hashes */ kv0.key = s->in2out.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */)) clib_warning ("in2out key add failed"); kv0.key = s->out2in.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */)) clib_warning ("out2in key add failed"); @@ -403,7 +403,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, worker_by_out_key.port = s->out2in.port; worker_by_out_key.fib_index = s->out2in.fib_index; kv0.key = worker_by_out_key.as_u64; - kv0.value = cpu_index; + kv0.value = thread_index; clib_bihash_add_del_8_8 (&sm->worker_by_out, &kv0, 1); /* log NAT event */ @@ -465,7 +465,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0, * * @param[in,out] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -473,7 +473,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0, * @param d optional parameter */ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -524,13 +524,13 @@ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, } next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto out; } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); out: @@ -548,7 +548,7 @@ out: * * @param[in] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -556,7 +556,7 @@ out: * @param d optional parameter */ u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -624,7 +624,7 @@ static inline u32 icmp_in2out (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index, + u32 thread_index, void *d) { snat_session_key_t key0, sm0; @@ -641,7 +641,7 @@ static inline u32 icmp_in2out (snat_main_t *sm, echo0 = (icmp_echo_header_t *)(icmp0+1); - next0_tmp = sm->icmp_match_in2out_cb(sm, node, cpu_index, b0, + next0_tmp = sm->icmp_match_in2out_cb(sm, node, thread_index, b0, &key0, &sm0, &dont_translate, d); if (next0_tmp != ~0) next0 = next0_tmp; @@ -847,11 +847,11 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm, vlib_node_runtime_t * node, u32 next0, f64 now, - u32 cpu_index, + u32 thread_index, snat_session_t ** p_s0) { next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, cpu_index, p_s0); + next0, thread_index, p_s0); snat_session_t * s0 = *p_s0; if (PREDICT_TRUE(next0 != SNAT_IN2OUT_NEXT_DROP && s0)) { @@ -862,9 +862,9 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm, /* Per-user LRU list maintenance for dynamic translations */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -884,7 +884,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, snat_runtime_t * rt = (snat_runtime_t *)node->runtime_data; f64 now = vlib_time_now (vm); u32 stats_node_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); stats_node_index = is_slow_path ? snat_in2out_slowpath_node.index : snat_in2out_node.index; @@ -977,7 +977,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next0 = icmp_in2out_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, - node, next0, now, cpu_index, &s0); + node, next0, now, thread_index, &s0); goto trace00; } } @@ -1006,7 +1006,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace00; next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto trace00; } @@ -1017,7 +1017,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->src_address.as_u32; @@ -1063,9 +1063,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1081,7 +1081,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; @@ -1117,7 +1117,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next1 = icmp_in2out_slow_path (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, cpu_index, &s1); + next1, now, thread_index, &s1); goto trace01; } } @@ -1146,7 +1146,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace01; next1 = slow_path (sm, b1, ip1, rx_fib_index1, &key1, - &s1, node, next1, cpu_index); + &s1, node, next1, thread_index); if (PREDICT_FALSE (next1 == SNAT_IN2OUT_NEXT_DROP)) goto trace01; } @@ -1157,7 +1157,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value1.value); old_addr1 = ip1->src_address.as_u32; @@ -1203,9 +1203,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s1)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s1->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s1->per_user_list_head_index, s1->per_user_index); } @@ -1220,7 +1220,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next1; t->session_index = ~0; if (s1) - t->session_index = s1 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next1 != SNAT_IN2OUT_NEXT_DROP; @@ -1292,7 +1292,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next0 = icmp_in2out_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace0; } } @@ -1321,7 +1321,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace0; next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto trace0; @@ -1333,7 +1333,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->src_address.as_u32; @@ -1379,9 +1379,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1397,7 +1397,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; @@ -2010,7 +2010,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm, u32 n_left_to_next_worker = 0, *to_next_worker = 0; u32 next_worker_index = 0; u32 current_worker_index = ~0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ASSERT (vec_len (sm->workers)); @@ -2048,7 +2048,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm, next_worker_index = sm->worker_in2out_cb(ip0, rx_fib_index0); - if (PREDICT_FALSE (next_worker_index != cpu_index)) + if (PREDICT_FALSE (next_worker_index != thread_index)) { do_handoff = 1; diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c index 656e42db..5d308d78 100644 --- a/src/plugins/snat/out2in.c +++ b/src/plugins/snat/out2in.c @@ -129,7 +129,7 @@ create_session_for_static_mapping (snat_main_t *sm, snat_session_key_t in2out, snat_session_key_t out2in, vlib_node_runtime_t * node, - u32 cpu_index) + u32 thread_index) { snat_user_t *u; snat_user_key_t user_key; @@ -146,36 +146,36 @@ create_session_for_static_mapping (snat_main_t *sm, if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0)) { /* no, make a new one */ - pool_get (sm->per_thread_data[cpu_index].users, u); + pool_get (sm->per_thread_data[thread_index].users, u); memset (u, 0, sizeof (*u)); u->addr = in2out.addr; u->fib_index = in2out.fib_index; - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); - kv0.value = u - sm->per_thread_data[cpu_index].users; + kv0.value = u - sm->per_thread_data[thread_index].users; /* add user */ clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */); /* add non-traslated packets worker lookup */ - kv0.value = cpu_index; + kv0.value = thread_index; clib_bihash_add_del_8_8 (&sm->worker_by_in, &kv0, 1); } else { - u = pool_elt_at_index (sm->per_thread_data[cpu_index].users, + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, value0.value); } - pool_get (sm->per_thread_data[cpu_index].sessions, s); + pool_get (sm->per_thread_data[thread_index].sessions, s); memset (s, 0, sizeof (*s)); s->outside_address_index = ~0; @@ -183,22 +183,22 @@ create_session_for_static_mapping (snat_main_t *sm, u->nstaticsessions++; /* Create list elts */ - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt); - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); per_user_translation_list_elt->value = - s - sm->per_thread_data[cpu_index].sessions; + s - sm->per_thread_data[thread_index].sessions; s->per_user_index = - per_user_translation_list_elt - sm->per_thread_data[cpu_index].list_pool; + per_user_translation_list_elt - sm->per_thread_data[thread_index].list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s->per_user_list_head_index, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); s->in2out = in2out; s->out2in = out2in; @@ -206,12 +206,12 @@ create_session_for_static_mapping (snat_main_t *sm, /* Add to translation hashes */ kv0.key = s->in2out.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */)) clib_warning ("in2out key add failed"); kv0.key = s->out2in.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */)) clib_warning ("out2in key add failed"); @@ -298,7 +298,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, * * @param[in,out] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -306,7 +306,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, * @param d optional parameter */ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -366,7 +366,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, - node, cpu_index); + node, thread_index); if (!s0) { @@ -375,7 +375,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); out: @@ -393,7 +393,7 @@ out: * * @param[in] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -401,7 +401,7 @@ out: * @param d optional parameter */ u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -460,7 +460,7 @@ static inline u32 icmp_out2in (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index, + u32 thread_index, void *d) { snat_session_key_t key0, sm0; @@ -477,7 +477,7 @@ static inline u32 icmp_out2in (snat_main_t *sm, echo0 = (icmp_echo_header_t *)(icmp0+1); - next0_tmp = sm->icmp_match_out2in_cb(sm, node, cpu_index, b0, + next0_tmp = sm->icmp_match_out2in_cb(sm, node, thread_index, b0, &key0, &sm0, &dont_translate, d); if (next0_tmp != ~0) next0 = next0_tmp; @@ -589,11 +589,11 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, f64 now, - u32 cpu_index, + u32 thread_index, snat_session_t ** p_s0) { next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, cpu_index, p_s0); + next0, thread_index, p_s0); snat_session_t * s0 = *p_s0; if (PREDICT_TRUE(next0 != SNAT_OUT2IN_NEXT_DROP && s0)) { @@ -604,9 +604,9 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -624,7 +624,7 @@ snat_out2in_node_fn (vlib_main_t * vm, u32 pkts_processed = 0; snat_main_t * sm = &snat_main; f64 now = vlib_time_now (vm); - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -712,7 +712,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next0 = icmp_out2in_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace0; } @@ -743,7 +743,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, - cpu_index); + thread_index); if (!s0) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -752,7 +752,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->dst_address.as_u32; @@ -796,9 +796,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -813,7 +813,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; @@ -847,7 +847,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next1 = icmp_out2in_slow_path (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, cpu_index, &s1); + next1, now, thread_index, &s1); goto trace1; } @@ -878,7 +878,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s1 = create_session_for_static_mapping(sm, b1, sm1, key1, node, - cpu_index); + thread_index); if (!s1) { b1->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -887,7 +887,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value1.value); old_addr1 = ip1->dst_address.as_u32; @@ -931,9 +931,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s1)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s1->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s1->per_user_list_head_index, s1->per_user_index); } @@ -948,7 +948,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next1; t->session_index = ~0; if (s1) - t->session_index = s1 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next1 != SNAT_OUT2IN_NEXT_DROP; @@ -1016,7 +1016,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next0 = icmp_out2in_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace00; } @@ -1048,7 +1048,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, - cpu_index); + thread_index); if (!s0) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -1057,7 +1057,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->dst_address.as_u32; @@ -1101,9 +1101,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1118,7 +1118,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; @@ -1599,7 +1599,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm, u32 n_left_to_next_worker = 0, *to_next_worker = 0; u32 next_worker_index = 0; u32 current_worker_index = ~0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ASSERT (vec_len (sm->workers)); @@ -1637,7 +1637,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm, next_worker_index = sm->worker_out2in_cb(ip0, rx_fib_index0); - if (PREDICT_FALSE (next_worker_index != cpu_index)) + if (PREDICT_FALSE (next_worker_index != thread_index)) { do_handoff = 1; diff --git a/src/plugins/snat/snat.h b/src/plugins/snat/snat.h index 017825c0..f4e1c5c0 100644 --- a/src/plugins/snat/snat.h +++ b/src/plugins/snat/snat.h @@ -221,7 +221,7 @@ struct snat_main_s; typedef u32 snat_icmp_match_function_t (struct snat_main_s *sm, vlib_node_runtime_t *node, - u32 cpu_index, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, @@ -402,22 +402,22 @@ typedef struct { } tcp_udp_header_t; u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a517a597..be3b41ef 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -299,7 +299,7 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, if (CLIB_DEBUG == 0) return; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); /* smp disaster check */ if (vec_len (vlib_mains) > 1) @@ -355,7 +355,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, vlib_buffer_free_list_t *f; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) { @@ -474,7 +474,7 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) u32 merge_index; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 394c336a..328660a3 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -209,7 +209,7 @@ always_inline vlib_buffer_known_state_t vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); uword *p = hash_get (bm->buffer_known_hash, buffer_index); return p ? p[0] : VLIB_BUFFER_UNKNOWN; @@ -221,7 +221,7 @@ vlib_buffer_set_known_state (vlib_main_t * vm, vlib_buffer_known_state_t state) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); hash_set (bm->buffer_known_hash, buffer_index, state); } diff --git a/src/vlib/cli.c b/src/vlib/cli.c index f853f655..3cc95076 100644 --- a/src/vlib/cli.c +++ b/src/vlib/cli.c @@ -709,7 +709,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap->flags |= MHEAP_FLAG_VALIDATE; // Turn off small object cache because it delays detection of errors @@ -722,7 +722,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap->flags &= ~MHEAP_FLAG_VALIDATE; mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; @@ -733,7 +733,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap_validate(heap); }); diff --git a/src/vlib/counter.h b/src/vlib/counter.h index 17a85217..60e2055d 100644 --- a/src/vlib/counter.h +++ b/src/vlib/counter.h @@ -70,17 +70,17 @@ u32 vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm); /** Increment a simple counter @param cm - (vlib_simple_counter_main_t *) simple counter main pointer - @param cpu_index - (u32) the current cpu index + @param thread_index - (u32) the current cpu index @param index - (u32) index of the counter to increment @param increment - (u64) quantitiy to add to the counter */ always_inline void vlib_increment_simple_counter (vlib_simple_counter_main_t * cm, - u32 cpu_index, u32 index, u64 increment) + u32 thread_index, u32 index, u64 increment) { counter_t *my_counters; - my_counters = cm->counters[cpu_index]; + my_counters = cm->counters[thread_index]; my_counters[index] += increment; } @@ -201,7 +201,7 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); /** Increment a combined counter @param cm - (vlib_combined_counter_main_t *) comined counter main pointer - @param cpu_index - (u32) the current cpu index + @param thread_index - (u32) the current cpu index @param index - (u32) index of the counter to increment @param packet_increment - (u64) number of packets to add to the counter @param byte_increment - (u64) number of bytes to add to the counter @@ -209,13 +209,13 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); always_inline void vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, - u32 cpu_index, + u32 thread_index, u32 index, u64 n_packets, u64 n_bytes) { vlib_counter_t *my_counters; /* Use this CPU's counter array */ - my_counters = cm->counters[cpu_index]; + my_counters = cm->counters[thread_index]; my_counters[index].packets += n_packets; my_counters[index].bytes += n_bytes; @@ -224,14 +224,14 @@ vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, /** Pre-fetch a per-thread combined counter for the given object index */ always_inline void vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm, - u32 cpu_index, u32 index) + u32 thread_index, u32 index) { vlib_counter_t *cpu_counters; /* * This CPU's index is assumed to already be in cache */ - cpu_counters = cm->counters[cpu_index]; + cpu_counters = cm->counters[thread_index]; CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE); } diff --git a/src/vlib/error.c b/src/vlib/error.c index a2c23176..e4ed4ee3 100644 --- a/src/vlib/error.c +++ b/src/vlib/error.c @@ -149,7 +149,7 @@ vlib_register_errors (vlib_main_t * vm, vlib_node_t *n = vlib_get_node (vm, node_index); uword l; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); /* Free up any previous error strings. */ if (n->n_errors > 0) diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h index f51ec381..9dd01fbf 100644 --- a/src/vlib/global_funcs.h +++ b/src/vlib/global_funcs.h @@ -23,7 +23,7 @@ always_inline vlib_main_t * vlib_get_main (void) { vlib_main_t *vm; - vm = vlib_mains[os_get_cpu_number ()]; + vm = vlib_mains[vlib_get_thread_index ()]; ASSERT (vm); return vm; } diff --git a/src/vlib/main.c b/src/vlib/main.c index b22203f0..422d3e26 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -136,18 +136,18 @@ vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, else { f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN); - f->cpu_index = vm->cpu_index; + f->thread_index = vm->thread_index; fi = vlib_frame_index_no_check (vm, f); } /* Poison frame when debugging. */ if (CLIB_DEBUG > 0) { - u32 save_cpu_index = f->cpu_index; + u32 save_thread_index = f->thread_index; memset (f, 0xfe, n); - f->cpu_index = save_cpu_index; + f->thread_index = save_thread_index; } /* Insert magic number. */ @@ -517,7 +517,7 @@ vlib_put_next_frame (vlib_main_t * vm, * a dangling frame reference. Each thread has its own copy of * the next_frames vector. */ - if (0 && r->cpu_index != next_runtime->cpu_index) + if (0 && r->thread_index != next_runtime->thread_index) { nf->frame_index = ~0; nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED); @@ -866,7 +866,7 @@ vlib_elog_main_loop_event (vlib_main_t * vm, : evm->node_call_elog_event_types, node_index), /* track */ - (vm->cpu_index ? &vlib_worker_threads[vm->cpu_index]. + (vm->thread_index ? &vlib_worker_threads[vm->thread_index]. elog_track : &em->default_track), /* data to log */ n_vectors); } @@ -963,7 +963,7 @@ dispatch_node (vlib_main_t * vm, vm->cpu_time_last_node_dispatch = last_time_stamp; - if (1 /* || vm->cpu_index == node->cpu_index */ ) + if (1 /* || vm->thread_index == node->thread_index */ ) { vlib_main_t *stat_vm; @@ -1029,7 +1029,7 @@ dispatch_node (vlib_main_t * vm, { u32 node_name, vector_length, is_polling; } *ed; - vlib_worker_thread_t *w = vlib_worker_threads + vm->cpu_index; + vlib_worker_thread_t *w = vlib_worker_threads + vm->thread_index; #endif if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT diff --git a/src/vlib/main.h b/src/vlib/main.h index 0197b4f3..329bf073 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -156,7 +156,7 @@ typedef struct vlib_main_t uword *init_functions_called; /* to compare with node runtime */ - u32 cpu_index; + u32 thread_index; void **mbuf_alloc_list; diff --git a/src/vlib/node.c b/src/vlib/node.c index dc0a4de5..bbd3a42e 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -99,7 +99,7 @@ vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index) vlib_pending_frame_t *pf; i32 i, j, n_insert; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); vlib_worker_thread_barrier_sync (vm); diff --git a/src/vlib/node.h b/src/vlib/node.h index fc7e7da2..1e2f4c38 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -344,8 +344,8 @@ typedef struct vlib_frame_t /* Number of vector elements currently in frame. */ u16 n_vectors; - /* Owner cpuid / heap id */ - u16 cpu_index; + /* Owner thread / heap id */ + u16 thread_index; /* Scalar and vector arguments to next node. */ u8 arguments[0]; @@ -459,7 +459,7 @@ typedef struct vlib_node_runtime_t zero before first run of this node. */ - u16 cpu_index; /**< CPU this node runs on */ + u16 thread_index; /**< thread this node runs on */ u8 runtime_data[0]; /**< Function dependent node-runtime data. This data is diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 1f7d94e1..54e36874 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -201,9 +201,9 @@ always_inline vlib_frame_t * vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) { vlib_frame_t *f; - u32 cpu_index = frame_index & VLIB_CPU_MASK; + u32 thread_index = frame_index & VLIB_CPU_MASK; u32 offset = frame_index & VLIB_OFFSET_MASK; - vm = vlib_mains[cpu_index]; + vm = vlib_mains[thread_index]; f = vm->heap_base + offset; return f; } @@ -215,10 +215,10 @@ vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) ASSERT (((uword) f & VLIB_CPU_MASK) == 0); - vm = vlib_mains[f->cpu_index]; + vm = vlib_mains[f->thread_index]; i = ((u8 *) f - (u8 *) vm->heap_base); - return i | f->cpu_index; + return i | f->thread_index; } always_inline vlib_frame_t * diff --git a/src/vlib/threads.c b/src/vlib/threads.c index ef3a24d3..4a111f8d 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -35,27 +35,12 @@ vl (void *p) vlib_worker_thread_t *vlib_worker_threads; vlib_thread_main_t vlib_thread_main; +__thread uword vlib_thread_index = 0; + uword os_get_cpu_number (void) { - void *sp; - uword n; - u32 len; - - len = vec_len (vlib_thread_stacks); - if (len == 0) - return 0; - - /* Get any old stack address. */ - sp = &sp; - - n = ((uword) sp - (uword) vlib_thread_stacks[0]) - >> VLIB_LOG2_THREAD_STACK_SIZE; - - /* "processes" have their own stacks, and they always run in thread 0 */ - n = n >= len ? 0 : n; - - return n; + return vlib_thread_index; } uword @@ -275,21 +260,6 @@ vlib_thread_init (vlib_main_t * vm) return 0; } -vlib_worker_thread_t * -vlib_alloc_thread (vlib_main_t * vm) -{ - vlib_worker_thread_t *w; - - if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks)) - { - clib_warning ("out of worker threads... Quitting..."); - exit (1); - } - vec_add2 (vlib_worker_threads, w, 1); - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; - return w; -} - vlib_frame_queue_t * vlib_frame_queue_alloc (int nelts) { @@ -427,7 +397,7 @@ vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, f64 b4 = vlib_time_now_ticks (vm, before); vlib_worker_thread_barrier_check (vm, b4); /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */ - // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm); + // vlib_frame_queue_dequeue (vm->thread_index, vm, nm); } elt = fq->elts + (new_tail & (fq->nelts - 1)); @@ -497,6 +467,8 @@ vlib_worker_thread_bootstrap_fn (void *arg) w->lwp = syscall (SYS_gettid); w->thread_id = pthread_self (); + vlib_thread_index = w - vlib_worker_threads; + rv = (void *) clib_calljmp ((uword (*)(uword)) w->thread_function, (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE); @@ -610,7 +582,9 @@ start_workers (vlib_main_t * vm) mheap_alloc (0 /* use VM */ , tr->mheap_size); else w->thread_mheap = main_heap; - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + + w->thread_stack = + vlib_thread_stack_init (w - vlib_worker_threads); w->thread_function = tr->function; w->thread_function_arg = w; w->instance_id = k; @@ -630,7 +604,7 @@ start_workers (vlib_main_t * vm) vm_clone = clib_mem_alloc (sizeof (*vm_clone)); clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone)); - vm_clone->cpu_index = worker_thread_index; + vm_clone->thread_index = worker_thread_index; vm_clone->heap_base = w->thread_mheap; vm_clone->mbuf_alloc_list = 0; vm_clone->init_functions_called = @@ -679,7 +653,7 @@ start_workers (vlib_main_t * vm) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy initial runtime_data from node */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -692,7 +666,7 @@ start_workers (vlib_main_t * vm) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy initial runtime_data from node */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -756,7 +730,8 @@ start_workers (vlib_main_t * vm) mheap_alloc (0 /* use VM */ , tr->mheap_size); else w->thread_mheap = main_heap; - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + w->thread_stack = + vlib_thread_stack_init (w - vlib_worker_threads); w->thread_function = tr->function; w->thread_function_arg = w; w->instance_id = j; @@ -827,7 +802,7 @@ vlib_worker_thread_node_runtime_update (void) uword n_calls, uword n_vectors, uword n_clocks); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (vec_len (vlib_mains) == 1) return; @@ -835,7 +810,7 @@ vlib_worker_thread_node_runtime_update (void) vm = vlib_mains[0]; nm = &vm->node_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); ASSERT (*vlib_worker_threads->wait_at_barrier == 1); /* @@ -955,7 +930,7 @@ vlib_worker_thread_node_runtime_update (void) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy runtime_data, will be overwritten later for existing rt */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -981,7 +956,7 @@ vlib_worker_thread_node_runtime_update (void) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy runtime_data, will be overwritten later for existing rt */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -1180,7 +1155,7 @@ vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which) if (vlib_mains == 0) return; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); vlib_worker_thread_barrier_sync (vm); switch (which) @@ -1212,7 +1187,7 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) vlib_worker_threads[0].barrier_sync_count++; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; @@ -1260,7 +1235,7 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) int vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm) { - u32 thread_id = vm->cpu_index; + u32 thread_id = vm->thread_index; vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id]; vlib_frame_queue_elt_t *elt; u32 *from, *to; @@ -1393,7 +1368,7 @@ vlib_worker_thread_fn (void *arg) vlib_main_t *vm = vlib_get_main (); clib_error_t *e; - ASSERT (vm->cpu_index == os_get_cpu_number ()); + ASSERT (vm->thread_index == vlib_get_thread_index ()); vlib_worker_thread_init (w); clib_time_init (&vm->clib_time); diff --git a/src/vlib/threads.h b/src/vlib/threads.h index eca4fc26..101d3d4a 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -153,8 +153,6 @@ typedef struct /* Called early, in thread 0's context */ clib_error_t *vlib_thread_init (vlib_main_t * vm); -vlib_worker_thread_t *vlib_alloc_thread (vlib_main_t * vm); - int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, u32 frame_queue_index, vlib_frame_t * frame, vlib_frame_queue_msg_type_t type); @@ -183,12 +181,19 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts); void vlib_worker_thread_barrier_sync (vlib_main_t * vm); void vlib_worker_thread_barrier_release (vlib_main_t * vm); +extern __thread uword vlib_thread_index; +static_always_inline uword +vlib_get_thread_index (void) +{ + return vlib_thread_index; +} + always_inline void vlib_smp_unsafe_warning (void) { if (CLIB_DEBUG > 0) { - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__); } } @@ -331,21 +336,21 @@ vlib_num_workers () } always_inline u32 -vlib_get_worker_cpu_index (u32 worker_index) +vlib_get_worker_thread_index (u32 worker_index) { return worker_index + 1; } always_inline u32 -vlib_get_worker_index (u32 cpu_index) +vlib_get_worker_index (u32 thread_index) { - return cpu_index - 1; + return thread_index - 1; } always_inline u32 vlib_get_current_worker_index () { - return os_get_cpu_number () - 1; + return vlib_get_thread_index () - 1; } static inline void @@ -467,6 +472,8 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, return elt; } +u8 *vlib_thread_stack_init (uword thread_index); + int vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb); diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c index 33ba163a..7c1e9475 100644 --- a/src/vlib/unix/cj.c +++ b/src/vlib/unix/cj.c @@ -48,7 +48,7 @@ cj_log (u32 type, void *data0, void *data1) r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]); r->time = vlib_time_now (cjm->vlib_main); - r->cpu = os_get_cpu_number (); + r->thread_index = vlib_get_thread_index (); r->type = type; r->data[0] = pointer_to_uword (data0); r->data[1] = pointer_to_uword (data1); @@ -133,7 +133,8 @@ static inline void cj_dump_one_record (cj_record_t * r) { fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n", - r->cpu, r->time, r->type, (long long unsigned int) r->data[0], + r->thread_index, r->time, r->type, + (long long unsigned int) r->data[0], (long long unsigned int) r->data[1]); } @@ -161,7 +162,7 @@ cj_dump_internal (u8 filter0_enable, u64 filter0, index = (cjm->tail + 1) & (cjm->num_records - 1); r = &(cjm->records[index]); - if (r->cpu != (u32) ~ 0) + if (r->thread_index != (u32) ~ 0) { /* Yes, dump from tail + 1 to the end */ for (i = index; i < cjm->num_records; i++) diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h index 67626afe..d0a1d46e 100644 --- a/src/vlib/unix/cj.h +++ b/src/vlib/unix/cj.h @@ -23,7 +23,7 @@ typedef struct { f64 time; - u32 cpu; + u32 thread_index; u32 type; u64 data[2]; } cj_record_t; diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 6b96cc0d..db5ddd64 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -510,13 +510,28 @@ thread0 (uword arg) return i; } +u8 * +vlib_thread_stack_init (uword thread_index) +{ + vec_validate (vlib_thread_stacks, thread_index); + vlib_thread_stacks[thread_index] = clib_mem_alloc_aligned + (VLIB_THREAD_STACK_SIZE, VLIB_THREAD_STACK_SIZE); + + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (vlib_thread_stacks[thread_index], + clib_mem_get_page_size (), PROT_READ) < 0) + clib_unix_warning ("thread stack"); + return vlib_thread_stacks[thread_index]; +} + int vlib_unix_main (int argc, char *argv[]) { vlib_main_t *vm = &vlib_global_main; /* one and only time for this! */ - vlib_thread_main_t *tm = &vlib_thread_main; unformat_input_t input; - u8 *thread_stacks; clib_error_t *e; int i; @@ -548,29 +563,9 @@ vlib_unix_main (int argc, char *argv[]) } unformat_free (&input); - /* - * allocate n x VLIB_THREAD_STACK_SIZE stacks, aligned to a - * VLIB_THREAD_STACK_SIZE boundary - * See also: os_get_cpu_number() in vlib/vlib/threads.c - */ - thread_stacks = clib_mem_alloc_aligned - ((uword) tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE, - VLIB_THREAD_STACK_SIZE); - - vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1); - for (i = 0; i < vec_len (vlib_thread_stacks); i++) - { - vlib_thread_stacks[i] = thread_stacks; - - /* - * Disallow writes to the bottom page of the stack, to - * catch stack overflows. - */ - if (mprotect (thread_stacks, clib_mem_get_page_size (), PROT_READ) < 0) - clib_unix_warning ("thread stack"); + vlib_thread_stack_init (0); - thread_stacks += VLIB_THREAD_STACK_SIZE; - } + vlib_thread_index = 0; i = clib_calljmp (thread0, (uword) vm, (void *) (vlib_thread_stacks[0] + diff --git a/src/vnet/adj/adj_l2.c b/src/vnet/adj/adj_l2.c index f68e54e0..20d70dd4 100644 --- a/src/vnet/adj/adj_l2.c +++ b/src/vnet/adj/adj_l2.c @@ -52,7 +52,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm, { u32 * from = vlib_frame_vector_args (frame); u32 n_left_from, n_left_to_next, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); ethernet_main_t * em = ðernet_main; n_left_from = frame->n_vectors; @@ -93,7 +93,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm, vnet_buffer(p0)->sw_if_index[VLIB_TX] = adj0->rewrite_header.sw_if_index; vlib_increment_combined_counter(&adjacency_counters, - cpu_index, + thread_index, adj_index0, /* packet increment */ 0, /* byte increment */ rw_len0); diff --git a/src/vnet/adj/adj_midchain.c b/src/vnet/adj/adj_midchain.c index e8087f08..5756de43 100644 --- a/src/vnet/adj/adj_midchain.c +++ b/src/vnet/adj/adj_midchain.c @@ -49,7 +49,7 @@ adj_midchain_tx_inline (vlib_main_t * vm, u32 next_index; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; /* Vector of buffer / pkt indices we're supposed to process */ from = vlib_frame_vector_args (frame); @@ -124,13 +124,13 @@ adj_midchain_tx_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj0->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj1->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b1)); @@ -181,7 +181,7 @@ adj_midchain_tx_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj0->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/vnet/adj/adj_nsh.c b/src/vnet/adj/adj_nsh.c index 9a0f9d8b..128570b0 100644 --- a/src/vnet/adj/adj_nsh.c +++ b/src/vnet/adj/adj_nsh.c @@ -53,7 +53,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm, { u32 * from = vlib_frame_vector_args (frame); u32 n_left_from, n_left_to_next, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); n_left_from = frame->n_vectors; next_index = node->cached_next_index; @@ -94,7 +94,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm, vnet_buffer(p0)->ip.save_rewrite_length = rw_len0; vlib_increment_combined_counter(&adjacency_counters, - cpu_index, + thread_index, adj_index0, /* packet increment */ 0, /* byte increment */ rw_len0); diff --git a/src/vnet/classify/vnet_classify.c b/src/vnet/classify/vnet_classify.c index 98842a48..70a189b0 100644 --- a/src/vnet/classify/vnet_classify.c +++ b/src/vnet/classify/vnet_classify.c @@ -251,12 +251,12 @@ static inline void make_working_copy vnet_classify_entry_##size##_t * working_copy##size = 0; foreach_size_in_u32x4; #undef _ - u32 cpu_number = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); - if (cpu_number >= vec_len (t->working_copies)) + if (thread_index >= vec_len (t->working_copies)) { oldheap = clib_mem_set_heap (t->mheap); - vec_validate (t->working_copies, cpu_number); + vec_validate (t->working_copies, thread_index); clib_mem_set_heap (oldheap); } @@ -265,7 +265,7 @@ static inline void make_working_copy * updates from multiple threads will not result in sporadic, spurious * lookup failures. */ - working_copy = t->working_copies[cpu_number]; + working_copy = t->working_copies[thread_index]; t->saved_bucket.as_u64 = b->as_u64; oldheap = clib_mem_set_heap (t->mheap); @@ -290,7 +290,7 @@ static inline void make_working_copy default: abort(); } - t->working_copies[cpu_number] = working_copy; + t->working_copies[thread_index] = working_copy; } _vec_len(working_copy) = (1<log2_pages)*t->entries_per_page; @@ -318,7 +318,7 @@ static inline void make_working_copy working_bucket.offset = vnet_classify_get_offset (t, working_copy); CLIB_MEMORY_BARRIER(); b->as_u64 = working_bucket.as_u64; - t->working_copies[cpu_number] = working_copy; + t->working_copies[thread_index] = working_copy; } static vnet_classify_entry_t * @@ -387,7 +387,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t, int i; u64 hash, new_hash; u32 new_log2_pages; - u32 cpu_number = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u8 * key_minus_skip; ASSERT ((add_v->flags & VNET_CLASSIFY_ENTRY_FREE) == 0); @@ -498,7 +498,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t, new_log2_pages = t->saved_bucket.log2_pages + 1; expand_again: - working_copy = t->working_copies[cpu_number]; + working_copy = t->working_copies[thread_index]; new_v = split_and_rehash (t, working_copy, new_log2_pages); if (new_v == 0) diff --git a/src/vnet/cop/ip4_whitelist.c b/src/vnet/cop/ip4_whitelist.c index 6ef3d7d7..1b5e336b 100644 --- a/src/vnet/cop/ip4_whitelist.c +++ b/src/vnet/cop/ip4_whitelist.c @@ -60,7 +60,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, cop_feature_type_t next_index; cop_main_t *cm = &cop_main; vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -177,12 +177,12 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, dpo1 = load_balance_get_bucket_i(lb1, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); vlib_increment_combined_counter - (vcm, cpu_index, lb_index1, 1, + (vcm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, b1) + sizeof(ethernet_header_t)); @@ -273,7 +273,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, dpo0 = load_balance_get_bucket_i(lb0, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); diff --git a/src/vnet/cop/ip6_whitelist.c b/src/vnet/cop/ip6_whitelist.c index c2e16ccf..f3fe62e3 100644 --- a/src/vnet/cop/ip6_whitelist.c +++ b/src/vnet/cop/ip6_whitelist.c @@ -61,7 +61,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, cop_main_t *cm = &cop_main; ip6_main_t * im6 = &ip6_main; vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -153,12 +153,12 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, dpo1 = load_balance_get_bucket_i(lb1, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); vlib_increment_combined_counter - (vcm, cpu_index, lb_index1, 1, + (vcm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, b1) + sizeof(ethernet_header_t)); @@ -233,7 +233,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, dpo0 = load_balance_get_bucket_i(lb0, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index ba337f3f..76980102 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -124,7 +124,7 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 frame_num = apif->rx_req->tp_frame_nr; u8 *block_start = apif->rx_ring + block * block_size; uword n_trace = vlib_get_trace_count (vm, node); - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); u32 min_bufs = apif->rx_req->tp_frame_size / n_buffer_bytes; @@ -132,15 +132,15 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (apif->per_interface_next_index != ~0) next_index = apif->per_interface_next_index; - n_free_bufs = vec_len (apm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (apm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) { - vec_validate (apm->rx_buffers[cpu_index], + vec_validate (apm->rx_buffers[thread_index], VLIB_FRAME_SIZE + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &apm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &apm->rx_buffers[thread_index][n_free_bufs], VLIB_FRAME_SIZE); - _vec_len (apm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (apm->rx_buffers[thread_index]) = n_free_bufs; } rx_frame = apif->next_rx_frame; @@ -163,11 +163,11 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { /* grab free buffer */ u32 last_empty_buffer = - vec_len (apm->rx_buffers[cpu_index]) - 1; + vec_len (apm->rx_buffers[thread_index]) - 1; prev_bi0 = bi0; - bi0 = apm->rx_buffers[cpu_index][last_empty_buffer]; + bi0 = apm->rx_buffers[thread_index][last_empty_buffer]; b0 = vlib_get_buffer (vm, bi0); - _vec_len (apm->rx_buffers[cpu_index]) = last_empty_buffer; + _vec_len (apm->rx_buffers[thread_index]) = last_empty_buffer; n_free_bufs--; /* copy data */ @@ -236,9 +236,9 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), apif->hw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), apif->hw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c index 41645220..5e5e812c 100644 --- a/src/vnet/devices/devices.c +++ b/src/vnet/devices/devices.c @@ -104,7 +104,7 @@ vnet_device_queue_sort (void *a1, void *a2) void vnet_device_input_assign_thread (u32 hw_if_index, - u16 queue_id, uword cpu_index) + u16 queue_id, uword thread_index) { vnet_main_t *vnm = vnet_get_main (); vnet_device_main_t *vdm = &vnet_device_main; @@ -115,19 +115,19 @@ vnet_device_input_assign_thread (u32 hw_if_index, ASSERT (hw->input_node_index > 0); - if (vdm->first_worker_cpu_index == 0) - cpu_index = 0; + if (vdm->first_worker_thread_index == 0) + thread_index = 0; - if (cpu_index != 0 && - (cpu_index < vdm->first_worker_cpu_index || - cpu_index > vdm->last_worker_cpu_index)) + if (thread_index != 0 && + (thread_index < vdm->first_worker_thread_index || + thread_index > vdm->last_worker_thread_index)) { - cpu_index = vdm->next_worker_cpu_index++; - if (vdm->next_worker_cpu_index > vdm->last_worker_cpu_index) - vdm->next_worker_cpu_index = vdm->first_worker_cpu_index; + thread_index = vdm->next_worker_thread_index++; + if (vdm->next_worker_thread_index > vdm->last_worker_thread_index) + vdm->next_worker_thread_index = vdm->first_worker_thread_index; } - vm = vlib_mains[cpu_index]; + vm = vlib_mains[thread_index]; rt = vlib_node_get_runtime_data (vm, hw->input_node_index); vec_add2 (rt->devices_and_queues, dq, 1); @@ -136,33 +136,33 @@ vnet_device_input_assign_thread (u32 hw_if_index, dq->queue_id = queue_id; vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); - vec_validate (hw->input_node_cpu_index_by_queue, queue_id); - hw->input_node_cpu_index_by_queue[queue_id] = cpu_index; + vec_validate (hw->input_node_thread_index_by_queue, queue_id); + hw->input_node_thread_index_by_queue[queue_id] = thread_index; } static int vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id, - uword cpu_index) + uword thread_index) { vnet_main_t *vnm = vnet_get_main (); vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); vnet_device_input_runtime_t *rt; vnet_device_and_queue_t *dq; - uword old_cpu_index; + uword old_thread_index; - if (hw->input_node_cpu_index_by_queue == 0) + if (hw->input_node_thread_index_by_queue == 0) return VNET_API_ERROR_INVALID_INTERFACE; - if (vec_len (hw->input_node_cpu_index_by_queue) < queue_id + 1) + if (vec_len (hw->input_node_thread_index_by_queue) < queue_id + 1) return VNET_API_ERROR_INVALID_INTERFACE; - old_cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; + old_thread_index = hw->input_node_thread_index_by_queue[queue_id]; - if (old_cpu_index == cpu_index) + if (old_thread_index == thread_index) return 0; rt = - vlib_node_get_runtime_data (vlib_mains[old_cpu_index], + vlib_node_get_runtime_data (vlib_mains[old_thread_index], hw->input_node_index); vec_foreach (dq, rt->devices_and_queues) @@ -240,7 +240,7 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, vnet_device_main_t *vdm = &vnet_device_main; u32 hw_if_index = (u32) ~ 0; u32 queue_id = (u32) 0; - u32 cpu_index = (u32) ~ 0; + u32 thread_index = (u32) ~ 0; int rv; if (!unformat_user (input, unformat_line_input, line_input)) @@ -253,10 +253,10 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (line_input, "queue %d", &queue_id)) ; - else if (unformat (line_input, "main", &cpu_index)) - cpu_index = 0; - else if (unformat (line_input, "worker %d", &cpu_index)) - cpu_index += vdm->first_worker_cpu_index; + else if (unformat (line_input, "main", &thread_index)) + thread_index = 0; + else if (unformat (line_input, "worker %d", &thread_index)) + thread_index += vdm->first_worker_thread_index; else { error = clib_error_return (0, "parse error: '%U'", @@ -271,16 +271,17 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, if (hw_if_index == (u32) ~ 0) return clib_error_return (0, "please specify valid interface name"); - if (cpu_index > vdm->last_worker_cpu_index) + if (thread_index > vdm->last_worker_thread_index) return clib_error_return (0, "please specify valid worker thread or main"); - rv = vnet_device_input_unassign_thread (hw_if_index, queue_id, cpu_index); + rv = + vnet_device_input_unassign_thread (hw_if_index, queue_id, thread_index); if (rv) return clib_error_return (0, "not found"); - vnet_device_input_assign_thread (hw_if_index, queue_id, cpu_index); + vnet_device_input_assign_thread (hw_if_index, queue_id, thread_index); return 0; } @@ -326,9 +327,9 @@ vnet_device_init (vlib_main_t * vm) tr = p ? (vlib_thread_registration_t *) p[0] : 0; if (tr && tr->count > 0) { - vdm->first_worker_cpu_index = tr->first_index; - vdm->next_worker_cpu_index = tr->first_index; - vdm->last_worker_cpu_index = tr->first_index + tr->count - 1; + vdm->first_worker_thread_index = tr->first_index; + vdm->next_worker_thread_index = tr->first_index; + vdm->last_worker_thread_index = tr->first_index + tr->count - 1; } return 0; } diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h index bbb29fe3..966f8302 100644 --- a/src/vnet/devices/devices.h +++ b/src/vnet/devices/devices.h @@ -50,9 +50,9 @@ typedef struct typedef struct { vnet_device_per_worker_data_t *workers; - uword first_worker_cpu_index; - uword last_worker_cpu_index; - uword next_worker_cpu_index; + uword first_worker_thread_index; + uword last_worker_thread_index; + uword next_worker_thread_index; } vnet_device_main_t; typedef struct @@ -80,7 +80,7 @@ vnet_set_device_input_node (u32 hw_if_index, u32 node_index) } void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id, - uword cpu_index); + uword thread_index); static inline u64 vnet_get_aggregate_rx_packets (void) @@ -95,12 +95,12 @@ vnet_get_aggregate_rx_packets (void) } static inline void -vnet_device_increment_rx_packets (u32 cpu_index, u64 count) +vnet_device_increment_rx_packets (u32 thread_index, u64 count) { vnet_device_main_t *vdm = &vnet_device_main; vnet_device_per_worker_data_t *pwd; - pwd = vec_elt_at_index (vdm->workers, cpu_index); + pwd = vec_elt_at_index (vdm->workers, thread_index); pwd->aggregate_rx_packets += count; } @@ -117,9 +117,9 @@ vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index, { vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); - ASSERT (queue_id < vec_len (hw->input_node_cpu_index_by_queue)); - u32 cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; - vlib_node_set_interrupt_pending (vlib_mains[cpu_index], + ASSERT (queue_id < vec_len (hw->input_node_thread_index_by_queue)); + u32 thread_index = hw->input_node_thread_index_by_queue[queue_id]; + vlib_node_set_interrupt_pending (vlib_mains[thread_index], hw->input_node_index); } diff --git a/src/vnet/devices/netmap/node.c b/src/vnet/devices/netmap/node.c index 68ea7832..e120eeae 100644 --- a/src/vnet/devices/netmap/node.c +++ b/src/vnet/devices/netmap/node.c @@ -98,22 +98,22 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_free_bufs; struct netmap_ring *ring; int cur_ring; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (nif->per_interface_next_index != ~0) next_index = nif->per_interface_next_index; - n_free_bufs = vec_len (nm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) { - vec_validate (nm->rx_buffers[cpu_index], + vec_validate (nm->rx_buffers[thread_index], VLIB_FRAME_SIZE + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], VLIB_FRAME_SIZE); - _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; } cur_ring = nif->first_rx_ring; @@ -163,11 +163,11 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *b0; /* grab free buffer */ u32 last_empty_buffer = - vec_len (nm->rx_buffers[cpu_index]) - 1; + vec_len (nm->rx_buffers[thread_index]) - 1; prev_bi0 = bi0; - bi0 = nm->rx_buffers[cpu_index][last_empty_buffer]; + bi0 = nm->rx_buffers[thread_index][last_empty_buffer]; b0 = vlib_get_buffer (vm, bi0); - _vec_len (nm->rx_buffers[cpu_index]) = last_empty_buffer; + _vec_len (nm->rx_buffers[thread_index]) = last_empty_buffer; n_free_bufs--; /* copy data */ @@ -247,9 +247,9 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), nif->hw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), nif->hw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -260,7 +260,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { int i; u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); netmap_main_t *nm = &netmap_main; netmap_if_t *nmi; @@ -269,7 +269,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, nmi = vec_elt_at_index (nm->interfaces, i); if (nmi->is_admin_up && (i % nm->input_cpu_count) == - (cpu_index - nm->input_cpu_first_index)) + (thread_index - nm->input_cpu_first_index)) n_rx_packets += netmap_device_input_fn (vm, node, frame, nmi); } diff --git a/src/vnet/devices/ssvm/node.c b/src/vnet/devices/ssvm/node.c index a6c9dfd7..539b4161 100644 --- a/src/vnet/devices/ssvm/node.c +++ b/src/vnet/devices/ssvm/node.c @@ -89,7 +89,7 @@ ssvm_eth_device_input (ssvm_eth_main_t * em, ethernet_header_t *eh0; u16 type0; u32 n_rx_bytes = 0, l3_offset0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 trace_cnt __attribute__ ((unused)) = vlib_get_trace_count (vm, node); volatile u32 *lock; u32 *elt_indices; @@ -284,10 +284,10 @@ out: vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters - + VNET_INTERFACE_COUNTER_RX, cpu_index, + + VNET_INTERFACE_COUNTER_RX, thread_index, intfc->vlib_hw_if_index, rx_queue_index, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, rx_queue_index); + vnet_device_increment_rx_packets (thread_index, rx_queue_index); return rx_queue_index; } diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index 00807dc0..5e720f65 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -331,7 +331,7 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) { //Let's try to assign one queue to each thread u32 qid = 0; - u32 cpu_index = 0; + u32 thread_index = 0; vui->use_tx_spinlock = 0; while (1) { @@ -341,20 +341,21 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) if (!rxvq->started || !rxvq->enabled) continue; - vui->per_cpu_tx_qid[cpu_index] = qid; - cpu_index++; - if (cpu_index == vlib_get_thread_main ()->n_vlib_mains) + vui->per_cpu_tx_qid[thread_index] = qid; + thread_index++; + if (thread_index == vlib_get_thread_main ()->n_vlib_mains) return; } //We need to loop, meaning the spinlock has to be used vui->use_tx_spinlock = 1; - if (cpu_index == 0) + if (thread_index == 0) { //Could not find a single valid one - for (cpu_index = 0; - cpu_index < vlib_get_thread_main ()->n_vlib_mains; cpu_index++) + for (thread_index = 0; + thread_index < vlib_get_thread_main ()->n_vlib_mains; + thread_index++) { - vui->per_cpu_tx_qid[cpu_index] = 0; + vui->per_cpu_tx_qid[thread_index] = 0; } return; } @@ -368,7 +369,7 @@ vhost_user_rx_thread_placement () vhost_user_intf_t *vui; vhost_cpu_t *vhc; u32 *workers = 0; - u32 cpu_index; + u32 thread_index; vlib_main_t *vm; //Let's list all workers cpu indexes @@ -400,9 +401,9 @@ vhost_user_rx_thread_placement () continue; i %= vec_len (vui_workers); - cpu_index = vui_workers[i]; + thread_index = vui_workers[i]; i++; - vhc = &vum->cpus[cpu_index]; + vhc = &vum->cpus[thread_index]; iaq.qid = qid; iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; @@ -429,14 +430,14 @@ vhost_user_rx_thread_placement () vhc->operation_mode = mode; } - for (cpu_index = vum->input_cpu_first_index; - cpu_index < vum->input_cpu_first_index + vum->input_cpu_count; - cpu_index++) + for (thread_index = vum->input_cpu_first_index; + thread_index < vum->input_cpu_first_index + vum->input_cpu_count; + thread_index++) { vlib_node_state_t state = VLIB_NODE_STATE_POLLING; - vhc = &vum->cpus[cpu_index]; - vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + vhc = &vum->cpus[thread_index]; + vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; switch (vhc->operation_mode) { case VHOST_USER_INTERRUPT_MODE: @@ -532,7 +533,7 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) { vhost_user_main_t *vum = &vhost_user_main; vhost_cpu_t *vhc; - u32 cpu_index; + u32 thread_index; vhost_iface_and_queue_t *vhiq; vlib_main_t *vm; u32 ifq2; @@ -553,8 +554,8 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) if ((vhiq->vhost_iface_index == (ifq >> 8)) && (VHOST_VRING_IDX_TX (vhiq->qid) == (ifq & 0xff))) { - cpu_index = vhc - vum->cpus; - vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + thread_index = vhc - vum->cpus; + vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; /* * Convert RX virtqueue number in the lower byte to vring * queue index for the input node process. Top bytes contain @@ -1592,7 +1593,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 n_trace = vlib_get_trace_count (vm, node); u16 qsz_mask; u32 map_hint = 0; - u16 cpu_index = os_get_cpu_number (); + u16 thread_index = vlib_get_thread_index (); u16 copy_len = 0; { @@ -1651,32 +1652,32 @@ vhost_user_if_input (vlib_main_t * vm, * in the loop and come back later. This is not an issue as for big packet, * processing cost really comes from the memory copy. */ - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len < n_left + 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len < n_left + 1)) { - u32 curr_len = vum->cpus[cpu_index].rx_buffers_len; - vum->cpus[cpu_index].rx_buffers_len += + u32 curr_len = vum->cpus[thread_index].rx_buffers_len; + vum->cpus[thread_index].rx_buffers_len += vlib_buffer_alloc_from_free_list (vm, - vum->cpus[cpu_index].rx_buffers + + vum->cpus[thread_index].rx_buffers + curr_len, VHOST_USER_RX_BUFFERS_N - curr_len, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len < + (vum->cpus[thread_index].rx_buffers_len < VHOST_USER_RX_BUFFER_STARVATION)) { /* In case of buffer starvation, discard some packets from the queue * and log the event. * We keep doing best effort for the remaining packets. */ - u32 flush = (n_left + 1 > vum->cpus[cpu_index].rx_buffers_len) ? - n_left + 1 - vum->cpus[cpu_index].rx_buffers_len : 1; + u32 flush = (n_left + 1 > vum->cpus[thread_index].rx_buffers_len) ? + n_left + 1 - vum->cpus[thread_index].rx_buffers_len : 1; flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush); n_left -= flush; vlib_increment_simple_counter (vnet_main. interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), + vlib_get_thread_index (), vui->sw_if_index, flush); vlib_error_count (vm, vhost_user_input_node.index, @@ -1696,7 +1697,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 desc_data_offset; vring_desc_t *desc_table = txvq->desc; - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len <= 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len <= 1)) { /* Not enough rx_buffers * Note: We yeld on 1 so we don't need to do an additional @@ -1707,17 +1708,18 @@ vhost_user_if_input (vlib_main_t * vm, } desc_current = txvq->avail->ring[txvq->last_avail_idx & qsz_mask]; - vum->cpus[cpu_index].rx_buffers_len--; - bi_current = (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index].rx_buffers_len]; + vum->cpus[thread_index].rx_buffers_len--; + bi_current = (vum->cpus[thread_index].rx_buffers) + [vum->cpus[thread_index].rx_buffers_len]; b_head = b_current = vlib_get_buffer (vm, bi_current); to_next[0] = bi_current; //We do that now so we can forget about bi_current to_next++; n_left_to_next--; vlib_prefetch_buffer_with_index (vm, - (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index]. + (vum-> + cpus[thread_index].rx_buffers) + [vum->cpus[thread_index]. rx_buffers_len - 1], LOAD); /* Just preset the used descriptor id and length for later */ @@ -1791,7 +1793,7 @@ vhost_user_if_input (vlib_main_t * vm, (b_current->current_length == VLIB_BUFFER_DATA_SIZE)) { if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len == 0)) + (vum->cpus[thread_index].rx_buffers_len == 0)) { /* Cancel speculation */ to_next--; @@ -1805,17 +1807,18 @@ vhost_user_if_input (vlib_main_t * vm, * but valid. */ vhost_user_input_rewind_buffers (vm, - &vum->cpus[cpu_index], + &vum->cpus + [thread_index], b_head); n_left = 0; goto stop; } /* Get next output */ - vum->cpus[cpu_index].rx_buffers_len--; + vum->cpus[thread_index].rx_buffers_len--; u32 bi_next = - (vum->cpus[cpu_index].rx_buffers)[vum->cpus - [cpu_index].rx_buffers_len]; + (vum->cpus[thread_index].rx_buffers)[vum->cpus + [thread_index].rx_buffers_len]; b_current->next_buffer = bi_next; b_current->flags |= VLIB_BUFFER_NEXT_PRESENT; bi_current = bi_next; @@ -1823,7 +1826,7 @@ vhost_user_if_input (vlib_main_t * vm, } /* Prepare a copy order executed later for the data */ - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; u32 desc_data_l = desc_table[desc_current].len - desc_data_offset; @@ -1880,7 +1883,7 @@ vhost_user_if_input (vlib_main_t * vm, if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD)) { if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning @@ -1905,7 +1908,7 @@ vhost_user_if_input (vlib_main_t * vm, /* Do the memory copies */ if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning ("Memory mapping error on interface hw_if_index=%d " @@ -1933,9 +1936,9 @@ vhost_user_if_input (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), vui->sw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), vui->sw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -1946,15 +1949,15 @@ vhost_user_input (vlib_main_t * vm, { vhost_user_main_t *vum = &vhost_user_main; uword n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); vhost_iface_and_queue_t *vhiq; vhost_user_intf_t *vui; vhost_cpu_t *vhc; - vhc = &vum->cpus[cpu_index]; + vhc = &vum->cpus[thread_index]; if (PREDICT_TRUE (vhc->operation_mode == VHOST_USER_POLLING_MODE)) { - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) { vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node); @@ -2096,7 +2099,7 @@ vhost_user_tx (vlib_main_t * vm, vhost_user_vring_t *rxvq; u16 qsz_mask; u8 error; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 map_hint = 0; u8 retry = 8; u16 copy_len; @@ -2116,7 +2119,7 @@ vhost_user_tx (vlib_main_t * vm, qid = VHOST_VRING_IDX_RX (*vec_elt_at_index - (vui->per_cpu_tx_qid, os_get_cpu_number ())); + (vui->per_cpu_tx_qid, vlib_get_thread_index ())); rxvq = &vui->vrings[qid]; if (PREDICT_FALSE (vui->use_tx_spinlock)) vhost_user_vring_lock (vui, qid); @@ -2143,10 +2146,10 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace = + vum->cpus[thread_index].current_trace = vlib_add_trace (vm, node, b0, - sizeof (*vum->cpus[cpu_index].current_trace)); - vhost_user_tx_trace (vum->cpus[cpu_index].current_trace, + sizeof (*vum->cpus[thread_index].current_trace)); + vhost_user_tx_trace (vum->cpus[thread_index].current_trace, vui, qid / 2, b0, rxvq); } @@ -2188,14 +2191,14 @@ retry: { // Get a header from the header array virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len]; + &vum->cpus[thread_index].tx_headers[tx_headers_len]; tx_headers_len++; hdr->hdr.flags = 0; hdr->hdr.gso_type = 0; hdr->num_buffers = 1; //This is local, no need to check // Prepare a copy order executed later for the header - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = vui->virtio_net_hdr_sz; cpy->dst = buffer_map_addr; @@ -2220,7 +2223,7 @@ retry: else if (vui->virtio_net_hdr_sz == 12) //MRG is available { virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + &vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; //Move from available to used buffer rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id = @@ -2282,7 +2285,7 @@ retry: } { - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = bytes_left; cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len; @@ -2325,8 +2328,8 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace->hdr = - vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + vum->cpus[thread_index].current_trace->hdr = + vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; } n_left--; //At the end for error counting when 'goto done' is invoked @@ -2336,7 +2339,7 @@ retry: done: //Do the memory copies if (PREDICT_FALSE - (vhost_user_tx_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_tx_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning ("Memory mapping error on interface hw_if_index=%d " @@ -2386,7 +2389,7 @@ done3: vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), vui->sw_if_index, n_left); + vlib_get_thread_index (), vui->sw_if_index, n_left); } vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); @@ -2773,11 +2776,11 @@ vhost_user_send_interrupt_process (vlib_main_t * vm, case ~0: vec_foreach (vhc, vum->cpus) { - u32 cpu_index = vhc - vum->cpus; + u32 thread_index = vhc - vum->cpus; f64 next_timeout; next_timeout = timeout; - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) { vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; vhost_user_vring_t *rxvq = diff --git a/src/vnet/dpo/lookup_dpo.c b/src/vnet/dpo/lookup_dpo.c index e94e871c..97ad0a44 100644 --- a/src/vnet/dpo/lookup_dpo.c +++ b/src/vnet/dpo/lookup_dpo.c @@ -266,7 +266,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, int table_from_interface) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; from = vlib_frame_vector_args (from_frame); @@ -407,10 +407,10 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -511,7 +511,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -606,7 +606,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, { vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -749,10 +749,10 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -853,7 +853,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -930,7 +930,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm, int table_from_interface) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; from = vlib_frame_vector_args (from_frame); @@ -994,7 +994,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index a9f334be..e25ceae9 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -627,7 +627,7 @@ replicate_inline (vlib_main_t * vm, vlib_combined_counter_main_t * cm = &replicate_main.repm_counters; replicate_main_t * rm = &replicate_main; u32 n_left_from, * from, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -657,12 +657,12 @@ replicate_inline (vlib_main_t * vm, rep0 = replicate_get(repi0); vlib_increment_combined_counter( - cm, cpu_index, repi0, 1, + cm, thread_index, repi0, 1, vlib_buffer_length_in_chain(vm, b0)); - vec_validate (rm->clones[cpu_index], rep0->rep_n_buckets - 1); + vec_validate (rm->clones[thread_index], rep0->rep_n_buckets - 1); - num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[cpu_index], rep0->rep_n_buckets, 128); + num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[thread_index], rep0->rep_n_buckets, 128); if (num_cloned != rep0->rep_n_buckets) { @@ -673,7 +673,7 @@ replicate_inline (vlib_main_t * vm, for (bucket = 0; bucket < num_cloned; bucket++) { - ci0 = rm->clones[cpu_index][bucket]; + ci0 = rm->clones[thread_index][bucket]; c0 = vlib_get_buffer(vm, ci0); to_next[0] = ci0; @@ -700,7 +700,7 @@ replicate_inline (vlib_main_t * vm, vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); } } - vec_reset_length (rm->clones[cpu_index]); + vec_reset_length (rm->clones[thread_index]); } vlib_put_next_frame (vm, node, next_index, n_left_to_next); diff --git a/src/vnet/ethernet/arp.c b/src/vnet/ethernet/arp.c index ee757505..c74a097e 100644 --- a/src/vnet/ethernet/arp.c +++ b/src/vnet/ethernet/arp.c @@ -1771,7 +1771,7 @@ set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t * a) { vnet_main_t *vm = vnet_get_main (); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (a->flags & ETHERNET_ARP_ARGS_REMOVE) vnet_arp_unset_ip4_over_ethernet_internal (vm, a); diff --git a/src/vnet/ethernet/interface.c b/src/vnet/ethernet/interface.c index 9894e3c8..335e3f9f 100644 --- a/src/vnet/ethernet/interface.c +++ b/src/vnet/ethernet/interface.c @@ -362,7 +362,7 @@ simulated_ethernet_interface_tx (vlib_main_t * vm, u32 next_index = VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT; u32 i, next_node_index, bvi_flag, sw_if_index; u32 n_pkts = 0, n_bytes = 0; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; vlib_node_main_t *nm = &vm->node_main; @@ -420,8 +420,9 @@ simulated_ethernet_interface_tx (vlib_main_t * vm, /* increment TX interface stat */ vlib_increment_combined_counter (im->combined_sw_if_counters + - VNET_INTERFACE_COUNTER_TX, cpu_index, - sw_if_index, n_pkts, n_bytes); + VNET_INTERFACE_COUNTER_TX, + thread_index, sw_if_index, n_pkts, + n_bytes); } return n_left_from; diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index b699e381..f7787ed2 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -291,7 +291,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_node_runtime_t *error_node; u32 n_left_from, next_index, *from, *to_next; u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 cached_sw_if_index = ~0; u32 cached_is_l2 = 0; /* shut up gcc */ vnet_hw_interface_t *hi = NULL; /* used for main interface only */ @@ -510,7 +510,7 @@ ethernet_input_inline (vlib_main_t * vm, interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, new_sw_if_index0, 1, len0); if (new_sw_if_index1 != old_sw_if_index1 @@ -519,7 +519,7 @@ ethernet_input_inline (vlib_main_t * vm, interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, new_sw_if_index1, 1, len1); @@ -530,7 +530,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = stats_n_bytes = 0; @@ -696,13 +696,13 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, new_sw_if_index0, 1, len0); + thread_index, new_sw_if_index0, 1, len0); if (stats_n_packets > 0) { vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = stats_n_bytes = 0; } @@ -734,7 +734,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/gre/node.c b/src/vnet/gre/node.c index 2683586e..acf15f24 100644 --- a/src/vnet/gre/node.c +++ b/src/vnet/gre/node.c @@ -75,7 +75,7 @@ gre_input (vlib_main_t * vm, u64 cached_tunnel_key6[4]; u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 len; vnet_interface_main_t *im = &gm->vnet_main->interface_main; @@ -257,7 +257,7 @@ gre_input (vlib_main_t * vm, len = vlib_buffer_length_in_chain (vm, b0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); @@ -324,7 +324,7 @@ drop0: len = vlib_buffer_length_in_chain (vm, b1); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); @@ -502,7 +502,7 @@ drop1: len = vlib_buffer_length_in_chain (vm, b0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); diff --git a/src/vnet/interface.h b/src/vnet/interface.h index a1ea2d61..08f08b10 100644 --- a/src/vnet/interface.h +++ b/src/vnet/interface.h @@ -468,7 +468,7 @@ typedef struct vnet_hw_interface_t u32 input_node_index; /* input node cpu index by queue */ - u32 *input_node_cpu_index_by_queue; + u32 *input_node_thread_index_by_queue; } vnet_hw_interface_t; diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c index 03f2cdca..663dc309 100644 --- a/src/vnet/interface_output.c +++ b/src/vnet/interface_output.c @@ -196,7 +196,7 @@ slow_path (vlib_main_t * vm, */ static_always_inline void incr_output_stats (vnet_main_t * vnm, - u32 cpu_index, + u32 thread_index, u32 length, u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes) @@ -216,7 +216,7 @@ incr_output_stats (vnet_main_t * vnm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, *last_sw_if_index, *n_packets, *n_bytes); } @@ -240,7 +240,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 last_sw_if_index; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; n_buffers = frame->n_vectors; @@ -266,7 +266,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_TX_ERROR); - vlib_increment_simple_counter (cm, cpu_index, + vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, n_buffers); return vlib_error_drop_buffers (vm, node, from, /* buffer stride */ 1, @@ -341,18 +341,18 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, from += 1; to_tx += n_buffers; n_left_to_tx -= n_buffers; - incr_output_stats (vnm, cpu_index, n_slow_bytes, + incr_output_stats (vnm, thread_index, n_slow_bytes, vnet_buffer (b)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); } } else { - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b0)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b1)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); @@ -396,7 +396,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, to_tx += n_buffers; n_left_to_tx -= n_buffers; } - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b0)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); @@ -408,7 +408,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, } /* Final update of interface stats. */ - incr_output_stats (vnm, cpu_index, 0, ~0, /* ~0 will flush stats */ + incr_output_stats (vnm, thread_index, 0, ~0, /* ~0 will flush stats */ &last_sw_if_index, &n_packets, &n_bytes); return n_buffers; @@ -428,7 +428,7 @@ vnet_interface_output_node (vlib_main_t * vm, u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 n_bytes_b0, n_bytes_b1, n_bytes_b2, n_bytes_b3; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; vnet_interface_main_t *im = &vnm->interface_main; u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX; u32 current_config_index = ~0; @@ -458,7 +458,7 @@ vnet_interface_output_node (vlib_main_t * vm, cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_TX_ERROR); - vlib_increment_simple_counter (cm, cpu_index, + vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, n_buffers); return vlib_error_drop_buffers (vm, node, from, @@ -558,7 +558,7 @@ vnet_interface_output_node (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif0, 1, + thread_index, tx_swif0, 1, n_bytes_b0); } @@ -567,7 +567,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif1, 1, + thread_index, tx_swif1, 1, n_bytes_b1); } @@ -576,7 +576,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif2, 1, + thread_index, tx_swif2, 1, n_bytes_b2); } if (PREDICT_FALSE (tx_swif3 != rt->sw_if_index)) @@ -584,7 +584,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif3, 1, + thread_index, tx_swif3, 1, n_bytes_b3); } } @@ -623,7 +623,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif0, 1, + thread_index, tx_swif0, 1, n_bytes_b0); } } @@ -634,7 +634,7 @@ vnet_interface_output_node (vlib_main_t * vm, /* Update main interface stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, rt->sw_if_index, n_packets, n_bytes); return n_buffers; } @@ -893,7 +893,7 @@ process_drop_punt (vlib_main_t * vm, u32 current_sw_if_index, n_errors_current_sw_if_index; u64 current_counter; vlib_simple_counter_main_t *cm; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; static vlib_error_t memory[VNET_ERROR_N_DISPOSITION]; static char memory_init[VNET_ERROR_N_DISPOSITION]; @@ -965,19 +965,19 @@ process_drop_punt (vlib_main_t * vm, current_counter -= 2; n_errors_current_sw_if_index -= 2; - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); /* Increment super-interface drop/punt counters for sub-interfaces. */ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0); vlib_increment_simple_counter - (cm, cpu_index, sw_if0->sup_sw_if_index, + (cm, thread_index, sw_if0->sup_sw_if_index, sw_if0->sup_sw_if_index != sw_if_index0); sw_if1 = vnet_get_sw_interface (vnm, sw_if_index1); vlib_increment_simple_counter - (cm, cpu_index, sw_if1->sup_sw_if_index, + (cm, thread_index, sw_if1->sup_sw_if_index, sw_if1->sup_sw_if_index != sw_if_index1); em->counters[current_counter_index] = current_counter; @@ -1013,11 +1013,12 @@ process_drop_punt (vlib_main_t * vm, sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; /* Increment drop/punt counters. */ - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); /* Increment super-interface drop/punt counters for sub-interfaces. */ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0); - vlib_increment_simple_counter (cm, cpu_index, sw_if0->sup_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, + sw_if0->sup_sw_if_index, sw_if0->sup_sw_if_index != sw_if_index0); if (PREDICT_FALSE (e0 != current_error)) @@ -1041,12 +1042,12 @@ process_drop_punt (vlib_main_t * vm, { vnet_sw_interface_t *si; - vlib_increment_simple_counter (cm, cpu_index, current_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, current_sw_if_index, n_errors_current_sw_if_index); si = vnet_get_sw_interface (vnm, current_sw_if_index); if (si->sup_sw_if_index != current_sw_if_index) - vlib_increment_simple_counter (cm, cpu_index, si->sup_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, si->sup_sw_if_index, n_errors_current_sw_if_index); } diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index ee1703e7..fdfe7f63 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -75,7 +75,7 @@ ip4_lookup_inline (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -292,19 +292,19 @@ ip4_lookup_inline (vlib_main_t * vm, vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lb_index0, 1, + (cm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, p0) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index1, 1, + (cm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, p1) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index2, 1, + (cm, thread_index, lb_index2, 1, vlib_buffer_length_in_chain (vm, p2) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index3, 1, + (cm, thread_index, lb_index3, 1, vlib_buffer_length_in_chain (vm, p3) + sizeof (ethernet_header_t)); @@ -392,7 +392,7 @@ ip4_lookup_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); from += 1; to_next += 1; @@ -479,7 +479,7 @@ ip4_load_balance (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -584,9 +584,9 @@ ip4_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); vlib_validate_buffer_enqueue_x2 (vm, node, next, to_next, n_left_to_next, @@ -639,7 +639,7 @@ ip4_load_balance (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, @@ -2330,7 +2330,7 @@ ip4_rewrite_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -2379,9 +2379,9 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) { vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + thread_index, adj_index0); vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index1); + thread_index, adj_index1); } ip0 = vlib_buffer_get_current (p0); @@ -2527,13 +2527,13 @@ ip4_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); } @@ -2618,7 +2618,7 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + thread_index, adj_index0); /* Guess we are only writing on simple Ethernet header. */ vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); @@ -2637,7 +2637,7 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); /* Check MTU of outgoing interface. */ diff --git a/src/vnet/ip/ip4_input.c b/src/vnet/ip/ip4_input.c index ba200a9f..3b08f4b0 100644 --- a/src/vnet/ip/ip4_input.c +++ b/src/vnet/ip/ip4_input.c @@ -85,7 +85,7 @@ ip4_input_inline (vlib_main_t * vm, vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip4_input_node.index); vlib_simple_counter_main_t *cm; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -178,8 +178,8 @@ ip4_input_inline (vlib_main_t * vm, vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); /* Punt packets with options or wrong version. */ if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45)) @@ -299,7 +299,7 @@ ip4_input_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); /* Punt packets with options or wrong version. */ if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45)) diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index c120f12c..c2fc4f87 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -74,7 +74,7 @@ ip6_lookup_inline (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -185,9 +185,9 @@ ip6_lookup_inline (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); from += 2; to_next += 2; @@ -291,7 +291,7 @@ ip6_lookup_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); from += 1; to_next += 1; @@ -703,7 +703,7 @@ ip6_load_balance (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ip6_main_t *im = &ip6_main; from = vlib_frame_vector_args (frame); @@ -824,9 +824,9 @@ ip6_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); vlib_validate_buffer_enqueue_x2 (vm, node, next, to_next, n_left_to_next, @@ -886,7 +886,7 @@ ip6_load_balance (vlib_main_t * vm, } vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, @@ -1897,7 +1897,7 @@ ip6_rewrite_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -2019,11 +2019,11 @@ ip6_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index1, 1, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); } @@ -2156,7 +2156,7 @@ ip6_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); } diff --git a/src/vnet/ip/ip6_input.c b/src/vnet/ip/ip6_input.c index 20306088..ffdc4727 100644 --- a/src/vnet/ip/ip6_input.c +++ b/src/vnet/ip/ip6_input.c @@ -82,7 +82,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_input_node.index); vlib_simple_counter_main_t *cm; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -171,8 +171,8 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); error0 = error1 = IP6_ERROR_NONE; @@ -270,7 +270,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); error0 = IP6_ERROR_NONE; /* Version != 6? Drop it. */ diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 5d1fb6f8..2af546df 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -581,7 +581,7 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm, u32 next_index; pending_resolution_t *pr, *mc; - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) { set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, 1 /* set new neighbor */ , is_static, @@ -722,7 +722,7 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm, uword *p; int rv = 0; - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) { set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, 0 /* unset */ , 0, 0); diff --git a/src/vnet/ipsec/esp.h b/src/vnet/ipsec/esp.h index 50cac806..799003b9 100644 --- a/src/vnet/ipsec/esp.h +++ b/src/vnet/ipsec/esp.h @@ -282,8 +282,8 @@ hmac_calc (ipsec_integ_alg_t alg, u8 * data, int data_len, u8 * signature, u8 use_esn, u32 seq_hi) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - HMAC_CTX *ctx = &(em->per_thread_data[cpu_index].hmac_ctx); + u32 thread_index = vlib_get_thread_index (); + HMAC_CTX *ctx = &(em->per_thread_data[thread_index].hmac_ctx); const EVP_MD *md = NULL; unsigned int len; @@ -292,10 +292,10 @@ hmac_calc (ipsec_integ_alg_t alg, if (PREDICT_FALSE (em->esp_integ_algs[alg].md == 0)) return 0; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_integ_alg)) + if (PREDICT_FALSE (alg != em->per_thread_data[thread_index].last_integ_alg)) { md = em->esp_integ_algs[alg].md; - em->per_thread_data[cpu_index].last_integ_alg = alg; + em->per_thread_data[thread_index].last_integ_alg = alg; } HMAC_Init (ctx, key, key_len, md); diff --git a/src/vnet/ipsec/esp_decrypt.c b/src/vnet/ipsec/esp_decrypt.c index 7289b260..925d2b45 100644 --- a/src/vnet/ipsec/esp_decrypt.c +++ b/src/vnet/ipsec/esp_decrypt.c @@ -85,8 +85,8 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg, u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].decrypt_ctx); + u32 thread_index = vlib_get_thread_index (); + EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].decrypt_ctx); const EVP_CIPHER *cipher = NULL; int out_len; @@ -95,10 +95,11 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg, if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == 0)) return; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_decrypt_alg)) + if (PREDICT_FALSE + (alg != em->per_thread_data[thread_index].last_decrypt_alg)) { cipher = em->esp_crypto_algs[alg].type; - em->per_thread_data[cpu_index].last_decrypt_alg = alg; + em->per_thread_data[thread_index].last_decrypt_alg = alg; } EVP_DecryptInit_ex (ctx, cipher, NULL, key, iv); @@ -117,11 +118,11 @@ esp_decrypt_node_fn (vlib_main_t * vm, u32 *recycle = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ipsec_alloc_empty_buffers (vm, im); - u32 *empty_buffers = im->empty_buffers[cpu_index]; + u32 *empty_buffers = im->empty_buffers[thread_index]; if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from)) { diff --git a/src/vnet/ipsec/esp_encrypt.c b/src/vnet/ipsec/esp_encrypt.c index 44ae2297..b2bc4e0b 100644 --- a/src/vnet/ipsec/esp_encrypt.c +++ b/src/vnet/ipsec/esp_encrypt.c @@ -88,8 +88,8 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg, u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].encrypt_ctx); + u32 thread_index = vlib_get_thread_index (); + EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].encrypt_ctx); const EVP_CIPHER *cipher = NULL; int out_len; @@ -98,10 +98,11 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg, if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == IPSEC_CRYPTO_ALG_NONE)) return; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_encrypt_alg)) + if (PREDICT_FALSE + (alg != em->per_thread_data[thread_index].last_encrypt_alg)) { cipher = em->esp_crypto_algs[alg].type; - em->per_thread_data[cpu_index].last_encrypt_alg = alg; + em->per_thread_data[thread_index].last_encrypt_alg = alg; } EVP_EncryptInit_ex (ctx, cipher, NULL, key, iv); @@ -119,11 +120,11 @@ esp_encrypt_node_fn (vlib_main_t * vm, n_left_from = from_frame->n_vectors; ipsec_main_t *im = &ipsec_main; u32 *recycle = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ipsec_alloc_empty_buffers (vm, im); - u32 *empty_buffers = im->empty_buffers[cpu_index]; + u32 *empty_buffers = im->empty_buffers[thread_index]; if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from)) { diff --git a/src/vnet/ipsec/ikev2.c b/src/vnet/ipsec/ikev2.c index 2c1074d8..3f9978a7 100644 --- a/src/vnet/ipsec/ikev2.c +++ b/src/vnet/ipsec/ikev2.c @@ -303,16 +303,16 @@ static void ikev2_delete_sa (ikev2_sa_t * sa) { ikev2_main_t *km = &ikev2_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); uword *p; ikev2_sa_free_all_vec (sa); - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi); + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi); if (p) { - hash_unset (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi); - pool_put (km->per_thread_data[cpu_index].sas, sa); + hash_unset (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi); + pool_put (km->per_thread_data[thread_index].sas, sa); } } @@ -776,29 +776,31 @@ ikev2_initial_contact_cleanup (ikev2_sa_t * sa) ikev2_sa_t *tmp; u32 i, *delete = 0; ikev2_child_sa_t *c; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); if (!sa->initial_contact) return; /* find old IKE SAs with the same authenticated identity */ /* *INDENT-OFF* */ - pool_foreach (tmp, km->per_thread_data[cpu_index].sas, ({ + pool_foreach (tmp, km->per_thread_data[thread_index].sas, ({ if (tmp->i_id.type != sa->i_id.type || vec_len(tmp->i_id.data) != vec_len(sa->i_id.data) || memcmp(sa->i_id.data, tmp->i_id.data, vec_len(sa->i_id.data))) continue; if (sa->rspi != tmp->rspi) - vec_add1(delete, tmp - km->per_thread_data[cpu_index].sas); + vec_add1(delete, tmp - km->per_thread_data[thread_index].sas); })); /* *INDENT-ON* */ for (i = 0; i < vec_len (delete); i++) { - tmp = pool_elt_at_index (km->per_thread_data[cpu_index].sas, delete[i]); - vec_foreach (c, tmp->childs) - ikev2_delete_tunnel_interface (km->vnet_main, tmp, c); + tmp = + pool_elt_at_index (km->per_thread_data[thread_index].sas, delete[i]); + vec_foreach (c, + tmp->childs) ikev2_delete_tunnel_interface (km->vnet_main, + tmp, c); ikev2_delete_sa (tmp); } @@ -1922,10 +1924,10 @@ ikev2_retransmit_sa_init (ike_header_t * ike, { ikev2_main_t *km = &ikev2_main; ikev2_sa_t *sa; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* *INDENT-OFF* */ - pool_foreach (sa, km->per_thread_data[cpu_index].sas, ({ + pool_foreach (sa, km->per_thread_data[thread_index].sas, ({ if (sa->ispi == clib_net_to_host_u64(ike->ispi) && sa->iaddr.as_u32 == iaddr.as_u32 && sa->raddr.as_u32 == raddr.as_u32) @@ -2036,7 +2038,7 @@ ikev2_node_fn (vlib_main_t * vm, u32 n_left_from, *from, *to_next; ikev2_next_t next_index; ikev2_main_t *km = &ikev2_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -2134,11 +2136,14 @@ ikev2_node_fn (vlib_main_t * vm, if (sa0->state == IKEV2_STATE_SA_INIT) { /* add SA to the pool */ - pool_get (km->per_thread_data[cpu_index].sas, sa0); + pool_get (km->per_thread_data[thread_index].sas, + sa0); clib_memcpy (sa0, &sa, sizeof (*sa0)); - hash_set (km->per_thread_data[cpu_index].sa_by_rspi, + hash_set (km-> + per_thread_data[thread_index].sa_by_rspi, sa0->rspi, - sa0 - km->per_thread_data[cpu_index].sas); + sa0 - + km->per_thread_data[thread_index].sas); } else { @@ -2169,11 +2174,11 @@ ikev2_node_fn (vlib_main_t * vm, if (sa0->state == IKEV2_STATE_SA_INIT) { /* add SA to the pool */ - pool_get (km->per_thread_data[cpu_index].sas, sa0); + pool_get (km->per_thread_data[thread_index].sas, sa0); clib_memcpy (sa0, &sa, sizeof (*sa0)); - hash_set (km->per_thread_data[cpu_index].sa_by_rspi, + hash_set (km->per_thread_data[thread_index].sa_by_rspi, sa0->rspi, - sa0 - km->per_thread_data[cpu_index].sas); + sa0 - km->per_thread_data[thread_index].sas); } else { @@ -2184,12 +2189,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_IKE_AUTH) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) @@ -2240,12 +2246,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_INFORMATIONAL) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) @@ -2305,12 +2312,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_CREATE_CHILD_SA) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) diff --git a/src/vnet/ipsec/ipsec.h b/src/vnet/ipsec/ipsec.h index 58f0f145..c884e360 100644 --- a/src/vnet/ipsec/ipsec.h +++ b/src/vnet/ipsec/ipsec.h @@ -324,21 +324,21 @@ int ipsec_set_interface_key (vnet_main_t * vnm, u32 hw_if_index, always_inline void ipsec_alloc_empty_buffers (vlib_main_t * vm, ipsec_main_t * im) { - u32 cpu_index = os_get_cpu_number (); - uword l = vec_len (im->empty_buffers[cpu_index]); + u32 thread_index = vlib_get_thread_index (); + uword l = vec_len (im->empty_buffers[thread_index]); uword n_alloc = 0; if (PREDICT_FALSE (l < VLIB_FRAME_SIZE)) { - if (!im->empty_buffers[cpu_index]) + if (!im->empty_buffers[thread_index]) { - vec_alloc (im->empty_buffers[cpu_index], 2 * VLIB_FRAME_SIZE); + vec_alloc (im->empty_buffers[thread_index], 2 * VLIB_FRAME_SIZE); } - n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[cpu_index] + l, + n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[thread_index] + l, 2 * VLIB_FRAME_SIZE - l); - _vec_len (im->empty_buffers[cpu_index]) = l + n_alloc; + _vec_len (im->empty_buffers[thread_index]) = l + n_alloc; } } diff --git a/src/vnet/ipsec/ipsec_if.c b/src/vnet/ipsec/ipsec_if.c index dc882004..ed124894 100644 --- a/src/vnet/ipsec/ipsec_if.c +++ b/src/vnet/ipsec/ipsec_if.c @@ -99,7 +99,7 @@ static int ipsec_add_del_tunnel_if_rpc_callback (ipsec_add_del_tunnel_args_t * a) { vnet_main_t *vnm = vnet_get_main (); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); return ipsec_add_del_tunnel_if_internal (vnm, a); } diff --git a/src/vnet/l2/l2_bvi.h b/src/vnet/l2/l2_bvi.h index dd1130a6..e21a1616 100644 --- a/src/vnet/l2/l2_bvi.h +++ b/src/vnet/l2/l2_bvi.h @@ -97,7 +97,7 @@ l2_to_bvi (vlib_main_t * vlib_main, vlib_increment_combined_counter (vnet_main->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - vlib_main->cpu_index, + vlib_main->thread_index, vnet_buffer (b0)->sw_if_index[VLIB_RX], 1, vlib_buffer_length_in_chain (vlib_main, b0)); return TO_BVI_ERR_OK; diff --git a/src/vnet/l2/l2_input.c b/src/vnet/l2/l2_input.c index 041ff38d..e5d6878a 100644 --- a/src/vnet/l2/l2_input.c +++ b/src/vnet/l2/l2_input.c @@ -117,7 +117,7 @@ typedef enum static_always_inline void classify_and_dispatch (vlib_main_t * vm, vlib_node_runtime_t * node, - u32 cpu_index, + u32 thread_index, l2input_main_t * msm, vlib_buffer_t * b0, u32 * next0) { /* @@ -237,7 +237,7 @@ l2input_node_inline (vlib_main_t * vm, u32 n_left_from, *from, *to_next; l2input_next_t next_index; l2input_main_t *msm = &l2input_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; /* number of packets to process */ @@ -350,10 +350,10 @@ l2input_node_inline (vlib_main_t * vm, vlib_node_increment_counter (vm, l2input_node.index, L2INPUT_ERROR_L2INPUT, 4); - classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0); - classify_and_dispatch (vm, node, cpu_index, msm, b1, &next1); - classify_and_dispatch (vm, node, cpu_index, msm, b2, &next2); - classify_and_dispatch (vm, node, cpu_index, msm, b3, &next3); + classify_and_dispatch (vm, node, thread_index, msm, b0, &next0); + classify_and_dispatch (vm, node, thread_index, msm, b1, &next1); + classify_and_dispatch (vm, node, thread_index, msm, b2, &next2); + classify_and_dispatch (vm, node, thread_index, msm, b3, &next3); /* verify speculative enqueues, maybe switch current next frame */ /* if next0==next1==next_index then nothing special needs to be done */ @@ -393,7 +393,7 @@ l2input_node_inline (vlib_main_t * vm, vlib_node_increment_counter (vm, l2input_node.index, L2INPUT_ERROR_L2INPUT, 1); - classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0); + classify_and_dispatch (vm, node, thread_index, msm, b0, &next0); /* verify speculative enqueue, maybe switch current next frame */ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, diff --git a/src/vnet/l2/l2_output.c b/src/vnet/l2/l2_output.c index 00f22571..e17b2a16 100644 --- a/src/vnet/l2/l2_output.c +++ b/src/vnet/l2/l2_output.c @@ -643,11 +643,11 @@ l2output_create_output_node_mapping (vlib_main_t * vlib_main, vnet_main_t * vnet hw0 = vnet_get_sup_hw_interface (vnet_main, sw_if_index); - uword cpu_number; + uword thread_index; - cpu_number = os_get_cpu_number (); + thread_index = vlib_get_thread_index (); - if (cpu_number) + if (thread_index) { u32 oldflags; diff --git a/src/vnet/l2tp/decap.c b/src/vnet/l2tp/decap.c index e8986935..46104129 100644 --- a/src/vnet/l2tp/decap.c +++ b/src/vnet/l2tp/decap.c @@ -149,7 +149,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi) /* per-mapping byte stats include the ethernet header */ vlib_increment_combined_counter (&lm->counter_main, - os_get_cpu_number (), + vlib_get_thread_index (), counter_index, 1 /* packet_increment */ , vlib_buffer_length_in_chain (vm, b) + sizeof (ethernet_header_t)); diff --git a/src/vnet/l2tp/encap.c b/src/vnet/l2tp/encap.c index ed7a9580..dcdfde4b 100644 --- a/src/vnet/l2tp/encap.c +++ b/src/vnet/l2tp/encap.c @@ -124,7 +124,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi) /* per-mapping byte stats include the ethernet header */ vlib_increment_combined_counter (&lm->counter_main, - os_get_cpu_number (), + vlib_get_thread_index (), counter_index, 1 /* packet_increment */ , vlib_buffer_length_in_chain (vm, b)); diff --git a/src/vnet/l2tp/l2tp.c b/src/vnet/l2tp/l2tp.c index cb94d7e7..3dedc447 100644 --- a/src/vnet/l2tp/l2tp.c +++ b/src/vnet/l2tp/l2tp.c @@ -157,7 +157,7 @@ test_counters_command_fn (vlib_main_t * vm, u32 session_index; u32 counter_index; u32 nincr = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* *INDENT-OFF* */ pool_foreach (session, lm->sessions, @@ -167,11 +167,11 @@ test_counters_command_fn (vlib_main_t * vm, session_index_to_counter_index (session_index, SESSION_COUNTER_USER_TO_NETWORK); vlib_increment_combined_counter (&lm->counter_main, - cpu_index, + thread_index, counter_index, 1/*pkt*/, 1111 /*bytes*/); vlib_increment_combined_counter (&lm->counter_main, - cpu_index, + thread_index, counter_index+1, 1/*pkt*/, 2222 /*bytes*/); nincr++; diff --git a/src/vnet/lisp-gpe/decap.c b/src/vnet/lisp-gpe/decap.c index d887a95f..68769710 100644 --- a/src/vnet/lisp-gpe/decap.c +++ b/src/vnet/lisp-gpe/decap.c @@ -103,7 +103,7 @@ next_index_to_iface (lisp_gpe_main_t * lgm, u32 next_index) } static_always_inline void -incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length, +incr_decap_stats (vnet_main_t * vnm, u32 thread_index, u32 length, u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes) { @@ -122,7 +122,7 @@ incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, *last_sw_if_index, + thread_index, *last_sw_if_index, *n_packets, *n_bytes); } *last_sw_if_index = sw_if_index; @@ -150,11 +150,11 @@ static uword lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_v4) { - u32 n_left_from, next_index, *from, *to_next, cpu_index; + u32 n_left_from, next_index, *from, *to_next, thread_index; u32 n_bytes = 0, n_packets = 0, last_sw_if_index = ~0, drops = 0; lisp_gpe_main_t *lgm = vnet_lisp_gpe_get_main (); - cpu_index = os_get_cpu_number (); + thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -267,7 +267,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si0) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b0), si0[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0]; @@ -282,7 +282,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si1) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b1), si1[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b1)->sw_if_index[VLIB_RX] = si1[0]; @@ -397,7 +397,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si0) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b0), si0[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0]; @@ -430,7 +430,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* flush iface stats */ - incr_decap_stats (lgm->vnet_main, cpu_index, 0, ~0, &last_sw_if_index, + incr_decap_stats (lgm->vnet_main, thread_index, 0, ~0, &last_sw_if_index, &n_packets, &n_bytes); vlib_node_increment_counter (vm, lisp_gpe_ip4_input_node.index, LISP_GPE_ERROR_NO_TUNNEL, drops); diff --git a/src/vnet/lldp/lldp_input.c b/src/vnet/lldp/lldp_input.c index 762743d0..e88f6fdb 100644 --- a/src/vnet/lldp/lldp_input.c +++ b/src/vnet/lldp/lldp_input.c @@ -35,7 +35,7 @@ typedef struct static void lldp_rpc_update_peer_cb (const lldp_intf_update_t * a) { - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); lldp_intf_t *n = lldp_get_intf (&lldp_main, a->hw_if_index); if (!n) diff --git a/src/vnet/map/ip4_map.c b/src/vnet/map/ip4_map.c index 1a20d704..e39b6f14 100644 --- a/src/vnet/map/ip4_map.c +++ b/src/vnet/map/ip4_map.c @@ -248,7 +248,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) next_index = node->cached_next_index; map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -377,7 +377,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip6h0->payload_length) + @@ -409,7 +409,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip41) ? IP4_MAP_NEXT_IP6_REWRITE : next1; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index1, 1, clib_net_to_host_u16 (ip6h1->payload_length) + @@ -520,7 +520,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip6h0->payload_length) + @@ -564,7 +564,7 @@ ip4_map_reass (vlib_main_t * vm, next_index = node->cached_next_index; map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 *fragments_to_drop = NULL; u32 *fragments_to_loopback = NULL; @@ -694,8 +694,8 @@ ip4_map_reass (vlib_main_t * vm, { if (error0 == MAP_ERROR_NONE) vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, map_domain_index0, - 1, + thread_index, + map_domain_index0, 1, clib_net_to_host_u16 (ip60->payload_length) + 40); next0 = diff --git a/src/vnet/map/ip4_map_t.c b/src/vnet/map/ip4_map_t.c index b63d76bf..5f2bcbf9 100644 --- a/src/vnet/map/ip4_map_t.c +++ b/src/vnet/map/ip4_map_t.c @@ -477,7 +477,7 @@ ip4_map_t_icmp (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -520,7 +520,7 @@ ip4_map_t_icmp (vlib_main_t * vm, if (PREDICT_TRUE (error0 == MAP_ERROR_NONE)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, len0); } @@ -1051,7 +1051,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -1158,7 +1158,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip40-> @@ -1169,7 +1169,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error1 == MAP_ERROR_NONE && next1 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p1)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip41-> @@ -1252,7 +1252,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip40-> diff --git a/src/vnet/map/ip6_map.c b/src/vnet/map/ip6_map.c index f7eb768f..63ada962 100644 --- a/src/vnet/map/ip6_map.c +++ b/src/vnet/map/ip6_map.c @@ -172,7 +172,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_get_runtime (vm, ip6_map_node.index); map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -319,7 +319,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next0; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); @@ -352,7 +352,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next1; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index1, 1, clib_net_to_host_u16 (ip41->length)); @@ -505,7 +505,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next0; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); @@ -820,7 +820,7 @@ ip6_map_ip4_reass (vlib_main_t * vm, vlib_node_get_runtime (vm, ip6_map_ip4_reass_node.index); map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 *fragments_to_drop = NULL; u32 *fragments_to_loopback = NULL; @@ -958,8 +958,8 @@ ip6_map_ip4_reass (vlib_main_t * vm, { if (error0 == MAP_ERROR_NONE) vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, map_domain_index0, - 1, + thread_index, + map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); next0 = @@ -1015,7 +1015,7 @@ ip6_map_icmp_relay (vlib_main_t * vm, vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_icmp_relay_node.index); map_main_t *mm = &map_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u16 *fragment_ids, *fid; from = vlib_frame_vector_args (frame); @@ -1143,7 +1143,8 @@ ip6_map_icmp_relay (vlib_main_t * vm, ip_csum_t sum = ip_incremental_checksum (0, new_icmp40, nlen - 20); new_icmp40->checksum = ~ip_csum_fold (sum); - vlib_increment_simple_counter (&mm->icmp_relayed, cpu_index, 0, 1); + vlib_increment_simple_counter (&mm->icmp_relayed, thread_index, 0, + 1); error: if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/map/ip6_map_t.c b/src/vnet/map/ip6_map_t.c index eb3996c2..99151678 100644 --- a/src/vnet/map/ip6_map_t.c +++ b/src/vnet/map/ip6_map_t.c @@ -448,7 +448,7 @@ ip6_map_t_icmp (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -493,7 +493,7 @@ ip6_map_t_icmp (vlib_main_t * vm, if (PREDICT_TRUE (error0 == MAP_ERROR_NONE)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, len0); @@ -1051,7 +1051,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_t_node.index); vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -1218,7 +1218,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, clib_net_to_host_u16 @@ -1229,7 +1229,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error1 == MAP_ERROR_NONE && next1 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p1)-> map_t.map_domain_index, 1, clib_net_to_host_u16 @@ -1403,7 +1403,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, clib_net_to_host_u16 diff --git a/src/vnet/mpls/mpls_input.c b/src/vnet/mpls/mpls_input.c index 893c4511..1b9bdd05 100644 --- a/src/vnet/mpls/mpls_input.c +++ b/src/vnet/mpls/mpls_input.c @@ -76,7 +76,7 @@ mpls_input_inline (vlib_main_t * vm, u32 n_left_from, next_index, * from, * to_next; mpls_input_runtime_t * rt; mpls_main_t * mm; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_simple_counter_main_t * cm; vnet_main_t * vnm = vnet_get_main(); @@ -151,7 +151,7 @@ mpls_input_inline (vlib_main_t * vm, next0 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); } if (PREDICT_FALSE(h1[3] == 0)) @@ -164,7 +164,7 @@ mpls_input_inline (vlib_main_t * vm, next1 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index1, &next1, b1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); } if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -215,7 +215,7 @@ mpls_input_inline (vlib_main_t * vm, { next0 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); } if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c index 475bb204..ace6a70f 100644 --- a/src/vnet/mpls/mpls_lookup.c +++ b/src/vnet/mpls/mpls_lookup.c @@ -67,7 +67,7 @@ mpls_lookup (vlib_main_t * vm, vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; u32 n_left_from, next_index, * from, * to_next; mpls_main_t * mm = &mpls_main; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -220,16 +220,16 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter - (cm, cpu_index, lbi2, 1, + (cm, thread_index, lbi2, 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter - (cm, cpu_index, lbi3, 1, + (cm, thread_index, lbi3, 1, vlib_buffer_length_in_chain (vm, b3)); /* @@ -351,7 +351,7 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); /* @@ -440,7 +440,7 @@ mpls_load_balance (vlib_main_t * vm, { vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 next; from = vlib_frame_vector_args (frame); @@ -536,10 +536,10 @@ mpls_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) @@ -597,7 +597,7 @@ mpls_load_balance (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, diff --git a/src/vnet/mpls/mpls_output.c b/src/vnet/mpls/mpls_output.c index 08018fd1..d90dec21 100644 --- a/src/vnet/mpls/mpls_output.c +++ b/src/vnet/mpls/mpls_output.c @@ -64,12 +64,12 @@ mpls_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_midchain) { - u32 n_left_from, next_index, * from, * to_next, cpu_index; + u32 n_left_from, next_index, * from, * to_next, thread_index; vlib_node_runtime_t * error_node; u32 n_left_to_next; mpls_main_t *mm; - cpu_index = os_get_cpu_number(); + thread_index = vlib_get_thread_index(); error_node = vlib_node_get_runtime (vm, mpls_output_node.index); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -137,13 +137,13 @@ mpls_output_inline (vlib_main_t * vm, /* Bump the adj counters for packet and bytes */ vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); @@ -245,7 +245,7 @@ mpls_output_inline (vlib_main_t * vm, vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); diff --git a/src/vnet/pg/input.c b/src/vnet/pg/input.c index 2649798b..597ae060 100644 --- a/src/vnet/pg/input.c +++ b/src/vnet/pg/input.c @@ -893,7 +893,7 @@ pg_generate_set_lengths (pg_main_t * pg, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), + vlib_get_thread_index (), si->sw_if_index, n_buffers, length_sum); } @@ -1266,7 +1266,7 @@ pg_stream_fill_helper (pg_main_t * pg, l += vlib_buffer_index_length_in_chain (vm, buffers[i]); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), + vlib_get_thread_index (), si->sw_if_index, n_alloc, l); s->current_replay_packet_index += n_alloc; s->current_replay_packet_index %= diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 86d922b5..233a8c2f 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -31,16 +31,16 @@ replication_prep (vlib_main_t * vm, { replication_main_t *rm = &replication_main; replication_context_t *ctx; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; ip4_header_t *ip; u32 ctx_id; /* Allocate a context, reserve context 0 */ - if (PREDICT_FALSE (rm->contexts[cpu_number] == 0)) - pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES); + if (PREDICT_FALSE (rm->contexts[thread_index] == 0)) + pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES); - pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES); - ctx_id = ctx - rm->contexts[cpu_number]; + pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES); + ctx_id = ctx - rm->contexts[thread_index]; /* Save state from vlib buffer */ ctx->saved_free_list_index = b0->free_list_index; @@ -94,11 +94,11 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) { replication_main_t *rm = &replication_main; replication_context_t *ctx; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; ip4_header_t *ip; /* Get access to the replication context */ - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); + ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count); /* Restore vnet buffer state */ clib_memcpy (vnet_buffer (b0), ctx->vnet_buffer, @@ -133,7 +133,7 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) b0->flags &= ~VLIB_BUFFER_RECYCLE; /* Free context back to its pool */ - pool_put (rm->contexts[cpu_number], ctx); + pool_put (rm->contexts[thread_index], ctx); } return ctx; @@ -160,7 +160,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) replication_main_t *rm = &replication_main; replication_context_t *ctx; u32 feature_node_index = 0; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; /* * All buffers in the list are destined to the same recycle node. @@ -172,7 +172,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) { bi0 = fl->buffers[0]; b0 = vlib_get_buffer (vm, bi0); - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); + ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count); feature_node_index = ctx->recycle_node_index; } diff --git a/src/vnet/replication.h b/src/vnet/replication.h index 5dc554c9..ce4b3ff1 100644 --- a/src/vnet/replication.h +++ b/src/vnet/replication.h @@ -100,7 +100,7 @@ replication_get_ctx (vlib_buffer_t * b0) replication_main_t *rm = &replication_main; return replication_is_recycled (b0) ? - pool_elt_at_index (rm->contexts[os_get_cpu_number ()], + pool_elt_at_index (rm->contexts[vlib_get_thread_index ()], b0->recycle_count) : 0; } diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index b86e87d9..dd211c51 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -311,7 +311,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, unix_shared_memory_queue_t *q; application_t *app; int n_tx_packets = 0; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; int i, rv; f64 now = vlib_time_now (vm); diff --git a/src/vnet/sr/sr_localsid.c b/src/vnet/sr/sr_localsid.c index 2e3d56de..6d72a506 100755 --- a/src/vnet/sr/sr_localsid.c +++ b/src/vnet/sr/sr_localsid.c @@ -887,7 +887,7 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -974,26 +974,26 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (((next1 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b1)); + &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter (((next2 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b2)); + &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter (((next3 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b3)); + &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b3)); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, bi0, bi1, bi2, bi3, @@ -1062,8 +1062,8 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); @@ -1103,7 +1103,7 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -1205,26 +1205,26 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (((next1 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b1)); + &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter (((next2 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b2)); + &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter (((next3 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b3)); + &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b3)); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, bi0, bi1, bi2, bi3, @@ -1295,8 +1295,8 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index e3705060..c1567aa0 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -174,7 +174,7 @@ tclient_thread_fn (void *arg) pthread_sigmask (SIG_SETMASK, &s, 0); } - clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0]; + clib_per_cpu_mheaps[vlib_get_thread_index ()] = clib_per_cpu_mheaps[0]; while (1) { diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index b2a371e2..b6c34828 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -646,10 +646,10 @@ const static transport_proto_vft_t tcp6_proto = { void tcp_timer_keep_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID; tcp_connection_close (tc); @@ -675,10 +675,10 @@ tcp_timer_establish_handler (u32 conn_index) void tcp_timer_waitclose_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; /* Session didn't come back with a close(). Send FIN either way diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 0090e15e..eaca672c 100644 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -343,7 +343,7 @@ typedef enum _tcp_dbg_evt } \ else \ { \ - u32 _thread_index = os_get_cpu_number (); \ + u32 _thread_index = vlib_get_thread_index (); \ _tc = tcp_connection_get (_tc_index, _thread_index); \ } \ ELOG_TYPE_DECLARE (_e) = \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a8224dc2..7e9fa47b 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1142,7 +1142,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); @@ -1332,7 +1332,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; from = vlib_frame_vector_args (from_frame); @@ -1634,7 +1634,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1989,7 +1989,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; @@ -2243,7 +2243,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index ea157bd7..e18bfad7 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -387,8 +387,8 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ @@ -396,7 +396,7 @@ do { \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ tm->vlib_main, my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[cpu_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -408,8 +408,8 @@ do { \ #define tcp_return_buffer(tm) \ do { \ u32 *my_tx_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ _vec_len (my_tx_buffers) +=1; \ } while (0) @@ -942,7 +942,7 @@ tcp_send_ack (tcp_connection_t * tc) void tcp_timer_delack_handler (u32 index) { - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; tc = tcp_connection_get (index, thread_index); @@ -1022,7 +1022,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; u32 bi, snd_space, n_bytes; @@ -1152,7 +1152,7 @@ tcp_timer_persist_handler (u32 index) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; u32 bi, n_bytes; @@ -1313,7 +1313,7 @@ tcp46_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1524,7 +1524,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 4b22109b..810278e6 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -70,7 +70,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, udp4_uri_input_next_t next_index; udp_uri_main_t *um = vnet_get_udp_main (); session_manager_main_t *smm = vnet_get_session_manager_main (); - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; u8 my_enqueue_epoch; u32 *session_indices_to_enqueue; static u32 serial_number; diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c index fb1a8bac..0fc62f6c 100644 --- a/src/vnet/unix/tapcli.c +++ b/src/vnet/unix/tapcli.c @@ -366,7 +366,7 @@ static uword tapcli_rx_iface(vlib_main_t * vm, vlib_increment_combined_counter ( vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number(), ti->sw_if_index, + vlib_get_thread_index(), ti->sw_if_index, 1, n_bytes_in_packet); if (PREDICT_FALSE(n_trace > 0)) { diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c index 2cfcc92f..ac674653 100644 --- a/src/vnet/unix/tuntap.c +++ b/src/vnet/unix/tuntap.c @@ -189,7 +189,7 @@ tuntap_tx (vlib_main_t * vm, /* Update tuntap interface output stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - vm->cpu_index, + vm->thread_index, tm->sw_if_index, n_packets, n_bytes); @@ -297,7 +297,7 @@ tuntap_rx (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number(), + vlib_get_thread_index(), tm->sw_if_index, 1, n_bytes_in_packet); diff --git a/src/vnet/vxlan-gpe/decap.c b/src/vnet/vxlan-gpe/decap.c index 22ab4b62..d4fe4231 100644 --- a/src/vnet/vxlan-gpe/decap.c +++ b/src/vnet/vxlan-gpe/decap.c @@ -115,7 +115,7 @@ vxlan_gpe_input (vlib_main_t * vm, vxlan4_gpe_tunnel_key_t last_key4; vxlan6_gpe_tunnel_key_t last_key6; u32 pkts_decapsulated = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; if (is_ip4) @@ -342,7 +342,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -427,7 +427,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len1; stats_sw_if_index = sw_if_index1; @@ -588,7 +588,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -615,7 +615,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) { vlib_increment_combined_counter ( - im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, cpu_index, + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan-gpe/encap.c b/src/vnet/vxlan-gpe/encap.c index 3a486e56..67ed94b4 100644 --- a/src/vnet/vxlan-gpe/encap.c +++ b/src/vnet/vxlan-gpe/encap.c @@ -151,7 +151,7 @@ vxlan_gpe_encap (vlib_main_t * vm, vnet_main_t * vnm = ngm->vnet_main; vnet_interface_main_t * im = &vnm->interface_main; u32 pkts_encapsulated = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; from = vlib_frame_vector_args (from_frame); @@ -253,7 +253,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_sw_if_index = sw_if_index0; stats_n_packets = 2; stats_n_bytes = len0 + len1; @@ -262,10 +262,10 @@ vxlan_gpe_encap (vlib_main_t * vm, { vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index0, 1, len0); + thread_index, sw_if_index0, 1, len0); vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index1, 1, len1); + thread_index, sw_if_index1, 1, len1); } } @@ -335,7 +335,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -359,7 +359,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) { vlib_increment_combined_counter ( - im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, cpu_index, + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c index 514b2c99..2acb1f6f 100644 --- a/src/vnet/vxlan/decap.c +++ b/src/vnet/vxlan/decap.c @@ -81,7 +81,7 @@ vxlan_input (vlib_main_t * vm, vxlan4_tunnel_key_t last_key4; vxlan6_tunnel_key_t last_key6; u32 pkts_decapsulated = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; if (is_ip4) @@ -314,7 +314,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -468,7 +468,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len1; @@ -674,7 +674,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -711,7 +711,7 @@ vxlan_input (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan/encap.c b/src/vnet/vxlan/encap.c index 5b63064a..4cfbbc23 100644 --- a/src/vnet/vxlan/encap.c +++ b/src/vnet/vxlan/encap.c @@ -77,7 +77,7 @@ vxlan_encap_inline (vlib_main_t * vm, vnet_interface_main_t * im = &vnm->interface_main; u32 pkts_encapsulated = 0; u16 old_l0 = 0, old_l1 = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; u32 sw_if_index0 = 0, sw_if_index1 = 0; u32 next0 = 0, next1 = 0; @@ -301,7 +301,7 @@ vxlan_encap_inline (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_sw_if_index = sw_if_index0; stats_n_packets = 2; @@ -311,10 +311,10 @@ vxlan_encap_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index0, 1, len0); + thread_index, sw_if_index0, 1, len0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index1, 1, len1); + thread_index, sw_if_index1, 1, len1); } } @@ -464,7 +464,7 @@ vxlan_encap_inline (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -496,7 +496,7 @@ vxlan_encap_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c index 042d02e2..4309cd51 100644 --- a/src/vpp/stats/stats.c +++ b/src/vpp/stats/stats.c @@ -66,14 +66,14 @@ _(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters) void dslock (stats_main_t * sm, int release_hint, int tag) { - u32 thread_id; + u32 thread_index; data_structure_lock_t *l = sm->data_structure_lock; if (PREDICT_FALSE (l == 0)) return; - thread_id = os_get_cpu_number (); - if (l->lock && l->thread_id == thread_id) + thread_index = vlib_get_thread_index (); + if (l->lock && l->thread_index == thread_index) { l->count++; return; @@ -85,7 +85,7 @@ dslock (stats_main_t * sm, int release_hint, int tag) while (__sync_lock_test_and_set (&l->lock, 1)) /* zzzz */ ; l->tag = tag; - l->thread_id = thread_id; + l->thread_index = thread_index; l->count = 1; } @@ -99,14 +99,14 @@ stats_dslock_with_hint (int hint, int tag) void dsunlock (stats_main_t * sm) { - u32 thread_id; + u32 thread_index; data_structure_lock_t *l = sm->data_structure_lock; if (PREDICT_FALSE (l == 0)) return; - thread_id = os_get_cpu_number (); - ASSERT (l->lock && l->thread_id == thread_id); + thread_index = vlib_get_thread_index (); + ASSERT (l->lock && l->thread_index == thread_index); l->count--; if (l->count == 0) { diff --git a/src/vpp/stats/stats.h b/src/vpp/stats/stats.h index 118115be..024dc78e 100644 --- a/src/vpp/stats/stats.h +++ b/src/vpp/stats/stats.h @@ -30,7 +30,7 @@ typedef struct { volatile u32 lock; volatile u32 release_hint; - u32 thread_id; + u32 thread_index; u32 count; int tag; } data_structure_lock_t; -- cgit 1.2.3-korg From 7447d07719bb4c9c58501d78aa4a453d6a044863 Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Wed, 5 Jul 2017 12:57:10 -0400 Subject: Buffer name inconsistently used a cstring/vec (VPP-901) Spotted in the output of CLI command "show buffers", the name field sometimes had trailing garbage, the hall sign of a string not being terminated. In this case it was being inconsistently used as a cstring or a vec. - CLI printf needs %v to print the vec srring - vlib_buffer_create_free_list_helper tried to use clib_mem_is_heap_object() to detect a vec object, wheras it should use clib_mem_is_vec() Change-Id: Ib8b242a0c5a18924b8af7e8e1432784eebcf572c Signed-off-by: Chris Luke --- src/vlib/buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index be3b41ef..e50064ae 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -383,7 +383,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, f->index = f - bm->buffer_free_list_pool; f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); f->min_n_buffers_each_physmem_alloc = VLIB_FRAME_SIZE; - f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); + f->name = clib_mem_is_vec (name) ? name : format (0, "%s", name); /* Setup free buffer template. */ f->buffer_init_template.free_list_index = f->index; @@ -774,7 +774,7 @@ vlib_packet_template_init (vlib_main_t * vm, { vlib_buffer_main_t *bm = vm->buffer_main; va_list va; - __attribute__ ((unused)) u8 *name; + u8 *name; vlib_buffer_free_list_t *fl; va_start (va, fmt); @@ -962,7 +962,7 @@ format_vlib_buffer_free_list (u8 * s, va_list * va) bytes_alloc = size * f->n_alloc; bytes_free = size * n_free; - s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d", threadnum, + s = format (s, "%7d%30v%12d%12d%=12U%=12U%=12d%=12d", threadnum, f->name, f->index, f->n_data_bytes, format_memory_size, bytes_alloc, format_memory_size, bytes_free, f->n_alloc, n_free); -- cgit 1.2.3-korg From 04a7f05e91e919f51eaecaee476435484076655b Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 10 Jul 2017 15:06:17 +0200 Subject: vlib: store buffer memory information in the buffer_main Currently, buffer index is calculated as a offset to the physmem region shifted by log2_cacheline size. When DPDK is used we "hack" physmem data with information taken from dpdk mempool. This makes physmem code not usable with DPDK. This change makes buffer memory start and size independent of physmem basically allowing physmem to be used when DPDK plugin is loaded. Change-Id: Ieb399d398f147583b9baab467152a352d58c9c31 Signed-off-by: Damjan Marion --- src/plugins/dpdk/buffer.c | 69 +++-------------------------------------------- src/vlib/buffer.c | 57 ++++++++++++++++++++++----------------- src/vlib/buffer.h | 32 +++++++++++++++++----- src/vlib/buffer_funcs.h | 13 ++++++--- src/vlib/main.c | 20 ++++++++++++-- src/vlib/unix/physmem.c | 9 ------- src/vnet/pg/input.c | 6 ++--- src/vnet/pg/stream.c | 2 +- src/vnet/replication.c | 2 +- 9 files changed, 96 insertions(+), 114 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index fd1d8415..aa73eb6c 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -431,7 +431,6 @@ vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id) { dpdk_main_t *dm = &dpdk_main; - vlib_physmem_main_t *vpm = &vm->physmem_main; struct rte_mempool *rmp; int i; @@ -453,63 +452,10 @@ vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, if (rmp) { { - uword this_pool_end; - uword this_pool_start; - uword this_pool_size; - uword save_vpm_start, save_vpm_end, save_vpm_size; struct rte_mempool_memhdr *memhdr; - this_pool_start = ~0; - this_pool_end = 0; - STAILQ_FOREACH (memhdr, &rmp->mem_list, next) - { - if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) - this_pool_end = (uword) (memhdr->addr + memhdr->len); - if (((uword) memhdr->addr) < this_pool_start) - this_pool_start = (uword) (memhdr->addr); - } - ASSERT (this_pool_start < ~0 && this_pool_end > 0); - this_pool_size = this_pool_end - this_pool_start; - - if (CLIB_DEBUG > 1) - { - clib_warning ("%s: pool start %llx pool end %llx pool size %lld", - pool_name, this_pool_start, this_pool_end, - this_pool_size); - clib_warning - ("before: virtual.start %llx virtual.end %llx virtual.size %lld", - vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); - } - - save_vpm_start = vpm->virtual.start; - save_vpm_end = vpm->virtual.end; - save_vpm_size = vpm->virtual.size; - - if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) - vpm->virtual.start = this_pool_start; - if (this_pool_end > vpm->virtual.end) - vpm->virtual.end = this_pool_end; - - vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; - - if (CLIB_DEBUG > 1) - { - clib_warning - ("after: virtual.start %llx virtual.end %llx virtual.size %lld", - vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); - } - - /* check if fits into buffer index range */ - if ((u64) vpm->virtual.size > - ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) - { - clib_warning ("physmem: virtual size out of range!"); - vpm->virtual.start = save_vpm_start; - vpm->virtual.end = save_vpm_end; - vpm->virtual.size = save_vpm_size; - rmp = 0; - } + vlib_buffer_add_mem_range (vm, (uword) memhdr->addr, memhdr->len); } if (rmp) { @@ -564,7 +510,8 @@ buffer_state_validation_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (buffer_state_validation_init); #endif -static vlib_buffer_callbacks_t callbacks = { +/* *INDENT-OFF* */ +VLIB_BUFFER_REGISTER_CALLBACKS (dpdk, static) = { .vlib_buffer_alloc_cb = &dpdk_buffer_alloc, .vlib_buffer_alloc_from_free_list_cb = &dpdk_buffer_alloc_from_free_list, .vlib_buffer_free_cb = &dpdk_buffer_free, @@ -572,15 +519,7 @@ static vlib_buffer_callbacks_t callbacks = { .vlib_packet_template_init_cb = &dpdk_packet_template_init, .vlib_buffer_delete_free_list_cb = &dpdk_buffer_delete_free_list, }; - -static clib_error_t * -dpdk_buffer_init (vlib_main_t * vm) -{ - vlib_buffer_cb_register (vm, &callbacks); - return 0; -} - -VLIB_INIT_FUNCTION (dpdk_buffer_init); +/* *INDENT-ON* */ /** @endcond */ /* diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index e50064ae..b2a095cf 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -46,6 +46,8 @@ #include #include +vlib_buffer_callbacks_t *vlib_buffer_callbacks = 0; + uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, vlib_buffer_t * b_first) @@ -584,11 +586,6 @@ alloc_from_free_list (vlib_main_t * vm, dst = alloc_buffers; - /* wait with buffer memory allocation as long as possible - in case external buffer manager takes over */ - if (PREDICT_FALSE (vm->os_physmem_alloc_aligned == 0)) - unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); - n_filled = fill_free_list (vm, free_list, n_alloc_buffers); if (n_filled == 0) return 0; @@ -944,6 +941,36 @@ vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, return copied; } +void +vlib_buffer_add_mem_range (vlib_main_t * vm, uword start, uword size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + if (bm->buffer_mem_size == 0) + { + bm->buffer_mem_start = start; + bm->buffer_mem_size = size; + } + else if (start < bm->buffer_mem_start) + { + bm->buffer_mem_size += bm->buffer_mem_start - start; + bm->buffer_mem_start = start; + if (size > bm->buffer_mem_size) + bm->buffer_mem_size = size; + } + else if (start > bm->buffer_mem_start) + { + uword new_size = start - bm->buffer_mem_start + size; + if (new_size > bm->buffer_mem_size) + bm->buffer_mem_size = new_size; + } + + if ((u64) bm->buffer_mem_size > + ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) + { + clib_panic ("buffer memory size out of range!"); + } +} static u8 * format_vlib_buffer_free_list (u8 * s, va_list * va) @@ -1011,6 +1038,7 @@ void vlib_buffer_cb_init (struct vlib_main_t *vm) { vlib_buffer_main_t *bm = vm->buffer_main; + bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal; bm->cb.vlib_buffer_alloc_from_free_list_cb = &vlib_buffer_alloc_from_free_list_internal; @@ -1018,25 +1046,6 @@ vlib_buffer_cb_init (struct vlib_main_t *vm) bm->cb.vlib_buffer_free_no_next_cb = &vlib_buffer_free_no_next_internal; bm->cb.vlib_buffer_delete_free_list_cb = &vlib_buffer_delete_free_list_internal; - bm->extern_buffer_mgmt = 0; -} - -int -vlib_buffer_cb_register (struct vlib_main_t *vm, vlib_buffer_callbacks_t * cb) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - if (bm->extern_buffer_mgmt) - return -1; - -#define _(x) bm->cb.x = cb->x - _(vlib_buffer_alloc_cb); - _(vlib_buffer_alloc_from_free_list_cb); - _(vlib_buffer_free_cb); - _(vlib_buffer_free_no_next_cb); - _(vlib_buffer_delete_free_list_cb); -#undef _ - bm->extern_buffer_mgmt = 1; - return 0; } /** @endcond */ diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 18e2437d..b20538b7 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -388,12 +388,20 @@ typedef struct u32 free_list_index); } vlib_buffer_callbacks_t; +extern vlib_buffer_callbacks_t *vlib_buffer_callbacks; + typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + /* Virtual memory address and size of buffer memory, used for calculating + buffer index */ + uword buffer_mem_start; + uword buffer_mem_size; + /* Buffer free callback, for subversive activities */ - u32 (*buffer_free_callback) (struct vlib_main_t * vm, - u32 * buffers, - u32 n_buffers, u32 follow_buffer_next); + u32 (*buffer_free_callback) (struct vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 follow_buffer_next); /* Pool of buffer free lists. Multiple free lists exist for packet generator which uses separate free lists for each packet stream --- so as to avoid @@ -417,12 +425,12 @@ typedef struct /* Callbacks */ vlib_buffer_callbacks_t cb; - int extern_buffer_mgmt; + int callbacks_registered; } vlib_buffer_main_t; +void vlib_buffer_add_mem_range (struct vlib_main_t *vm, uword start, + uword size); void vlib_buffer_cb_init (struct vlib_main_t *vm); -int vlib_buffer_cb_register (struct vlib_main_t *vm, - vlib_buffer_callbacks_t * cb); typedef struct { @@ -498,6 +506,18 @@ serialize_vlib_buffer_n_bytes (serialize_main_t * m) #endif /* included_vlib_buffer_h */ +#define VLIB_BUFFER_REGISTER_CALLBACKS(x,...) \ + __VA_ARGS__ vlib_buffer_callbacks_t __##x##_buffer_callbacks; \ +static void __vlib_add_buffer_callbacks_t_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_buffer_callbacks_t_##x (void) \ +{ \ + if (vlib_buffer_callbacks) \ + clib_panic ("vlib buffer callbacks already registered"); \ + vlib_buffer_callbacks = &__##x##_buffer_callbacks; \ +} \ +__VA_ARGS__ vlib_buffer_callbacks_t __##x##_buffer_callbacks + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 328660a3..79e3e69c 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -56,8 +56,11 @@ always_inline vlib_buffer_t * vlib_get_buffer (vlib_main_t * vm, u32 buffer_index) { - return vlib_physmem_at_offset (&vm->physmem_main, ((uword) buffer_index) - << CLIB_LOG2_CACHE_LINE_BYTES); + vlib_buffer_main_t *bm = vm->buffer_main; + uword offset = ((uword) buffer_index) << CLIB_LOG2_CACHE_LINE_BYTES; + ASSERT (offset < bm->buffer_mem_size); + + return uword_to_pointer (bm->buffer_mem_start + offset, void *); } /** \brief Translate buffer pointer into buffer index @@ -66,10 +69,14 @@ vlib_get_buffer (vlib_main_t * vm, u32 buffer_index) @param p - (void *) buffer pointer @return - (u32) buffer index */ + always_inline u32 vlib_get_buffer_index (vlib_main_t * vm, void *p) { - uword offset = vlib_physmem_offset_of (&vm->physmem_main, p); + vlib_buffer_main_t *bm = vm->buffer_main; + uword offset = pointer_to_uword (p) - bm->buffer_mem_start; + ASSERT (pointer_to_uword (p) >= bm->buffer_mem_start); + ASSERT (offset < bm->buffer_mem_size); ASSERT ((offset % (1 << CLIB_LOG2_CACHE_LINE_BYTES)) == 0); return offset >> CLIB_LOG2_CACHE_LINE_BYTES; } diff --git a/src/vlib/main.c b/src/vlib/main.c index 19d70232..73548fbe 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -43,6 +43,7 @@ #include #include +#include #include CJ_GLOBAL_LOG_PROTOTYPE; @@ -466,7 +467,7 @@ vlib_put_next_frame (vlib_main_t * vm, vlib_frame_t *f; u32 n_vectors_in_frame; - if (vm->buffer_main->extern_buffer_mgmt == 0 && CLIB_DEBUG > 0) + if (vm->buffer_main->callbacks_registered == 0 && CLIB_DEBUG > 0) vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left); nf = vlib_node_runtime_get_next_frame (vm, r, next_index); @@ -1712,7 +1713,22 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) vm->name = "VLIB"; vec_validate (vm->buffer_main, 0); - vlib_buffer_cb_init (vm); + if (vlib_buffer_callbacks) + { + /* external plugin has registered own buffer callbacks + so we just copy them */ + vlib_buffer_main_t *bm = vm->buffer_main; + clib_memcpy (&bm->cb, vlib_buffer_callbacks, + sizeof (vlib_buffer_callbacks_t)); + bm->callbacks_registered = 1; + } + else + { + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_buffer_cb_init (vm); + unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); + vlib_buffer_add_mem_range (vm, vpm->virtual.start, vpm->virtual.size); + } if ((error = vlib_thread_init (vm))) { diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 8d10ad2e..933cc63b 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -45,14 +45,10 @@ static void * unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, uword alignment) { - vlib_main_t *vm = vlib_get_main (); physmem_main_t *pm = &physmem_main; uword lo_offset, hi_offset; uword *to_free = 0; - if (vm->buffer_main->extern_buffer_mgmt) - clib_warning ("unsafe alloc!"); - /* IO memory is always at least cache aligned. */ alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); @@ -270,11 +266,6 @@ show_physmem (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { physmem_main_t *pm = &physmem_main; - if (vm->buffer_main->extern_buffer_mgmt) - { - vlib_cli_output (vm, "Not supported with external buffer management."); - return 0; - } if (pm->heap) vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); diff --git a/src/vnet/pg/input.c b/src/vnet/pg/input.c index 597ae060..c3738a6a 100644 --- a/src/vnet/pg/input.c +++ b/src/vnet/pg/input.c @@ -1214,7 +1214,7 @@ pg_stream_fill_helper (pg_main_t * pg, * Historically, the pg maintained its own free lists and * device drivers tx paths would return pkts. */ - if (vm->buffer_main->extern_buffer_mgmt == 0 && + if (vm->buffer_main->callbacks_registered == 0 && !(s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE)) f->buffer_init_function = pg_buffer_init; f->buffer_init_function_opaque = @@ -1238,7 +1238,7 @@ pg_stream_fill_helper (pg_main_t * pg, n_alloc = n_allocated; /* Reinitialize buffers */ - if (vm->buffer_main->extern_buffer_mgmt == 0 || CLIB_DEBUG > 0 + if (vm->buffer_main->callbacks_registered == 0 || CLIB_DEBUG > 0 || (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE)) init_buffers_inline (vm, s, @@ -1246,7 +1246,7 @@ pg_stream_fill_helper (pg_main_t * pg, n_alloc, (bi - s->buffer_indices) * s->buffer_bytes /* data offset */ , s->buffer_bytes, /* set_data */ - vm->buffer_main->extern_buffer_mgmt != 0 + vm->buffer_main->callbacks_registered != 0 || (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE) != 0); if (next_buffers) diff --git a/src/vnet/pg/stream.c b/src/vnet/pg/stream.c index 05d820a3..a540b32b 100644 --- a/src/vnet/pg/stream.c +++ b/src/vnet/pg/stream.c @@ -438,7 +438,7 @@ pg_stream_add (pg_main_t * pg, pg_stream_t * s_init) pg_buffer_index_t *bi; int n; - if (vm->buffer_main->extern_buffer_mgmt) + if (vm->buffer_main->callbacks_registered) s->buffer_bytes = VLIB_BUFFER_DATA_SIZE; if (!s->buffer_bytes) diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 233a8c2f..1c6f28d2 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -214,7 +214,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) b0->flags |= VLIB_BUFFER_IS_RECYCLED; #if (CLIB_DEBUG > 0) - if (vm->buffer_main->extern_buffer_mgmt == 0) + if (vm->buffer_main->callbacks_registered == 0) vlib_buffer_set_known_state (vm, bi0, VLIB_BUFFER_KNOWN_ALLOCATED); #endif -- cgit 1.2.3-korg From 072401e8096c648b91f958bd911f64ce24fecff9 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 13 Jul 2017 18:53:27 +0200 Subject: Introduce l{2,3,4}_hdr_offset fields in the buffer metadata To save space in the first cacheline following is changed: - total_length_not_including_first_buffer moved to the 2nd cacheline. This field is used only when VLIB_BUFFER_TOTAL_LENGTH_VALID and VLIB_BUFFER_NEXT_PRESENT are both set. - free_list_index is now stored in 4bits inside flags, which allows up to 16 free lists. In case we need more we can store index in the 2nd cachelin Change-Id: Ic8521350819391af470d31d3fa1013e67ecb7681 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/node.c | 8 ++++++- src/vlib/buffer.c | 16 ++++++++----- src/vlib/buffer.h | 40 +++++++++++++++++--------------- src/vlib/buffer_funcs.h | 50 +++++++++++++++++++++++++++++----------- src/vnet/bfd/bfd_udp.c | 4 ++-- src/vnet/buffer.h | 14 +++-------- src/vnet/dhcp/dhcp4_proxy_node.c | 2 +- src/vnet/dhcp/dhcp6_proxy_node.c | 2 +- src/vnet/ethernet/ethernet.h | 3 +-- src/vnet/ethernet/node.c | 23 ++++++++---------- src/vnet/ip/ip4_forward.c | 6 ++--- src/vnet/ip/ip6_forward.c | 6 ++--- src/vnet/ip/ip6_neighbor.c | 19 +++++++-------- src/vnet/l2/l2_bvi.h | 2 +- src/vnet/lisp-cp/control.c | 2 +- src/vnet/replication.c | 6 ++--- 16 files changed, 111 insertions(+), 92 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 69acc529..74fb8da1 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -208,7 +208,13 @@ dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b, mb_seg = mb->next; b_chain = b; - while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs)) + if (mb->nb_segs < 2) + return; + + b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b->total_length_not_including_first_buffer = 0; + + while (nb_seg < mb->nb_segs) { ASSERT (mb_seg != 0); diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index b2a095cf..53b60c16 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -72,8 +72,8 @@ format_vlib_buffer (u8 * s, va_list * args) uword indent = format_get_indent (s); s = format (s, "current data %d, length %d, free-list %d, clone-count %u", - b->current_data, b->current_length, b->free_list_index, - b->n_add_refs); + b->current_data, b->current_length, + vlib_buffer_get_free_list_index (b), b->n_add_refs); if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID) s = format (s, ", totlen-nifb %d", @@ -163,10 +163,14 @@ vlib_validate_buffer_helper (vlib_main_t * vm, vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *fl; - if (pool_is_free_index (bm->buffer_free_list_pool, b->free_list_index)) - return format (0, "unknown free list 0x%x", b->free_list_index); + if (pool_is_free_index + (bm->buffer_free_list_pool, vlib_buffer_get_free_list_index (b))) + return format (0, "unknown free list 0x%x", + vlib_buffer_get_free_list_index (b)); - fl = pool_elt_at_index (bm->buffer_free_list_pool, b->free_list_index); + fl = + pool_elt_at_index (bm->buffer_free_list_pool, + vlib_buffer_get_free_list_index (b)); if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE) return format (0, "current data %d before pre-data", b->current_data); @@ -388,7 +392,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, f->name = clib_mem_is_vec (name) ? name : format (0, "%s", name); /* Setup free buffer template. */ - f->buffer_init_template.free_list_index = f->index; + vlib_buffer_set_free_list_index (&f->buffer_init_template, f->index); f->buffer_init_template.n_add_refs = 0; if (is_public) diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index b20538b7..c810db4e 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -72,6 +72,7 @@ typedef struct the end of this buffer. */ u32 flags; /**< buffer flags: +
VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index,
VLIB_BUFFER_IS_TRACED: trace this buffer.
VLIB_BUFFER_NEXT_PRESENT: this is a multi-chunk buffer.
VLIB_BUFFER_TOTAL_LENGTH_VALID: as it says @@ -82,28 +83,26 @@ typedef struct set to avoid adding it to a flow report
VLIB_BUFFER_FLAG_USER(n): user-defined bit N */ -#define VLIB_BUFFER_IS_TRACED (1 << 0) -#define VLIB_BUFFER_LOG2_NEXT_PRESENT (1) + +/* any change to the following line requres update of + * vlib_buffer_get_free_list_index(...) and + * vlib_buffer_set_free_list_index(...) functions */ +#define VLIB_BUFFER_FREE_LIST_INDEX_MASK ((1 << 4) - 1) + +#define VLIB_BUFFER_IS_TRACED (1 << 4) +#define VLIB_BUFFER_LOG2_NEXT_PRESENT (5) #define VLIB_BUFFER_NEXT_PRESENT (1 << VLIB_BUFFER_LOG2_NEXT_PRESENT) -#define VLIB_BUFFER_IS_RECYCLED (1 << 2) -#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 3) -#define VLIB_BUFFER_REPL_FAIL (1 << 4) -#define VLIB_BUFFER_RECYCLE (1 << 5) -#define VLIB_BUFFER_FLOW_REPORT (1 << 6) -#define VLIB_BUFFER_EXT_HDR_VALID (1 << 7) +#define VLIB_BUFFER_IS_RECYCLED (1 << 6) +#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 7) +#define VLIB_BUFFER_REPL_FAIL (1 << 8) +#define VLIB_BUFFER_RECYCLE (1 << 9) +#define VLIB_BUFFER_FLOW_REPORT (1 << 10) +#define VLIB_BUFFER_EXT_HDR_VALID (1 << 11) /* User defined buffer flags. */ #define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n)) #define VLIB_BUFFER_FLAG_USER(n) (1 << LOG2_VLIB_BUFFER_FLAG_USER(n)) - u32 free_list_index; /**< Buffer free list that this buffer was - allocated from and will be freed to. - */ - - u32 total_length_not_including_first_buffer; - /**< Only valid for first buffer in chain. Current length plus - total length given here give total number of bytes in buffer chain. - */ STRUCT_MARK (template_end); u32 next_buffer; /**< Next buffer for this linked-list of buffers. @@ -128,7 +127,7 @@ typedef struct Before allocating any of it, discussion required! */ - u32 opaque[8]; /**< Opaque data used by sub-graphs for their own purposes. + u32 opaque[10]; /**< Opaque data used by sub-graphs for their own purposes. See .../vnet/vnet/buffer.h */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); @@ -137,7 +136,12 @@ typedef struct if VLIB_PACKET_IS_TRACED flag is set. */ u32 recycle_count; /**< Used by L2 path recycle code */ - u32 opaque2[14]; /**< More opaque data, currently unused */ + + u32 total_length_not_including_first_buffer; + /**< Only valid for first buffer in chain. Current length plus + total length given here give total number of bytes in buffer chain. + */ + u32 opaque2[13]; /**< More opaque data, currently unused */ /***** end of second cache line */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline2); diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 97442e12..1aaac0b2 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -106,12 +106,15 @@ uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, always_inline uword vlib_buffer_length_in_chain (vlib_main_t * vm, vlib_buffer_t * b) { - uword l = b->current_length + b->total_length_not_including_first_buffer; - if (PREDICT_FALSE ((b->flags & (VLIB_BUFFER_NEXT_PRESENT - | VLIB_BUFFER_TOTAL_LENGTH_VALID)) - == VLIB_BUFFER_NEXT_PRESENT)) - return vlib_buffer_length_in_chain_slow_path (vm, b); - return l; + uword len = b->current_length; + + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0)) + return len; + + if (PREDICT_TRUE (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID)) + return len + b->total_length_not_including_first_buffer; + + return vlib_buffer_length_in_chain_slow_path (vm, b); } /** \brief Get length in bytes of the buffer index buffer chain @@ -261,6 +264,24 @@ vlib_buffer_round_size (u32 size) return round_pow2 (size, sizeof (vlib_buffer_t)); } +always_inline u32 +vlib_buffer_get_free_list_index (vlib_buffer_t * b) +{ + return b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK; +} + +always_inline void +vlib_buffer_set_free_list_index (vlib_buffer_t * b, u32 index) +{ + /* if there is an need for more free lists we should consider + storig data in the 2nd cacheline */ + ASSERT (VLIB_BUFFER_FREE_LIST_INDEX_MASK & 1); + ASSERT (index <= VLIB_BUFFER_FREE_LIST_INDEX_MASK); + + b->flags &= ~VLIB_BUFFER_FREE_LIST_INDEX_MASK; + b->flags |= index & VLIB_BUFFER_FREE_LIST_INDEX_MASK; +} + /** \brief Allocate buffers from specific freelist into supplied array @param vm - (vlib_main_t *) vlib main data structure pointer @@ -381,7 +402,7 @@ vlib_buffer_get_buffer_free_list (vlib_main_t * vm, vlib_buffer_t * b, vlib_buffer_main_t *bm = vm->buffer_main; u32 i; - *index = i = b->free_list_index; + *index = i = vlib_buffer_get_free_list_index (b); return pool_elt_at_index (bm->buffer_free_list_pool, i); } @@ -569,7 +590,8 @@ vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers, } n_buffers = vlib_buffer_alloc_from_free_list (vm, buffers, n_buffers, - s->free_list_index); + vlib_buffer_get_free_list_index + (s)); if (PREDICT_FALSE (n_buffers == 0)) { buffers[0] = src_buffer; @@ -581,7 +603,8 @@ vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers, vlib_buffer_t *d = vlib_get_buffer (vm, buffers[i]); d->current_data = s->current_data; d->current_length = head_end_offset; - d->free_list_index = s->free_list_index; + vlib_buffer_set_free_list_index (d, + vlib_buffer_get_free_list_index (s)); d->total_length_not_including_first_buffer = s->total_length_not_including_first_buffer + s->current_length - head_end_offset; @@ -615,7 +638,8 @@ vlib_buffer_attach_clone (vlib_main_t * vm, vlib_buffer_t * head, vlib_buffer_t * tail) { ASSERT ((head->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); - ASSERT (head->free_list_index == tail->free_list_index); + ASSERT (vlib_buffer_get_free_list_index (head) == + vlib_buffer_get_free_list_index (tail)); head->flags |= VLIB_BUFFER_NEXT_PRESENT; head->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; @@ -791,7 +815,7 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, CLIB_CACHE_LINE_BYTES * 2); /* Make sure buffer template is sane. */ - ASSERT (fl->index == fl->buffer_init_template.free_list_index); + ASSERT (fl->index == vlib_buffer_get_free_list_index (src)); clib_memcpy (STRUCT_MARK_PTR (dst, template_start), STRUCT_MARK_PTR (src, template_start), @@ -806,7 +830,6 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, _(current_data); _(current_length); _(flags); - _(free_list_index); #undef _ ASSERT (dst->total_length_not_including_first_buffer == 0); ASSERT (dst->n_add_refs == 0); @@ -832,7 +855,7 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, vlib_buffer_t *src = &fl->buffer_init_template; /* Make sure buffer template is sane. */ - ASSERT (fl->index == fl->buffer_init_template.free_list_index); + ASSERT (fl->index == vlib_buffer_get_free_list_index (src)); clib_memcpy (STRUCT_MARK_PTR (dst0, template_start), STRUCT_MARK_PTR (src, template_start), @@ -853,7 +876,6 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, _(current_data); _(current_length); _(flags); - _(free_list_index); #undef _ ASSERT (dst0->total_length_not_including_first_buffer == 0); diff --git a/src/vnet/bfd/bfd_udp.c b/src/vnet/bfd/bfd_udp.c index 346c5495..06b843c6 100644 --- a/src/vnet/bfd/bfd_udp.c +++ b/src/vnet/bfd/bfd_udp.c @@ -843,7 +843,7 @@ bfd_udp4_find_headers (vlib_buffer_t * b, ip4_header_t ** ip4, udp_header_t ** udp) { /* sanity check first */ - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < sizeof (b->pre_data)) { BFD_ERR ("Start of ip header is before pre_data, ignoring"); @@ -1000,7 +1000,7 @@ bfd_udp6_find_headers (vlib_buffer_t * b, ip6_header_t ** ip6, udp_header_t ** udp) { /* sanity check first */ - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < sizeof (b->pre_data)) { BFD_ERR ("Start of ip header is before pre_data, ignoring"); diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index 9aba34da..8647db00 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -71,7 +71,6 @@ #define VNET_BUFFER_SPAN_CLONE (1 << LOG2_VNET_BUFFER_SPAN_CLONE) #define foreach_buffer_opaque_union_subtype \ -_(ethernet) \ _(ip) \ _(swt) \ _(l2) \ @@ -100,16 +99,12 @@ _(tcp) typedef struct { u32 sw_if_index[VLIB_N_RX_TX]; + i16 l2_hdr_offset; + i16 l3_hdr_offset; + i16 l4_hdr_offset; union { - /* Ethernet. */ - struct - { - /* Saved value of current header by ethernet-input. */ - i32 start_of_ethernet_header; - } ethernet; - /* IP4/6 buffer opaque. */ struct { @@ -143,9 +138,6 @@ typedef struct u8 code; u32 data; } icmp; - - /* IP header offset from vlib_buffer.data - saved by ip*_local nodes */ - i32 start_of_ip_header; }; } ip; diff --git a/src/vnet/dhcp/dhcp4_proxy_node.c b/src/vnet/dhcp/dhcp4_proxy_node.c index 26e1e65c..1b59cdea 100644 --- a/src/vnet/dhcp/dhcp4_proxy_node.c +++ b/src/vnet/dhcp/dhcp4_proxy_node.c @@ -231,7 +231,7 @@ dhcp_proxy_to_server_input (vlib_main_t * vm, o = (dhcp_option_t *) (((uword) o) + (o->length + 2)); } - fl = vlib_buffer_get_free_list (vm, b0->free_list_index); + fl = vlib_buffer_get_free_list (vm, vlib_buffer_get_free_list_index (b0)); // start write at (option*)o, some packets have padding if (((u8 *)o - (u8 *)b0->data + VPP_DHCP_OPTION82_SIZE) > fl->n_data_bytes) { diff --git a/src/vnet/dhcp/dhcp6_proxy_node.c b/src/vnet/dhcp/dhcp6_proxy_node.c index 885313a5..e109cc4c 100644 --- a/src/vnet/dhcp/dhcp6_proxy_node.c +++ b/src/vnet/dhcp/dhcp6_proxy_node.c @@ -306,7 +306,7 @@ dhcpv6_proxy_to_server_input (vlib_main_t * vm, copy_ip6_address(&r1->link_addr, ia0); link_address_set: - fl = vlib_buffer_get_free_list (vm, b0->free_list_index); + fl = vlib_buffer_get_free_list (vm, vlib_buffer_get_free_list_index (b0)); if ((b0->current_length+sizeof(*id1)+sizeof(*vss1)+sizeof(*cmac)) > fl->n_data_bytes) diff --git a/src/vnet/ethernet/ethernet.h b/src/vnet/ethernet/ethernet.h index dcc656a7..2fc5b804 100644 --- a/src/vnet/ethernet/ethernet.h +++ b/src/vnet/ethernet/ethernet.h @@ -344,8 +344,7 @@ ethernet_setup_node (vlib_main_t * vm, u32 node_index) always_inline ethernet_header_t * ethernet_buffer_get_header (vlib_buffer_t * b) { - return (void *) - (b->data + vnet_buffer (b)->ethernet.start_of_ethernet_header); + return (void *) (b->data + vnet_buffer (b)->l2_hdr_offset); } /** Returns the number of VLAN headers in the current Ethernet frame in the diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index d9fdff48..421d501a 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -101,7 +101,7 @@ parse_header (ethernet_input_variant_t variant, e0 = (void *) (b0->data + b0->current_data); - vnet_buffer (b0)->ethernet.start_of_ethernet_header = b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; vlib_buffer_advance (b0, sizeof (e0[0])); @@ -205,9 +205,7 @@ identify_subint (vnet_hw_interface_t * hi, if (!(*is_l2)) { ethernet_header_t *e0; - e0 = - (void *) (b0->data + - vnet_buffer (b0)->ethernet.start_of_ethernet_header); + e0 = (void *) (b0->data + vnet_buffer (b0)->l2_hdr_offset); if (!(ethernet_address_cast (e0->dst_address))) { @@ -238,7 +236,7 @@ determine_next_node (ethernet_main_t * em, { *next0 = em->l2_next; // record the L2 len and reset the buffer so the L2 header is preserved - u32 eth_start = vnet_buffer (b0)->ethernet.start_of_ethernet_header; + u32 eth_start = vnet_buffer (b0)->l2_hdr_offset; vnet_buffer (b0)->l2.l2_len = b0->current_data - eth_start; ASSERT (vnet_buffer (b0)->l2.l2_len == ethernet_buffer_header_size (b0)); @@ -424,10 +422,8 @@ ethernet_input_inline (vlib_main_t * vm, cached_is_l2 = is_l20 = subint0->flags & SUBINT_CONFIG_L2; } - vnet_buffer (b0)->ethernet.start_of_ethernet_header = - b0->current_data; - vnet_buffer (b1)->ethernet.start_of_ethernet_header = - b1->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; + vnet_buffer (b1)->l2_hdr_offset = b1->current_data; if (PREDICT_TRUE (is_l20 != 0)) { @@ -519,9 +515,9 @@ ethernet_input_inline (vlib_main_t * vm, { len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data - - vnet_buffer (b0)->ethernet.start_of_ethernet_header; + - vnet_buffer (b0)->l2_hdr_offset; len1 = vlib_buffer_length_in_chain (vm, b1) + b1->current_data - - vnet_buffer (b1)->ethernet.start_of_ethernet_header; + - vnet_buffer (b1)->l2_hdr_offset; stats_n_packets += 2; stats_n_bytes += len0 + len1; @@ -646,8 +642,7 @@ ethernet_input_inline (vlib_main_t * vm, cached_is_l2 = is_l20 = subint0->flags & SUBINT_CONFIG_L2; } - vnet_buffer (b0)->ethernet.start_of_ethernet_header = - b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; if (PREDICT_TRUE (is_l20 != 0)) { @@ -710,7 +705,7 @@ ethernet_input_inline (vlib_main_t * vm, { len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data - - vnet_buffer (b0)->ethernet.start_of_ethernet_header; + - vnet_buffer (b0)->l2_hdr_offset; stats_n_packets += 1; stats_n_bytes += len0; diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 8263e01c..b8dfa847 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1585,8 +1585,8 @@ ip4_local_inline (vlib_main_t * vm, ip0 = vlib_buffer_get_current (p0); ip1 = vlib_buffer_get_current (p1); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; - vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; + vnet_buffer (p1)->l3_hdr_offset = p1->current_data; sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX]; @@ -1788,7 +1788,7 @@ ip4_local_inline (vlib_main_t * vm, ip0 = vlib_buffer_get_current (p0); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 4b574b9a..2b8c2bd2 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -1362,8 +1362,8 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip0 = vlib_buffer_get_current (p0); ip1 = vlib_buffer_get_current (p1); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; - vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; + vnet_buffer (p1)->l3_hdr_offset = p1->current_data; type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; type1 = lm->builtin_protocol_by_ip_protocol[ip1->protocol]; @@ -1493,7 +1493,7 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip0 = vlib_buffer_get_current (p0); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; next0 = lm->local_next_by_ip_protocol[ip0->protocol]; diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index b8f6f9b1..68a8cbbc 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -1479,9 +1479,8 @@ icmp6_router_solicitation (vlib_main_t * vm, sizeof (icmp6_router_advertisement_header_t); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &rh, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &rh, sizeof (icmp6_router_advertisement_header_t)); @@ -1499,9 +1498,8 @@ icmp6_router_solicitation (vlib_main_t * vm, eth_if0->address, 6); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &h, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &h, sizeof (icmp6_neighbor_discovery_ethernet_link_layer_address_option_t)); @@ -1525,9 +1523,8 @@ icmp6_router_solicitation (vlib_main_t * vm, sizeof (icmp6_neighbor_discovery_mtu_option_t); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &h, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &h, sizeof (icmp6_neighbor_discovery_mtu_option_t)); } @@ -1579,7 +1576,7 @@ icmp6_router_solicitation (vlib_main_t * vm, payload_length += sizeof( icmp6_neighbor_discovery_prefix_information_option_t); vlib_buffer_add_data (vm, - p0->free_list_index, + vlib_buffer_get_free_list_index (p0), bi0, (void *)&h, sizeof(icmp6_neighbor_discovery_prefix_information_option_t)); @@ -2326,7 +2323,7 @@ ip6_neighbor_send_mldpv2_report (u32 sw_if_index) num_addr_records++; vlib_buffer_add_data - (vm, b0->free_list_index, bo0, + (vm, vlib_buffer_get_free_list_index (b0), bo0, (void *)&rr, sizeof(icmp6_multicast_address_record_t)); payload_length += sizeof( icmp6_multicast_address_record_t); diff --git a/src/vnet/l2/l2_bvi.h b/src/vnet/l2/l2_bvi.h index e21a1616..662ec402 100644 --- a/src/vnet/l2/l2_bvi.h +++ b/src/vnet/l2/l2_bvi.h @@ -57,7 +57,7 @@ l2_to_bvi (vlib_main_t * vlib_main, } /* Save L2 header position which may be changed due to packet replication */ - vnet_buffer (b0)->ethernet.start_of_ethernet_header = b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; /* Strip L2 header */ l2_len = vnet_buffer (b0)->l2.l2_len; diff --git a/src/vnet/lisp-cp/control.c b/src/vnet/lisp-cp/control.c index 22b5c82c..d8a1372d 100644 --- a/src/vnet/lisp-cp/control.c +++ b/src/vnet/lisp-cp/control.c @@ -3706,7 +3706,7 @@ send_map_reply (lisp_cp_main_t * lcm, u32 mi, ip_address_t * dst, static void find_ip_header (vlib_buffer_t * b, u8 ** ip_hdr) { - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < -sizeof (b->pre_data)) { *ip_hdr = 0; diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 1c6f28d2..0fdca0bf 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -43,12 +43,12 @@ replication_prep (vlib_main_t * vm, ctx_id = ctx - rm->contexts[thread_index]; /* Save state from vlib buffer */ - ctx->saved_free_list_index = b0->free_list_index; + ctx->saved_free_list_index = vlib_buffer_get_free_list_index (b0); ctx->current_data = b0->current_data; /* Set up vlib buffer hooks */ b0->recycle_count = ctx_id; - b0->free_list_index = rm->recycle_list_index; + vlib_buffer_set_free_list_index (b0, rm->recycle_list_index); b0->flags |= VLIB_BUFFER_RECYCLE; /* Save feature state */ @@ -129,7 +129,7 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) * This is the last replication in the list. * Restore original buffer free functionality. */ - b0->free_list_index = ctx->saved_free_list_index; + vlib_buffer_set_free_list_index (b0, ctx->saved_free_list_index); b0->flags &= ~VLIB_BUFFER_RECYCLE; /* Free context back to its pool */ -- cgit 1.2.3-korg From 6b0f5892833254438adaaf786b7195c82e3fdd30 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 27 Jul 2017 04:01:24 -0400 Subject: Thread safe internal buffer manager Change-Id: I45845b952aa42a854e1c2c396b85f905de987020 Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 7 +------ src/vlib/buffer.h | 2 ++ src/vlib/buffer_funcs.h | 15 +++++++++++++-- src/vlib/unix/physmem.c | 3 ++- 4 files changed, 18 insertions(+), 9 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 53b60c16..a5c955c7 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -305,12 +305,6 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, if (CLIB_DEBUG == 0) return; - ASSERT (vlib_get_thread_index () == 0); - - /* smp disaster check */ - if (vec_len (vlib_mains) > 1) - ASSERT (vm == vlib_mains[0]); - is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED; b = buffers; for (i = 0; i < n_buffers; i++) @@ -1050,6 +1044,7 @@ vlib_buffer_cb_init (struct vlib_main_t *vm) bm->cb.vlib_buffer_free_no_next_cb = &vlib_buffer_free_no_next_internal; bm->cb.vlib_buffer_delete_free_list_cb = &vlib_buffer_delete_free_list_internal; + clib_spinlock_init (&bm->buffer_known_hash_lockp); } /** @endcond */ diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 77528e77..9047ca9a 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -44,6 +44,7 @@ #include #include #include +#include #include /* for vlib_error_t */ #include /* for __PRE_DATA_SIZE */ @@ -423,6 +424,7 @@ typedef struct If buffer index is not in hash table then this buffer has never been allocated. */ uword *buffer_known_hash; + clib_spinlock_t buffer_known_hash_lockp; /* List of free-lists needing Blue Light Special announcements */ vlib_buffer_free_list_t **announce_list; diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 1aaac0b2..72008dad 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -219,9 +219,10 @@ always_inline vlib_buffer_known_state_t vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (vlib_get_thread_index () == 0); + clib_spinlock_lock (&bm->buffer_known_hash_lockp); uword *p = hash_get (bm->buffer_known_hash, buffer_index); + clib_spinlock_unlock (&bm->buffer_known_hash_lockp); return p ? p[0] : VLIB_BUFFER_UNKNOWN; } @@ -231,8 +232,9 @@ vlib_buffer_set_known_state (vlib_main_t * vm, vlib_buffer_known_state_t state) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (vlib_get_thread_index () == 0); + clib_spinlock_lock (&bm->buffer_known_hash_lockp); hash_set (bm->buffer_known_hash, buffer_index, state); + clib_spinlock_unlock (&bm->buffer_known_hash_lockp); } /* Validates sanity of a single buffer. @@ -841,10 +843,19 @@ vlib_buffer_add_to_free_list (vlib_main_t * vm, u32 buffer_index, u8 do_init) { vlib_buffer_t *b; + u32 i; b = vlib_get_buffer (vm, buffer_index); if (PREDICT_TRUE (do_init)) vlib_buffer_init_for_free_list (b, f); vec_add1_aligned (f->buffers, buffer_index, CLIB_CACHE_LINE_BYTES); + + if (vec_len (f->buffers) > 3 * VLIB_FRAME_SIZE) + { + /* keep last stored buffers, as they are more likely hot in the cache */ + for (i = 0; i < VLIB_FRAME_SIZE; i++) + vm->os_physmem_free (vlib_get_buffer (vm, i)); + vec_delete (f->buffers, VLIB_FRAME_SIZE, 0); + } } always_inline void diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 933cc63b..27a5bacf 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -155,7 +155,8 @@ htlb_init (vlib_main_t * vm) pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size, /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM); + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); cur = pointer_to_uword (pm->mem); i = 0; -- cgit 1.2.3-korg From b6a8ed7fa0709bbf8e091826803f50e6330689cf Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 29 Aug 2017 00:15:35 +0200 Subject: Thread safe internal buffer manager, take two First attempt to make internal buffer manager thread safe was not succesfull, so trying again. This time with more testing. Change-Id: I01b8385a9c26d233934a3339255ea4bd31c865ac Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 19 +++++++++++++++++++ src/vlib/buffer.h | 6 ++++++ src/vlib/buffer_funcs.h | 11 +++++++---- 3 files changed, 32 insertions(+), 4 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a5c955c7..908368c0 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -396,6 +396,8 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); } + clib_spinlock_init (&f->global_buffers_lock); + for (i = 1; i < vec_len (vlib_mains); i++) { vlib_buffer_main_t *wbm = vlib_mains[i]->buffer_main; @@ -509,6 +511,7 @@ fill_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * fl, uword min_free_buffers) { vlib_buffer_t *buffers, *b; + vlib_buffer_free_list_t *mfl; int n, n_bytes, i; u32 *bi; u32 n_remaining, n_alloc, n_this_chunk; @@ -518,6 +521,22 @@ fill_free_list (vlib_main_t * vm, if (n <= 0) return min_free_buffers; + mfl = vlib_buffer_get_free_list (vlib_mains[0], fl->index); + if (vec_len (mfl->global_buffers) > 0) + { + int n_copy, n_left; + clib_spinlock_lock (&mfl->global_buffers_lock); + n_copy = clib_min (vec_len (mfl->global_buffers), n); + n_left = vec_len (mfl->global_buffers) - n_copy; + vec_add_aligned (fl->buffers, mfl->global_buffers + n_left, n_copy, + CLIB_CACHE_LINE_BYTES); + _vec_len (mfl->global_buffers) = n_left; + clib_spinlock_unlock (&mfl->global_buffers_lock); + n = min_free_buffers - vec_len (fl->buffers); + if (n <= 0) + return min_free_buffers; + } + /* Always allocate round number of buffers. */ n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32)); diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 9047ca9a..5504bf7c 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -350,6 +350,12 @@ typedef struct vlib_buffer_free_list_t /* Vector of free buffers. Each element is a byte offset into I/O heap. */ u32 *buffers; + /* global vector of free buffers, used only on main thread. + Bufers are returned to global buffers only in case when number of + buffers on free buffers list grows about threshold */ + u32 *global_buffers; + clib_spinlock_t global_buffers_lock; + /* Memory chunks allocated for this free list recorded here so they can be freed when free list is deleted. */ diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 6a662416..78bf9317 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -848,18 +848,21 @@ vlib_buffer_add_to_free_list (vlib_main_t * vm, u32 buffer_index, u8 do_init) { vlib_buffer_t *b; - u32 i; b = vlib_get_buffer (vm, buffer_index); if (PREDICT_TRUE (do_init)) vlib_buffer_init_for_free_list (b, f); vec_add1_aligned (f->buffers, buffer_index, CLIB_CACHE_LINE_BYTES); - if (vec_len (f->buffers) > 3 * VLIB_FRAME_SIZE) + if (vec_len (f->buffers) > 4 * VLIB_FRAME_SIZE) { + vlib_buffer_free_list_t *mf; + mf = vlib_buffer_get_free_list (vlib_mains[0], f->index); + clib_spinlock_lock (&mf->global_buffers_lock); /* keep last stored buffers, as they are more likely hot in the cache */ - for (i = 0; i < VLIB_FRAME_SIZE; i++) - vm->os_physmem_free (vlib_get_buffer (vm, i)); + vec_add_aligned (mf->global_buffers, f->buffers, VLIB_FRAME_SIZE, + CLIB_CACHE_LINE_BYTES); vec_delete (f->buffers, VLIB_FRAME_SIZE, 0); + clib_spinlock_unlock (&mf->global_buffers_lock); } } -- cgit 1.2.3-korg From 49d66f1f42cbc310e4fa0dc526b9fdb91d0ca220 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 20 Jul 2017 18:10:35 +0200 Subject: vlib physmem rework This patch adds supprot support for multiple numa-aware physmem regions. Change-Id: I5c69a6f4da33c8ee21bdb8604d52fd2886f2327e Signed-off-by: Damjan Marion --- src/examples/vlib/main_stub.c | 3 +- src/plugins/dpdk/api/dpdk_api.c | 1 - src/plugins/dpdk/device/init.c | 17 +- src/plugins/dpdk/hqos/hqos.c | 1 - src/plugins/ixge/ixge.c | 32 ++- src/plugins/ixge/ixge.h | 2 + src/vlib.am | 4 +- src/vlib/buffer.c | 72 ++++- src/vlib/buffer.h | 3 +- src/vlib/buffer_funcs.h | 39 +-- src/vlib/main.c | 20 +- src/vlib/main.h | 16 +- src/vlib/physmem.h | 69 ++--- src/vlib/physmem_funcs.h | 161 +++++++++++ src/vlib/unix/physmem.c | 572 +++++++++++++++++++--------------------- src/vlib/unix/physmem.h | 65 ----- src/vlib/unix/unix.h | 24 +- src/vlib/unix/util.c | 113 +++++++- src/vlib/vlib.h | 3 +- 19 files changed, 700 insertions(+), 517 deletions(-) create mode 100644 src/vlib/physmem_funcs.h delete mode 100644 src/vlib/unix/physmem.h (limited to 'src/vlib/buffer.c') diff --git a/src/examples/vlib/main_stub.c b/src/examples/vlib/main_stub.c index 4d74bd77..3b19c53f 100644 --- a/src/examples/vlib/main_stub.c +++ b/src/examples/vlib/main_stub.c @@ -27,8 +27,7 @@ main_stub_init (vlib_main_t * vm) { clib_error_t *error; - if ((error = - unix_physmem_init (vm, /* fail_if_physical_memory_not_present */ 0))) + if ((error = unix_physmem_init (vm))) return error; if ((error = vlib_call_init_function (vm, unix_cli_init))) diff --git a/src/plugins/dpdk/api/dpdk_api.c b/src/plugins/dpdk/api/dpdk_api.c index 08afdd70..97c4bc75 100755 --- a/src/plugins/dpdk/api/dpdk_api.c +++ b/src/plugins/dpdk/api/dpdk_api.c @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index a795ba0e..e23542f7 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -17,10 +17,10 @@ #include #include #include +#include #include #include -#include #include #include @@ -1026,21 +1026,28 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( { int pages_avail, page_size, mem; + clib_error_t *e = 0; vec_validate(mem_by_socket, c); mem = mem_by_socket[c]; page_size = 1024; - pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); + e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); - if (pages_avail < 0 || page_size * pages_avail < mem) + if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_1g = 0; + if (e) + clib_error_free (e); + page_size = 2; - pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); + e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); - if (pages_avail < 0 || page_size * pages_avail < mem) + if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_2m = 0; + + if (e) + clib_error_free (e); })); /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index 813eb91c..c9b85652 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include /* enumerate all vlib messages */ diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index e0150f41..222c148c 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -2493,10 +2493,11 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); dq->head_index = dq->tail_index = 0; - dq->descriptors = vlib_physmem_alloc_aligned (vm, &error, - dq->n_descriptors * - sizeof (dq->descriptors[0]), - 128 /* per chip spec */ ); + dq->descriptors = + vlib_physmem_alloc_aligned (vm, xm->physmem_region, &error, + dq->n_descriptors * + sizeof (dq->descriptors[0]), + 128 /* per chip spec */ ); if (error) return error; @@ -2518,7 +2519,8 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) vlib_buffer_t *b = vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]); dq->descriptors[i].rx_to_hw.tail_address = - vlib_physmem_virtual_to_physical (vm, b->data); + vlib_physmem_virtual_to_physical (vm, xm->physmem_region, + b->data); } } else @@ -2526,7 +2528,8 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) u32 i; dq->tx.head_index_write_back = - vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES); + vlib_physmem_alloc (vm, vm->buffer_main->physmem_region, &error, + CLIB_CACHE_LINE_BYTES); for (i = 0; i < dq->n_descriptors; i++) dq->descriptors[i].tx = xm->tx_descriptor_template; @@ -2538,7 +2541,9 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); u64 a; - a = vlib_physmem_virtual_to_physical (vm, dq->descriptors); + a = + vlib_physmem_virtual_to_physical (vm, vm->buffer_main->physmem_region, + dq->descriptors); dr->descriptor_address[0] = a & 0xFFFFFFFF; dr->descriptor_address[1] = a >> (u64) 32; dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); @@ -2564,7 +2569,9 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) dq->tx.head_index_write_back[0] = dq->head_index; a = - vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back); + vlib_physmem_virtual_to_physical (vm, + vm->buffer_main->physmem_region, + dq->tx.head_index_write_back); dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; } @@ -2850,9 +2857,12 @@ ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev) void *r; ixge_device_t *xd; - /* Device found: make sure we have dma memory. */ - if (unix_physmem_is_fake (vm)) - return clib_error_return (0, "no physical memory available"); + /* Allocate physmem region for DMA buffers */ + error = vlib_physmem_region_alloc (vm, "ixge decriptors", 2 << 20, 0, + VLIB_PHYSMEM_F_INIT_MHEAP, + &xm->physmem_region); + if (error) + return error; error = vlib_pci_map_resource (dev, 0, &r); if (error) diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h index 779603b3..42c1bfa5 100644 --- a/src/plugins/ixge/ixge.h +++ b/src/plugins/ixge/ixge.h @@ -1266,6 +1266,8 @@ typedef struct u32 *rx_buffers_to_add; f64 time_last_stats_update; + + vlib_physmem_region_index_t physmem_region; } ixge_main_t; ixge_main_t ixge_main; diff --git a/src/vlib.am b/src/vlib.am index 111dcfa3..cab90e2d 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -13,7 +13,7 @@ lib_LTLIBRARIES += libvlib.la -libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread +libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread -lnuma libvlib_la_DEPENDENCIES = libvppinfra.la BUILT_SOURCES += vlib/config.h @@ -65,6 +65,7 @@ nobase_include_HEADERS += \ vlib/physmem.h \ vlib/pci/pci.h \ vlib/pci/pci_config.h \ + vlib/physmem_funcs.h \ vlib/threads.h \ vlib/trace_funcs.h \ vlib/trace.h \ @@ -84,7 +85,6 @@ libvlib_la_SOURCES += \ nobase_include_HEADERS += \ vlib/unix/cj.h \ vlib/unix/mc_socket.h \ - vlib/unix/physmem.h \ vlib/unix/plugin.h \ vlib/unix/unix.h diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 908368c0..a5ec0e0a 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -47,6 +47,7 @@ #include vlib_buffer_callbacks_t *vlib_buffer_callbacks = 0; +static u32 vlib_buffer_physmem_sz = 32 << 20; uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, @@ -461,7 +462,8 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) u32 i; for (i = 0; i < vec_len (f->buffer_memory_allocated); i++) - vm->os_physmem_free (f->buffer_memory_allocated[i]); + vm->os_physmem_free (vm, vm->buffer_main->physmem_region, + f->buffer_memory_allocated[i]); vec_free (f->name); vec_free (f->buffer_memory_allocated); vec_free (f->buffers); @@ -552,9 +554,9 @@ fill_free_list (vlib_main_t * vm, n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes); /* drb: removed power-of-2 ASSERT */ - buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main, - n_bytes, - sizeof (vlib_buffer_t)); + buffers = + vm->os_physmem_alloc_aligned (vm, vm->buffer_main->physmem_region, + n_bytes, sizeof (vlib_buffer_t)); if (!buffers) return n_alloc; @@ -1051,10 +1053,25 @@ VLIB_CLI_COMMAND (show_buffers_command, static) = { }; /* *INDENT-ON* */ -void -vlib_buffer_cb_init (struct vlib_main_t *vm) +clib_error_t * +vlib_buffer_main_init (struct vlib_main_t * vm) { - vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_main_t *bm; + clib_error_t *error; + + vec_validate (vm->buffer_main, 0); + bm = vm->buffer_main; + + if (vlib_buffer_callbacks) + { + /* external plugin has registered own buffer callbacks + so we just copy them and quit */ + vlib_buffer_main_t *bm = vm->buffer_main; + clib_memcpy (&bm->cb, vlib_buffer_callbacks, + sizeof (vlib_buffer_callbacks_t)); + bm->callbacks_registered = 1; + return 0; + } bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal; bm->cb.vlib_buffer_alloc_from_free_list_cb = @@ -1064,8 +1081,49 @@ vlib_buffer_cb_init (struct vlib_main_t *vm) bm->cb.vlib_buffer_delete_free_list_cb = &vlib_buffer_delete_free_list_internal; clib_spinlock_init (&bm->buffer_known_hash_lockp); + + /* allocate default region */ + error = vlib_physmem_region_alloc (vm, "buffers", + vlib_buffer_physmem_sz, 0, + VLIB_PHYSMEM_F_INIT_MHEAP | + VLIB_PHYSMEM_F_HAVE_BUFFERS, + &bm->physmem_region); + + if (error == 0) + return 0; + + clib_error_free (error); + + /* we my be running unpriviledged, so try to allocate fake physmem */ + error = vlib_physmem_region_alloc (vm, "buffers (fake)", + vlib_buffer_physmem_sz, 0, + VLIB_PHYSMEM_F_FAKE | + VLIB_PHYSMEM_F_INIT_MHEAP | + VLIB_PHYSMEM_F_HAVE_BUFFERS, + &bm->physmem_region); + return error; } +static clib_error_t * +vlib_buffers_configure (vlib_main_t * vm, unformat_input_t * input) +{ + u32 size_in_mb; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "memory-size-in-mb %d", &size_in_mb)) + vlib_buffer_physmem_sz = size_in_mb << 20; + else + return unformat_parse_error (input); + } + + unformat_free (input); + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (vlib_buffers_configure, "buffers"); + + /** @endcond */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 5504bf7c..e47dbc6d 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -408,6 +408,7 @@ typedef struct buffer index */ uword buffer_mem_start; uword buffer_mem_size; + vlib_physmem_region_index_t physmem_region; /* Buffer free callback, for subversive activities */ u32 (*buffer_free_callback) (struct vlib_main_t * vm, @@ -442,7 +443,7 @@ typedef struct void vlib_buffer_add_mem_range (struct vlib_main_t *vm, uword start, uword size); -void vlib_buffer_cb_init (struct vlib_main_t *vm); +clib_error_t *vlib_buffer_main_init (struct vlib_main_t *vm); typedef struct { diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 78bf9317..d51de6be 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -162,7 +162,7 @@ vlib_buffer_contents (vlib_main_t * vm, u32 buffer_index, u8 * contents) always_inline u64 vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index) { - return vlib_physmem_offset_to_physical (&vm->physmem_main, + return vlib_physmem_offset_to_physical (vm, vm->buffer_main->physmem_region, (((uword) buffer_index) << CLIB_LOG2_CACHE_LINE_BYTES) + STRUCT_OFFSET_OF (vlib_buffer_t, @@ -455,43 +455,6 @@ vlib_copy_buffers (u32 * dst, u32 * src, u32 n) } } -always_inline void * -vlib_physmem_alloc_aligned (vlib_main_t * vm, clib_error_t ** error, - uword n_bytes, uword alignment) -{ - void *r = - vm->os_physmem_alloc_aligned (&vm->physmem_main, n_bytes, alignment); - if (!r) - *error = - clib_error_return (0, "failed to allocate %wd bytes of I/O memory", - n_bytes); - else - *error = 0; - return r; -} - -/* By default allocate I/O memory with cache line alignment. */ -always_inline void * -vlib_physmem_alloc (vlib_main_t * vm, clib_error_t ** error, uword n_bytes) -{ - return vlib_physmem_alloc_aligned (vm, error, n_bytes, - CLIB_CACHE_LINE_BYTES); -} - -always_inline void -vlib_physmem_free (vlib_main_t * vm, void *mem) -{ - return vm->os_physmem_free (mem); -} - -always_inline u64 -vlib_physmem_virtual_to_physical (vlib_main_t * vm, void *mem) -{ - vlib_physmem_main_t *pm = &vm->physmem_main; - uword o = pointer_to_uword (mem) - pm->virtual.start; - return vlib_physmem_offset_to_physical (pm, o); -} - /* Append given data to end of buffer, possibly allocating new buffers. */ u32 vlib_buffer_add_data (vlib_main_t * vm, u32 free_list_index, diff --git a/src/vlib/main.c b/src/vlib/main.c index 5d99e899..7875f62a 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1705,22 +1705,16 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) if (!vm->name) vm->name = "VLIB"; - vec_validate (vm->buffer_main, 0); - if (vlib_buffer_callbacks) + if ((error = unix_physmem_init (vm))) { - /* external plugin has registered own buffer callbacks - so we just copy them */ - vlib_buffer_main_t *bm = vm->buffer_main; - clib_memcpy (&bm->cb, vlib_buffer_callbacks, - sizeof (vlib_buffer_callbacks_t)); - bm->callbacks_registered = 1; + clib_error_report (error); + goto done; } - else + + if ((error = vlib_buffer_main_init (vm))) { - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_buffer_cb_init (vm); - unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); - vlib_buffer_add_mem_range (vm, vpm->virtual.start, vpm->virtual.size); + clib_error_report (error); + goto done; } if ((error = vlib_thread_init (vm))) diff --git a/src/vlib/main.h b/src/vlib/main.h index b63c63fa..4c0cde3f 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -107,9 +107,21 @@ typedef struct vlib_main_t /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc. buffer memory is guaranteed to be cache-aligned. */ - void *(*os_physmem_alloc_aligned) (vlib_physmem_main_t * pm, + + clib_error_t *(*os_physmem_region_alloc) (struct vlib_main_t * vm, + char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * + idx); + + void (*os_physmem_region_free) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx); + + void *(*os_physmem_alloc_aligned) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx, uword n_bytes, uword alignment); - void (*os_physmem_free) (void *x); + void (*os_physmem_free) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx, void *x); /* Node graph main structure. */ vlib_node_main_t node_main; diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h index 9e7d52a6..a7fed124 100644 --- a/src/vlib/physmem.h +++ b/src/vlib/physmem.h @@ -40,62 +40,35 @@ #ifndef included_vlib_physmem_h #define included_vlib_physmem_h -typedef struct -{ - uword start, end, size; -} vlib_physmem_region_t; +typedef u8 vlib_physmem_region_index_t; typedef struct { - vlib_physmem_region_t virtual; - - uword log2_n_bytes_per_page; - - /* 1 << log2_n_bytes_per_page - 1. */ - uword page_mask; - + vlib_physmem_region_index_t index; + void *mem; + uword size; + int fd; + u8 log2_page_size; + u16 n_pages; + u32 page_mask; + + void *heap; + u32 flags; +#define VLIB_PHYSMEM_F_INIT_MHEAP (1<<0) +#define VLIB_PHYSMEM_F_HAVE_BUFFERS (1<<1) +#define VLIB_PHYSMEM_F_FAKE (1<<2) + + u8 numa_node; u64 *page_table; + u8 *name; +} vlib_physmem_region_t; - /* is fake physmem */ - u8 is_fake; -} vlib_physmem_main_t; - -always_inline u64 -vlib_physmem_offset_to_physical (vlib_physmem_main_t * pm, uword o) -{ - uword page_index = o >> pm->log2_n_bytes_per_page; - ASSERT (o < pm->virtual.size); - ASSERT (pm->page_table[page_index] != 0); - return (vec_elt (pm->page_table, page_index) + (o & pm->page_mask)); -} - -always_inline int -vlib_physmem_is_virtual (vlib_physmem_main_t * pm, uword p) -{ - return p >= pm->virtual.start && p < pm->virtual.end; -} - -always_inline uword -vlib_physmem_offset_of (vlib_physmem_main_t * pm, void *p) -{ - uword a = pointer_to_uword (p); - uword o; - - ASSERT (vlib_physmem_is_virtual (pm, a)); - o = a - pm->virtual.start; - - /* Offset must fit in 32 bits. */ - ASSERT ((uword) o == a - pm->virtual.start); - return o; -} -always_inline void * -vlib_physmem_at_offset (vlib_physmem_main_t * pm, uword offset) +typedef struct { - ASSERT (offset < pm->virtual.size); - return uword_to_pointer (pm->virtual.start + offset, void *); -} + vlib_physmem_region_t *regions; +} vlib_physmem_main_t; #endif /* included_vlib_physmem_h */ diff --git a/src/vlib/physmem_funcs.h b/src/vlib/physmem_funcs.h new file mode 100644 index 00000000..dbb8d9de --- /dev/null +++ b/src/vlib/physmem_funcs.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.h: virtual <-> physical memory mapping for VLIB buffers + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_physmem_funcs_h +#define included_vlib_physmem_funcs_h + +always_inline vlib_physmem_region_t * +vlib_physmem_get_region (vlib_main_t * vm, u8 index) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + return pool_elt_at_index (vpm->regions, index); +} + +always_inline u64 +vlib_physmem_offset_to_physical (vlib_main_t * vm, + vlib_physmem_region_index_t idx, uword o) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword page_index = o >> pr->log2_page_size; + ASSERT (o < pr->size); + ASSERT (pr->page_table[page_index] != 0); + return (vec_elt (pr->page_table, page_index) + (o & pr->page_mask)); +} + +always_inline int +vlib_physmem_is_virtual (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword p) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + return p >= pointer_to_uword (pr->mem) + && p < (pointer_to_uword (pr->mem) + pr->size); +} + +always_inline uword +vlib_physmem_offset_of (vlib_main_t * vm, vlib_physmem_region_index_t idx, + void *p) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword a = pointer_to_uword (p); + uword o; + + ASSERT (vlib_physmem_is_virtual (vm, idx, a)); + o = a - pointer_to_uword (pr->mem); + + /* Offset must fit in 32 bits. */ + ASSERT ((uword) o == a - pointer_to_uword (pr->mem)); + + return o; +} + +always_inline void * +vlib_physmem_at_offset (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword offset) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + ASSERT (offset < pr->size); + return uword_to_pointer (pointer_to_uword (pr->mem) + offset, void *); +} + +always_inline void * +vlib_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + clib_error_t ** error, + uword n_bytes, uword alignment) +{ + void *r = vm->os_physmem_alloc_aligned (vm, idx, n_bytes, alignment); + if (!r) + *error = + clib_error_return (0, "failed to allocate %wd bytes of I/O memory", + n_bytes); + else + *error = 0; + return r; +} + +/* By default allocate I/O memory with cache line alignment. */ +always_inline void * +vlib_physmem_alloc (vlib_main_t * vm, vlib_physmem_region_index_t idx, + clib_error_t ** error, uword n_bytes) +{ + return vlib_physmem_alloc_aligned (vm, idx, error, n_bytes, + CLIB_CACHE_LINE_BYTES); +} + +always_inline void +vlib_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, + void *mem) +{ + return vm->os_physmem_free (vm, idx, mem); +} + +always_inline u64 +vlib_physmem_virtual_to_physical (vlib_main_t * vm, + vlib_physmem_region_index_t idx, void *mem) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = pool_elt_at_index (vpm->regions, idx); + uword o = mem - pr->mem; + return vlib_physmem_offset_to_physical (vm, idx, o); +} + + +always_inline clib_error_t * +vlib_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + return vm->os_physmem_region_alloc (vm, name, size, numa_node, flags, idx); +} + +always_inline void +vlib_physmem_region_free (struct vlib_main_t *vm, + vlib_physmem_region_index_t idx) +{ + vm->os_physmem_region_free (vm, idx); +} + +#endif /* included_vlib_physmem_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 27a5bacf..d5d5d6c8 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -37,24 +37,66 @@ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifndef __NR_memfd_create +#if defined __x86_64__ +#define __NR_memfd_create 319 +#elif defined __arm__ +#define __NR_memfd_create 385 +#elif defined __aarch64__ +#define __NR_memfd_create 279 +#else +#error "__NR_memfd_create unknown for this architecture" +#endif +#endif + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#define MFD_ALLOW_SEALING 0x0002U +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -static physmem_main_t physmem_main; +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ static void * -unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, - uword alignment) +unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword n_bytes, uword alignment) { - physmem_main_t *pm = &physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); uword lo_offset, hi_offset; uword *to_free = 0; + if (pr->heap == 0) + return 0; + /* IO memory is always at least cache aligned. */ alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); while (1) { - mheap_get_aligned (pm->heap, n_bytes, + mheap_get_aligned (pr->heap, n_bytes, /* align */ alignment, /* align offset */ 0, &lo_offset); @@ -63,11 +105,14 @@ unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, if (lo_offset == ~0) break; + if (pr->flags & VLIB_PHYSMEM_F_FAKE) + break; + /* Make sure allocation does not span DMA physical chunk boundary. */ hi_offset = lo_offset + n_bytes - 1; - if ((lo_offset >> vpm->log2_n_bytes_per_page) == - (hi_offset >> vpm->log2_n_bytes_per_page)) + if ((lo_offset >> pr->log2_page_size) == + (hi_offset >> pr->log2_page_size)) break; /* Allocation would span chunk boundary, queue it to be freed as soon as @@ -79,380 +124,311 @@ unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, { uword i; for (i = 0; i < vec_len (to_free); i++) - mheap_put (pm->heap, to_free[i]); + mheap_put (pr->heap, to_free[i]); vec_free (to_free); } - return lo_offset != ~0 ? pm->heap + lo_offset : 0; + return lo_offset != ~0 ? pr->heap + lo_offset : 0; } static void -unix_physmem_free (void *x) +unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) { - physmem_main_t *pm = &physmem_main; - + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); /* Return object to region's heap. */ - mheap_put (pm->heap, x - pm->heap); + mheap_put (pr->heap, x - pr->heap); } -static void -htlb_shutdown (void) +static u64 +get_page_paddr (int fd, uword addr) { - physmem_main_t *pm = &physmem_main; - - if (!pm->shmid) - return; - shmctl (pm->shmid, IPC_RMID, 0); - pm->shmid = 0; -} + int pagesize = sysconf (_SC_PAGESIZE); + u64 seek, pagemap = 0; -/* try to use huge TLB pgs if possible */ -static int -htlb_init (vlib_main_t * vm) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - physmem_main_t *pm = &physmem_main; - u64 hugepagesize, pagesize; - u64 pfn, seek_loc; - u64 cur, physaddr, ptbits; - int fd, i; - - pm->shmid = shmget (11 /* key, my amp goes to 11 */ , pm->mem_size, - IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W); - if (pm->shmid < 0) + seek = ((u64) addr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) { - clib_unix_warning ("shmget"); + clib_unix_warning ("lseek to 0x%llx", seek); return 0; } - - pm->mem = shmat (pm->shmid, NULL, 0 /* flags */ ); - if (pm->mem == 0) + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) { - shmctl (pm->shmid, IPC_RMID, 0); + clib_unix_warning ("read ptbits"); return 0; } + if ((pagemap & (1ULL << 63)) == 0) + return 0; - memset (pm->mem, 0, pm->mem_size); + pagemap &= pow2_mask (55); - /* $$$ get page size info from /proc/meminfo */ - hugepagesize = 2 << 20; - pagesize = 4 << 10; - vpm->log2_n_bytes_per_page = min_log2 (hugepagesize); - vec_resize (vpm->page_table, pm->mem_size / hugepagesize); + return pagemap * pagesize; +} - vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); - vpm->virtual.start = pointer_to_uword (pm->mem); - vpm->virtual.size = pm->mem_size; - vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; +static clib_error_t * +unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + clib_error_t *error = 0; + int pagemap_fd = -1; + u8 *mount_dir = 0; + u8 *filename = 0; + struct stat st; + int old_mpol; + int mmap_flags; + struct bitmask *old_mask = numa_allocate_nodemask (); - fd = open ("/proc/self/pagemap", O_RDONLY); + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) + return clib_error_return (0, "not allowed"); - if (fd < 0) + pool_get (vpm->regions, pr); + + if ((pr - vpm->regions) >= 256) { - (void) shmdt (pm->mem); - return 0; + error = clib_error_return (0, "maximum number of regions reached"); + goto error; } - pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size, - /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM | - MHEAP_FLAG_THREAD_SAFE); + pr->index = pr - vpm->regions; + pr->fd = -1; + pr->flags = flags; - cur = pointer_to_uword (pm->mem); - i = 0; + if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) + == -1) + { + error = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } - while (cur < pointer_to_uword (pm->mem) + pm->mem_size) + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - pfn = (u64) cur / pagesize; - seek_loc = pfn * sizeof (u64); - if (lseek (fd, seek_loc, SEEK_SET) != seek_loc) - { - clib_unix_warning ("lseek to 0x%llx", seek_loc); - shmctl (pm->shmid, IPC_RMID, 0); - close (fd); - return 0; - } - if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof (ptbits))) + if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) { - clib_unix_warning ("read ptbits"); - shmctl (pm->shmid, IPC_RMID, 0); - close (fd); - return 0; + error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); + goto error; } - /* bits 0-54 are the physical page number */ - physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize; - if (CLIB_DEBUG > 1) - fformat (stderr, "pm: virtual 0x%llx physical 0x%llx\n", - cur, physaddr); - vpm->page_table[i++] = physaddr; + mount_dir = format (0, "%s/physmem_region%d%c", + vlib_unix_get_runtime_dir (), pr->index, 0); + filename = format (0, "%s/mem%c", mount_dir, 0); - cur += hugepagesize; - } - close (fd); - atexit (htlb_shutdown); - return 1; -} - -int vlib_app_physmem_init (vlib_main_t * vm, - physmem_main_t * pm, int) __attribute__ ((weak)); -int -vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x) -{ - return 0; -} - -clib_error_t * -unix_physmem_init (vlib_main_t * vm, int physical_memory_required) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - physmem_main_t *pm = &physmem_main; - clib_error_t *error = 0; - - /* Avoid multiple calls. */ - if (vm->os_physmem_alloc_aligned) - return error; + unlink ((char *) mount_dir); - vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; - vm->os_physmem_free = unix_physmem_free; - pm->mem = MAP_FAILED; + error = vlib_unix_recursive_mkdir ((char *) mount_dir); + if (error) + goto error; - if (pm->mem_size == 0) - pm->mem_size = 16 << 20; + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + error = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } - /* OK, Mr. App, you tell us */ - if (vlib_app_physmem_init (vm, pm, physical_memory_required)) - return 0; + if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + error = clib_error_return_unix (0, "open"); + goto error; + } - if (!pm->no_hugepages && htlb_init (vm)) + mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + } + else { - fformat (stderr, "%s: use huge pages\n", __FUNCTION__); - return 0; + if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) + return clib_error_return_unix (0, "memfd_create"); + + if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + error = + clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + goto error; + } + mmap_flags = MAP_SHARED; } - pm->mem = - mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (pm->mem == MAP_FAILED) + if (fstat (pr->fd, &st)) { - error = clib_error_return_unix (0, "mmap"); - goto done; + error = clib_error_return_unix (0, "fstat"); + goto error; } - pm->heap = mheap_alloc (pm->mem, pm->mem_size); - - /* Identity map with a single page. */ - vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size); - vec_add1 (vpm->page_table, pointer_to_uword (pm->mem)); - - vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); - vpm->virtual.start = pointer_to_uword (pm->mem); - vpm->virtual.size = pm->mem_size; - vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; - vpm->is_fake = 1; + pr->log2_page_size = min_log2 (st.st_blksize); + pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; + size = pr->n_pages * (1 << pr->log2_page_size); - fformat (stderr, "%s: use fake dma pages\n", __FUNCTION__); + if ((ftruncate (pr->fd, size)) == -1) + { + error = clib_error_return_unix (0, "ftruncate length: %d", size); + goto error; + } -done: - if (error) + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if (pm->mem != MAP_FAILED) - munmap (pm->mem, pm->mem_size); + error = vlib_sysfs_prealloc_hugepages (numa_node, + 1 << (pr->log2_page_size - 10), + pr->n_pages); + if (error) + goto error; } - return error; -} -static clib_error_t * -show_physmem (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - physmem_main_t *pm = &physmem_main; + numa_set_preferred (numa_node); - if (pm->heap) - vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); - else - vlib_cli_output (vm, "No physmem allocated."); - return 0; -} + pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_physmem_command, static) = { - .path = "show physmem", - .short_help = "Show physical memory allocation", - .function = show_physmem, -}; -/* *INDENT-ON* */ + if (pr->mem == MAP_FAILED) + { + pr->mem = 0; + error = clib_error_return_unix (0, "mmap"); + goto error; + } -static clib_error_t * -show_affinity (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - cpu_set_t set; - cpu_set_t *setp = &set; - int i, rv; - u8 *s = 0; - int first_set_bit_in_run = -1; - int last_set_bit_in_run = -1; - int output_done = 0; - - rv = sched_getaffinity (0 /* pid, 0 = this proc */ , - sizeof (*setp), setp); - if (rv < 0) + if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) { - vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", - strerror (errno)); - return 0; + error = clib_error_return_unix (0, "set_mempolicy"); + goto error; } - for (i = 0; i < 64; i++) + pr->size = pr->n_pages << pr->log2_page_size; + pr->page_mask = (1 << pr->log2_page_size) - 1; + pr->numa_node = numa_node; + pr->name = format (0, "%s", name); + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if (CPU_ISSET (i, setp)) - { - if (first_set_bit_in_run == -1) - { - first_set_bit_in_run = i; - last_set_bit_in_run = i; - if (output_done) - s = format (s, ","); - s = format (s, "%d-", i); - output_done = 1; - } - else - { - if (i == (last_set_bit_in_run + 1)) - last_set_bit_in_run = i; - } - } - else + int i; + for (i = 0; i < pr->n_pages; i++) { - if (first_set_bit_in_run != -1) + void *ptr = pr->mem + (i << pr->log2_page_size); + int node; + move_pages (0, 1, &ptr, 0, &node, 0); + if (numa_node != node) { - if (first_set_bit_in_run == (i - 1)) - { - _vec_len (s) -= 2 + ((first_set_bit_in_run / 10)); - } - s = format (s, "%d", last_set_bit_in_run); - first_set_bit_in_run = -1; - last_set_bit_in_run = -1; + clib_warning + ("physmem page for region \'%s\' allocated on the wrong" + " numa node (requested %u actual %u)", pr->name, + pr->numa_node, node, i); + break; } } } - if (first_set_bit_in_run != -1) - s = format (s, "%d", first_set_bit_in_run); - - vlib_cli_output (vm, "Process runs on: %v", s); - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_affinity_command, static) = { - .path = "show affinity", - .short_help = "Show process cpu affinity", - .function = show_affinity, -}; -/* *INDENT-ON* */ + if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) + { + pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); + fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); + } -static clib_error_t * -set_affinity (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - cpu_set_t set; - cpu_set_t *setp = &set; - int i, rv; - int another_round; - u32 first, last; + if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) + { + vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); + } - memset (setp, 0, sizeof (*setp)); + *idx = pr->index; - do + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - another_round = 0; - if (unformat (input, "%d-%d,", &first, &last)) + int i; + for (i = 0; i < pr->n_pages; i++) { - if (first > 64 || last > 64) - { - barf1: - vlib_cli_output (vm, "range %d-%d invalid", first, last); - return 0; - } - - for (i = first; i <= last; i++) - CPU_SET (i, setp); - another_round = 1; + uword vaddr = + pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); + u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); + vec_add1 (pr->page_table, page_paddr); } - else if (unformat (input, "%d-%d", &first, &last)) - { - if (first > 64 || last > 64) - goto barf1; + } - for (i = first; i <= last; i++) - CPU_SET (i, setp); - } - else if (unformat (input, "%d,", &first)) - { - if (first > 64) - { - barf2: - vlib_cli_output (vm, "cpu %d invalid", first); - return 0; - } - CPU_SET (first, setp); - another_round = 1; - } - else if (unformat (input, "%d", &first)) - { - if (first > 64) - goto barf2; + goto done; - CPU_SET (first, setp); - } - } - while (another_round); +error: + if (pr->fd > -1) + close (pr->fd); - rv = sched_setaffinity (0 /* pid, 0 = this proc */ , - sizeof (*setp), setp); + if (pr->mem) + munmap (pr->mem, size); - if (rv < 0) + memset (pr, 0, sizeof (*pr)); + pool_put (vpm->regions, pr); + +done: + if (mount_dir) { - vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", - strerror (errno)); - return 0; + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + vec_free (mount_dir); } - return show_affinity (vm, input, cmd); + numa_free_cpumask (old_mask); + vec_free (filename); + if (pagemap_fd > -1) + close (pagemap_fd); + return error; } -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (set_affinity_command, static) = { - .path = "set affinity", - .short_help = "Set process cpu affinity", - .function = set_affinity, -}; -/* *INDENT-ON* */ +static void +unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + + if (pr->fd > 0) + close (pr->fd); + munmap (pr->mem, pr->size); + vec_free (pr->name); + pool_put (vpm->regions, pr); +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + vm->os_physmem_region_alloc = unix_physmem_region_alloc; + vm->os_physmem_region_free = unix_physmem_region_free; + + return error; +} static clib_error_t * -vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input) +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) { - physmem_main_t *pm = &physmem_main; - u32 size_in_mb; + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + /* *INDENT-OFF* */ + pool_foreach (pr, vpm->regions, ( { - if (unformat (input, "no-huge") || unformat (input, "no-huge-pages")) - pm->no_hugepages = 1; - - else if (unformat (input, "size-in-mb %d", &size_in_mb) || - unformat (input, "size %d", &size_in_mb)) - pm->mem_size = size_in_mb << 20; + vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " + "numa-node %u fd %d\n", + pr->index, pr->name, (1 << (pr->log2_page_size -10)), + pr->n_pages, pr->numa_node, pr->fd); + if (pr->heap) + vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); else - return unformat_parse_error (input); - } - - unformat_free (input); + vlib_cli_output (vm, " no heap\n"); + })); + /* *INDENT-ON* */ return 0; } -VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem"); +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vlib/unix/physmem.h b/src/vlib/unix/physmem.h deleted file mode 100644 index 5519a7d6..00000000 --- a/src/vlib/unix/physmem.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __included_physmem_h__ -#define __included_physmem_h__ - -/* Manage I/O physical memory. */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -#include -#include - -#include /* for open */ -#include /* for flock */ -#include -#include -#include -#include -#include -#include - -typedef struct -{ - /* Virtual memory via mmaped. */ - void *mem; - - /* Size in bytes. */ - uword mem_size; - - /* Heap allocated out of virtual memory. */ - void *heap; - - /* huge TLB segment id */ - int shmid; - - /* should we try to use htlb ? */ - int no_hugepages; - -} physmem_main_t; - -#endif /* __included_physmem_h__ */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index 97f58944..b5a33427 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -195,18 +195,7 @@ unix_save_error (unix_main_t * um, clib_error_t * error) /* Main function for Unix VLIB. */ int vlib_unix_main (int argc, char *argv[]); -/* Call to allocate/initialize physical DMA memory subsystem. - This is not an init function so that users can explicitly enable/disable - physmem when its not needed. */ -clib_error_t *unix_physmem_init (vlib_main_t * vm, - int fail_if_physical_memory_not_present); - -static inline int -unix_physmem_is_fake (vlib_main_t * vm) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - return vpm->is_fake; -} +clib_error_t *unix_physmem_init (vlib_main_t * vm); /* Set prompt for CLI. */ void vlib_unix_cli_set_prompt (char *prompt); @@ -234,7 +223,16 @@ clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); u8 *vlib_sysfs_link_to_name (char *link); -int vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size); +clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, + int page_size, int nr); +clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, + int page_size, int nr); clib_error_t *foreach_directory_file (char *dir_name, clib_error_t * (*f) (void *arg, diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 312cc9b5..0e252aca 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -189,37 +189,132 @@ vlib_sysfs_link_to_name (char *link) return s; } -int -vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size) +clib_error_t * +vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) { + clib_error_t *error = 0; struct stat sb; u8 *p = 0; - int r = -1; p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); if (stat ((char *) p, &sb) == 0) { if (S_ISDIR (sb.st_mode) == 0) - goto done; + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } } else if (numa_node == 0) { vec_reset_length (p); p = format (p, "/sys/kernel/mm%c", 0); if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - goto done; + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } } else - goto done; + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/free_hugepages%c", page_size, 0); - vlib_sysfs_read ((char *) p, "%d", &r); + p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); + vlib_sysfs_write ((char *) p, "%d", nr); done: vec_free (p); - return r; + return error; +} + + +static clib_error_t * +vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, + int page_size, int *val) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, + type, 0); + error = vlib_sysfs_read ((char *) p, "%d", val); + +done: + vec_free (p); + return error; +} + +clib_error_t * +vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, + int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + int n, needed; + error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); + if (error) + return error; + needed = nr - n; + if (needed <= 0) + return 0; + + error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); + if (error) + return error; + clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", + needed, page_size, numa_node); + return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); } clib_error_t * diff --git a/src/vlib/vlib.h b/src/vlib/vlib.h index b146a49b..eed5c5bc 100644 --- a/src/vlib/vlib.h +++ b/src/vlib/vlib.h @@ -50,6 +50,7 @@ struct vlib_main_t; /* All includes in alphabetical order. */ +#include #include #include #include @@ -57,7 +58,6 @@ struct vlib_main_t; #include #include #include -#include #include /* Main include depends on other vlib/ includes so we put it last. */ @@ -65,6 +65,7 @@ struct vlib_main_t; /* Inline/extern function declarations. */ #include +#include #include #include #include -- cgit 1.2.3-korg From 2f9b0c05fca7ca829ea438da1d87e2bf93969500 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 11 Sep 2017 20:54:15 -0400 Subject: dpdk: cli to check for buffer leakage Use buffer pre_data and existing buffer trace trajectory code to find out dpdk buffer leakages. Change-Id: I26a5d8bd2f23d01cb6070ffc3ddcc6d3d863b575 Signed-off-by: Florin Coras --- src/plugins/dpdk/buffer.c | 63 +++++++++++++++++++++++++++++++++++++++- src/plugins/dpdk/device/cli.c | 53 +++++++++++++++++++++++++++++++++ src/plugins/dpdk/device/device.c | 6 ++++ src/plugins/dpdk/device/dpdk.h | 5 ++++ src/vlib/buffer.c | 2 +- src/vnet/ip/ip4_forward.c | 1 + src/vnet/tcp/tcp_output.c | 10 ++++--- 7 files changed, 134 insertions(+), 6 deletions(-) (limited to 'src/vlib/buffer.c') diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 28af100a..e09d8019 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -340,7 +340,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, vlib_buffer_t *b; b = vlib_get_buffer (vm, buffers[i]); - + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b); fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); /* The only current use of this callback: multicast recycle */ @@ -493,6 +493,67 @@ buffer_state_validation_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (buffer_state_validation_init); #endif +#if CLI_DEBUG +struct dpdk_validate_buf_result +{ + u32 invalid; + u32 uninitialized; +}; + +#define DPDK_TRAJECTORY_POISON 31 + +static void +dpdk_buffer_validate_trajectory (struct rte_mempool *mp, void *opaque, + void *obj, unsigned obj_idx) +{ + vlib_buffer_t *b; + struct dpdk_validate_buf_result *counter = opaque; + b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj); + if (b->pre_data[0] != 0) + { + if (b->pre_data[0] == DPDK_TRAJECTORY_POISON) + counter->uninitialized++; + else + counter->invalid++; + } +} + +int +dpdk_buffer_validate_trajectory_all (u32 * uninitialized) +{ + dpdk_main_t *dm = &dpdk_main; + struct dpdk_validate_buf_result counter = { 0 }; + int i; + + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + rte_mempool_obj_iter (dm->pktmbuf_pools[i], + dpdk_buffer_validate_trajectory, &counter); + if (uninitialized) + *uninitialized = counter.uninitialized; + return counter.invalid; +} + +static void +dpdk_buffer_poison_trajectory (struct rte_mempool *mp, void *opaque, + void *obj, unsigned obj_idx) +{ + vlib_buffer_t *b; + b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj); + b->pre_data[0] = DPDK_TRAJECTORY_POISON; +} + +void +dpdk_buffer_poison_trajectory_all (void) +{ + dpdk_main_t *dm = &dpdk_main; + int i; + + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + rte_mempool_obj_iter (dm->pktmbuf_pools[i], dpdk_buffer_poison_trajectory, + 0); +} +#endif + /* *INDENT-OFF* */ VLIB_BUFFER_REGISTER_CALLBACKS (dpdk, static) = { .vlib_buffer_alloc_cb = &dpdk_buffer_alloc, diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index aeeb772d..c9fcea5c 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -1885,6 +1885,59 @@ VLIB_CLI_COMMAND (show_vpe_version_command, static) = { }; /* *INDENT-ON* */ +#if CLI_DEBUG + +static clib_error_t * +dpdk_validate_buffers_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + u32 n_invalid_bufs = 0, uninitialized = 0; + u32 is_poison = 0, is_test = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "poison")) + is_poison = 1; + else if (unformat (input, "trajectory")) + is_test = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (VLIB_BUFFER_TRACE_TRAJECTORY == 0) + { + vlib_cli_output (vm, "Trajectory not enabled. Recompile with " + "VLIB_BUFFER_TRACE_TRAJECTORY 1"); + return 0; + } + if (is_poison) + { + dpdk_buffer_poison_trajectory_all (); + } + if (is_test) + { + n_invalid_bufs = dpdk_buffer_validate_trajectory_all (&uninitialized); + if (!n_invalid_bufs) + vlib_cli_output (vm, "All buffers are valid %d uninitialized", + uninitialized); + else + vlib_cli_output (vm, "Found %d invalid buffers and %d uninitialized", + n_invalid_bufs, uninitialized); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_dpdk_buffers_command, static) = +{ + .path = "test dpdk buffers", + .short_help = "test dpdk buffers [poison] [trajectory]", + .function = dpdk_validate_buffers_fn, +}; +/* *INDENT-ON* */ + +#endif + clib_error_t * dpdk_cli_init (vlib_main_t * vm) { diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 97c13630..aa134327 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -462,6 +462,11 @@ dpdk_interface_tx (vlib_main_t * vm, or_flags = b0->flags | b1->flags | b2->flags | b3->flags; + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3); + if (or_flags & VLIB_BUFFER_NEXT_PRESENT) { dpdk_validate_rte_mbuf (vm, b0, 1); @@ -556,6 +561,7 @@ dpdk_interface_tx (vlib_main_t * vm, from++; b0 = vlib_get_buffer (vm, bi0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); dpdk_validate_rte_mbuf (vm, b0, 1); diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 849e687b..9762c713 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -467,6 +467,11 @@ admin_up_down_process (vlib_main_t * vm, clib_error_t *dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id); +#if CLI_DEBUG +int dpdk_buffer_validate_trajectory_all (u32 * uninitialized); +void dpdk_buffer_poison_trajectory_all (void); +#endif + #endif /* __included_dpdk_h__ */ /* diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a5ec0e0a..7399b618 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -686,7 +686,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 bi = buffers[i]; b = vlib_get_buffer (vm, bi); - + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b); fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); /* The only current use of this callback: multicast recycle */ diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index b3de1201..c526003c 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -2131,6 +2131,7 @@ ip4_arp_inline (vlib_main_t * vm, vlib_buffer_copy_trace_flag (vm, p0, bi0); b0 = vlib_get_buffer (vm, bi0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0; vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 9cb3e779..b843c926 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -439,14 +439,16 @@ tcp_init_mss (tcp_connection_t * tc) always_inline int tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) { + vlib_main_t *vm = vlib_get_main (); u32 current_length = vec_len (tm->tx_buffers[thread_index]); + u32 n_allocated; vec_validate (tm->tx_buffers[thread_index], current_length + n_free_buffers - 1); - _vec_len (tm->tx_buffers[thread_index]) = current_length - + vlib_buffer_alloc (vlib_get_main (), - &tm->tx_buffers[thread_index][current_length], - n_free_buffers); + n_allocated = + vlib_buffer_alloc (vm, &tm->tx_buffers[thread_index][current_length], + n_free_buffers); + _vec_len (tm->tx_buffers[thread_index]) = current_length + n_allocated; /* buffer shortage, report failure */ if (vec_len (tm->tx_buffers[thread_index]) == 0) { -- cgit 1.2.3-korg