From 7cd468a3d7dee7d6c92f69a0bb7061ae208ec727 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 19 Dec 2016 23:05:39 +0100 Subject: Reorganize source tree to use single autotools instance Change-Id: I7b51f88292e057c6443b12224486f2d0c9f8ae23 Signed-off-by: Damjan Marion --- src/vnet/fib/fib_walk.c | 1108 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1108 insertions(+) create mode 100644 src/vnet/fib/fib_walk.c (limited to 'src/vnet/fib/fib_walk.c') diff --git a/src/vnet/fib/fib_walk.c b/src/vnet/fib/fib_walk.c new file mode 100644 index 00000000..938f7b8c --- /dev/null +++ b/src/vnet/fib/fib_walk.c @@ -0,0 +1,1108 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +/** + * The flags on a walk + */ +typedef enum fib_walk_flags_t_ +{ + /** + * A synchronous walk. + * This walk will run to completion, i.e. visit ALL the children. + * It is a depth first traversal of the graph. + */ + FIB_WALK_FLAG_SYNC = (1 << 0), + /** + * An asynchronous walk. + * This walk will be scheduled to run in the background. It will thus visits + * the children at a later point in time. + * It is a depth first traversal of the graph. + */ + FIB_WALK_FLAG_ASYNC = (1 << 1), + /** + * An indication that the walk is currently executing. + */ + FIB_WALK_FLAG_EXECUTING = (1 << 2), +} fib_walk_flags_t; + +/** + * A representation of a graph walk from a parent object to its children + */ +typedef struct fib_walk_t_ +{ + /** + * FIB node linkage. This object is not in the FIB object graph, + * but it is present in other node's dependency lists, so it needs to + * be pointerable to. + */ + fib_node_t fw_node; + + /** + * the walk's flags + */ + fib_walk_flags_t fw_flags; + + /** + * Sibling index in the dependency list + */ + u32 fw_dep_sibling; + + /** + * Sibling index in the list of all walks + */ + u32 fw_prio_sibling; + + /** + * Pointer to the node whose dependants this walk is walking + */ + fib_node_ptr_t fw_parent; + + /** + * Number of nodes visited by this walk. saved for debugging purposes. + */ + u32 fw_n_visits; + + /** + * Time the walk started + */ + f64 fw_start_time; + + /** + * The reasons this walk is occuring. + * This is a vector ordered in time. The reasons and the front were started + * first, and so should be acted first when a node is visisted. + */ + fib_node_back_walk_ctx_t *fw_ctx; +} fib_walk_t; + +/** + * @brief The pool of all walk objects + */ +static fib_walk_t *fib_walk_pool; + +/** + * @brief There's only one event type sent to the walk process + */ +#define FIB_WALK_EVENT 0 + +/** + * Statistics maintained per-walk queue + */ +typedef enum fib_walk_queue_stats_t_ +{ + FIB_WALK_SCHEDULED, + FIB_WALK_COMPLETED, +} fib_walk_queue_stats_t; +#define FIB_WALK_QUEUE_STATS_NUM ((fib_walk_queue_stats_t)(FIB_WALK_COMPLETED+1)) + +#define FIB_WALK_QUEUE_STATS { \ + [FIB_WALK_SCHEDULED] = "scheduled", \ + [FIB_WALK_COMPLETED] = "completed", \ +} + +#define FOR_EACH_FIB_WALK_QUEUE_STATS(_wqs) \ + for ((_wqs) = FIB_WALK_SCHEDULED; \ + (_wqs) < FIB_WALK_QUEUE_STATS_NUM; \ + (_wqs)++) + +/** + * The names of the walk stats + */ +static const char * const fib_walk_queue_stats_names[] = FIB_WALK_QUEUE_STATS; +/** + * The names of the walk reasons + */ +static const char * const fib_node_bw_reason_names[] = FIB_NODE_BW_REASONS; + +/** + * A represenation of one queue of walk + */ +typedef struct fib_walk_queue_t_ +{ + /** + * Qeuee stats + */ + u64 fwq_stats[FIB_WALK_QUEUE_STATS_NUM]; + + /** + * The node list which acts as the queue + */ + fib_node_list_t fwq_queue; +} fib_walk_queue_t; + +/** + * A set of priority queues for outstanding walks + */ +typedef struct fib_walk_queues_t_ +{ + fib_walk_queue_t fwqs_queues[FIB_WALK_PRIORITY_NUM]; +} fib_walk_queues_t; + +/** + * The global queues of outstanding walks + */ +static fib_walk_queues_t fib_walk_queues; + +/** + * The names of the walk priorities + */ +static const char * const fib_walk_priority_names[] = FIB_WALK_PRIORITIES; + +/** + * @brief Histogram stats on the lenths of each walk in elemenets visisted. + * Store upto 1<<23 elements in increments of 1<<10 + */ +#define HISTOGRAM_VISITS_PER_WALK_MAX (1<<23) +#define HISTOGRAM_VISITS_PER_WALK_INCR (1<<10) +#define HISTOGRAM_VISITS_PER_WALK_N_BUCKETS \ + (HISTOGRAM_VISITS_PER_WALK_MAX/HISTOGRAM_VISITS_PER_WALK_INCR) +static u64 fib_walk_hist_vists_per_walk[HISTOGRAM_VISITS_PER_WALK_N_BUCKETS]; + +/** + * @brief History of state for the last 128 walks + */ +#define HISTORY_N_WALKS 128 +#define MAX_HISTORY_REASONS 16 +static u32 history_last_walk_pos; +typedef struct fib_walk_history_t_ { + u32 fwh_n_visits; + f64 fwh_duration; + f64 fwh_completed; + fib_node_ptr_t fwh_parent; + fib_walk_flags_t fwh_flags; + fib_node_bw_reason_flag_t fwh_reason[MAX_HISTORY_REASONS]; +} fib_walk_history_t; +static fib_walk_history_t fib_walk_history[HISTORY_N_WALKS]; + +u8* +format_fib_walk_priority (u8 *s, va_list ap) +{ + fib_walk_priority_t prio = va_arg(ap, fib_walk_priority_t); + + ASSERT(prio < FIB_WALK_PRIORITY_NUM); + + return (format(s, "%s", fib_walk_priority_names[prio])); +} +static u8* +format_fib_walk_queue_stats (u8 *s, va_list ap) +{ + fib_walk_queue_stats_t wqs = va_arg(ap, fib_walk_queue_stats_t); + + ASSERT(wqs < FIB_WALK_QUEUE_STATS_NUM); + + return (format(s, "%s", fib_walk_queue_stats_names[wqs])); +} + +static index_t +fib_walk_get_index (fib_walk_t *fwalk) +{ + return (fwalk - fib_walk_pool); +} + +static fib_walk_t * +fib_walk_get (index_t fwi) +{ + return (pool_elt_at_index(fib_walk_pool, fwi)); +} + +/* + * not static so it can be used in the unit tests + */ +u32 +fib_walk_queue_get_size (fib_walk_priority_t prio) +{ + return (fib_node_list_get_size(fib_walk_queues.fwqs_queues[prio].fwq_queue)); +} + +static fib_node_index_t +fib_walk_queue_get_front (fib_walk_priority_t prio) +{ + fib_node_ptr_t wp; + + fib_node_list_get_front(fib_walk_queues.fwqs_queues[prio].fwq_queue, &wp); + + return (wp.fnp_index); +} + +static void +fib_walk_destroy (fib_walk_t *fwalk) +{ + u32 bucket, ii; + + if (FIB_NODE_INDEX_INVALID != fwalk->fw_prio_sibling) + { + fib_node_list_elt_remove(fwalk->fw_prio_sibling); + } + fib_node_child_remove(fwalk->fw_parent.fnp_type, + fwalk->fw_parent.fnp_index, + fwalk->fw_dep_sibling); + + /* + * add the stats to the continuous histogram collection. + */ + bucket = (fwalk->fw_n_visits / HISTOGRAM_VISITS_PER_WALK_INCR); + bucket = (bucket >= HISTOGRAM_VISITS_PER_WALK_N_BUCKETS ? + HISTOGRAM_VISITS_PER_WALK_N_BUCKETS - 1 : + bucket); + fib_walk_hist_vists_per_walk[bucket]++; + + /* + * save stats to the recent history + */ + + fib_walk_history[history_last_walk_pos].fwh_n_visits = + fwalk->fw_n_visits; + fib_walk_history[history_last_walk_pos].fwh_completed = + vlib_time_now(vlib_get_main()); + fib_walk_history[history_last_walk_pos].fwh_duration = + fib_walk_history[history_last_walk_pos].fwh_completed - + fwalk->fw_start_time; + fib_walk_history[history_last_walk_pos].fwh_parent = + fwalk->fw_parent; + fib_walk_history[history_last_walk_pos].fwh_flags = + fwalk->fw_flags; + + vec_foreach_index(ii, fwalk->fw_ctx) + { + if (ii < MAX_HISTORY_REASONS) + { + fib_walk_history[history_last_walk_pos].fwh_reason[ii] = + fwalk->fw_ctx[ii].fnbw_reason; + } + } + + history_last_walk_pos = (history_last_walk_pos + 1) % HISTORY_N_WALKS; + + fib_node_deinit(&fwalk->fw_node); + vec_free(fwalk->fw_ctx); + pool_put(fib_walk_pool, fwalk); +} + +/** + * return code when advancing a walk + */ +typedef enum fib_walk_advance_rc_t_ +{ + /** + * The walk is complete + */ + FIB_WALK_ADVANCE_DONE, + /** + * the walk has more work + */ + FIB_WALK_ADVANCE_MORE, + /** + * The walk merged with the one in front + */ + FIB_WALK_ADVANCE_MERGE, +} fib_walk_advance_rc_t; + +/** + * @brief Advance the walk one element in its work list + */ +static fib_walk_advance_rc_t +fib_walk_advance (fib_node_index_t fwi) +{ + fib_node_back_walk_ctx_t *ctx, *old; + fib_node_back_walk_rc_t wrc; + fib_node_ptr_t sibling; + fib_walk_t *fwalk; + int more_elts; + + /* + * this walk function is re-entrant - walks acan spawn walks. + * fib_walk_t objects come from a pool, so they can realloc. we need + * to retch from said pool at the appropriate times. + */ + fwalk = fib_walk_get(fwi); + + more_elts = fib_node_list_elt_get_next(fwalk->fw_dep_sibling, &sibling); + + if (more_elts) + { + old = fwalk->fw_ctx; + + vec_foreach(ctx, fwalk->fw_ctx) + { + wrc = fib_node_back_walk_one(&sibling, ctx); + + fwalk = fib_walk_get(fwi); + fwalk->fw_n_visits++; + + if (FIB_NODE_BACK_WALK_MERGE == wrc) + { + /* + * this walk has merged with the one further along the node's + * dependecy list. + */ + return (FIB_WALK_ADVANCE_MERGE); + } + if (old != fwalk->fw_ctx) + { + /* + * nasty re-entrant addition of a walk has realloc'd the vector + * break out + */ + return (FIB_WALK_ADVANCE_MERGE); + } + } + /* + * move foward to the next node to visit + */ + more_elts = fib_node_list_advance(fwalk->fw_dep_sibling); + } + + if (more_elts) + { + return (FIB_WALK_ADVANCE_MORE); + } + + return (FIB_WALK_ADVANCE_DONE); +} + +/** + * @breif Enurmerate the times of sleep between walks + */ +typedef enum fib_walk_sleep_type_t_ +{ + FIB_WALK_SHORT_SLEEP, + FIB_WALK_LONG_SLEEP, +} fib_walk_sleep_type_t; + +#define FIB_WALK_N_SLEEP (FIB_WALK_LONG_SLEEP+1) + +/** + * @brief Durations for the sleep types + */ +static f64 fib_walk_sleep_duration[] = { + [FIB_WALK_LONG_SLEEP] = 1e-3, + [FIB_WALK_SHORT_SLEEP] = 1e-8, +}; + +/** + * @brief The time quota for a walk. When more than this amount of time is + * spent, the walk process will yield. + */ +static f64 quota = 1e-4; + +/** + * Histogram on the amount of work done (in msecs) in each walk + */ +#define N_TIME_BUCKETS 128 +#define TIME_INCREMENTS (N_TIME_BUCKETS/2) +static u64 fib_walk_work_time_taken[N_TIME_BUCKETS]; + +/** + * Histogram on the number of nodes visted in each quota + */ +#define N_ELTS_BUCKETS 128 +static u32 fib_walk_work_nodes_visisted_incr = 2; +static u64 fib_walk_work_nodes_visited[N_ELTS_BUCKETS]; + +/** + * Histogram of the sleep lengths + */ +static u64 fib_walk_sleep_lengths[2]; + +/** + * @brief Service the queues + * This is not declared static so that it can be unit tested - i know i know... + */ +f64 +fib_walk_process_queues (vlib_main_t * vm, + const f64 quota) +{ + f64 start_time, consumed_time; + fib_walk_sleep_type_t sleep; + fib_walk_priority_t prio; + fib_walk_advance_rc_t rc; + fib_node_index_t fwi; + fib_walk_t *fwalk; + u32 n_elts; + i32 bucket; + + consumed_time = 0; + start_time = vlib_time_now(vm); + n_elts = 0; + + FOR_EACH_FIB_WALK_PRIORITY(prio) + { + while (0 != fib_walk_queue_get_size(prio)) + { + fwi = fib_walk_queue_get_front(prio); + + /* + * set this walk as executing + */ + fwalk = fib_walk_get(fwi); + fwalk->fw_flags |= FIB_WALK_FLAG_EXECUTING; + + do + { + rc = fib_walk_advance(fwi); + n_elts++; + consumed_time = (vlib_time_now(vm) - start_time); + } while ((consumed_time < quota) && + (FIB_WALK_ADVANCE_MORE == rc)); + + /* + * if this walk has no more work then pop it from the queue + * and move on to the next. + */ + if (FIB_WALK_ADVANCE_MORE != rc) + { + fwalk = fib_walk_get(fwi); + fib_walk_destroy(fwalk); + fib_walk_queues.fwqs_queues[prio].fwq_stats[FIB_WALK_COMPLETED]++; + } + else + { + /* + * passed our work quota. sleep time. + */ + fwalk = fib_walk_get(fwi); + fwalk->fw_flags &= ~FIB_WALK_FLAG_EXECUTING; + sleep = FIB_WALK_SHORT_SLEEP; + goto that_will_do_for_now; + } + } + } + /* + * got to the end of all the work + */ + sleep = FIB_WALK_LONG_SLEEP; + +that_will_do_for_now: + + /* + * collect the stats: + * - for the number of nodes visisted we store 128 increments + * - for the time consumed we store quota/TIME_INCREMENTS increments. + */ + bucket = ((n_elts/fib_walk_work_nodes_visisted_incr) > N_ELTS_BUCKETS ? + N_ELTS_BUCKETS-1 : + n_elts/fib_walk_work_nodes_visisted_incr); + ++fib_walk_work_nodes_visited[bucket]; + + bucket = (consumed_time - quota) / (quota / TIME_INCREMENTS); + bucket += N_TIME_BUCKETS/2; + bucket = (bucket < 0 ? 0 : bucket); + bucket = (bucket > N_TIME_BUCKETS-1 ? N_TIME_BUCKETS-1 : bucket); + ++fib_walk_work_time_taken[bucket]; + + ++fib_walk_sleep_lengths[sleep]; + + return (fib_walk_sleep_duration[sleep]); +} + +/** + * @brief The 'fib-walk' process's main loop. + */ +static uword +fib_walk_process (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * f) +{ + f64 sleep_time; + + sleep_time = fib_walk_sleep_duration[FIB_WALK_SHORT_SLEEP]; + + while (1) + { + vlib_process_wait_for_event_or_clock(vm, sleep_time); + + /* + * there may be lots of event queued between the processes, + * but the walks we want to schedule are in the priority queues, + * so we ignore the process events. + */ + vlib_process_get_events(vm, NULL); + + sleep_time = fib_walk_process_queues(vm, quota); + } + + /* + * Unreached + */ + ASSERT(!"WTF"); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (fib_walk_process_node,static) = { + .function = fib_walk_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "fib-walk", +}; +/* *INDENT-ON* */ + +/** + * @brief Allocate a new walk object + */ +static fib_walk_t * +fib_walk_alloc (fib_node_type_t parent_type, + fib_node_index_t parent_index, + fib_walk_flags_t flags, + fib_node_back_walk_ctx_t *ctx) +{ + fib_walk_t *fwalk; + + pool_get(fib_walk_pool, fwalk); + + fib_node_init(&fwalk->fw_node, FIB_NODE_TYPE_WALK); + + fwalk->fw_flags = flags; + fwalk->fw_dep_sibling = FIB_NODE_INDEX_INVALID; + fwalk->fw_prio_sibling = FIB_NODE_INDEX_INVALID; + fwalk->fw_parent.fnp_index = parent_index; + fwalk->fw_parent.fnp_type = parent_type; + fwalk->fw_ctx = NULL; + fwalk->fw_start_time = vlib_time_now(vlib_get_main()); + fwalk->fw_n_visits = 0; + + /* + * make a copy of the backwalk context so the depth count remains + * the same for each sibling visitsed. This is important in the case + * where a parent has a loop via one child, but all the others are not. + * if the looped child were visited first, the depth count would exceed, the + * max and the walk would terminate before it reached the other siblings. + */ + vec_add1(fwalk->fw_ctx, *ctx); + + return (fwalk); +} + +/** + * @brief Enqueue a walk onto the appropriate priority queue. Then signal + * the background process there is work to do. + */ +static index_t +fib_walk_prio_queue_enquue (fib_walk_priority_t prio, + fib_walk_t *fwalk) +{ + index_t sibling; + + sibling = fib_node_list_push_front(fib_walk_queues.fwqs_queues[prio].fwq_queue, + 0, + FIB_NODE_TYPE_WALK, + fib_walk_get_index(fwalk)); + fib_walk_queues.fwqs_queues[prio].fwq_stats[FIB_WALK_SCHEDULED]++; + + /* + * poke the fib-walk process to perform the async walk. + * we are not passing it specific data, hence the last two args, + * the process will drain the queues + */ + vlib_process_signal_event(vlib_get_main(), + fib_walk_process_node.index, + FIB_WALK_EVENT, + FIB_WALK_EVENT); + + return (sibling); +} + +void +fib_walk_async (fib_node_type_t parent_type, + fib_node_index_t parent_index, + fib_walk_priority_t prio, + fib_node_back_walk_ctx_t *ctx) +{ + fib_walk_t *fwalk; + + if (FIB_NODE_GRAPH_MAX_DEPTH < ++ctx->fnbw_depth) + { + /* + * The walk has reached the maximum depth. there is a loop in the graph. + * bail. + */ + return; + } + if (0 == fib_node_get_n_children(parent_type, + parent_index)) + { + /* + * no children to walk - quit now + */ + return; + } + if (ctx->fnbw_flags & FIB_NODE_BW_FLAG_FORCE_SYNC) + { + /* + * the originator of the walk wanted it to be synchronous, but the + * parent object chose async - denied. + */ + return (fib_walk_sync(parent_type, parent_index, ctx)); + } + + + fwalk = fib_walk_alloc(parent_type, + parent_index, + FIB_WALK_FLAG_ASYNC, + ctx); + + fwalk->fw_dep_sibling = fib_node_child_add(parent_type, + parent_index, + FIB_NODE_TYPE_WALK, + fib_walk_get_index(fwalk)); + + fwalk->fw_prio_sibling = fib_walk_prio_queue_enquue(prio, fwalk); +} + +/** + * @brief Back walk all the children of a FIB node. + * + * note this is a synchronous depth first walk. Children visited may propagate + * the walk to thier children. Other children node types may not propagate, + * synchronously but instead queue the walk for later async completion. + */ +void +fib_walk_sync (fib_node_type_t parent_type, + fib_node_index_t parent_index, + fib_node_back_walk_ctx_t *ctx) +{ + fib_walk_advance_rc_t rc; + fib_node_index_t fwi; + fib_walk_t *fwalk; + + if (FIB_NODE_GRAPH_MAX_DEPTH < ++ctx->fnbw_depth) + { + /* + * The walk has reached the maximum depth. there is a loop in the graph. + * bail. + */ + return; + } + if (0 == fib_node_get_n_children(parent_type, + parent_index)) + { + /* + * no children to walk - quit now + */ + return; + } + + fwalk = fib_walk_alloc(parent_type, + parent_index, + FIB_WALK_FLAG_SYNC, + ctx); + + fwalk->fw_dep_sibling = fib_node_child_add(parent_type, + parent_index, + FIB_NODE_TYPE_WALK, + fib_walk_get_index(fwalk)); + fwi = fib_walk_get_index(fwalk); + + while (1) + { + /* + * set this walk as executing + */ + fwalk->fw_flags |= FIB_WALK_FLAG_EXECUTING; + + do + { + rc = fib_walk_advance(fwi); + } while (FIB_WALK_ADVANCE_MORE == rc); + + + /* + * this walk function is re-entrant - walks can spawn walks. + * fib_walk_t objects come from a pool, so they can realloc. we need + * to re-fetch from said pool at the appropriate times. + */ + fwalk = fib_walk_get(fwi); + + if (FIB_WALK_ADVANCE_MERGE == rc) + { + /* + * this sync walk merged with an walk in front. + * by reqeusting a sync walk the client wanted all children walked, + * so we ditch the walk object in hand and continue with the one + * we merged into + */ + fib_node_ptr_t merged_walk; + + fib_node_list_elt_get_next(fwalk->fw_dep_sibling, &merged_walk); + + ASSERT(FIB_NODE_INDEX_INVALID != merged_walk.fnp_index); + ASSERT(FIB_NODE_TYPE_WALK == merged_walk.fnp_type); + + fib_walk_destroy(fwalk); + + fwi = merged_walk.fnp_index; + fwalk = fib_walk_get(fwi); + + if (FIB_WALK_FLAG_EXECUTING & fwalk->fw_flags) + { + /* + * we are executing a sync walk, and we have met with another + * walk that is also executing. since only one walk executs at once + * (there is no multi-threading) this implies we have met ourselves + * and hence the is a loop in the graph. + * This function is re-entrant, so the walk object we met is being + * acted on in a stack frame below this one. We must therefore not + * continue with it now, but let the stack unwind and along the + * appropriate frame to read the depth count and bail. + */ + fwalk = NULL; + break; + } + } + else + { + /* + * the walk reached the end of the depdency list. + */ + break; + } + } + + if (NULL != fwalk) + { + fib_walk_destroy(fwalk); + } +} + +static fib_node_t * +fib_walk_get_node (fib_node_index_t index) +{ + fib_walk_t *fwalk; + + fwalk = fib_walk_get(index); + + return (&(fwalk->fw_node)); +} + +/** + * Walk objects are not parents, nor are they locked. + * are no-ops + */ +static void +fib_walk_last_lock_gone (fib_node_t *node) +{ + ASSERT(0); +} + +static fib_walk_t* +fib_walk_get_from_node (fib_node_t *node) +{ + return ((fib_walk_t*)(((char*)node) - + STRUCT_OFFSET_OF(fib_walk_t, fw_node))); +} + +/** + * @brief Another back walk has reach this walk. + * Megre them so there is only one left. It is this node being + * visited that will remain, so copy or merge the context onto it. + */ +static fib_node_back_walk_rc_t +fib_walk_back_walk_notify (fib_node_t *node, + fib_node_back_walk_ctx_t *ctx) +{ + fib_node_back_walk_ctx_t *last; + fib_walk_t *fwalk; + + fwalk = fib_walk_get_from_node(node); + + /* + * check whether the walk context can be merged with the most recent. + * the most recent was the one last added and is thus at the back of the vector. + * we can merge walks if the reason for the walk is the same. + */ + last = vec_end(fwalk->fw_ctx) - 1; + + if (last->fnbw_reason == ctx->fnbw_reason) + { + /* + * copy the largest of the depth values. in the presence of a loop, + * the same walk will merge with itself. if we take the smaller depth + * then it will never end. + */ + last->fnbw_depth = ((last->fnbw_depth >= ctx->fnbw_depth) ? + last->fnbw_depth : + ctx->fnbw_depth); + } + else + { + /* + * walks could not be merged, this means that the walk infront needs to + * perform different action to this one that has caught up. the one in + * front was scheduled first so append the new walk context to the back + * of the list. + */ + vec_add1(fwalk->fw_ctx, *ctx); + } + + return (FIB_NODE_BACK_WALK_MERGE); +} + +/** + * The FIB walk's graph node virtual function table + */ +static const fib_node_vft_t fib_walk_vft = { + .fnv_get = fib_walk_get_node, + .fnv_last_lock = fib_walk_last_lock_gone, + .fnv_back_walk = fib_walk_back_walk_notify, +}; + +void +fib_walk_module_init (void) +{ + fib_walk_priority_t prio; + + FOR_EACH_FIB_WALK_PRIORITY(prio) + { + fib_walk_queues.fwqs_queues[prio].fwq_queue = fib_node_list_create(); + } + + fib_node_register_type(FIB_NODE_TYPE_WALK, &fib_walk_vft); +} + +static u8* +format_fib_walk (u8* s, va_list ap) +{ + fib_node_index_t fwi = va_arg(ap, fib_node_index_t); + fib_walk_t *fwalk; + + fwalk = fib_walk_get(fwi); + + return (format(s, " parent:{%s:%d} visits:%d flags:%d", + fib_node_type_get_name(fwalk->fw_parent.fnp_type), + fwalk->fw_parent.fnp_index, + fwalk->fw_n_visits, + fwalk->fw_flags)); +} + +static clib_error_t * +fib_walk_show (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + fib_walk_queue_stats_t wqs; + fib_walk_priority_t prio; + fib_node_ptr_t sibling; + fib_node_index_t fwi; + fib_walk_t *fwalk; + int more_elts, ii; + u8 *s = NULL; + +#define USEC 1000000 + vlib_cli_output(vm, "FIB Walk Quota = %.2fusec:", quota * USEC); + vlib_cli_output(vm, "FIB Walk queues:"); + + FOR_EACH_FIB_WALK_PRIORITY(prio) + { + vlib_cli_output(vm, " %U priority queue:", + format_fib_walk_priority, prio); + vlib_cli_output(vm, " Stats: "); + + FOR_EACH_FIB_WALK_QUEUE_STATS(wqs) + { + vlib_cli_output(vm, " %U:%d", + format_fib_walk_queue_stats, wqs, + fib_walk_queues.fwqs_queues[prio].fwq_stats[wqs]); + } + vlib_cli_output(vm, " Occupancy:%d", + fib_node_list_get_size( + fib_walk_queues.fwqs_queues[prio].fwq_queue)); + + more_elts = fib_node_list_get_front( + fib_walk_queues.fwqs_queues[prio].fwq_queue, + &sibling); + + while (more_elts) + { + ASSERT(FIB_NODE_INDEX_INVALID != sibling.fnp_index); + ASSERT(FIB_NODE_TYPE_WALK == sibling.fnp_type); + + fwi = sibling.fnp_index; + fwalk = fib_walk_get(fwi); + + vlib_cli_output(vm, " %U", format_fib_walk, fwi); + + more_elts = fib_node_list_elt_get_next(fwalk->fw_prio_sibling, + &sibling); + } + } + + vlib_cli_output(vm, "Histogram Statistics:"); + vlib_cli_output(vm, " Number of Elements visit per-quota:"); + for (ii = 0; ii < N_ELTS_BUCKETS; ii++) + { + if (0 != fib_walk_work_nodes_visited[ii]) + s = format(s, "%d:%d ", + (ii * fib_walk_work_nodes_visisted_incr), + fib_walk_work_nodes_visited[ii]); + } + vlib_cli_output(vm, " %v", s); + vec_free(s); + + vlib_cli_output(vm, " Time consumed per-quota (Quota=%f usec):", quota*USEC); + s = format(s, "0:%d ", fib_walk_work_time_taken[0]); + for (ii = 1; ii < N_TIME_BUCKETS; ii++) + { + if (0 != fib_walk_work_time_taken[ii]) + s = format(s, "%d:%d ", (u32)((((ii - N_TIME_BUCKETS/2) * + (quota / TIME_INCREMENTS)) + quota) * + USEC), + fib_walk_work_time_taken[ii]); + } + vlib_cli_output(vm, " %v", s); + vec_free(s); + + vlib_cli_output(vm, " Sleep Types:"); + vlib_cli_output(vm, " Short Long:"); + vlib_cli_output(vm, " %d %d:", + fib_walk_sleep_lengths[FIB_WALK_SHORT_SLEEP], + fib_walk_sleep_lengths[FIB_WALK_LONG_SLEEP]); + + vlib_cli_output(vm, " Number of Elements visited per-walk:"); + for (ii = 0; ii < HISTOGRAM_VISITS_PER_WALK_N_BUCKETS; ii++) + { + if (0 != fib_walk_hist_vists_per_walk[ii]) + s = format(s, "%d:%d ", + ii*HISTOGRAM_VISITS_PER_WALK_INCR, + fib_walk_hist_vists_per_walk[ii]); + } + vlib_cli_output(vm, " %v", s); + vec_free(s); + + + vlib_cli_output(vm, "Brief History (last %d walks):", HISTORY_N_WALKS); + ii = history_last_walk_pos - 1; + if (ii < 0) + ii = HISTORY_N_WALKS - 1; + + while (ii != history_last_walk_pos) + { + if (0 != fib_walk_history[ii].fwh_reason[0]) + { + fib_node_back_walk_reason_t reason; + u8 *s = NULL; + u32 jj; + + s = format(s, "[@%d]: %s:%d visits:%d duration:%.2f completed:%.2f ", + ii, fib_node_type_get_name(fib_walk_history[ii].fwh_parent.fnp_type), + fib_walk_history[ii].fwh_parent.fnp_index, + fib_walk_history[ii].fwh_n_visits, + fib_walk_history[ii].fwh_duration, + fib_walk_history[ii].fwh_completed); + if (FIB_WALK_FLAG_SYNC & fib_walk_history[ii].fwh_flags) + s = format(s, "sync, "); + if (FIB_WALK_FLAG_ASYNC & fib_walk_history[ii].fwh_flags) + s = format(s, "async, "); + + s = format(s, "reason:"); + jj = 0; + while (0 != fib_walk_history[ii].fwh_reason[jj]) + { + FOR_EACH_FIB_NODE_BW_REASON(reason) { + if ((1< Date: Tue, 18 Apr 2017 09:09:40 -0700 Subject: Improve Load-Balance MAPs - only build them for popular path-lists (where popular means more than 64 children) the reason to have a map is to improve convergence speed for recursive prefixes - if there are only a few this technique is not needed - only build them when there is at least one path that has recursive constraints, i.e. a path that can 'fail' in a PIC scenario. - Use the MAPS in the switch path. - PIC test cases for functionality (not convergence performance) Change-Id: I70705444c8469d22b07ae34be82cfb6a01358e10 Signed-off-by: Neale Ranns --- src/vnet/dpo/load_balance.c | 3 +- src/vnet/dpo/load_balance_map.h | 31 ++++ src/vnet/fib/fib_entry_src.c | 15 +- src/vnet/fib/fib_entry_src_rr.c | 2 +- src/vnet/fib/fib_path.c | 6 +- src/vnet/fib/fib_path.h | 2 +- src/vnet/fib/fib_path_list.c | 73 ++++++-- src/vnet/fib/fib_path_list.h | 24 +-- src/vnet/fib/fib_table.c | 12 +- src/vnet/fib/fib_test.c | 116 +++++++++++-- src/vnet/fib/fib_walk.c | 122 +++++++++++--- src/vnet/fib/fib_walk.h | 3 + src/vnet/ip/ip4_forward.c | 137 ++++++++------- src/vnet/ip/ip6_forward.c | 104 ++++++++---- src/vnet/ip/ip6_neighbor.c | 2 +- src/vnet/mpls/mpls_lookup.c | 118 ++++++++----- test/test_mpls.py | 359 ++++++++++++++++++++++++++++++++++++++++ test/vpp_ip_route.py | 53 ++++-- 18 files changed, 966 insertions(+), 216 deletions(-) (limited to 'src/vnet/fib/fib_walk.c') diff --git a/src/vnet/dpo/load_balance.c b/src/vnet/dpo/load_balance.c index 6b0eda0e..af054f1c 100644 --- a/src/vnet/dpo/load_balance.c +++ b/src/vnet/dpo/load_balance.c @@ -118,7 +118,8 @@ load_balance_format (index_t lbi, buckets = load_balance_get_buckets(lb); s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE); - s = format(s, "[index:%d buckets:%d ", lbi, lb->lb_n_buckets); + s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto); + s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets); s = format(s, "uRPF:%d ", lb->lb_urpf); s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes); if (0 != via.packets) diff --git a/src/vnet/dpo/load_balance_map.h b/src/vnet/dpo/load_balance_map.h index 454bf4b3..237f24b0 100644 --- a/src/vnet/dpo/load_balance_map.h +++ b/src/vnet/dpo/load_balance_map.h @@ -73,6 +73,37 @@ load_balance_map_get (index_t lbmi) return (pool_elt_at_index(load_balance_map_pool, lbmi)); } +static inline u16 +load_balance_map_translate (index_t lbmi, + u16 bucket) +{ + load_balance_map_t*lbm; + + lbm = load_balance_map_get(lbmi); + + return (lbm->lbm_buckets[bucket]); +} + +static inline const dpo_id_t * +load_balance_get_fwd_bucket (const load_balance_t *lb, + u16 bucket) +{ + ASSERT(bucket < lb->lb_n_buckets); + + if (INDEX_INVALID != lb->lb_map) + { + bucket = load_balance_map_translate(lb->lb_map, bucket); + } + + if (PREDICT_TRUE(LB_HAS_INLINE_BUCKETS(lb))) + { + return (&lb->lb_buckets_inline[bucket]); + } + else + { + return (&lb->lb_buckets[bucket]); + } +} extern void load_balance_map_module_init(void); diff --git a/src/vnet/fib/fib_entry_src.c b/src/vnet/fib/fib_entry_src.c index a700282e..fd80497c 100644 --- a/src/vnet/fib/fib_entry_src.c +++ b/src/vnet/fib/fib_entry_src.c @@ -192,7 +192,7 @@ typedef struct fib_entry_src_collect_forwarding_ctx_t_ const fib_entry_t *fib_entry; const fib_entry_src_t *esrc; fib_forward_chain_type_t fct; - int is_recursive; + int n_recursive_constrained; } fib_entry_src_collect_forwarding_ctx_t; /** @@ -203,10 +203,11 @@ load_balance_flags_t fib_entry_calc_lb_flags (fib_entry_src_collect_forwarding_ctx_t *ctx) { /** - * We'll use a LB map is the path-list has recursive paths. + * We'll use a LB map if the path-list has multiple recursive paths. * recursive paths implies BGP, and hence scale. */ - if (ctx->is_recursive) + if (ctx->n_recursive_constrained > 1 && + fib_path_list_is_popular(ctx->esrc->fes_pl)) { return (LOAD_BALANCE_FLAG_USES_MAP); } @@ -282,9 +283,9 @@ fib_entry_src_collect_forwarding (fib_node_index_t pl_index, return (!0); } - if (fib_path_is_recursive(path_index)) + if (fib_path_is_recursive_constrained(path_index)) { - ctx->is_recursive = 1; + ctx->n_recursive_constrained += 1; } /* @@ -397,7 +398,7 @@ fib_entry_src_mk_lb (fib_entry_t *fib_entry, .esrc = esrc, .fib_entry = fib_entry, .next_hops = NULL, - .is_recursive = 0, + .n_recursive_constrained = 0, .fct = fct, }; @@ -409,7 +410,7 @@ fib_entry_src_mk_lb (fib_entry_t *fib_entry, vec_validate(ctx.next_hops, fib_path_list_get_n_paths(esrc->fes_pl)); vec_reset_length(ctx.next_hops); - lb_proto = fib_proto_to_dpo(fib_entry->fe_prefix.fp_proto); + lb_proto = fib_forw_chain_type_to_dpo_proto(fct); fib_path_list_walk(esrc->fes_pl, fib_entry_src_collect_forwarding, diff --git a/src/vnet/fib/fib_entry_src_rr.c b/src/vnet/fib/fib_entry_src_rr.c index ff15c54e..c145aaa2 100644 --- a/src/vnet/fib/fib_entry_src_rr.c +++ b/src/vnet/fib/fib_entry_src_rr.c @@ -103,7 +103,7 @@ fib_entry_src_rr_activate (fib_entry_src_t *src, fib_entry_cover_track(cover, fib_entry_get_index(fib_entry)); /* - * if the ocver is attached then install an attached-host path + * if the cover is attached then install an attached-host path * (like an adj-fib). Otherwise inherit the forwarding from the cover */ if (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags_i(cover)) diff --git a/src/vnet/fib/fib_path.c b/src/vnet/fib/fib_path.c index 70c87905..889317fd 100644 --- a/src/vnet/fib/fib_path.c +++ b/src/vnet/fib/fib_path.c @@ -2025,13 +2025,15 @@ fib_path_append_nh_for_multipath_hash (fib_node_index_t path_index, } int -fib_path_is_recursive (fib_node_index_t path_index) +fib_path_is_recursive_constrained (fib_node_index_t path_index) { fib_path_t *path; path = fib_path_get(path_index); - return (FIB_PATH_TYPE_RECURSIVE == path->fp_type); + return ((FIB_PATH_TYPE_RECURSIVE == path->fp_type) && + ((path->fp_cfg_flags & FIB_PATH_CFG_FLAG_RESOLVE_ATTACHED) || + (path->fp_cfg_flags & FIB_PATH_CFG_FLAG_RESOLVE_HOST))); } int diff --git a/src/vnet/fib/fib_path.h b/src/vnet/fib/fib_path.h index 334be6f5..b6bf1e4f 100644 --- a/src/vnet/fib/fib_path.h +++ b/src/vnet/fib/fib_path.h @@ -144,7 +144,7 @@ extern fib_node_index_t fib_path_copy(fib_node_index_t path_index, fib_node_index_t path_list_index); extern int fib_path_resolve(fib_node_index_t path_index); extern int fib_path_is_resolved(fib_node_index_t path_index); -extern int fib_path_is_recursive(fib_node_index_t path_index); +extern int fib_path_is_recursive_constrained(fib_node_index_t path_index); extern int fib_path_is_exclusive(fib_node_index_t path_index); extern int fib_path_is_deag(fib_node_index_t path_index); extern int fib_path_is_looped(fib_node_index_t path_index); diff --git a/src/vnet/fib/fib_path_list.c b/src/vnet/fib/fib_path_list.c index ea6565dd..64917f95 100644 --- a/src/vnet/fib/fib_path_list.c +++ b/src/vnet/fib/fib_path_list.c @@ -25,6 +25,16 @@ #include #include +/** + * The magic number of child entries that make a path-list popular. + * There's a trade-off here between convergnece and forwarding speed. + * Popular path-lists generate load-balance maps for the entires that + * use them. If the map is present there is a switch path cost to indirect + * through the map - this indirection provides the fast convergence - so + * without the map convergence is slower. + */ +#define FIB_PATH_LIST_POPULAR 64 + /** * FIB path-list * A representation of the list/set of path trough which a prefix is reachable @@ -454,14 +464,7 @@ fib_path_list_back_walk (fib_node_index_t path_list_index, /* * propagate the backwalk further */ - if (32 >= fib_node_list_get_size(path_list->fpl_node.fn_children)) - { - /* - * only a few children. continue the walk synchronously - */ - fib_walk_sync(FIB_NODE_TYPE_PATH_LIST, path_list_index, ctx); - } - else + if (path_list->fpl_flags & FIB_PATH_LIST_FLAG_POPULAR) { /* * many children. schedule a async walk @@ -471,6 +474,13 @@ fib_path_list_back_walk (fib_node_index_t path_list_index, FIB_WALK_PRIORITY_LOW, ctx); } + else + { + /* + * only a few children. continue the walk synchronously + */ + fib_walk_sync(FIB_NODE_TYPE_PATH_LIST, path_list_index, ctx); + } } /* @@ -625,6 +635,16 @@ fib_path_list_is_looped (fib_node_index_t path_list_index) return (path_list->fpl_flags & FIB_PATH_LIST_FLAG_LOOPED); } +int +fib_path_list_is_popular (fib_node_index_t path_list_index) +{ + fib_path_list_t *path_list; + + path_list = fib_path_list_get(path_list_index); + + return (path_list->fpl_flags & FIB_PATH_LIST_FLAG_POPULAR); +} + static fib_path_list_flags_t fib_path_list_flags_fixup (fib_path_list_flags_t flags) { @@ -807,6 +827,7 @@ fib_path_list_path_add (fib_node_index_t path_list_index, */ if (0 == fib_path_cmp(new_path_index, *orig_path_index)) { + fib_path_destroy(new_path_index); return (*orig_path_index); } } @@ -1173,10 +1194,38 @@ fib_path_list_child_add (fib_node_index_t path_list_index, fib_node_type_t child_type, fib_node_index_t child_index) { - return (fib_node_child_add(FIB_NODE_TYPE_PATH_LIST, - path_list_index, - child_type, - child_index)); + u32 sibling; + + sibling = fib_node_child_add(FIB_NODE_TYPE_PATH_LIST, + path_list_index, + child_type, + child_index); + + if (FIB_PATH_LIST_POPULAR == fib_node_get_n_children(FIB_NODE_TYPE_PATH_LIST, + path_list_index)) + { + /* + * Set the popular flag on the path-list once we pass the magic + * threshold. then walk children to update. + * We don't undo this action. The rational being that the number + * of entries using this prefix is large enough such that it is a + * non-trival amount of effort to converge them. If we get into the + * situation where we are adding and removing entries such that we + * flip-flop over the threshold, then this non-trivial work is added + * to each of those routes adds/deletes - not a situation we want. + */ + fib_node_back_walk_ctx_t ctx = { + .fnbw_reason = FIB_NODE_BW_REASON_FLAG_EVALUATE, + }; + fib_path_list_t *path_list; + + path_list = fib_path_list_get(path_list_index); + path_list->fpl_flags |= FIB_PATH_LIST_FLAG_POPULAR; + + fib_walk_sync(FIB_NODE_TYPE_PATH_LIST, path_list_index, &ctx); + } + + return (sibling); } void diff --git a/src/vnet/fib/fib_path_list.h b/src/vnet/fib/fib_path_list.h index 9d246211..376cb726 100644 --- a/src/vnet/fib/fib_path_list.h +++ b/src/vnet/fib/fib_path_list.h @@ -38,11 +38,6 @@ typedef enum fib_path_list_attribute_t_ { * be searched for each route update. */ FIB_PATH_LIST_ATTRIBUTE_SHARED = FIB_PATH_LIST_ATTRIBUTE_FIRST, - /** - * Indexed means the path-list keeps a hash table of all paths for - * fast lookup. The lookup result is the fib_node_index of the path. - */ - FIB_PATH_LIST_ATTRIBUTE_INDEXED, /** * explicit drop path-list. Used when the entry source needs to * force a drop, despite the fact the path info is present. @@ -65,6 +60,12 @@ typedef enum fib_path_list_attribute_t_ { * looped path-list. one path looped implies the whole list is */ FIB_PATH_LIST_ATTRIBUTE_LOOPED, + /** + * a popular path-ist is one that is shared amongst many entries. + * Path list become popular as they gain more children, but they + * don't become unpopular as they lose them. + */ + FIB_PATH_LIST_ATTRIBUTE_POPULAR, /** * no uRPF - do not generate unicast RPF list for this path-list */ @@ -72,30 +73,30 @@ typedef enum fib_path_list_attribute_t_ { /** * Marher. Add new flags before this one, and then update it. */ - FIB_PATH_LIST_ATTRIBUTE_LAST = FIB_PATH_LIST_ATTRIBUTE_LOOPED, + FIB_PATH_LIST_ATTRIBUTE_LAST = FIB_PATH_LIST_ATTRIBUTE_NO_URPF, } fib_path_list_attribute_t; typedef enum fib_path_list_flags_t_ { FIB_PATH_LIST_FLAG_NONE = 0, FIB_PATH_LIST_FLAG_SHARED = (1 << FIB_PATH_LIST_ATTRIBUTE_SHARED), - FIB_PATH_LIST_FLAG_INDEXED = (1 << FIB_PATH_LIST_ATTRIBUTE_INDEXED), FIB_PATH_LIST_FLAG_DROP = (1 << FIB_PATH_LIST_ATTRIBUTE_DROP), FIB_PATH_LIST_FLAG_LOCAL = (1 << FIB_PATH_LIST_ATTRIBUTE_LOCAL), FIB_PATH_LIST_FLAG_EXCLUSIVE = (1 << FIB_PATH_LIST_ATTRIBUTE_EXCLUSIVE), FIB_PATH_LIST_FLAG_RESOLVED = (1 << FIB_PATH_LIST_ATTRIBUTE_RESOLVED), FIB_PATH_LIST_FLAG_LOOPED = (1 << FIB_PATH_LIST_ATTRIBUTE_LOOPED), + FIB_PATH_LIST_FLAG_POPULAR = (1 << FIB_PATH_LIST_ATTRIBUTE_POPULAR), FIB_PATH_LIST_FLAG_NO_URPF = (1 << FIB_PATH_LIST_ATTRIBUTE_NO_URPF), } fib_path_list_flags_t; #define FIB_PATH_LIST_ATTRIBUTES { \ [FIB_PATH_LIST_ATTRIBUTE_SHARED] = "shared", \ - [FIB_PATH_LIST_ATTRIBUTE_INDEXED] = "indexed", \ [FIB_PATH_LIST_ATTRIBUTE_RESOLVED] = "resolved", \ [FIB_PATH_LIST_ATTRIBUTE_DROP] = "drop", \ [FIB_PATH_LIST_ATTRIBUTE_EXCLUSIVE] = "exclusive", \ - [FIB_PATH_LIST_ATTRIBUTE_LOCAL] = "local", \ - [FIB_PATH_LIST_ATTRIBUTE_LOOPED] = "looped", \ - [FIB_PATH_LIST_ATTRIBUTE_NO_URPF] = "no-uRPF", \ + [FIB_PATH_LIST_ATTRIBUTE_LOCAL] = "local", \ + [FIB_PATH_LIST_ATTRIBUTE_LOOPED] = "looped", \ + [FIB_PATH_LIST_ATTRIBUTE_POPULAR] = "popular", \ + [FIB_PATH_LIST_ATTRIBUTE_NO_URPF] = "no-uRPF", \ } #define FOR_EACH_PATH_LIST_ATTRIBUTE(_item) \ @@ -148,6 +149,7 @@ extern int fib_path_list_recursive_loop_detect(fib_node_index_t path_list_index, fib_node_index_t **entry_indicies); extern u32 fib_path_list_get_resolving_interface(fib_node_index_t path_list_index); extern int fib_path_list_is_looped(fib_node_index_t path_list_index); +extern int fib_path_list_is_popular(fib_node_index_t path_list_index); extern fib_protocol_t fib_path_list_get_proto(fib_node_index_t path_list_index); extern u8 * fib_path_list_format(fib_node_index_t pl_index, u8 * s); diff --git a/src/vnet/fib/fib_table.c b/src/vnet/fib/fib_table.c index 0938ce9b..ff428049 100644 --- a/src/vnet/fib/fib_table.c +++ b/src/vnet/fib/fib_table.c @@ -608,11 +608,19 @@ fib_table_entry_path_remove2 (u32 fib_index, fib_entry_src_flag_t src_flag; int was_sourced; - /* + /* + * if it's not sourced, then there's nowt to remove + */ + was_sourced = fib_entry_is_sourced(fib_entry_index, source); + if (!was_sourced) + { + return; + } + + /* * don't nobody go nowhere */ fib_entry_lock(fib_entry_index); - was_sourced = fib_entry_is_sourced(fib_entry_index, source); for (ii = 0; ii < vec_len(rpath); ii++) { diff --git a/src/vnet/fib/fib_test.c b/src/vnet/fib/fib_test.c index cbb5640a..d3bdfa35 100644 --- a/src/vnet/fib/fib_test.c +++ b/src/vnet/fib/fib_test.c @@ -729,6 +729,9 @@ fib_test_v4 (void) .ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a02), }; + FIB_TEST((0 == pool_elts(load_balance_map_pool)), "LB-map pool size is %d", + pool_elts(load_balance_map_pool)); + tm = &test_main; /* record the nubmer of load-balances in use before we start */ @@ -3090,6 +3093,43 @@ fib_test_v4 (void) NULL, FIB_ROUTE_PATH_RESOLVE_VIA_HOST); + /* + * add a bunch load more entries using this path combo so that we get + * an LB-map created. + */ +#define N_P 128 + fib_prefix_t bgp_78s[N_P]; + for (ii = 0; ii < N_P; ii++) + { + bgp_78s[ii].fp_len = 32; + bgp_78s[ii].fp_proto = FIB_PROTOCOL_IP4; + bgp_78s[ii].fp_addr.ip4.as_u32 = clib_host_to_net_u32(0x4e000000+ii); + + + fib_table_entry_path_add(fib_index, + &bgp_78s[ii], + FIB_SOURCE_API, + FIB_ENTRY_FLAG_NONE, + FIB_PROTOCOL_IP4, + &pfx_1_1_1_3_s_32.fp_addr, + ~0, + fib_index, + 1, + NULL, + FIB_ROUTE_PATH_RESOLVE_VIA_HOST); + fib_table_entry_path_add(fib_index, + &bgp_78s[ii], + FIB_SOURCE_API, + FIB_ENTRY_FLAG_NONE, + FIB_PROTOCOL_IP4, + &nh_1_1_1_1, + ~0, + fib_index, + 1, + NULL, + FIB_ROUTE_PATH_RESOLVE_VIA_HOST); + } + fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx); dpo = fib_entry_contribute_ip_forwarding(fei); @@ -3138,6 +3178,9 @@ fib_test_v4 (void) 1, FIB_ROUTE_PATH_FLAG_NONE); + /* suspend so the update walk kicks int */ + vlib_process_suspend(vlib_get_main(), 1e-5); + fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx); FIB_TEST(!dpo_cmp(dpo, fib_entry_contribute_ip_forwarding(fei)), "post PIC 200.200.200.200/32 was inplace modified"); @@ -3175,6 +3218,9 @@ fib_test_v4 (void) NULL, FIB_ROUTE_PATH_FLAG_NONE); + /* suspend so the update walk kicks in */ + vlib_process_suspend(vlib_get_main(), 1e-5); + FIB_TEST(!dpo_cmp(dpo2, load_balance_get_bucket_i(lb, 0)), "post PIC recovery adj for 200.200.200.200/32 is recursive " "via adj for 1.1.1.1"); @@ -3201,6 +3247,20 @@ fib_test_v4 (void) 1, NULL, FIB_ROUTE_PATH_RESOLVE_VIA_HOST); + for (ii = 0; ii < N_P; ii++) + { + fib_table_entry_path_add(fib_index, + &bgp_78s[ii], + FIB_SOURCE_API, + FIB_ENTRY_FLAG_NONE, + FIB_PROTOCOL_IP4, + &pfx_1_1_1_2_s_32.fp_addr, + ~0, + fib_index, + 1, + NULL, + FIB_ROUTE_PATH_RESOLVE_VIA_HOST); + } fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx); dpo = fib_entry_contribute_ip_forwarding(fei); @@ -3233,6 +3293,8 @@ fib_test_v4 (void) ~0, 1, FIB_ROUTE_PATH_FLAG_NONE); + /* suspend so the update walk kicks int */ + vlib_process_suspend(vlib_get_main(), 1e-5); fei = fib_table_lookup_exact_match(fib_index, &bgp_200_pfx); dpo = fib_entry_contribute_ip_forwarding(fei); @@ -3270,6 +3332,16 @@ fib_test_v4 (void) NULL, FIB_ROUTE_PATH_FLAG_NONE); + for (ii = 0; ii < N_P; ii++) + { + fib_table_entry_delete(fib_index, + &bgp_78s[ii], + FIB_SOURCE_API); + FIB_TEST((FIB_NODE_INDEX_INVALID == + fib_table_lookup_exact_match(fib_index, &bgp_78s[ii])), + "%U removed", + format_fib_prefix, &bgp_78s[ii]); + } fib_table_entry_path_remove(fib_index, &bgp_200_pfx, FIB_SOURCE_API, @@ -3303,6 +3375,8 @@ fib_test_v4 (void) fib_table_entry_delete(fib_index, &pfx_1_1_1_0_s_28, FIB_SOURCE_API); + /* suspend so the update walk kicks int */ + vlib_process_suspend(vlib_get_main(), 1e-5); FIB_TEST((FIB_NODE_INDEX_INVALID == fib_table_lookup_exact_match(fib_index, &pfx_1_1_1_0_s_28)), "1.1.1.1/28 removed"); @@ -3821,7 +3895,7 @@ fib_test_v4 (void) /* * -2 entries and -2 non-shared path-list */ - FIB_TEST((0 == fib_path_list_db_size()), "path list DB population:%d", + FIB_TEST((0 == fib_path_list_db_size()), "path list DB population:%d", fib_path_list_db_size()); FIB_TEST((PNBR == fib_path_list_pool_size()), "path list pool size is %d", fib_path_list_pool_size()); @@ -3855,7 +3929,7 @@ fib_test_v4 (void) FIB_TEST((ENBR-5 == pool_elts(fib_urpf_list_pool)), "uRPF pool size is %d", pool_elts(fib_urpf_list_pool)); FIB_TEST((0 == pool_elts(load_balance_map_pool)), "LB-map pool size is %d", - pool_elts(load_balance_map_pool)); + pool_elts(load_balance_map_pool)); FIB_TEST((lb_count == pool_elts(load_balance_pool)), "LB pool size is %d", pool_elts(load_balance_pool)); @@ -5900,6 +5974,12 @@ fib_test_label (void) .adj = DPO_PROTO_IP4, }, }; + fib_test_lb_bucket_t mpls_bucket_drop = { + .type = FT_LB_SPECIAL, + .special = { + .adj = DPO_PROTO_MPLS, + }, + }; fib_table_entry_path_remove(fib_index, &pfx_1_1_1_1_s_32, @@ -5932,9 +6012,9 @@ fib_test_label (void) &pfx_24001_neos); FIB_TEST(fib_test_validate_entry(fei, FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS, - 1, - &bucket_drop), - "24001/eos LB 1 buckets via: DROP"); + 1, + &mpls_bucket_drop), + "24001/neos LB 1 buckets via: DROP"); /* * add back the path with the valid label @@ -7707,6 +7787,12 @@ lfib_test (void) * A recursive via a label that does not exist */ fib_test_lb_bucket_t bucket_drop = { + .type = FT_LB_SPECIAL, + .special = { + .adj = DPO_PROTO_IP4, + }, + }; + fib_test_lb_bucket_t mpls_bucket_drop = { .type = FT_LB_SPECIAL, .special = { .adj = DPO_PROTO_MPLS, @@ -7735,7 +7821,12 @@ lfib_test (void) FIB_FORW_CHAIN_TYPE_UNICAST_IP4, 1, &bucket_drop), - "2.2.2.4/32 LB 1 buckets via: ip4-DROP"); + "1200/neos LB 1 buckets via: ip4-DROP"); + FIB_TEST(fib_test_validate_entry(lfe, + FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS, + 1, + &mpls_bucket_drop), + "1200/neos LB 1 buckets via: mpls-DROP"); fib_table_entry_delete(fib_index, &pfx_2_2_2_4_s_32, FIB_SOURCE_API); @@ -7940,18 +8031,19 @@ fib_test (vlib_main_t * vm, } else { - /* - * These walk UT aren't run as part of the full suite, since the - * fib-walk process must be disabled in order for the tests to work - * - * fib_test_walk(); - */ res += fib_test_v4(); res += fib_test_v6(); res += fib_test_ae(); res += fib_test_bfd(); res += fib_test_label(); res += lfib_test(); + + /* + * fib-walk process must be disabled in order for the walk tests to work + */ + fib_walk_process_disable(); + res += fib_test_walk(); + fib_walk_process_enable(); } if (res) diff --git a/src/vnet/fib/fib_walk.c b/src/vnet/fib/fib_walk.c index 938f7b8c..c570476d 100644 --- a/src/vnet/fib/fib_walk.c +++ b/src/vnet/fib/fib_walk.c @@ -95,11 +95,6 @@ typedef struct fib_walk_t_ */ static fib_walk_t *fib_walk_pool; -/** - * @brief There's only one event type sent to the walk process - */ -#define FIB_WALK_EVENT 0 - /** * Statistics maintained per-walk queue */ @@ -240,10 +235,13 @@ fib_walk_queue_get_front (fib_walk_priority_t prio) } static void -fib_walk_destroy (fib_walk_t *fwalk) +fib_walk_destroy (index_t fwi) { + fib_walk_t *fwalk; u32 bucket, ii; + fwalk = fib_walk_get(fwi); + if (FIB_NODE_INDEX_INVALID != fwalk->fw_prio_sibling) { fib_node_list_elt_remove(fwalk->fw_prio_sibling); @@ -252,6 +250,12 @@ fib_walk_destroy (fib_walk_t *fwalk) fwalk->fw_parent.fnp_index, fwalk->fw_dep_sibling); + /* + * refetch the walk object. More walks could have been spawned as a result + * of releasing the lock on the parent. + */ + fwalk = fib_walk_get(fwi); + /* * add the stats to the continuous histogram collection. */ @@ -466,8 +470,7 @@ fib_walk_process_queues (vlib_main_t * vm, */ if (FIB_WALK_ADVANCE_MORE != rc) { - fwalk = fib_walk_get(fwi); - fib_walk_destroy(fwalk); + fib_walk_destroy(fwi); fib_walk_queues.fwqs_queues[prio].fwq_stats[FIB_WALK_COMPLETED]++; } else @@ -510,6 +513,16 @@ that_will_do_for_now: return (fib_walk_sleep_duration[sleep]); } +/** + * Events sent to the FIB walk process + */ +typedef enum fib_walk_process_event_t_ +{ + FIB_WALK_PROCESS_EVENT_DATA, + FIB_WALK_PROCESS_EVENT_ENABLE, + FIB_WALK_PROCESS_EVENT_DISABLE, +} fib_walk_process_event; + /** * @brief The 'fib-walk' process's main loop. */ @@ -518,22 +531,47 @@ fib_walk_process (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) { + uword event_type, *event_data = 0; f64 sleep_time; + int enabled; + enabled = 1; sleep_time = fib_walk_sleep_duration[FIB_WALK_SHORT_SLEEP]; while (1) { - vlib_process_wait_for_event_or_clock(vm, sleep_time); + /* + * the feature to disable/enable this walk process is only + * for testing purposes + */ + if (enabled) + { + vlib_process_wait_for_event_or_clock(vm, sleep_time); + } + else + { + vlib_process_wait_for_event(vm); + } - /* - * there may be lots of event queued between the processes, - * but the walks we want to schedule are in the priority queues, - * so we ignore the process events. - */ - vlib_process_get_events(vm, NULL); + event_type = vlib_process_get_events(vm, &event_data); + vec_reset_length(event_data); + + switch (event_type) + { + case FIB_WALK_PROCESS_EVENT_ENABLE: + enabled = 1; + break; + case FIB_WALK_PROCESS_EVENT_DISABLE: + enabled = 0; + break; + default: + break; + } - sleep_time = fib_walk_process_queues(vm, quota); + if (enabled) + { + sleep_time = fib_walk_process_queues(vm, quota); + } } /* @@ -610,8 +648,8 @@ fib_walk_prio_queue_enquue (fib_walk_priority_t prio, */ vlib_process_signal_event(vlib_get_main(), fib_walk_process_node.index, - FIB_WALK_EVENT, - FIB_WALK_EVENT); + FIB_WALK_PROCESS_EVENT_DATA, + 0); return (sibling); } @@ -742,7 +780,7 @@ fib_walk_sync (fib_node_type_t parent_type, ASSERT(FIB_NODE_INDEX_INVALID != merged_walk.fnp_index); ASSERT(FIB_NODE_TYPE_WALK == merged_walk.fnp_type); - fib_walk_destroy(fwalk); + fib_walk_destroy(fwi); fwi = merged_walk.fnp_index; fwalk = fib_walk_get(fwi); @@ -774,7 +812,7 @@ fib_walk_sync (fib_node_type_t parent_type, if (NULL != fwalk) { - fib_walk_destroy(fwalk); + fib_walk_destroy(fwi); } } @@ -1106,3 +1144,47 @@ VLIB_CLI_COMMAND (fib_walk_clear_command, static) = { .short_help = "clear fib walk", .function = fib_walk_clear, }; + +void +fib_walk_process_enable (void) +{ + vlib_process_signal_event(vlib_get_main(), + fib_walk_process_node.index, + FIB_WALK_PROCESS_EVENT_ENABLE, + 0); +} + +void +fib_walk_process_disable (void) +{ + vlib_process_signal_event(vlib_get_main(), + fib_walk_process_node.index, + FIB_WALK_PROCESS_EVENT_DISABLE, + 0); +} + +static clib_error_t * +fib_walk_process_enable_disable (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + if (unformat (input, "enable")) + { + fib_walk_process_enable(); + } + else if (unformat (input, "disable")) + { + fib_walk_process_disable(); + } + else + { + return clib_error_return(0, "choose enable or disable"); + } + return (NULL); +} + +VLIB_CLI_COMMAND (fib_walk_process_command, static) = { + .path = "test fib-walk-process", + .short_help = "test fib-walk-process [enable|disable]", + .function = fib_walk_process_enable_disable, +}; diff --git a/src/vnet/fib/fib_walk.h b/src/vnet/fib/fib_walk.h index 7413d8a2..fdf2f10c 100644 --- a/src/vnet/fib/fib_walk.h +++ b/src/vnet/fib/fib_walk.h @@ -54,5 +54,8 @@ extern void fib_walk_sync(fib_node_type_t parent_type, extern u8* format_fib_walk_priority(u8 *s, va_list ap); +extern void fib_walk_process_enable(void); +extern void fib_walk_process_disable(void); + #endif diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 0f562037..697d2169 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -49,6 +49,7 @@ #include /* for FIB uRPF check */ #include #include +#include #include #include /* for mFIB table and entry creation */ @@ -89,7 +90,6 @@ ip4_lookup_inline (vlib_main_t * vm, { vlib_buffer_t *p0, *p1, *p2, *p3; ip4_header_t *ip0, *ip1, *ip2, *ip3; - __attribute__ ((unused)) tcp_header_t *tcp0, *tcp1, *tcp2, *tcp3; ip_lookup_next_t next0, next1, next2, next3; const load_balance_t *lb0, *lb1, *lb2, *lb3; ip4_fib_mtrie_t *mtrie0, *mtrie1, *mtrie2, *mtrie3; @@ -188,11 +188,6 @@ ip4_lookup_inline (vlib_main_t * vm, leaf3 = ip4_fib_mtrie_lookup_step_one (mtrie3, dst_addr3); } - tcp0 = (void *) (ip0 + 1); - tcp1 = (void *) (ip1 + 1); - tcp2 = (void *) (ip2 + 1); - tcp3 = (void *) (ip3 + 1); - if (!lookup_for_responses_to_locally_received_packets) { leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2); @@ -230,6 +225,15 @@ ip4_lookup_inline (vlib_main_t * vm, lb2 = load_balance_get (lb_index2); lb3 = load_balance_get (lb_index3); + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); + ASSERT (lb1->lb_n_buckets > 0); + ASSERT (is_pow2 (lb1->lb_n_buckets)); + ASSERT (lb2->lb_n_buckets > 0); + ASSERT (is_pow2 (lb2->lb_n_buckets)); + ASSERT (lb3->lb_n_buckets > 0); + ASSERT (is_pow2 (lb3->lb_n_buckets)); + /* Use flow hash to compute multipath adjacency. */ hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0; hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0; @@ -240,47 +244,57 @@ ip4_lookup_inline (vlib_main_t * vm, flow_hash_config0 = lb0->lb_hash_config; hash_c0 = vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash (ip0, flow_hash_config0); + dpo0 = + load_balance_get_fwd_bucket (lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) { flow_hash_config1 = lb1->lb_hash_config; hash_c1 = vnet_buffer (p1)->ip.flow_hash = ip4_compute_flow_hash (ip1, flow_hash_config1); + dpo1 = + load_balance_get_fwd_bucket (lb1, + (hash_c1 & + (lb1->lb_n_buckets_minus_1))); + } + else + { + dpo1 = load_balance_get_bucket_i (lb1, 0); } if (PREDICT_FALSE (lb2->lb_n_buckets > 1)) { flow_hash_config2 = lb2->lb_hash_config; hash_c2 = vnet_buffer (p2)->ip.flow_hash = ip4_compute_flow_hash (ip2, flow_hash_config2); + dpo2 = + load_balance_get_fwd_bucket (lb2, + (hash_c2 & + (lb2->lb_n_buckets_minus_1))); + } + else + { + dpo2 = load_balance_get_bucket_i (lb2, 0); } if (PREDICT_FALSE (lb3->lb_n_buckets > 1)) { flow_hash_config3 = lb3->lb_hash_config; hash_c3 = vnet_buffer (p3)->ip.flow_hash = ip4_compute_flow_hash (ip3, flow_hash_config3); + dpo3 = + load_balance_get_fwd_bucket (lb3, + (hash_c3 & + (lb3->lb_n_buckets_minus_1))); + } + else + { + dpo3 = load_balance_get_bucket_i (lb3, 0); } - - ASSERT (lb0->lb_n_buckets > 0); - ASSERT (is_pow2 (lb0->lb_n_buckets)); - ASSERT (lb1->lb_n_buckets > 0); - ASSERT (is_pow2 (lb1->lb_n_buckets)); - ASSERT (lb2->lb_n_buckets > 0); - ASSERT (is_pow2 (lb2->lb_n_buckets)); - ASSERT (lb3->lb_n_buckets > 0); - ASSERT (is_pow2 (lb3->lb_n_buckets)); - - dpo0 = load_balance_get_bucket_i (lb0, - (hash_c0 & - (lb0->lb_n_buckets_minus_1))); - dpo1 = load_balance_get_bucket_i (lb1, - (hash_c1 & - (lb1->lb_n_buckets_minus_1))); - dpo2 = load_balance_get_bucket_i (lb2, - (hash_c2 & - (lb2->lb_n_buckets_minus_1))); - dpo3 = load_balance_get_bucket_i (lb3, - (hash_c3 & - (lb3->lb_n_buckets_minus_1))); next0 = dpo0->dpoi_next_node; vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; @@ -293,20 +307,16 @@ ip4_lookup_inline (vlib_main_t * vm, vlib_increment_combined_counter (cm, thread_index, lb_index0, 1, - vlib_buffer_length_in_chain (vm, p0) - + sizeof (ethernet_header_t)); + vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter (cm, thread_index, lb_index1, 1, - vlib_buffer_length_in_chain (vm, p1) - + sizeof (ethernet_header_t)); + vlib_buffer_length_in_chain (vm, p1)); vlib_increment_combined_counter (cm, thread_index, lb_index2, 1, - vlib_buffer_length_in_chain (vm, p2) - + sizeof (ethernet_header_t)); + vlib_buffer_length_in_chain (vm, p2)); vlib_increment_combined_counter (cm, thread_index, lb_index3, 1, - vlib_buffer_length_in_chain (vm, p3) - + sizeof (ethernet_header_t)); + vlib_buffer_length_in_chain (vm, p3)); vlib_validate_buffer_enqueue_x4 (vm, node, next, to_next, n_left_to_next, @@ -318,7 +328,6 @@ ip4_lookup_inline (vlib_main_t * vm, { vlib_buffer_t *p0; ip4_header_t *ip0; - __attribute__ ((unused)) tcp_header_t *tcp0; ip_lookup_next_t next0; const load_balance_t *lb0; ip4_fib_mtrie_t *mtrie0; @@ -352,8 +361,6 @@ ip4_lookup_inline (vlib_main_t * vm, leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0); } - tcp0 = (void *) (ip0 + 1); - if (!lookup_for_responses_to_locally_received_packets) leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2); @@ -371,6 +378,9 @@ ip4_lookup_inline (vlib_main_t * vm, ASSERT (lbi0); lb0 = load_balance_get (lbi0); + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); + /* Use flow hash to compute multipath adjacency. */ hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0; if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) @@ -379,20 +389,22 @@ ip4_lookup_inline (vlib_main_t * vm, hash_c0 = vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash (ip0, flow_hash_config0); + dpo0 = + load_balance_get_fwd_bucket (lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } - - ASSERT (lb0->lb_n_buckets > 0); - ASSERT (is_pow2 (lb0->lb_n_buckets)); - - dpo0 = load_balance_get_bucket_i (lb0, - (hash_c0 & - (lb0->lb_n_buckets_minus_1))); next0 = dpo0->dpoi_next_node; vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; - vlib_increment_combined_counter - (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + vlib_increment_combined_counter (cm, thread_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, + p0)); from += 1; to_next += 1; @@ -555,6 +567,12 @@ ip4_load_balance (vlib_main_t * vm, hc0 = vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash (ip0, lb0->lb_hash_config); } + dpo0 = load_balance_get_fwd_bucket + (lb0, (hc0 & (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) { @@ -568,14 +586,13 @@ ip4_load_balance (vlib_main_t * vm, hc1 = vnet_buffer (p1)->ip.flow_hash = ip4_compute_flow_hash (ip1, lb1->lb_hash_config); } + dpo1 = load_balance_get_fwd_bucket + (lb1, (hc1 & (lb1->lb_n_buckets_minus_1))); + } + else + { + dpo1 = load_balance_get_bucket_i (lb1, 0); } - - dpo0 = - load_balance_get_bucket_i (lb0, - hc0 & (lb0->lb_n_buckets_minus_1)); - dpo1 = - load_balance_get_bucket_i (lb1, - hc1 & (lb1->lb_n_buckets_minus_1)); next0 = dpo0->dpoi_next_node; next1 = dpo1->dpoi_next_node; @@ -629,11 +646,13 @@ ip4_load_balance (vlib_main_t * vm, hc0 = vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash (ip0, lb0->lb_hash_config); } + dpo0 = load_balance_get_fwd_bucket + (lb0, (hc0 & (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } - - dpo0 = - load_balance_get_bucket_i (lb0, - hc0 & (lb0->lb_n_buckets_minus_1)); next0 = dpo0->dpoi_next_node; vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 98bfd4d1..3bc07d0e 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -45,7 +45,7 @@ #include /* for FIB uRPF check */ #include #include -#include +#include #include #include @@ -138,6 +138,10 @@ ip6_lookup_inline (vlib_main_t * vm, lb0 = load_balance_get (lbi0); lb1 = load_balance_get (lbi1); + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (lb1->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); + ASSERT (is_pow2 (lb1->lb_n_buckets)); vnet_buffer (p0)->ip.flow_hash = vnet_buffer (p1)->ip.flow_hash = 0; @@ -146,25 +150,29 @@ ip6_lookup_inline (vlib_main_t * vm, flow_hash_config0 = lb0->lb_hash_config; vnet_buffer (p0)->ip.flow_hash = ip6_compute_flow_hash (ip0, flow_hash_config0); + dpo0 = + load_balance_get_fwd_bucket (lb0, + (vnet_buffer (p0)->ip.flow_hash & + (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) { flow_hash_config1 = lb1->lb_hash_config; vnet_buffer (p1)->ip.flow_hash = ip6_compute_flow_hash (ip1, flow_hash_config1); + dpo1 = + load_balance_get_fwd_bucket (lb1, + (vnet_buffer (p1)->ip.flow_hash & + (lb1->lb_n_buckets_minus_1))); + } + else + { + dpo1 = load_balance_get_bucket_i (lb1, 0); } - - ASSERT (lb0->lb_n_buckets > 0); - ASSERT (lb1->lb_n_buckets > 0); - ASSERT (is_pow2 (lb0->lb_n_buckets)); - ASSERT (is_pow2 (lb1->lb_n_buckets)); - dpo0 = load_balance_get_bucket_i (lb0, - (vnet_buffer (p0)->ip.flow_hash & - lb0->lb_n_buckets_minus_1)); - dpo1 = load_balance_get_bucket_i (lb1, - (vnet_buffer (p1)->ip.flow_hash & - lb1->lb_n_buckets_minus_1)); - next0 = dpo0->dpoi_next_node; next1 = dpo1->dpoi_next_node; @@ -266,16 +274,24 @@ ip6_lookup_inline (vlib_main_t * vm, lb0 = load_balance_get (lbi0); vnet_buffer (p0)->ip.flow_hash = 0; + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) { flow_hash_config0 = lb0->lb_hash_config; vnet_buffer (p0)->ip.flow_hash = ip6_compute_flow_hash (ip0, flow_hash_config0); + dpo0 = + load_balance_get_fwd_bucket (lb0, + (vnet_buffer (p0)->ip.flow_hash & + (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } - ASSERT (lb0->lb_n_buckets > 0); - ASSERT (is_pow2 (lb0->lb_n_buckets)); dpo0 = load_balance_get_bucket_i (lb0, (vnet_buffer (p0)->ip.flow_hash & lb0->lb_n_buckets_minus_1)); @@ -337,10 +353,18 @@ ip6_add_interface_routes (vnet_main_t * vnm, u32 sw_if_index, { fib_node_index_t fei; - fei = fib_table_entry_update_one_path (fib_index, &pfx, FIB_SOURCE_INTERFACE, (FIB_ENTRY_FLAG_CONNECTED | FIB_ENTRY_FLAG_ATTACHED), FIB_PROTOCOL_IP6, NULL, /* No next-hop address */ - sw_if_index, ~0, // invalid FIB index - 1, NULL, // no label stack - FIB_ROUTE_PATH_FLAG_NONE); + fei = fib_table_entry_update_one_path (fib_index, + &pfx, + FIB_SOURCE_INTERFACE, + (FIB_ENTRY_FLAG_CONNECTED | + FIB_ENTRY_FLAG_ATTACHED), + FIB_PROTOCOL_IP6, + /* No next-hop address */ + NULL, sw_if_index, + /* invalid FIB index */ + ~0, 1, + /* no label stack */ + NULL, FIB_ROUTE_PATH_FLAG_NONE); a->neighbor_probe_adj_index = fib_entry_get_adj (fei); } @@ -366,7 +390,13 @@ ip6_add_interface_routes (vnet_main_t * vnm, u32 sw_if_index, } } - fib_table_entry_update_one_path (fib_index, &pfx, FIB_SOURCE_INTERFACE, (FIB_ENTRY_FLAG_CONNECTED | FIB_ENTRY_FLAG_LOCAL), FIB_PROTOCOL_IP6, &pfx.fp_addr, sw_if_index, ~0, // invalid FIB index + fib_table_entry_update_one_path (fib_index, &pfx, + FIB_SOURCE_INTERFACE, + (FIB_ENTRY_FLAG_CONNECTED | + FIB_ENTRY_FLAG_LOCAL), + FIB_PROTOCOL_IP6, + &pfx.fp_addr, + sw_if_index, ~0, 1, NULL, FIB_ROUTE_PATH_FLAG_NONE); } @@ -780,6 +810,14 @@ ip6_load_balance (vlib_main_t * vm, hc0 = vnet_buffer (p0)->ip.flow_hash = ip6_compute_flow_hash (ip0, lb0->lb_hash_config); } + dpo0 = + load_balance_get_fwd_bucket (lb0, + (hc0 & + lb0->lb_n_buckets_minus_1)); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) { @@ -793,14 +831,15 @@ ip6_load_balance (vlib_main_t * vm, hc1 = vnet_buffer (p1)->ip.flow_hash = ip6_compute_flow_hash (ip1, lb1->lb_hash_config); } + dpo1 = + load_balance_get_fwd_bucket (lb1, + (hc1 & + lb1->lb_n_buckets_minus_1)); + } + else + { + dpo1 = load_balance_get_bucket_i (lb1, 0); } - - dpo0 = - load_balance_get_bucket_i (lb0, - hc0 & (lb0->lb_n_buckets_minus_1)); - dpo1 = - load_balance_get_bucket_i (lb1, - hc1 & (lb1->lb_n_buckets_minus_1)); next0 = dpo0->dpoi_next_node; next1 = dpo1->dpoi_next_node; @@ -869,10 +908,15 @@ ip6_load_balance (vlib_main_t * vm, hc0 = vnet_buffer (p0)->ip.flow_hash = ip6_compute_flow_hash (ip0, lb0->lb_hash_config); } + dpo0 = + load_balance_get_fwd_bucket (lb0, + (hc0 & + lb0->lb_n_buckets_minus_1)); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } - dpo0 = - load_balance_get_bucket_i (lb0, - hc0 & (lb0->lb_n_buckets_minus_1)); next0 = dpo0->dpoi_next_node; vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 31182770..ee80ee3d 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -630,7 +630,7 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm, n->fib_entry_index = fib_table_entry_update_one_path (fib_index, &pfx, FIB_SOURCE_ADJ, - FIB_ENTRY_FLAG_NONE, + FIB_ENTRY_FLAG_ATTACHED, FIB_PROTOCOL_IP6, &pfx.fp_addr, n->key.sw_if_index, ~0, 1, NULL, FIB_ROUTE_PATH_FLAG_NONE); diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c index 3c6be7e8..4b8a3eef 100644 --- a/src/vnet/mpls/mpls_lookup.c +++ b/src/vnet/mpls/mpls_lookup.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include /** @@ -47,7 +47,7 @@ format_mpls_lookup_trace (u8 * s, va_list * args) CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); mpls_lookup_trace_t * t = va_arg (*args, mpls_lookup_trace_t *); - s = format (s, "MPLS: next [%d], lookup fib index %d, LB index %d hash %d" + s = format (s, "MPLS: next [%d], lookup fib index %d, LB index %d hash %#x " "label %d eos %d", t->next_index, t->lfib_index, t->lb_index, t->hash, vnet_mpls_uc_get_label( @@ -64,8 +64,15 @@ always_inline u32 mpls_compute_flow_hash (const mpls_unicast_header_t * hdr, flow_hash_config_t flow_hash_config) { - // FIXME - return (vnet_mpls_uc_get_label(hdr->label_exp_s_ttl)); + /* + * improve this to include: + * - all labels in the stack. + * - recognise entropy labels. + * + * We need to byte swap so we use the numerical value. i.e. an odd label + * leads to an odd bucket. ass opposed to a label above and below value X. + */ + return (vnet_mpls_uc_get_label(clib_net_to_host_u32(hdr->label_exp_s_ttl))); } static inline uword @@ -179,17 +186,21 @@ mpls_lookup (vlib_main_t * vm, else { lb0 = load_balance_get(lbi0); + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); if (PREDICT_FALSE(lb0->lb_n_buckets > 1)) { hash_c0 = vnet_buffer (b0)->ip.flow_hash = mpls_compute_flow_hash(h0, lb0->lb_hash_config); + dpo0 = load_balance_get_fwd_bucket + (lb0, + (hash_c0 & (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } - ASSERT (lb0->lb_n_buckets > 0); - ASSERT (is_pow2 (lb0->lb_n_buckets)); - dpo0 = load_balance_get_bucket_i(lb0, - (hash_c0 & - (lb0->lb_n_buckets_minus_1))); next0 = dpo0->dpoi_next_node; vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; @@ -207,17 +218,21 @@ mpls_lookup (vlib_main_t * vm, else { lb1 = load_balance_get(lbi1); + ASSERT (lb1->lb_n_buckets > 0); + ASSERT (is_pow2 (lb1->lb_n_buckets)); if (PREDICT_FALSE(lb1->lb_n_buckets > 1)) { hash_c1 = vnet_buffer (b1)->ip.flow_hash = mpls_compute_flow_hash(h1, lb1->lb_hash_config); + dpo1 = load_balance_get_fwd_bucket + (lb1, + (hash_c1 & (lb1->lb_n_buckets_minus_1))); + } + else + { + dpo1 = load_balance_get_bucket_i (lb1, 0); } - ASSERT (lb1->lb_n_buckets > 0); - ASSERT (is_pow2 (lb1->lb_n_buckets)); - dpo1 = load_balance_get_bucket_i(lb1, - (hash_c1 & - (lb1->lb_n_buckets_minus_1))); next1 = dpo1->dpoi_next_node; vnet_buffer (b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; @@ -235,17 +250,21 @@ mpls_lookup (vlib_main_t * vm, else { lb2 = load_balance_get(lbi2); + ASSERT (lb2->lb_n_buckets > 0); + ASSERT (is_pow2 (lb2->lb_n_buckets)); if (PREDICT_FALSE(lb2->lb_n_buckets > 1)) { hash_c2 = vnet_buffer (b2)->ip.flow_hash = mpls_compute_flow_hash(h2, lb2->lb_hash_config); + dpo2 = load_balance_get_fwd_bucket + (lb2, + (hash_c2 & (lb2->lb_n_buckets_minus_1))); + } + else + { + dpo2 = load_balance_get_bucket_i (lb2, 0); } - ASSERT (lb2->lb_n_buckets > 0); - ASSERT (is_pow2 (lb2->lb_n_buckets)); - dpo2 = load_balance_get_bucket_i(lb2, - (hash_c2 & - (lb2->lb_n_buckets_minus_1))); next2 = dpo2->dpoi_next_node; vnet_buffer (b2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index; @@ -263,17 +282,21 @@ mpls_lookup (vlib_main_t * vm, else { lb3 = load_balance_get(lbi3); + ASSERT (lb3->lb_n_buckets > 0); + ASSERT (is_pow2 (lb3->lb_n_buckets)); if (PREDICT_FALSE(lb3->lb_n_buckets > 1)) { hash_c3 = vnet_buffer (b3)->ip.flow_hash = mpls_compute_flow_hash(h3, lb3->lb_hash_config); + dpo3 = load_balance_get_fwd_bucket + (lb3, + (hash_c3 & (lb3->lb_n_buckets_minus_1))); + } + else + { + dpo3 = load_balance_get_bucket_i (lb3, 0); } - ASSERT (lb3->lb_n_buckets > 0); - ASSERT (is_pow2 (lb3->lb_n_buckets)); - dpo3 = load_balance_get_bucket_i(lb3, - (hash_c3 & - (lb3->lb_n_buckets_minus_1))); next3 = dpo3->dpoi_next_node; vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; @@ -393,20 +416,21 @@ mpls_lookup (vlib_main_t * vm, else { lb0 = load_balance_get(lbi0); + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); if (PREDICT_FALSE(lb0->lb_n_buckets > 1)) { hash_c0 = vnet_buffer (b0)->ip.flow_hash = mpls_compute_flow_hash(h0, lb0->lb_hash_config); + dpo0 = load_balance_get_fwd_bucket + (lb0, + (hash_c0 & (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } - - ASSERT (lb0->lb_n_buckets > 0); - ASSERT (is_pow2 (lb0->lb_n_buckets)); - - dpo0 = load_balance_get_bucket_i(lb0, - (hash_c0 & - (lb0->lb_n_buckets_minus_1))); - next0 = dpo0->dpoi_next_node; vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; @@ -467,7 +491,7 @@ VLIB_REGISTER_NODE (mpls_lookup_node, static) = { .n_errors = MPLS_N_ERROR, .error_strings = mpls_error_strings, - .sibling_of = "ip4-lookup", + .sibling_of = "mpls-load-balance", .format_buffer = format_mpls_header, .format_trace = format_mpls_lookup_trace, @@ -574,6 +598,11 @@ mpls_load_balance (vlib_main_t * vm, { hc0 = vnet_buffer(p0)->ip.flow_hash = mpls_compute_flow_hash(mpls0, hc0); } + dpo0 = load_balance_get_fwd_bucket(lb0, (hc0 & lb0->lb_n_buckets_minus_1)); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) { @@ -585,10 +614,12 @@ mpls_load_balance (vlib_main_t * vm, { hc1 = vnet_buffer(p1)->ip.flow_hash = mpls_compute_flow_hash(mpls1, hc1); } + dpo1 = load_balance_get_fwd_bucket(lb1, (hc1 & lb1->lb_n_buckets_minus_1)); + } + else + { + dpo1 = load_balance_get_bucket_i (lb1, 0); } - - dpo0 = load_balance_get_bucket_i(lb0, hc0 & (lb0->lb_n_buckets_minus_1)); - dpo1 = load_balance_get_bucket_i(lb1, hc1 & (lb1->lb_n_buckets_minus_1)); next0 = dpo0->dpoi_next_node; next1 = dpo1->dpoi_next_node; @@ -650,9 +681,12 @@ mpls_load_balance (vlib_main_t * vm, { hc0 = vnet_buffer(p0)->ip.flow_hash = mpls_compute_flow_hash(mpls0, hc0); } + dpo0 = load_balance_get_fwd_bucket(lb0, (hc0 & lb0->lb_n_buckets_minus_1)); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); } - - dpo0 = load_balance_get_bucket_i(lb0, hc0 & (lb0->lb_n_buckets_minus_1)); next0 = dpo0->dpoi_next_node; vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; @@ -676,9 +710,13 @@ VLIB_REGISTER_NODE (mpls_load_balance_node) = { .function = mpls_load_balance, .name = "mpls-load-balance", .vector_size = sizeof (u32), - .sibling_of = "mpls-lookup", - .format_trace = format_mpls_load_balance_trace, + .n_next_nodes = 1, + .next_nodes = + { + [0] = "mpls-drop", + }, + }; VLIB_NODE_FUNCTION_MULTIARCH (mpls_load_balance_node, mpls_load_balance) diff --git a/test/test_mpls.py b/test/test_mpls.py index 700b7091..0ad1ee69 100644 --- a/test/test_mpls.py +++ b/test/test_mpls.py @@ -1054,5 +1054,364 @@ class TestMPLSDisabled(VppTestCase): self.send_and_assert_no_replies(self.pg1, tx, "IPv6 disabled") +class TestMPLSPIC(VppTestCase): + """ MPLS PIC edge convergence """ + + def setUp(self): + super(TestMPLSPIC, self).setUp() + + # create 2 pg interfaces + self.create_pg_interfaces(range(4)) + + # core links + self.pg0.admin_up() + self.pg0.config_ip4() + self.pg0.resolve_arp() + self.pg0.enable_mpls() + self.pg1.admin_up() + self.pg1.config_ip4() + self.pg1.resolve_arp() + self.pg1.enable_mpls() + + # VRF (customer facing) link + self.pg2.admin_up() + self.pg2.set_table_ip4(1) + self.pg2.config_ip4() + self.pg2.resolve_arp() + self.pg2.set_table_ip6(1) + self.pg2.config_ip6() + self.pg2.resolve_ndp() + self.pg3.admin_up() + self.pg3.set_table_ip4(1) + self.pg3.config_ip4() + self.pg3.resolve_arp() + self.pg3.set_table_ip6(1) + self.pg3.config_ip6() + self.pg3.resolve_ndp() + + def tearDown(self): + super(TestMPLSPIC, self).tearDown() + self.pg0.disable_mpls() + for i in self.pg_interfaces: + i.unconfig_ip4() + i.unconfig_ip6() + i.set_table_ip4(0) + i.set_table_ip6(0) + i.admin_down() + + def test_mpls_ibgp_pic(self): + """ MPLS iBGP PIC edge convergence + + 1) setup many iBGP VPN routes via a pair of iBGP peers. + 2) Check EMCP forwarding to these peers + 3) withdraw the IGP route to one of these peers. + 4) check forwarding continues to the remaining peer + """ + + # + # IGP+LDP core routes + # + core_10_0_0_45 = VppIpRoute(self, "10.0.0.45", 32, + [VppRoutePath(self.pg0.remote_ip4, + self.pg0.sw_if_index, + labels=[45])]) + core_10_0_0_45.add_vpp_config() + + core_10_0_0_46 = VppIpRoute(self, "10.0.0.46", 32, + [VppRoutePath(self.pg1.remote_ip4, + self.pg1.sw_if_index, + labels=[46])]) + core_10_0_0_46.add_vpp_config() + + # + # Lot's of VPN routes. We need more the 64 so VPP will build + # the fast convergence indirection + # + vpn_routes = [] + pkts = [] + for ii in range(64): + dst = "192.168.1.%d" % ii + vpn_routes.append(VppIpRoute(self, dst, 32, + [VppRoutePath("10.0.0.45", + 0xffffffff, + labels=[145], + is_resolve_host=1), + VppRoutePath("10.0.0.46", + 0xffffffff, + labels=[146], + is_resolve_host=1)], + table_id=1)) + vpn_routes[ii].add_vpp_config() + + pkts.append(Ether(dst=self.pg2.local_mac, + src=self.pg2.remote_mac) / + IP(src=self.pg2.remote_ip4, dst=dst) / + UDP(sport=1234, dport=1234) / + Raw('\xa5' * 100)) + + # + # Send the packet stream (one pkt to each VPN route) + # - expect a 50-50 split of the traffic + # + self.pg2.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg0._get_capture(1) + rx1 = self.pg1._get_capture(1) + + # not testig the LB hashing algorithm so we're not concerned + # with the split ratio, just as long as neither is 0 + self.assertNotEqual(0, len(rx0)) + self.assertNotEqual(0, len(rx1)) + + # + # use a test CLI command to stop the FIB walk process, this + # will prevent the FIB converging the VPN routes and thus allow + # us to probe the interim (psot-fail, pre-converge) state + # + self.vapi.ppcli("test fib-walk-process disable") + + # + # Withdraw one of the IGP routes + # + core_10_0_0_46.remove_vpp_config() + + # + # now all packets should be forwarded through the remaining peer + # + self.vapi.ppcli("clear trace") + self.pg2.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg0.get_capture(len(pkts)) + + # + # enable the FIB walk process to converge the FIB + # + self.vapi.ppcli("test fib-walk-process enable") + + # + # packets should still be forwarded through the remaining peer + # + self.pg2.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg0.get_capture(64) + + # + # Add the IGP route back and we return to load-balancing + # + core_10_0_0_46.add_vpp_config() + + self.pg2.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg0._get_capture(1) + rx1 = self.pg1._get_capture(1) + self.assertNotEqual(0, len(rx0)) + self.assertNotEqual(0, len(rx1)) + + def test_mpls_ebgp_pic(self): + """ MPLS eBGP PIC edge convergence + + 1) setup many eBGP VPN routes via a pair of eBGP peers + 2) Check EMCP forwarding to these peers + 3) withdraw one eBGP path - expect LB across remaining eBGP + """ + + # + # Lot's of VPN routes. We need more the 64 so VPP will build + # the fast convergence indirection + # + vpn_routes = [] + vpn_bindings = [] + pkts = [] + for ii in range(64): + dst = "192.168.1.%d" % ii + local_label = 1600 + ii + vpn_routes.append(VppIpRoute(self, dst, 32, + [VppRoutePath(self.pg2.remote_ip4, + 0xffffffff, + nh_table_id=1, + is_resolve_attached=1), + VppRoutePath(self.pg3.remote_ip4, + 0xffffffff, + nh_table_id=1, + is_resolve_attached=1)], + table_id=1)) + vpn_routes[ii].add_vpp_config() + + vpn_bindings.append(VppMplsIpBind(self, local_label, dst, 32, + ip_table_id=1)) + vpn_bindings[ii].add_vpp_config() + + pkts.append(Ether(dst=self.pg0.local_mac, + src=self.pg0.remote_mac) / + MPLS(label=local_label, ttl=64) / + IP(src=self.pg0.remote_ip4, dst=dst) / + UDP(sport=1234, dport=1234) / + Raw('\xa5' * 100)) + + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg2._get_capture(1) + rx1 = self.pg3._get_capture(1) + self.assertNotEqual(0, len(rx0)) + self.assertNotEqual(0, len(rx1)) + + # + # use a test CLI command to stop the FIB walk process, this + # will prevent the FIB converging the VPN routes and thus allow + # us to probe the interim (psot-fail, pre-converge) state + # + self.vapi.ppcli("test fib-walk-process disable") + + # + # withdraw the connected prefix on the interface. + # + self.pg2.unconfig_ip4() + + # + # now all packets should be forwarded through the remaining peer + # + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg3.get_capture(len(pkts)) + + # + # enable the FIB walk process to converge the FIB + # + self.vapi.ppcli("test fib-walk-process enable") + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg3.get_capture(len(pkts)) + + # + # put the connecteds back + # + self.pg2.config_ip4() + + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg2._get_capture(1) + rx1 = self.pg3._get_capture(1) + self.assertNotEqual(0, len(rx0)) + self.assertNotEqual(0, len(rx1)) + + def test_mpls_v6_ebgp_pic(self): + """ MPLSv6 eBGP PIC edge convergence + + 1) setup many eBGP VPNv6 routes via a pair of eBGP peers + 2) Check EMCP forwarding to these peers + 3) withdraw one eBGP path - expect LB across remaining eBGP + """ + + # + # Lot's of VPN routes. We need more the 64 so VPP will build + # the fast convergence indirection + # + vpn_routes = [] + vpn_bindings = [] + pkts = [] + for ii in range(64): + dst = "3000::%d" % ii + local_label = 1600 + ii + vpn_routes.append(VppIpRoute(self, dst, 128, + [VppRoutePath(self.pg2.remote_ip6, + 0xffffffff, + nh_table_id=1, + is_resolve_attached=1, + is_ip6=1), + VppRoutePath(self.pg3.remote_ip6, + 0xffffffff, + nh_table_id=1, + is_ip6=1, + is_resolve_attached=1)], + table_id=1, + is_ip6=1)) + vpn_routes[ii].add_vpp_config() + + vpn_bindings.append(VppMplsIpBind(self, local_label, dst, 128, + ip_table_id=1, + is_ip6=1)) + vpn_bindings[ii].add_vpp_config() + + pkts.append(Ether(dst=self.pg0.local_mac, + src=self.pg0.remote_mac) / + MPLS(label=local_label, ttl=64) / + IPv6(src=self.pg0.remote_ip6, dst=dst) / + UDP(sport=1234, dport=1234) / + Raw('\xa5' * 100)) + + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg2._get_capture(1) + rx1 = self.pg3._get_capture(1) + self.assertNotEqual(0, len(rx0)) + self.assertNotEqual(0, len(rx1)) + + # + # use a test CLI command to stop the FIB walk process, this + # will prevent the FIB converging the VPN routes and thus allow + # us to probe the interim (psot-fail, pre-converge) state + # + self.vapi.ppcli("test fib-walk-process disable") + + # + # withdraw the connected prefix on the interface. + # and shutdown the interface so the ND cache is flushed. + # + self.pg2.unconfig_ip6() + self.pg2.admin_down() + + # + # now all packets should be forwarded through the remaining peer + # + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg3.get_capture(len(pkts)) + + # + # enable the FIB walk process to converge the FIB + # + self.vapi.ppcli("test fib-walk-process enable") + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg3.get_capture(len(pkts)) + + # + # put the connecteds back + # + self.pg2.admin_up() + self.pg2.config_ip6() + + self.pg0.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx0 = self.pg2._get_capture(1) + rx1 = self.pg3._get_capture(1) + self.assertNotEqual(0, len(rx0)) + self.assertNotEqual(0, len(rx1)) + + if __name__ == '__main__': unittest.main(testRunner=VppTestRunner) diff --git a/test/vpp_ip_route.py b/test/vpp_ip_route.py index d6146f28..b68e2105 100644 --- a/test/vpp_ip_route.py +++ b/test/vpp_ip_route.py @@ -57,7 +57,9 @@ class VppRoutePath(object): nh_via_label=MPLS_LABEL_INVALID, is_ip6=0, rpf_id=0, - is_interface_rx=0): + is_interface_rx=0, + is_resolve_host=0, + is_resolve_attached=0): self.nh_itf = nh_sw_if_index self.nh_table_id = nh_table_id self.nh_via_label = nh_via_label @@ -68,6 +70,8 @@ class VppRoutePath(object): self.nh_addr = inet_pton(AF_INET6, nh_addr) else: self.nh_addr = inet_pton(AF_INET, nh_addr) + self.is_resolve_host = is_resolve_host + self.is_resolve_attached = is_resolve_attached self.is_interface_rx = is_interface_rx self.is_rpf_id = 0 if rpf_id != 0: @@ -136,7 +140,11 @@ class VppIpRoute(VppObject): next_hop_n_out_labels=len( path.nh_labels), next_hop_via_label=path.nh_via_label, - is_ipv6=self.is_ip6) + next_hop_table_id=path.nh_table_id, + is_ipv6=self.is_ip6, + is_resolve_host=path.is_resolve_host, + is_resolve_attached=path.is_resolve_attached, + is_multipath=1 if len(self.paths) > 1 else 0) self._test.registry.register(self, self._test.logger) def remove_vpp_config(self): @@ -154,13 +162,16 @@ class VppIpRoute(VppObject): is_ipv6=self.is_ip6) else: for path in self.paths: - self._test.vapi.ip_add_del_route(self.dest_addr, - self.dest_addr_len, - path.nh_addr, - path.nh_itf, - table_id=self.table_id, - is_add=0, - is_ipv6=self.is_ip6) + self._test.vapi.ip_add_del_route( + self.dest_addr, + self.dest_addr_len, + path.nh_addr, + path.nh_itf, + table_id=self.table_id, + next_hop_table_id=path.nh_table_id, + next_hop_via_label=path.nh_via_label, + is_add=0, + is_ipv6=self.is_ip6) def query_vpp_config(self): return find_route(self._test, @@ -318,33 +329,41 @@ class VppMplsIpBind(VppObject): """ def __init__(self, test, local_label, dest_addr, dest_addr_len, - table_id=0, ip_table_id=0): + table_id=0, ip_table_id=0, is_ip6=0): self._test = test - self.dest_addr = inet_pton(AF_INET, dest_addr) self.dest_addr_len = dest_addr_len + self.dest_addr = dest_addr self.local_label = local_label self.table_id = table_id self.ip_table_id = ip_table_id + self.is_ip6 = is_ip6 + if is_ip6: + self.dest_addrn = inet_pton(AF_INET6, dest_addr) + else: + self.dest_addrn = inet_pton(AF_INET, dest_addr) def add_vpp_config(self): self._test.vapi.mpls_ip_bind_unbind(self.local_label, - self.dest_addr, + self.dest_addrn, self.dest_addr_len, table_id=self.table_id, - ip_table_id=self.ip_table_id) + ip_table_id=self.ip_table_id, + is_ip4=(self.is_ip6 == 0)) self._test.registry.register(self, self._test.logger) def remove_vpp_config(self): self._test.vapi.mpls_ip_bind_unbind(self.local_label, - self.dest_addr, + self.dest_addrn, self.dest_addr_len, - is_bind=0) + table_id=self.table_id, + ip_table_id=self.ip_table_id, + is_bind=0, + is_ip4=(self.is_ip6 == 0)) def query_vpp_config(self): dump = self._test.vapi.mpls_fib_dump() for e in dump: if self.local_label == e.label \ - and self.eos_bit == e.eos_bit \ and self.table_id == e.table_id: return True return False @@ -357,7 +376,7 @@ class VppMplsIpBind(VppObject): % (self.table_id, self.local_label, self.ip_table_id, - inet_ntop(AF_INET, self.dest_addr), + self.dest_addr, self.dest_addr_len)) -- cgit 1.2.3-korg From 8c4611b39162da9753caaf654741faa115eaf612 Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Tue, 23 May 2017 03:43:47 -0700 Subject: Labelled attached paths via an MPLS tunnel Change-Id: Ic86617c9c3217122043656ce2ea70bb106df5b2d Signed-off-by: Neale Ranns --- src/vnet/adj/adj.c | 1 + src/vnet/dpo/dpo.c | 3 ++ src/vnet/fib/fib_path.c | 70 ++++++++++++++++++++++++++++----------------- src/vnet/fib/fib_walk.c | 31 +++++++++++--------- src/vnet/mpls/mpls_tunnel.c | 4 ++- test/test_mpls.py | 27 +++++++++++++++-- 6 files changed, 94 insertions(+), 42 deletions(-) (limited to 'src/vnet/fib/fib_walk.c') diff --git a/src/vnet/adj/adj.c b/src/vnet/adj/adj.c index bf44383f..f8496913 100644 --- a/src/vnet/adj/adj.c +++ b/src/vnet/adj/adj.c @@ -64,6 +64,7 @@ adj_alloc (fib_protocol_t proto) adj->ia_nh_proto = proto; adj->ia_flags = 0; adj->rewrite_header.sw_if_index = ~0; + adj->rewrite_header.flags = 0; adj->lookup_next_index = 0; adj->ia_delegates = NULL; diff --git a/src/vnet/dpo/dpo.c b/src/vnet/dpo/dpo.c index dfc2bd92..28aa0c23 100644 --- a/src/vnet/dpo/dpo.c +++ b/src/vnet/dpo/dpo.c @@ -189,6 +189,9 @@ dpo_set (dpo_id_t *dpo, break; case IP_LOOKUP_NEXT_MCAST: dpo->dpoi_type = DPO_ADJACENCY_MCAST; + break; + case IP_LOOKUP_NEXT_GLEAN: + dpo->dpoi_type = DPO_ADJACENCY_GLEAN; break; default: break; diff --git a/src/vnet/fib/fib_path.c b/src/vnet/fib/fib_path.c index 255f0dd1..274b0ef4 100644 --- a/src/vnet/fib/fib_path.c +++ b/src/vnet/fib/fib_path.c @@ -588,6 +588,30 @@ fib_path_attached_next_hop_set (fib_path_t *path) } } +static const adj_index_t +fib_path_attached_get_adj (fib_path_t *path, + vnet_link_t link) +{ + if (vnet_sw_interface_is_p2p(vnet_get_main(), + path->attached.fp_interface)) + { + /* + * point-2-point interfaces do not require a glean, since + * there is nothing to ARP. Install a rewrite/nbr adj instead + */ + return (adj_nbr_add_or_lock(path->fp_nh_proto, + link, + &zero_addr, + path->attached.fp_interface)); + } + else + { + return (adj_glean_add_or_lock(path->fp_nh_proto, + path->attached.fp_interface, + NULL)); + } +} + /* * create of update the paths recursive adj */ @@ -1559,31 +1583,12 @@ fib_path_resolve (fib_node_index_t path_index) { path->fp_oper_flags &= ~FIB_PATH_OPER_FLAG_RESOLVED; } - if (vnet_sw_interface_is_p2p(vnet_get_main(), - path->attached.fp_interface)) - { - /* - * point-2-point interfaces do not require a glean, since - * there is nothing to ARP. Install a rewrite/nbr adj instead - */ - dpo_set(&path->fp_dpo, - DPO_ADJACENCY, - fib_proto_to_dpo(path->fp_nh_proto), - adj_nbr_add_or_lock( - path->fp_nh_proto, - fib_proto_to_link(path->fp_nh_proto), - &zero_addr, - path->attached.fp_interface)); - } - else - { - dpo_set(&path->fp_dpo, - DPO_ADJACENCY_GLEAN, - fib_proto_to_dpo(path->fp_nh_proto), - adj_glean_add_or_lock(path->fp_nh_proto, - path->attached.fp_interface, - NULL)); - } + dpo_set(&path->fp_dpo, + DPO_ADJACENCY, + fib_proto_to_dpo(path->fp_nh_proto), + fib_path_attached_get_adj(path, + fib_proto_to_link(path->fp_nh_proto))); + /* * become a child of the adjacency so we receive updates * when the interface state changes @@ -1969,7 +1974,20 @@ fib_path_contribute_forwarding (fib_node_index_t path_index, case FIB_FORW_CHAIN_TYPE_MPLS_EOS: case FIB_FORW_CHAIN_TYPE_ETHERNET: case FIB_FORW_CHAIN_TYPE_NSH: - break; + { + adj_index_t ai; + + /* + * get a appropriate link type adj. + */ + ai = fib_path_attached_get_adj( + path, + fib_forw_chain_type_to_link_type(fct)); + dpo_set(dpo, DPO_ADJACENCY, + fib_forw_chain_type_to_dpo_proto(fct), ai); + adj_unlock(ai); + break; + } case FIB_FORW_CHAIN_TYPE_MCAST_IP4: case FIB_FORW_CHAIN_TYPE_MCAST_IP6: { diff --git a/src/vnet/fib/fib_walk.c b/src/vnet/fib/fib_walk.c index c570476d..70180137 100644 --- a/src/vnet/fib/fib_walk.c +++ b/src/vnet/fib/fib_walk.c @@ -322,10 +322,10 @@ typedef enum fib_walk_advance_rc_t_ static fib_walk_advance_rc_t fib_walk_advance (fib_node_index_t fwi) { - fib_node_back_walk_ctx_t *ctx, *old; fib_node_back_walk_rc_t wrc; fib_node_ptr_t sibling; fib_walk_t *fwalk; + uint n_ctxs, ii; int more_elts; /* @@ -339,12 +339,20 @@ fib_walk_advance (fib_node_index_t fwi) if (more_elts) { - old = fwalk->fw_ctx; - vec_foreach(ctx, fwalk->fw_ctx) - { - wrc = fib_node_back_walk_one(&sibling, ctx); + /* + * loop through the backwalk contexts. This can grow in length + * as walks on the same object meet each other. Order is preserved so the + * most recently started walk as at the back of the vector. + */ + ii = 0; + n_ctxs = vec_len(fwalk->fw_ctx); + + while (ii < n_ctxs) + { + wrc = fib_node_back_walk_one(&sibling, &fwalk->fw_ctx[ii]); + ii++; fwalk = fib_walk_get(fwi); fwalk->fw_n_visits++; @@ -356,14 +364,11 @@ fib_walk_advance (fib_node_index_t fwi) */ return (FIB_WALK_ADVANCE_MERGE); } - if (old != fwalk->fw_ctx) - { - /* - * nasty re-entrant addition of a walk has realloc'd the vector - * break out - */ - return (FIB_WALK_ADVANCE_MERGE); - } + + /* + * re-evaluate the number of backwalk contexts we need to process. + */ + n_ctxs = vec_len(fwalk->fw_ctx); } /* * move foward to the next node to visit diff --git a/src/vnet/mpls/mpls_tunnel.c b/src/vnet/mpls/mpls_tunnel.c index d6e85e70..776b23ba 100644 --- a/src/vnet/mpls/mpls_tunnel.c +++ b/src/vnet/mpls/mpls_tunnel.c @@ -273,7 +273,9 @@ mpls_tunnel_stack (adj_index_t ai) mpls_tunnel_mk_lb(mt, adj->ia_link, - FIB_FORW_CHAIN_TYPE_MPLS_EOS, + (VNET_LINK_MPLS == adj_get_link_type(ai) ? + FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS: + FIB_FORW_CHAIN_TYPE_MPLS_EOS), &dpo); adj_nbr_midchain_stack(ai, &dpo); diff --git a/test/test_mpls.py b/test/test_mpls.py index 0ad1ee69..d0c9e249 100644 --- a/test/test_mpls.py +++ b/test/test_mpls.py @@ -203,7 +203,10 @@ class TestMPLS(VppTestCase): except: raise - def verify_capture_tunneled_ip4(self, src_if, capture, sent, mpls_labels): + def verify_capture_tunneled_ip4(self, src_if, capture, sent, mpls_labels, + ttl=255, top=None): + if top is None: + top = len(mpls_labels) - 1 try: capture = self.verify_filter(capture, sent) @@ -217,7 +220,7 @@ class TestMPLS(VppTestCase): # the MPLS TTL is 255 since it enters a new tunnel self.verify_mpls_stack( - rx, mpls_labels, 255, len(mpls_labels) - 1) + rx, mpls_labels, ttl, top) self.assertEqual(rx_ip.src, tx_ip.src) self.assertEqual(rx_ip.dst, tx_ip.dst) @@ -617,6 +620,26 @@ class TestMPLS(VppTestCase): rx = self.pg0.get_capture() self.verify_capture_tunneled_ip4(self.pg0, rx, tx, [44, 46]) + # + # add a labelled route through the new tunnel + # + route_10_0_0_4 = VppIpRoute(self, "10.0.0.4", 32, + [VppRoutePath("0.0.0.0", + mpls_tun._sw_if_index, + labels=[33])]) + route_10_0_0_4.add_vpp_config() + + self.vapi.cli("clear trace") + tx = self.create_stream_ip4(self.pg0, "10.0.0.4") + self.pg0.add_stream(tx) + + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx = self.pg0.get_capture() + self.verify_capture_tunneled_ip4(self.pg0, rx, tx, [44, 46, 33], + ttl=63, top=2) + def test_v4_exp_null(self): """ MPLS V4 Explicit NULL test """ -- cgit 1.2.3-korg From aba8be617c1c716b55b8b2919ec3d4b61e69dff1 Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Sat, 10 Jun 2017 01:43:44 -0700 Subject: FIB walk process - wake-up rate unnecessarily high Change-Id: I7dedf283c83c7f0e0b7642f095b68bc0b40898cf Signed-off-by: Neale Ranns --- src/vnet/fib/fib_walk.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'src/vnet/fib/fib_walk.c') diff --git a/src/vnet/fib/fib_walk.c b/src/vnet/fib/fib_walk.c index 70180137..94297442 100644 --- a/src/vnet/fib/fib_walk.c +++ b/src/vnet/fib/fib_walk.c @@ -399,7 +399,17 @@ typedef enum fib_walk_sleep_type_t_ * @brief Durations for the sleep types */ static f64 fib_walk_sleep_duration[] = { - [FIB_WALK_LONG_SLEEP] = 1e-3, + /** + * Long sleep when there is no more work, i.e. the queues are empty. + * This is a sleep (as opposed to a wait for event) just to be sure we + * are not missing events by sleeping forever. + */ + [FIB_WALK_LONG_SLEEP] = 2, + + /** + * Short sleep. There is work left in the queues. We are yielding the CPU + * momentarily. + */ [FIB_WALK_SHORT_SLEEP] = 1e-8, }; -- cgit 1.2.3-korg