aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/l2
AgeCommit message (Expand)AuthorFilesLines
2017-11-10Break up vpe.apiNeale Ranns2-5/+273
2017-11-10Further fix to SHG handling for ARP/ICMPv6 from BVI in a BDJohn Lo1-6/+25
2017-11-08Fix SHG handling for ARP/ICMPv6 received from BVI in a BDJohn Lo1-0/+8
2017-11-03silence l2fib_init() invalid read of size 2 found by address-sanitizerGabriel Ganne1-1/+6
2017-10-31Fix "l2fib add" CLI to allow adding of filter MAC entriesJohn Lo1-25/+17
2017-10-31l2fib: MAC: Fix uint64 to u8 byte arrayMohsin Kazmi5-49/+44
2017-10-24Update L2 header offset after VLAN tag rewriteNeale Ranns1-1/+3
2017-10-24Add extern to *_main global variable declarations in header files.Dave Wallace3-2/+4
2017-10-09vppapigen: support per-file (major,minor,patch) version stampsDave Barach1-0/+2
2017-10-03Update L2FIB entry timestamp only if BD aging enabled (VPP-1002)John Lo2-3/+5
2017-10-03Repair vlib API socket serverDave Barach1-5/+7
2017-10-02L2-FIB:add mac learn events testEyal Bari1-56/+49
2017-09-15Update of free text tag patch for BDJerome Tollet3-4/+7
2017-09-07Support for bridge domain free text tagJerome Tollet4-2/+66
2017-09-03Improve L2FIB PDR/NDR performance (VPP-963)John Lo4-18/+30
2017-08-22Increase default MAC learn limit and check it in learn-update pathJohn Lo5-4/+8
2017-08-03Add support for API client to receive L2 MAC eventsJohn Lo8-160/+494
2017-07-31CLI:add l2 input/outut to "sh int features"Eyal Bari5-1/+51
2017-07-24SPAN:add l2 mirrorEyal Bari3-10/+16
2017-07-23Improve L2 Input/Output Feature Infrastructure and UsageJohn Lo13-319/+154
2017-07-20L2FWD:move vec_validate out of access functionEyal Bari2-2/+9
2017-07-19Add a bihash prefetchable bucket-level cacheDave Barach1-4/+4
2017-07-14Introduce l{2,3,4}_hdr_offset fields in the buffer metadataDamjan Marion1-1/+1
2017-07-13Fix crash with worker threads on 4K VXLAN/BD setup (VPP-907)John Lo7-167/+49
2017-07-11L2INPUT:fix features mask cailculationEyal Bari1-6/+10
2017-06-27L2-LEARN:fix l2fib entry seq num not updated on hit (VPP-888)Eyal Bari5-87/+64
2017-06-19L2FWD:fix seq_num overwritten + validate l2fib entries when forwardingEyal Bari7-53/+87
2017-06-14ETH:fix l2_len/vlan count mismatch for > 2 tagsEyal Bari2-15/+3
2017-06-12L2FIB:fix crash in show with deleted subif entriesEyal Bari1-4/+7
2017-05-20API: Cleaning up message naming that does not follow the conventionsOle Troan2-52/+41
2017-05-19Enforce Bridge Domain ID range to match 24-bit VNI rangeJohn Lo3-7/+20
2017-05-15L2BD/API:fix bd dump to ignore unknown bd_idEyal Bari1-19/+20
2017-05-08L2FIB:CLI/API to flush all non-static entriesEyal Bari6-86/+156
2017-05-05Fix L2FIB learn counter and memory cleanup of mac_by_ip6 hash tableJohn Lo2-5/+16
2017-05-03L2FIB:flush interface learned macs on downEyal Bari4-39/+42
2017-05-02Prevent Bridge Domain operations on BD 0.Jon Loeliger2-2/+62
2017-04-28Fix memory leak on deletion of BD (bridge domain)John Lo1-3/+9
2017-04-25"autoreply" flag: autogenerate standard xxx_reply_t messagesDave Barach1-88/+8
2017-04-24BD:unify bridge domain creation codeEyal Bari3-106/+86
2017-04-06VPP-686: create bridge-domainChoonho Son1-2/+4
2017-04-06Use thread local storage for thread indexDamjan Marion3-11/+11
2017-04-05L2FIB:add l2fib_flush_bd l2fib_flush_int apisEyal Bari2-0/+82
2017-04-04BD/API:add bridge_domain_set_mac_age apiEyal Bari4-10/+55
2017-04-01CLI: create/delete bridge-domainChoonho Son2-1/+201
2017-03-28Implement MAC Flush for BD or Interface from the L2FIBJohn Lo8-46/+253
2017-03-16vlib: make runtime_data thread-localDamjan Marion2-0/+31
2017-03-07fix gcc 5.4 warning: argument to 'sizeof' in 'memcpy' call is the same expres...Gabriel Ganne2-10/+8
2017-03-03VPP-651: Ensure sw_if_index to node mapping for L2 output path is only done v...Andrew Yourtchenko3-3/+17
2017-03-02Clean up binary api message handler registration issuesDave Barach1-22/+0
2017-03-02bridge domain: fix members reordered when removingEyal Bari1-1/+1
f='#n957'>957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300
/*
 * Copyright (c) 2016 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vnet/ip/lookup.h>
#include <vnet/dpo/load_balance.h>
#include <vnet/dpo/load_balance_map.h>
#include <vnet/dpo/drop_dpo.h>
#include <vppinfra/math.h>              /* for fabs */
#include <vnet/adj/adj.h>
#include <vnet/adj/adj_internal.h>
#include <vnet/fib/fib_urpf_list.h>
#include <vnet/bier/bier_fwd.h>
#include <vnet/fib/mpls_fib.h>

/*
 * distribution error tolerance for load-balancing
 */
const f64 multipath_next_hop_error_tolerance = 0.1;

static const char *load_balance_attr_names[] = LOAD_BALANCE_ATTR_NAMES;

/**
 * the logger
 */
vlib_log_class_t load_balance_logger;

#define LB_DBG(_lb, _fmt, _args...)                                     \
{                                                                       \
    vlib_log_debug(load_balance_logger,                                 \
                   "lb:[%U]:" _fmt,                                     \
                   format_load_balance, load_balance_get_index(_lb),    \
                   LOAD_BALANCE_FORMAT_NONE,                            \
                   ##_args);                                            \
}

/**
 * Pool of all DPOs. It's not static so the DP can have fast access
 */
load_balance_t *load_balance_pool;

/**
 * The one instance of load-balance main
 */
load_balance_main_t load_balance_main = {
    .lbm_to_counters = {
        .name = "route-to",
        .stat_segment_name = "/net/route/to",
    },
    .lbm_via_counters = {
        .name = "route-via",
        .stat_segment_name = "/net/route/via",
    }
};

f64
load_balance_get_multipath_tolerance (void)
{
    return (multipath_next_hop_error_tolerance);
}

static inline index_t
load_balance_get_index (const load_balance_t *lb)
{
    return (lb - load_balance_pool);
}

static inline dpo_id_t*
load_balance_get_buckets (load_balance_t *lb)
{
    if (LB_HAS_INLINE_BUCKETS(lb))
    {
        return (lb->lb_buckets_inline);
    }
    else
    {
        return (lb->lb_buckets);
    }
}

static load_balance_t *
load_balance_alloc_i (void)
{
    load_balance_t *lb;

    pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
    clib_memset(lb, 0, sizeof(*lb));

    lb->lb_map = INDEX_INVALID;
    lb->lb_urpf = INDEX_INVALID;
    vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
                                   load_balance_get_index(lb));
    vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
                                   load_balance_get_index(lb));
    vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
                               load_balance_get_index(lb));
    vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
                               load_balance_get_index(lb));

    return (lb);
}

static u8*
load_balance_format (index_t lbi,
                     load_balance_format_flags_t flags,
                     u32 indent,
                     u8 *s)
{
    vlib_counter_t to, via;
    load_balance_t *lb;
    dpo_id_t *buckets;
    u32 i;

    lb = load_balance_get(lbi);
    vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
    vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
    buckets = load_balance_get_buckets(lb);

    s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
    s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
    s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
    s = format(s, "uRPF:%d ", lb->lb_urpf);
    if (lb->lb_flags)
    {
        load_balance_attr_t attr;

        s = format(s, "flags:[");

        FOR_EACH_LOAD_BALANCE_ATTR(attr)
        {
            if (lb->lb_flags & (1 << attr))
            {
                s = format (s, "%s", load_balance_attr_names[attr]);
            }
        }
        s = format(s, "] ");
    }
    s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
    if (0 != via.packets)
    {
        s = format(s, " via:[%Ld:%Ld]",
                   via.packets, via.bytes);
    }
    s = format(s, "]");

    if (INDEX_INVALID != lb->lb_map)
    {
        s = format(s, "\n%U%U",
                   format_white_space, indent+4,
                   format_load_balance_map, lb->lb_map, indent+4);
    }
    for (i = 0; i < lb->lb_n_buckets; i++)
    {
        s = format(s, "\n%U[%d] %U",
                   format_white_space, indent+2,
                   i,
                   format_dpo_id,
                   &buckets[i], indent+6);
    }
    return (s);
}

u8*
format_load_balance (u8 * s, va_list * args)
{
    index_t lbi = va_arg(*args, index_t);
    load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);

    return (load_balance_format(lbi, flags, 0, s));
}

static u8*
format_load_balance_dpo (u8 * s, va_list * args)
{
    index_t lbi = va_arg(*args, index_t);
    u32 indent = va_arg(*args, u32);

    return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
}

flow_hash_config_t
load_balance_get_default_flow_hash (dpo_proto_t lb_proto)
{
    switch (lb_proto)
    {
    case DPO_PROTO_IP4:
    case DPO_PROTO_IP6:
        return (IP_FLOW_HASH_DEFAULT);

    case DPO_PROTO_MPLS:
        return (MPLS_FLOW_HASH_DEFAULT);

    case DPO_PROTO_ETHERNET:
    case DPO_PROTO_BIER:
    case DPO_PROTO_NSH:
        break;
    }

    return (0);
}

static load_balance_t *
load_balance_create_i (u32 num_buckets,
                       dpo_proto_t lb_proto,
                       flow_hash_config_t fhc)
{
    load_balance_t *lb;

    lb = load_balance_alloc_i();
    lb->lb_hash_config = fhc;
    lb->lb_n_buckets = num_buckets;
    lb->lb_n_buckets_minus_1 = num_buckets-1;
    lb->lb_proto = lb_proto;

    if (!LB_HAS_INLINE_BUCKETS(lb))
    {
        vec_validate_aligned(lb->lb_buckets,
                             lb->lb_n_buckets - 1,
                             CLIB_CACHE_LINE_BYTES);
    }

    LB_DBG(lb, "create");

    return (lb);
}

index_t
load_balance_create (u32 n_buckets,
                     dpo_proto_t lb_proto,
                     flow_hash_config_t fhc)
{
    return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
}

static inline void
load_balance_set_bucket_i (load_balance_t *lb,
                           u32 bucket,
                           dpo_id_t *buckets,
                           const dpo_id_t *next)
{
    dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
}

void
load_balance_set_bucket (index_t lbi,
                         u32 bucket,
                         const dpo_id_t *next)
{
    load_balance_t *lb;
    dpo_id_t *buckets;

    lb = load_balance_get(lbi);
    buckets = load_balance_get_buckets(lb);

    ASSERT(bucket < lb->lb_n_buckets);

    load_balance_set_bucket_i(lb, bucket, buckets, next);
}

int
load_balance_is_drop (const dpo_id_t *dpo)
{
    load_balance_t *lb;

    if (DPO_LOAD_BALANCE != dpo->dpoi_type)
        return (0);

    lb = load_balance_get(dpo->dpoi_index);

    if (1 == lb->lb_n_buckets)
    {
        return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
    }
    return (0);
}

u16
load_balance_n_buckets (index_t lbi)
{
    load_balance_t *lb;

    lb = load_balance_get(lbi);

    return (lb->lb_n_buckets);
}

void
load_balance_set_fib_entry_flags (index_t lbi,
                                  fib_entry_flag_t flags)
{
    load_balance_t *lb;

    lb = load_balance_get(lbi);
    lb->lb_fib_entry_flags = flags;
}


void
load_balance_set_urpf (index_t lbi,
		       index_t urpf)
{
    load_balance_t *lb;
    index_t old;

    lb = load_balance_get(lbi);

    /*
     * packets in flight we see this change. but it's atomic, so :P
     */
    old = lb->lb_urpf;
    lb->lb_urpf = urpf;

    fib_urpf_list_unlock(old);
    fib_urpf_list_lock(urpf);
}

index_t
load_balance_get_urpf (index_t lbi)
{
    load_balance_t *lb;

    lb = load_balance_get(lbi);

    return (lb->lb_urpf);
}

const dpo_id_t *
load_balance_get_bucket (index_t lbi,
                         u32 bucket)
{
    load_balance_t *lb;

    lb = load_balance_get(lbi);

    return (load_balance_get_bucket_i(lb, bucket));
}

static int
next_hop_sort_by_weight (const load_balance_path_t * n1,
                         const load_balance_path_t * n2)
{
    return ((int) n1->path_weight - (int) n2->path_weight);
}

/* Given next hop vector is over-written with normalized one with sorted weights and
   with weights corresponding to the number of adjacencies for each next hop.
   Returns number of adjacencies in block. */
u32
ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
                                  load_balance_path_t ** normalized_next_hops,
                                  u32 *sum_weight_in,
                                  f64 multipath_next_hop_error_tolerance)
{
    load_balance_path_t * nhs;
    uword n_nhs, n_adj, n_adj_left, i, sum_weight;
    f64 norm, error;

    n_nhs = vec_len (raw_next_hops);
    ASSERT (n_nhs > 0);
    if (n_nhs == 0)
        return 0;

    /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
    nhs = *normalized_next_hops;
    vec_validate (nhs, 2*n_nhs - 1);

    /* Fast path: 1 next hop in block. */
    n_adj = n_nhs;
    if (n_nhs == 1)
    {
        nhs[0] = raw_next_hops[0];
        nhs[0].path_weight = 1;
        _vec_len (nhs) = 1;
        sum_weight = 1;
        goto done;
    }

    else if (n_nhs == 2)
    {
        int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;

        /* Fast sort. */
        nhs[0] = raw_next_hops[cmp];
        nhs[1] = raw_next_hops[cmp ^ 1];

        /* Fast path: equal cost multipath with 2 next hops. */
        if (nhs[0].path_weight == nhs[1].path_weight)
        {
            nhs[0].path_weight = nhs[1].path_weight = 1;
            _vec_len (nhs) = 2;
            sum_weight = 2;
            goto done;
        }
    }
    else
    {
        clib_memcpy_fast (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
        qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
    }

    /* Find total weight to normalize weights. */
    sum_weight = 0;
    for (i = 0; i < n_nhs; i++)
        sum_weight += nhs[i].path_weight;

    /* In the unlikely case that all weights are given as 0, set them all to 1. */
    if (sum_weight == 0)
    {
        for (i = 0; i < n_nhs; i++)
            nhs[i].path_weight = 1;
        sum_weight = n_nhs;
    }

    /* Save copies of all next hop weights to avoid being overwritten in loop below. */
    for (i = 0; i < n_nhs; i++)
        nhs[n_nhs + i].path_weight = nhs[i].path_weight;

    /* Try larger and larger power of 2 sized adjacency blocks until we
       find one where traffic flows to within 1% of specified weights. */
    for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
    {
        error = 0;

        norm = n_adj / ((f64) sum_weight);
        n_adj_left = n_adj;
        for (i = 0; i < n_nhs; i++)
        {
            f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
            word n = flt_round_nearest (nf);

            n = n > n_adj_left ? n_adj_left : n;
            n_adj_left -= n;
            error += fabs (nf - n);
            nhs[i].path_weight = n;

            if (0 == nhs[i].path_weight)
            {
                /*
                 * when the weight skew is high (norm is small) and n == nf.
                 * without this correction the path with a low weight would have
                 * no represenation in the load-balanace - don't want that.
                 * If the weight skew is high so the load-balance has many buckets
                 * to allow it. pays ya money takes ya choice.
                 */
                error = n_adj;
                break;
            }
        }

        nhs[0].path_weight += n_adj_left;

        /* Less than 5% average error per adjacency with this size adjacency block? */
        if (error <= multipath_next_hop_error_tolerance*n_adj)
        {
            /* Truncate any next hops with zero weight. */
            _vec_len (nhs) = i;
            break;
        }
    }

done:
    /* Save vector for next call. */
    *normalized_next_hops = nhs;
    *sum_weight_in = sum_weight;
    return n_adj;
}

static load_balance_path_t *
load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
                                       dpo_proto_t drop_proto)
{
    if (0 == vec_len(nhs))
    {
        load_balance_path_t *new_nhs = NULL, *nh;

        /*
         * we need something for the load-balance. so use the drop
         */
        vec_add2(new_nhs, nh, 1);

        nh->path_weight = 1;
        dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));

        return (new_nhs);
    }

    return (NULL);
}

/*
 * Fill in adjacencies in block based on corresponding
 * next hop adjacencies.
 */
static void
load_balance_fill_buckets_norm (load_balance_t *lb,
                                load_balance_path_t *nhs,
                                dpo_id_t *buckets,
                                u32 n_buckets)
{
    load_balance_path_t *nh;
    u16 ii, bucket;

    bucket = 0;

    /*
     * the next-hops have normalised weights. that means their sum is the number
     * of buckets we need to fill.
     */
    vec_foreach (nh, nhs)
    {
        for (ii = 0; ii < nh->path_weight; ii++)
        {
            ASSERT(bucket < n_buckets);
            load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
        }
    }
}
static void
load_balance_fill_buckets_sticky (load_balance_t *lb,
                                  load_balance_path_t *nhs,
                                  dpo_id_t *buckets,
                                  u32 n_buckets)
{
    load_balance_path_t *nh, *fwding_paths;
    u16 ii, bucket, fpath;

    fpath = bucket = 0;
    fwding_paths = NULL;

    vec_foreach (nh, nhs)
    {
        if (!dpo_is_drop(&nh->path_dpo))
        {
            vec_add1(fwding_paths, *nh);
        }
    }
    if (vec_len(fwding_paths) == 0)
        fwding_paths = vec_dup(nhs);

    /*
     * the next-hops have normalised weights. that means their sum is the number
     * of buckets we need to fill.
     */
    vec_foreach (nh, nhs)
    {
        for (ii = 0; ii < nh->path_weight; ii++)
        {
            ASSERT(bucket < n_buckets);
            if (!dpo_is_drop(&nh->path_dpo))
            {
                load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
            }
            else
            {
                /* fill the bucks from the next up path */
                load_balance_set_bucket_i(lb, bucket++, buckets, &fwding_paths[fpath].path_dpo);
                fpath = (fpath + 1) % vec_len(fwding_paths);
            }
        }
    }

    vec_free(fwding_paths);
}

static void
load_balance_fill_buckets (load_balance_t *lb,
                           load_balance_path_t *nhs,
                           dpo_id_t *buckets,
                           u32 n_buckets,
                           load_balance_flags_t flags)
{
    if (flags & LOAD_BALANCE_FLAG_STICKY)
    {
        load_balance_fill_buckets_sticky(lb, nhs, buckets, n_buckets);
    }
    else
    {
        load_balance_fill_buckets_norm(lb, nhs, buckets, n_buckets);
    }
}

static inline void
load_balance_set_n_buckets (load_balance_t *lb,
                            u32 n_buckets)
{
    lb->lb_n_buckets = n_buckets;
    lb->lb_n_buckets_minus_1 = n_buckets-1;
}

void
load_balance_multipath_update (const dpo_id_t *dpo,
                               const load_balance_path_t * raw_nhs,
                               load_balance_flags_t flags)
{
    load_balance_path_t *nh, *nhs, *fixed_nhs;
    u32 sum_of_weights, n_buckets, ii;
    index_t lbmi, old_lbmi;
    load_balance_t *lb;
    dpo_id_t *tmp_dpo;

    nhs = NULL;

    ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
    lb = load_balance_get(dpo->dpoi_index);
    lb->lb_flags = flags;
    fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
    n_buckets =
        ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
                                          raw_nhs :
                                          fixed_nhs),
                                         &nhs,
                                         &sum_of_weights,
                                         multipath_next_hop_error_tolerance);

    ASSERT (n_buckets >= vec_len (raw_nhs));

    /*
     * Save the old load-balance map used, and get a new one if required.
     */
    old_lbmi = lb->lb_map;
    if (flags & LOAD_BALANCE_FLAG_USES_MAP)
    {
        lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
    }
    else
    {
        lbmi = INDEX_INVALID;
    }

    if (0 == lb->lb_n_buckets)
    {
        /*
         * first time initialisation. no packets inflight, so we can write
         * at leisure.
         */
        load_balance_set_n_buckets(lb, n_buckets);

        if (!LB_HAS_INLINE_BUCKETS(lb))
            vec_validate_aligned(lb->lb_buckets,
                                 lb->lb_n_buckets - 1,
                                 CLIB_CACHE_LINE_BYTES);

        load_balance_fill_buckets(lb, nhs,
                                  load_balance_get_buckets(lb),
                                  n_buckets, flags);
        lb->lb_map = lbmi;
    }
    else
    {
        /*
         * This is a modification of an existing load-balance.
         * We need to ensure that packets inflight see a consistent state, that
         * is the number of reported buckets the LB has (read from
         * lb_n_buckets_minus_1) is not more than it actually has. So if the
         * number of buckets is increasing, we must update the bucket array first,
         * then the reported number. vice-versa if the number of buckets goes down.
         */
        if (n_buckets == lb->lb_n_buckets)
        {
            /*
             * no change in the number of buckets. we can simply fill what
             * is new over what is old.
             */
            load_balance_fill_buckets(lb, nhs,
                                      load_balance_get_buckets(lb),
                                      n_buckets, flags);
            lb->lb_map = lbmi;
        }
        else if (n_buckets > lb->lb_n_buckets)
        {
            /*
             * we have more buckets. the old load-balance map (if there is one)
             * will remain valid, i.e. mapping to indices within range, so we
             * update it last.
             */
            if (n_buckets > LB_NUM_INLINE_BUCKETS &&
                lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
            {
                /*
                 * the new increased number of buckets is crossing the threshold
                 * from the inline storage to out-line. Alloc the outline buckets
                 * first, then fixup the number. then reset the inlines.
                 */
                ASSERT(NULL == lb->lb_buckets);
                vec_validate_aligned(lb->lb_buckets,
                                     n_buckets - 1,
                                     CLIB_CACHE_LINE_BYTES);

                load_balance_fill_buckets(lb, nhs,
                                          lb->lb_buckets,
                                          n_buckets, flags);
                CLIB_MEMORY_BARRIER();
                load_balance_set_n_buckets(lb, n_buckets);

                CLIB_MEMORY_BARRIER();

                for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
                {
                    dpo_reset(&lb->lb_buckets_inline[ii]);
                }
            }
            else
            {
                if (n_buckets <= LB_NUM_INLINE_BUCKETS)
                {
                    /*
                     * we are not crossing the threshold and it's still inline buckets.
                     * we can write the new on the old..
                     */
                    load_balance_fill_buckets(lb, nhs,
                                              load_balance_get_buckets(lb),
                                              n_buckets, flags);
                    CLIB_MEMORY_BARRIER();
                    load_balance_set_n_buckets(lb, n_buckets);
                }
                else
                {
                    /*
                     * we are not crossing the threshold. We need a new bucket array to
                     * hold the increased number of choices.
                     */
                    dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;

                    new_buckets = NULL;
                    old_buckets = load_balance_get_buckets(lb);

                    vec_validate_aligned(new_buckets,
                                         n_buckets - 1,
                                         CLIB_CACHE_LINE_BYTES);

                    load_balance_fill_buckets(lb, nhs, new_buckets,
                                              n_buckets, flags);
                    CLIB_MEMORY_BARRIER();
                    lb->lb_buckets = new_buckets;
                    CLIB_MEMORY_BARRIER();
                    load_balance_set_n_buckets(lb, n_buckets);

                    vec_foreach(tmp_dpo, old_buckets)
                    {
                        dpo_reset(tmp_dpo);
                    }
                    vec_free(old_buckets);
                }
            }

            /*
             * buckets fixed. ready for the MAP update.
             */
            lb->lb_map = lbmi;
        }
        else
        {
            /*
             * bucket size shrinkage.
             * Any map we have will be based on the old
             * larger number of buckets, so will be translating to indices
             * out of range. So the new MAP must be installed first.
             */
            lb->lb_map = lbmi;
            CLIB_MEMORY_BARRIER();


            if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
                lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
            {
                /*
                 * the new decreased number of buckets is crossing the threshold
                 * from out-line storage to inline:
                 *   1 - Fill the inline buckets,
                 *   2 - fixup the number (and this point the inline buckets are
                 *       used).
                 *   3 - free the outline buckets
                 */
                load_balance_fill_buckets(lb, nhs,
                                          lb->lb_buckets_inline,
                                          n_buckets, flags);
                CLIB_MEMORY_BARRIER();
                load_balance_set_n_buckets(lb, n_buckets);
                CLIB_MEMORY_BARRIER();

                vec_foreach(tmp_dpo, lb->lb_buckets)
                {
                    dpo_reset(tmp_dpo);
                }
                vec_free(lb->lb_buckets);
            }
            else
            {
                /*
                 * not crossing the threshold.
                 *  1 - update the number to the smaller size
                 *  2 - write the new buckets
                 *  3 - reset those no longer used.
                 */
                dpo_id_t *buckets;
                u32 old_n_buckets;

                old_n_buckets = lb->lb_n_buckets;
                buckets = load_balance_get_buckets(lb);

                load_balance_set_n_buckets(lb, n_buckets);
                CLIB_MEMORY_BARRIER();

                load_balance_fill_buckets(lb, nhs, buckets,
                                          n_buckets, flags);

                for (ii = n_buckets; ii < old_n_buckets; ii++)
                {
                    dpo_reset(&buckets[ii]);
                }
            }
        }
    }

    vec_foreach (nh, nhs)
    {
        dpo_reset(&nh->path_dpo);
    }
    vec_free(nhs);
    vec_free(fixed_nhs);

    load_balance_map_unlock(old_lbmi);
}

static void
load_balance_lock (dpo_id_t *dpo)
{
    load_balance_t *lb;

    lb = load_balance_get(dpo->dpoi_index);

    lb->lb_locks++;
}

static void
load_balance_destroy (load_balance_t *lb)
{
    dpo_id_t *buckets;
    int i;

    buckets = load_balance_get_buckets(lb);

    for (i = 0; i < lb->lb_n_buckets; i++)
    {
        dpo_reset(&buckets[i]);
    }

    LB_DBG(lb, "destroy");
    if (!LB_HAS_INLINE_BUCKETS(lb))
    {
        vec_free(lb->lb_buckets);
    }

    fib_urpf_list_unlock(lb->lb_urpf);
    load_balance_map_unlock(lb->lb_map);

    pool_put(load_balance_pool, lb);
}

static void
load_balance_unlock (dpo_id_t *dpo)
{
    load_balance_t *lb;

    lb = load_balance_get(dpo->dpoi_index);

    lb->lb_locks--;

    if (0 == lb->lb_locks)
    {
        load_balance_destroy(lb);
    }
}

static void
load_balance_mem_show (void)
{
    fib_show_memory_usage("load-balance",
			  pool_elts(load_balance_pool),
			  pool_len(load_balance_pool),
			  sizeof(load_balance_t));
    load_balance_map_show_mem();
}

const static dpo_vft_t lb_vft = {
    .dv_lock = load_balance_lock,
    .dv_unlock = load_balance_unlock,
    .dv_format = format_load_balance_dpo,
    .dv_mem_show = load_balance_mem_show,
};

/**
 * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
 *        object.
 *
 * this means that these graph nodes are ones from which a load-balance is the
 * parent object in the DPO-graph.
 *
 * We do not list all the load-balance nodes, such as the *-lookup. instead
 * we are relying on the correct use of the .sibling_of field when setting
 * up these sibling nodes.
 */
const static char* const load_balance_ip4_nodes[] =
{
    "ip4-load-balance",
    NULL,
};
const static char* const load_balance_ip6_nodes[] =
{
    "ip6-load-balance",
    NULL,
};
const static char* const load_balance_mpls_nodes[] =
{
    "mpls-load-balance",
    NULL,
};
const static char* const load_balance_l2_nodes[] =
{
    "l2-load-balance",
    NULL,
};
const static char* const load_balance_nsh_nodes[] =
{
    "nsh-load-balance",
    NULL
};
const static char* const load_balance_bier_nodes[] =
{
    "bier-load-balance",
    NULL,
};
const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
{
    [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
    [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
    [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
    [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
    [DPO_PROTO_NSH] = load_balance_nsh_nodes,
    [DPO_PROTO_BIER] = load_balance_bier_nodes,
};

void
load_balance_module_init (void)
{
    index_t lbi;

    dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);

    /*
     * Special LB with index zero. we need to define this since the v4 mtrie
     * assumes an index of 0 implies the ply is empty. therefore all 'real'
     * adjs need a non-zero index.
     * This should never be used, but just in case, stack it on a drop.
     */
    lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
    load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));

    load_balance_logger =
        vlib_log_register_class("dpo", "load-balance");

    load_balance_map_module_init();
}

static clib_error_t *
load_balance_show (vlib_main_t * vm,
                   unformat_input_t * input,
                   vlib_cli_command_t * cmd)
{
    index_t lbi = INDEX_INVALID;

    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
        if (unformat (input, "%d", &lbi))
            ;
        else
            break;
    }

    if (INDEX_INVALID != lbi)
    {
        if (pool_is_free_index(load_balance_pool, lbi))
        {
            vlib_cli_output (vm, "no such load-balance:%d", lbi);
        }
        else
        {
            vlib_cli_output (vm, "%U", format_load_balance, lbi,
                         LOAD_BALANCE_FORMAT_DETAIL);
        }
    }
    else
    {
        load_balance_t *lb;

        pool_foreach(lb, load_balance_pool,
        ({
            vlib_cli_output (vm, "%U", format_load_balance,
                             load_balance_get_index(lb),
                             LOAD_BALANCE_FORMAT_NONE);
        }));
    }

    return 0;
}

VLIB_CLI_COMMAND (load_balance_show_command, static) = {
    .path = "show load-balance",
    .short_help = "show load-balance [<index>]",
    .function = load_balance_show,
};


always_inline u32
ip_flow_hash (void *data)
{
  ip4_header_t *iph = (ip4_header_t *) data;

  if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
    return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
  else
    return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
}

always_inline u64
mac_to_u64 (u8 * m)
{
  return (*((u64 *) m) & 0xffffffffffff);
}

always_inline u32
l2_flow_hash (vlib_buffer_t * b0)
{
  ethernet_header_t *eh;
  u64 a, b, c;
  uword is_ip, eh_size;
  u16 eh_type;

  eh = vlib_buffer_get_current (b0);
  eh_type = clib_net_to_host_u16 (eh->type);
  eh_size = ethernet_buffer_header_size (b0);

  is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);

  /* since we have 2 cache lines, use them */
  if (is_ip)
    a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
  else
    a = eh->type;

  b = mac_to_u64 ((u8 *) eh->dst_address);
  c = mac_to_u64 ((u8 *) eh->src_address);
  hash_mix64 (a, b, c);

  return (u32) c;
}

typedef struct load_balance_trace_t_
{
    index_t lb_index;
} load_balance_trace_t;

always_inline uword
load_balance_inline (vlib_main_t * vm,
		     vlib_node_runtime_t * node,
		     vlib_frame_t * frame,
		     int is_l2)
{
  u32 n_left_from, next_index, *from, *to_next;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;

  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  vlib_buffer_t *b0;
	  u32 bi0, lbi0, next0;
	  const dpo_id_t *dpo0;
	  const load_balance_t *lb0;

	  bi0 = from[0];
	  to_next[0] = bi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;

	  b0 = vlib_get_buffer (vm, bi0);

	  /* lookup dst + src mac */
	  lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
	  lb0 = load_balance_get(lbi0);

	  if (is_l2)
	  {
	      vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
	  }
	  else
	  {
	      /* it's BIER */
	      const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
	      vnet_buffer(b0)->ip.flow_hash = bier_compute_flow_hash(bh0);
	  }

	  dpo0 = load_balance_get_bucket_i(lb0, 
					   vnet_buffer(b0)->ip.flow_hash &
					   (lb0->lb_n_buckets_minus_1));

	  next0 = dpo0->dpoi_next_node;
	  vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;

	  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
	    {
	      load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
							 sizeof (*tr));
	      tr->lb_index = lbi0;
	    }
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
					   n_left_to_next, bi0, next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  return frame->n_vectors;
}

static uword
l2_load_balance (vlib_main_t * vm,
		 vlib_node_runtime_t * node,
		 vlib_frame_t * frame)
{
    return (load_balance_inline(vm, node, frame, 1));
}

static u8 *
format_l2_load_balance_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);

  s = format (s, "L2-load-balance: index %d", t->lb_index);
  return s;
}

/**
 * @brief
 */
VLIB_REGISTER_NODE (l2_load_balance_node) = {
  .function = l2_load_balance,
  .name = "l2-load-balance",
  .vector_size = sizeof (u32),

  .format_trace = format_l2_load_balance_trace,
  .n_next_nodes = 1,
  .next_nodes = {
      [0] = "error-drop",
  },
};

static uword
nsh_load_balance (vlib_main_t * vm,
                 vlib_node_runtime_t * node,
                 vlib_frame_t * frame)
{
  u32 n_left_from, next_index, *from, *to_next;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;

  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from > 0 && n_left_to_next > 0)
        {
          vlib_buffer_t *b0;
          u32 bi0, lbi0, next0, *nsh0;
          const dpo_id_t *dpo0;
          const load_balance_t *lb0;

          bi0 = from[0];
          to_next[0] = bi0;
          from += 1;
          to_next += 1;
          n_left_from -= 1;
          n_left_to_next -= 1;

          b0 = vlib_get_buffer (vm, bi0);

          lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
          lb0 = load_balance_get(lbi0);

          /* SPI + SI are the second word of the NSH header */
          nsh0 = vlib_buffer_get_current (b0);
          vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;

          dpo0 = load_balance_get_bucket_i(lb0,
                                           vnet_buffer(b0)->ip.flow_hash &
                                           (lb0->lb_n_buckets_minus_1));

          next0 = dpo0->dpoi_next_node;
          vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;

          if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
            {
              load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
                                                         sizeof (*tr));
              tr->lb_index = lbi0;
            }
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
                                           n_left_to_next, bi0, next0);
        }

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  return frame->n_vectors;
}

static u8 *
format_nsh_load_balance_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);

  s = format (s, "NSH-load-balance: index %d", t->lb_index);
  return s;
}

/**
 * @brief
 */
VLIB_REGISTER_NODE (nsh_load_balance_node) = {
  .function = nsh_load_balance,
  .name = "nsh-load-balance",
  .vector_size = sizeof (u32),

  .format_trace = format_nsh_load_balance_trace,
  .n_next_nodes = 1,
  .next_nodes = {
      [0] = "error-drop",
  },
};

static u8 *
format_bier_load_balance_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);

  s = format (s, "BIER-load-balance: index %d", t->lb_index);
  return s;
}

static uword
bier_load_balance (vlib_main_t * vm,
		   vlib_node_runtime_t * node,
		   vlib_frame_t * frame)
{
    return (load_balance_inline(vm, node, frame, 0));
}

/**
 * @brief
 */
VLIB_REGISTER_NODE (bier_load_balance_node) = {
  .function = bier_load_balance,
  .name = "bier-load-balance",
  .vector_size = sizeof (u32),

  .format_trace = format_bier_load_balance_trace,
  .sibling_of = "mpls-load-balance",
};