aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/marvell/pp2/output.c
blob: 18969a458e24664aeb5d67fde1618c6004a33171 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/*
 *------------------------------------------------------------------
 * Copyright (c) 2018 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *------------------------------------------------------------------
 */

#include <stdint.h>
#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/uio.h>

#include <vlib/vlib.h>
#include <vlib/unix/unix.h>
#include <vnet/ethernet/ethernet.h>
#include <vnet/devices/devices.h>

#include <marvell/pp2/pp2.h>

uword
mrvl_pp2_interface_tx (vlib_main_t * vm,
		       vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  mrvl_pp2_main_t *ppm = &mrvl_pp2_main;
  vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
  mrvl_pp2_if_t *ppif = pool_elt_at_index (ppm->interfaces, rd->dev_instance);
  u32 thread_index = vm->thread_index;
  mrvl_pp2_per_thread_data_t *ptd =
    vec_elt_at_index (ppm->per_thread_data, thread_index);
  u8 qid = thread_index;
  mrvl_pp2_outq_t *outq = vec_elt_at_index (ppif->outqs, qid);
  u32 *buffers = vlib_frame_vector_args (frame);
  u16 n_desc = frame->n_vectors, n_left = n_desc, n_sent = n_desc, n_done;
  struct pp2_ppio_desc *d;
  u16 mask = outq->size - 1;

  if (PREDICT_FALSE (pp2_ppio_get_num_outq_done (ppif->ppio, ptd->hif, qid,
						 &n_done)))
    {
      n_done = 0;
      vlib_error_count (vm, node->node_index,
			MRVL_PP2_TX_ERROR_PPIO_GET_NUM_OUTQ_DONE, 1);
    }

  if (n_done)
    {
      u16 n_free = clib_min (n_done, outq->size - (outq->tail & mask));
      vlib_buffer_free (vm, outq->buffers + (outq->tail & mask), n_free);
      if (PREDICT_FALSE (n_free < n_done))
	vlib_buffer_free (vm, outq->buffers, n_done - n_free);
      outq->tail += n_done;
    }

  vec_validate_aligned (ptd->descs, n_left, CLIB_CACHE_LINE_BYTES);
  d = ptd->descs;
  while (n_left)
    {
      u32 bi0 = buffers[0];
      vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
      u64 paddr = vlib_buffer_get_pa (vm, b0);

      pp2_ppio_outq_desc_reset (d);
      pp2_ppio_outq_desc_set_phys_addr (d, paddr + b0->current_data);
      pp2_ppio_outq_desc_set_pkt_offset (d, 0);
      pp2_ppio_outq_desc_set_pkt_len (d, b0->current_length);
      d++;
      buffers++;
      n_left--;
    }

  if (pp2_ppio_send (ppif->ppio, ptd->hif, qid, ptd->descs, &n_sent))
    {
      n_sent = 0;
      vlib_error_count (vm, node->node_index, MRVL_PP2_TX_ERROR_PPIO_SEND, 1);
    }

  /* free unsent buffers */
  if (PREDICT_FALSE (n_sent != n_desc))
    {
      vlib_buffer_free (vm, vlib_frame_vector_args (frame) + n_sent,
			frame->n_vectors - n_sent);
      vlib_error_count (vm, node->node_index, MRVL_PP2_TX_ERROR_NO_FREE_SLOTS,
			frame->n_vectors - n_sent);
    }

  /* store buffer index for each enqueued packet into the ring
     so we can know what to free after packet is sent */
  if (n_sent)
    {
      u16 slot = outq->head & mask;
      buffers = vlib_frame_vector_args (frame);
      u16 n_copy = clib_min (outq->size - slot, n_sent);

      clib_memcpy (outq->buffers + slot, buffers, n_copy * sizeof (u32));
      if (PREDICT_FALSE (n_copy < n_sent))
	clib_memcpy (outq->buffers, buffers + n_copy,
		     (n_sent - n_copy) * sizeof (u32));

      outq->head += n_sent;
    }

  return n_sent;
}

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */
n902'>902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
/*
 * Copyright (c) 2016 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vnet/ip/lookup.h>
#include <vnet/dpo/load_balance.h>
#include <vnet/dpo/load_balance_map.h>
#include <vnet/dpo/drop_dpo.h>
#include <vppinfra/math.h>              /* for fabs */
#include <vnet/adj/adj.h>
#include <vnet/adj/adj_internal.h>
#include <vnet/fib/fib_urpf_list.h>

/*
 * distribution error tolerance for load-balancing
 */
const f64 multipath_next_hop_error_tolerance = 0.1;

#undef LB_DEBUG

#ifdef LB_DEBUG
#define LB_DBG(_lb, _fmt, _args...)                                     \
{                                                                       \
    u8* _tmp =NULL;                                                     \
    clib_warning("lb:[%s]:" _fmt,                                       \
                 load_balance_format(load_balance_get_index((_lb)),     \
                                     0, _tmp),                          \
                 ##_args);                                              \
    vec_free(_tmp);                                                     \
}
#else
#define LB_DBG(_p, _fmt, _args...)
#endif


/**
 * Pool of all DPOs. It's not static so the DP can have fast access
 */
load_balance_t *load_balance_pool;

/**
 * The one instance of load-balance main
 */
load_balance_main_t load_balance_main;

f64
load_balance_get_multipath_tolerance (void)
{
    return (multipath_next_hop_error_tolerance);
}

static inline index_t
load_balance_get_index (const load_balance_t *lb)
{
    return (lb - load_balance_pool);
}

static inline dpo_id_t*
load_balance_get_buckets (load_balance_t *lb)
{
    if (LB_HAS_INLINE_BUCKETS(lb))
    {
        return (lb->lb_buckets_inline);
    }
    else
    {
        return (lb->lb_buckets);
    }
}

static load_balance_t *
load_balance_alloc_i (void)
{
    load_balance_t *lb;

    pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
    memset(lb, 0, sizeof(*lb));

    lb->lb_map = INDEX_INVALID;
    lb->lb_urpf = INDEX_INVALID;
    vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
                                   load_balance_get_index(lb));
    vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
                                   load_balance_get_index(lb));
    vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
                               load_balance_get_index(lb));
    vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
                               load_balance_get_index(lb));

    return (lb);
}

static u8*
load_balance_format (index_t lbi,
                     load_balance_format_flags_t flags,
                     u32 indent,
                     u8 *s)
{
    vlib_counter_t to, via;
    load_balance_t *lb;
    dpo_id_t *buckets;
    u32 i;

    lb = load_balance_get(lbi);
    vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
    vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
    buckets = load_balance_get_buckets(lb);

    s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
    s = format(s, "[index:%d buckets:%d ", lbi, lb->lb_n_buckets);
    s = format(s, "uRPF:%d ", lb->lb_urpf);
    s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
    if (0 != via.packets)
    {
        s = format(s, " via:[%Ld:%Ld]",
                   via.packets, via.bytes);
    }
    s = format(s, "]");

    if (INDEX_INVALID != lb->lb_map)
    {
        s = format(s, "\n%U%U",
                   format_white_space, indent+4,
                   format_load_balance_map, lb->lb_map, indent+4);
    }
    for (i = 0; i < lb->lb_n_buckets; i++)
    {
        s = format(s, "\n%U[%d] %U",
                   format_white_space, indent+2,
                   i,
                   format_dpo_id,
                   &buckets[i], indent+6);
    }
    return (s);
}

u8*
format_load_balance (u8 * s, va_list * args)
{
    index_t lbi = va_arg(*args, index_t);
    load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);

    return (load_balance_format(lbi, flags, 0, s));
}
static u8*
format_load_balance_dpo (u8 * s, va_list * args)
{
    index_t lbi = va_arg(*args, index_t);
    u32 indent = va_arg(*args, u32);

    return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
}


static load_balance_t *
load_balance_create_i (u32 num_buckets,
                       dpo_proto_t lb_proto,
                       flow_hash_config_t fhc)
{
    load_balance_t *lb;

    lb = load_balance_alloc_i();
    lb->lb_hash_config = fhc;
    lb->lb_n_buckets = num_buckets;
    lb->lb_n_buckets_minus_1 = num_buckets-1;
    lb->lb_proto = lb_proto;

    if (!LB_HAS_INLINE_BUCKETS(lb))
    {
        vec_validate_aligned(lb->lb_buckets,
                             lb->lb_n_buckets - 1,
                             CLIB_CACHE_LINE_BYTES);
    }

    LB_DBG(lb, "create");

    return (lb);
}

index_t
load_balance_create (u32 n_buckets,
                     dpo_proto_t lb_proto,
                     flow_hash_config_t fhc)
{
    return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
}

static inline void
load_balance_set_bucket_i (load_balance_t *lb,
                           u32 bucket,
                           dpo_id_t *buckets,
                           const dpo_id_t *next)
{
    dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
}

void
load_balance_set_bucket (index_t lbi,
                         u32 bucket,
                         const dpo_id_t *next)
{
    load_balance_t *lb;
    dpo_id_t *buckets;

    lb = load_balance_get(lbi);
    buckets = load_balance_get_buckets(lb);

    ASSERT(bucket < lb->lb_n_buckets);

    load_balance_set_bucket_i(lb, bucket, buckets, next);
}

int
load_balance_is_drop (const dpo_id_t *dpo)
{
    load_balance_t *lb;

    if (DPO_LOAD_BALANCE != dpo->dpoi_type)
        return (0);

    lb = load_balance_get(dpo->dpoi_index);

    if (1 == lb->lb_n_buckets)
    {
        return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
    }
    return (0);
}

void
load_balance_set_fib_entry_flags (index_t lbi,
                                  fib_entry_flag_t flags)
{
    load_balance_t *lb;

    lb = load_balance_get(lbi);
    lb->lb_fib_entry_flags = flags;
}


void
load_balance_set_urpf (index_t lbi,
		       index_t urpf)
{
    load_balance_t *lb;
    index_t old;

    lb = load_balance_get(lbi);

    /*
     * packets in flight we see this change. but it's atomic, so :P
     */
    old = lb->lb_urpf;
    lb->lb_urpf = urpf;

    fib_urpf_list_unlock(old);
    fib_urpf_list_lock(urpf);
}

index_t
load_balance_get_urpf (index_t lbi)
{
    load_balance_t *lb;

    lb = load_balance_get(lbi);

    return (lb->lb_urpf);
}

const dpo_id_t *
load_balance_get_bucket (index_t lbi,
                         u32 bucket)
{
    load_balance_t *lb;

    lb = load_balance_get(lbi);

    return (load_balance_get_bucket_i(lb, bucket));
}

static int
next_hop_sort_by_weight (const load_balance_path_t * n1,
                         const load_balance_path_t * n2)
{
    return ((int) n1->path_weight - (int) n2->path_weight);
}

/* Given next hop vector is over-written with normalized one with sorted weights and
   with weights corresponding to the number of adjacencies for each next hop.
   Returns number of adjacencies in block. */
u32
ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
                                  load_balance_path_t ** normalized_next_hops,
                                  u32 *sum_weight_in,
                                  f64 multipath_next_hop_error_tolerance)
{
    load_balance_path_t * nhs;
    uword n_nhs, n_adj, n_adj_left, i, sum_weight;
    f64 norm, error;

    n_nhs = vec_len (raw_next_hops);
    ASSERT (n_nhs > 0);
    if (n_nhs == 0)
        return 0;

    /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
    nhs = *normalized_next_hops;
    vec_validate (nhs, 2*n_nhs - 1);

    /* Fast path: 1 next hop in block. */
    n_adj = n_nhs;
    if (n_nhs == 1)
    {
        nhs[0] = raw_next_hops[0];
        nhs[0].path_weight = 1;
        _vec_len (nhs) = 1;
        sum_weight = 1;
        goto done;
    }

    else if (n_nhs == 2)
    {
        int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;

        /* Fast sort. */
        nhs[0] = raw_next_hops[cmp];
        nhs[1] = raw_next_hops[cmp ^ 1];

        /* Fast path: equal cost multipath with 2 next hops. */
        if (nhs[0].path_weight == nhs[1].path_weight)
        {
            nhs[0].path_weight = nhs[1].path_weight = 1;
            _vec_len (nhs) = 2;
            sum_weight = 2;
            goto done;
        }
    }
    else
    {
        clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
        qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
    }

    /* Find total weight to normalize weights. */
    sum_weight = 0;
    for (i = 0; i < n_nhs; i++)
        sum_weight += nhs[i].path_weight;

    /* In the unlikely case that all weights are given as 0, set them all to 1. */
    if (sum_weight == 0)
    {
        for (i = 0; i < n_nhs; i++)
            nhs[i].path_weight = 1;
        sum_weight = n_nhs;
    }

    /* Save copies of all next hop weights to avoid being overwritten in loop below. */
    for (i = 0; i < n_nhs; i++)
        nhs[n_nhs + i].path_weight = nhs[i].path_weight;

    /* Try larger and larger power of 2 sized adjacency blocks until we
       find one where traffic flows to within 1% of specified weights. */
    for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
    {
        error = 0;

        norm = n_adj / ((f64) sum_weight);
        n_adj_left = n_adj;
        for (i = 0; i < n_nhs; i++)
        {
            f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
            word n = flt_round_nearest (nf);

            n = n > n_adj_left ? n_adj_left : n;
            n_adj_left -= n;
            error += fabs (nf - n);
            nhs[i].path_weight = n;

            if (0 == nhs[i].path_weight)
            {
                /*
                 * when the weight skew is high (norm is small) and n == nf.
                 * without this correction the path with a low weight would have
                 * no represenation in the load-balanace - don't want that.
                 * If the weight skew is high so the load-balance has many buckets
                 * to allow it. pays ya money takes ya choice.
                 */
                error = n_adj;
                break;
            }
        }

        nhs[0].path_weight += n_adj_left;

        /* Less than 5% average error per adjacency with this size adjacency block? */
        if (error <= multipath_next_hop_error_tolerance*n_adj)
        {
            /* Truncate any next hops with zero weight. */
            _vec_len (nhs) = i;
            break;
        }
    }

done:
    /* Save vector for next call. */
    *normalized_next_hops = nhs;
    *sum_weight_in = sum_weight;
    return n_adj;
}

static load_balance_path_t *
load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
                                       dpo_proto_t drop_proto)
{
    if (0 == vec_len(nhs))
    {
        load_balance_path_t *new_nhs = NULL, *nh;

        /*
         * we need something for the load-balance. so use the drop
         */
        vec_add2(new_nhs, nh, 1);

        nh->path_weight = 1;
        dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));

        return (new_nhs);
    }

    return (NULL);
}

/*
 * Fill in adjacencies in block based on corresponding
 * next hop adjacencies.
 */
static void
load_balance_fill_buckets (load_balance_t *lb,
                           load_balance_path_t *nhs,
                           dpo_id_t *buckets,
                           u32 n_buckets)
{
    load_balance_path_t * nh;
    u16 ii, bucket;

    bucket = 0;

    /*
     * the next-hops have normalised weights. that means their sum is the number
     * of buckets we need to fill.
     */
    vec_foreach (nh, nhs)
    {
        for (ii = 0; ii < nh->path_weight; ii++)
        {
            ASSERT(bucket < n_buckets);
            load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
        }
    }
}

static inline void
load_balance_set_n_buckets (load_balance_t *lb,
                            u32 n_buckets)
{
    lb->lb_n_buckets = n_buckets;
    lb->lb_n_buckets_minus_1 = n_buckets-1;
}

void
load_balance_multipath_update (const dpo_id_t *dpo,
                               const load_balance_path_t * raw_nhs,
                               load_balance_flags_t flags)
{
    load_balance_path_t *nh, *nhs, *fixed_nhs;
    u32 sum_of_weights, n_buckets, ii;
    index_t lbmi, old_lbmi;
    load_balance_t *lb;
    dpo_id_t *tmp_dpo;

    nhs = NULL;

    ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
    lb = load_balance_get(dpo->dpoi_index);
    fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
    n_buckets =
        ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
                                          raw_nhs :
                                          fixed_nhs),
                                         &nhs,
                                         &sum_of_weights,
                                         multipath_next_hop_error_tolerance);

    ASSERT (n_buckets >= vec_len (raw_nhs));

    /*
     * Save the old load-balance map used, and get a new one if required.
     */
    old_lbmi = lb->lb_map;
    if (flags & LOAD_BALANCE_FLAG_USES_MAP)
    {
        lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
    }
    else
    {
        lbmi = INDEX_INVALID;
    }

    if (0 == lb->lb_n_buckets)
    {
        /*
         * first time initialisation. no packets inflight, so we can write
         * at leisure.
         */
        load_balance_set_n_buckets(lb, n_buckets);

        if (!LB_HAS_INLINE_BUCKETS(lb))
            vec_validate_aligned(lb->lb_buckets,
                                 lb->lb_n_buckets - 1,
                                 CLIB_CACHE_LINE_BYTES);

        load_balance_fill_buckets(lb, nhs,
                                  load_balance_get_buckets(lb),
                                  n_buckets);
        lb->lb_map = lbmi;
    }
    else
    {
        /*
         * This is a modification of an existing load-balance.
         * We need to ensure that packets inflight see a consistent state, that
         * is the number of reported buckets the LB has (read from
         * lb_n_buckets_minus_1) is not more than it actually has. So if the
         * number of buckets is increasing, we must update the bucket array first,
         * then the reported number. vice-versa if the number of buckets goes down.
         */
        if (n_buckets == lb->lb_n_buckets)
        {
            /*
             * no change in the number of buckets. we can simply fill what
             * is new over what is old.
             */
            load_balance_fill_buckets(lb, nhs,
                                      load_balance_get_buckets(lb),
                                      n_buckets);
            lb->lb_map = lbmi;
        }
        else if (n_buckets > lb->lb_n_buckets)
        {
            /*
             * we have more buckets. the old load-balance map (if there is one)
             * will remain valid, i.e. mapping to indices within range, so we
             * update it last.
             */
            if (n_buckets > LB_NUM_INLINE_BUCKETS &&
                lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
            {
                /*
                 * the new increased number of buckets is crossing the threshold
                 * from the inline storage to out-line. Alloc the outline buckets
                 * first, then fixup the number. then reset the inlines.
                 */
                ASSERT(NULL == lb->lb_buckets);
                vec_validate_aligned(lb->lb_buckets,
                                     n_buckets - 1,
                                     CLIB_CACHE_LINE_BYTES);

                load_balance_fill_buckets(lb, nhs,
                                          lb->lb_buckets,
                                          n_buckets);
                CLIB_MEMORY_BARRIER();
                load_balance_set_n_buckets(lb, n_buckets);

                CLIB_MEMORY_BARRIER();

                for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
                {
                    dpo_reset(&lb->lb_buckets_inline[ii]);
                }
            }
            else
            {
                if (n_buckets <= LB_NUM_INLINE_BUCKETS)
                {
                    /*
                     * we are not crossing the threshold and it's still inline buckets.
                     * we can write the new on the old..
                     */
                    load_balance_fill_buckets(lb, nhs,
                                              load_balance_get_buckets(lb),
                                              n_buckets);
                    CLIB_MEMORY_BARRIER();
                    load_balance_set_n_buckets(lb, n_buckets);
                }
                else
                {
                    /*
                     * we are not crossing the threshold. We need a new bucket array to
                     * hold the increased number of choices.
                     */
                    dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;

                    new_buckets = NULL;
                    old_buckets = load_balance_get_buckets(lb);

                    vec_validate_aligned(new_buckets,
                                         n_buckets - 1,
                                         CLIB_CACHE_LINE_BYTES);

                    load_balance_fill_buckets(lb, nhs, new_buckets, n_buckets);
                    CLIB_MEMORY_BARRIER();
                    lb->lb_buckets = new_buckets;
                    CLIB_MEMORY_BARRIER();
                    load_balance_set_n_buckets(lb, n_buckets);

                    vec_foreach(tmp_dpo, old_buckets)
                    {
                        dpo_reset(tmp_dpo);
                    }
                    vec_free(old_buckets);
                }
            }

            /*
             * buckets fixed. ready for the MAP update.
             */
            lb->lb_map = lbmi;
        }
        else
        {
            /*
             * bucket size shrinkage.
             * Any map we have will be based on the old
             * larger number of buckets, so will be translating to indices
             * out of range. So the new MAP must be installed first.
             */
            lb->lb_map = lbmi;
            CLIB_MEMORY_BARRIER();


            if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
                lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
            {
                /*
                 * the new decreased number of buckets is crossing the threshold
                 * from out-line storage to inline:
                 *   1 - Fill the inline buckets,
                 *   2 - fixup the number (and this point the inline buckets are
                 *       used).
                 *   3 - free the outline buckets
                 */
                load_balance_fill_buckets(lb, nhs,
                                          lb->lb_buckets_inline,
                                          n_buckets);
                CLIB_MEMORY_BARRIER();
                load_balance_set_n_buckets(lb, n_buckets);
                CLIB_MEMORY_BARRIER();

                vec_foreach(tmp_dpo, lb->lb_buckets)
                {
                    dpo_reset(tmp_dpo);
                }
                vec_free(lb->lb_buckets);
            }
            else
            {
                /*
                 * not crossing the threshold.
                 *  1 - update the number to the smaller size
                 *  2 - write the new buckets
                 *  3 - reset those no longer used.
                 */
                dpo_id_t *buckets;
                u32 old_n_buckets;

                old_n_buckets = lb->lb_n_buckets;
                buckets = load_balance_get_buckets(lb);

                load_balance_set_n_buckets(lb, n_buckets);
                CLIB_MEMORY_BARRIER();

                load_balance_fill_buckets(lb, nhs,
                                          buckets,
                                          n_buckets);

                for (ii = n_buckets; ii < old_n_buckets; ii++)
                {
                    dpo_reset(&buckets[ii]);
                }
            }
        }
    }

    vec_foreach (nh, nhs)
    {
        dpo_reset(&nh->path_dpo);
    }
    vec_free(nhs);
    vec_free(fixed_nhs);

    load_balance_map_unlock(old_lbmi);
}

static void
load_balance_lock (dpo_id_t *dpo)
{
    load_balance_t *lb;

    lb = load_balance_get(dpo->dpoi_index);

    lb->lb_locks++;
}

static void
load_balance_destroy (load_balance_t *lb)
{
    dpo_id_t *buckets;
    int i;

    buckets = load_balance_get_buckets(lb);

    for (i = 0; i < lb->lb_n_buckets; i++)
    {
        dpo_reset(&buckets[i]);
    }

    LB_DBG(lb, "destroy");
    if (!LB_HAS_INLINE_BUCKETS(lb))
    {
        vec_free(lb->lb_buckets);
    }

    fib_urpf_list_unlock(lb->lb_urpf);
    load_balance_map_unlock(lb->lb_map);

    pool_put(load_balance_pool, lb);
}

static void
load_balance_unlock (dpo_id_t *dpo)
{
    load_balance_t *lb;

    lb = load_balance_get(dpo->dpoi_index);

    lb->lb_locks--;

    if (0 == lb->lb_locks)
    {
        load_balance_destroy(lb);
    }
}

static void
load_balance_mem_show (void)
{
    fib_show_memory_usage("load-balance",
			  pool_elts(load_balance_pool),
			  pool_len(load_balance_pool),
			  sizeof(load_balance_t));
    load_balance_map_show_mem();
}

const static dpo_vft_t lb_vft = {
    .dv_lock = load_balance_lock,
    .dv_unlock = load_balance_unlock,
    .dv_format = format_load_balance_dpo,
    .dv_mem_show = load_balance_mem_show,
};

/**
 * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
 *        object.
 *
 * this means that these graph nodes are ones from which a load-balance is the
 * parent object in the DPO-graph.
 *
 * We do not list all the load-balance nodes, such as the *-lookup. instead
 * we are relying on the correct use of the .sibling_of field when setting
 * up these sibling nodes.
 */
const static char* const load_balance_ip4_nodes[] =
{
    "ip4-load-balance",
    NULL,
};
const static char* const load_balance_ip6_nodes[] =
{
    "ip6-load-balance",
    NULL,
};
const static char* const load_balance_mpls_nodes[] =
{
    "mpls-load-balance",
    NULL,
};
const static char* const load_balance_l2_nodes[] =
{
    "l2-load-balance",
    NULL,
};
const static char* const load_balance_nsh_nodes[] =
{
    "nsh-load-balance",
    NULL,
};
const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
{
    [DPO_PROTO_IP4]  = load_balance_ip4_nodes,
    [DPO_PROTO_IP6]  = load_balance_ip6_nodes,
    [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
    [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
    [DPO_PROTO_NSH] = load_balance_nsh_nodes,
};

void
load_balance_module_init (void)
{
    dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);

    /*
     * Special LB with index zero. we need to define this since the v4 mtrie
     * assumes an index of 0 implies the ply is empty. therefore all 'real'
     * adjs need a non-zero index.
     */
    load_balance_create(0, DPO_PROTO_IP4, 0);

    load_balance_map_module_init();
}

static clib_error_t *
load_balance_show (vlib_main_t * vm,
                   unformat_input_t * input,
                   vlib_cli_command_t * cmd)
{
    index_t lbi = INDEX_INVALID;

    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
        if (unformat (input, "%d", &lbi))
            ;
        else
            break;
    }

    if (INDEX_INVALID != lbi)
    {
        vlib_cli_output (vm, "%U", format_load_balance, lbi,
                         LOAD_BALANCE_FORMAT_DETAIL);
    }
    else
    {
        load_balance_t *lb;

        pool_foreach(lb, load_balance_pool,
        ({
            vlib_cli_output (vm, "%U", format_load_balance,
                             load_balance_get_index(lb),
                             LOAD_BALANCE_FORMAT_NONE);
        }));
    }

    return 0;
}

VLIB_CLI_COMMAND (load_balance_show_command, static) = {
    .path = "show load-balance",
    .short_help = "show load-balance [<index>]",
    .function = load_balance_show,
};


always_inline u32
ip_flow_hash (void *data)
{
  ip4_header_t *iph = (ip4_header_t *) data;

  if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
    return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
  else
    return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
}

always_inline u64
mac_to_u64 (u8 * m)
{
  return (*((u64 *) m) & 0xffffffffffff);
}

always_inline u32
l2_flow_hash (vlib_buffer_t * b0)
{
  ethernet_header_t *eh;
  u64 a, b, c;
  uword is_ip, eh_size;
  u16 eh_type;

  eh = vlib_buffer_get_current (b0);
  eh_type = clib_net_to_host_u16 (eh->type);
  eh_size = ethernet_buffer_header_size (b0);

  is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);

  /* since we have 2 cache lines, use them */
  if (is_ip)
    a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
  else
    a = eh->type;

  b = mac_to_u64 ((u8 *) eh->dst_address);
  c = mac_to_u64 ((u8 *) eh->src_address);
  hash_mix64 (a, b, c);

  return (u32) c;
}

typedef struct load_balance_trace_t_
{
    index_t lb_index;
} load_balance_trace_t;

static uword
l2_load_balance (vlib_main_t * vm,
		 vlib_node_runtime_t * node,
		 vlib_frame_t * frame)
{
  u32 n_left_from, next_index, *from, *to_next;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;

  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  vlib_buffer_t *b0;
	  u32 bi0, lbi0, next0;
	  const dpo_id_t *dpo0;
	  const load_balance_t *lb0;

	  bi0 = from[0];
	  to_next[0] = bi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;

	  b0 = vlib_get_buffer (vm, bi0);

	  /* lookup dst + src mac */
	  lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
	  lb0 = load_balance_get(lbi0);

	  vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);

	  dpo0 = load_balance_get_bucket_i(lb0, 
					   vnet_buffer(b0)->ip.flow_hash &
					   (lb0->lb_n_buckets_minus_1));

	  next0 = dpo0->dpoi_next_node;
	  vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;

	  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
	    {
	      load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
							 sizeof (*tr));
	      tr->lb_index = lbi0;
	    }
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
					   n_left_to_next, bi0, next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  return frame->n_vectors;
}

static u8 *
format_l2_load_balance_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);

  s = format (s, "L2-load-balance: index %d", t->lb_index);
  return s;
}

/**
 * @brief
 */
VLIB_REGISTER_NODE (l2_load_balance_node) = {
  .function = l2_load_balance,
  .name = "l2-load-balance",
  .vector_size = sizeof (u32),

  .format_trace = format_l2_load_balance_trace,
  .n_next_nodes = 1,
  .next_nodes = {
      [0] = "error-drop",
  },
};

static uword
nsh_load_balance (vlib_main_t * vm,
                 vlib_node_runtime_t * node,
                 vlib_frame_t * frame)
{
  u32 n_left_from, next_index, *from, *to_next;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;

  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from > 0 && n_left_to_next > 0)
        {
          vlib_buffer_t *b0;
          u32 bi0, lbi0, next0, *nsh0;
          const dpo_id_t *dpo0;
          const load_balance_t *lb0;

          bi0 = from[0];
          to_next[0] = bi0;
          from += 1;
          to_next += 1;
          n_left_from -= 1;
          n_left_to_next -= 1;

          b0 = vlib_get_buffer (vm, bi0);

          lbi0 =  vnet_buffer (b0)->ip.adj_index[VLIB_TX];
          lb0 = load_balance_get(lbi0);

          /* SPI + SI are the second word of the NSH header */
          nsh0 = vlib_buffer_get_current (b0);
          vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;

          dpo0 = load_balance_get_bucket_i(lb0,
                                           vnet_buffer(b0)->ip.flow_hash &
                                           (lb0->lb_n_buckets_minus_1));

          next0 = dpo0->dpoi_next_node;
          vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;

          if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
            {
              load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
                                                         sizeof (*tr));
              tr->lb_index = lbi0;
            }
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
                                           n_left_to_next, bi0, next0);
        }

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  return frame->n_vectors;
}

static u8 *
format_nsh_load_balance_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);

  s = format (s, "NSH-load-balance: index %d", t->lb_index);
  return s;
}

/**
 * @brief
 */
VLIB_REGISTER_NODE (nsh_load_balance_node) = {
  .function = nsh_load_balance,
  .name = "nsh-load-balance",
  .vector_size = sizeof (u32),

  .format_trace = format_nsh_load_balance_trace,
  .n_next_nodes = 1,
  .next_nodes = {
      [0] = "error-drop",
  },
};