aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/adj/adj_glean.c
blob: 8d86e2a9f00c5095bc30cd9d812e592cdeb4672f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/*
 * Copyright (c) 2016 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vnet/adj/adj.h>
#include <vnet/adj/adj_internal.h>
#include <vnet/fib/fib_walk.h>

/*
 * The 'DB' of all glean adjs.
 * There is only one glean per-interface per-protocol, so this is a per-interface
 * vector
 */
static adj_index_t *adj_gleans[FIB_PROTOCOL_MAX];

static inline vlib_node_registration_t*
adj_get_glean_node (fib_protocol_t proto)
{
    switch (proto) {
    case FIB_PROTOCOL_IP4:
	return (&ip4_glean_node);
    case FIB_PROTOCOL_IP6:
	return (&ip6_glean_node);
    case FIB_PROTOCOL_MPLS:
	break;
    }
    ASSERT(0);
    return (NULL);
}

/*
 * adj_glean_add_or_lock
 *
 * The next_hop address here is used for source address selection in the DP.
 * The glean adj is added to an interface's connected prefix, the next-hop
 * passed here is the local prefix on the same interface.
 */
adj_index_t
adj_glean_add_or_lock (fib_protocol_t proto,
		       u32 sw_if_index,
		       const ip46_address_t *nh_addr)
{
    ip_adjacency_t * adj;

    vec_validate_init_empty(adj_gleans[proto], sw_if_index, ADJ_INDEX_INVALID);

    if (ADJ_INDEX_INVALID == adj_gleans[proto][sw_if_index])
    {
	adj = adj_alloc(proto);

	adj->lookup_next_index = IP_LOOKUP_NEXT_GLEAN;
	adj->ia_nh_proto = proto;
	adj_gleans[proto][sw_if_index] = adj_get_index(adj);

	if (NULL != nh_addr)
	{
	    adj->sub_type.glean.receive_addr = *nh_addr;
	}

	adj->rewrite_header.data_bytes = 0;

	vnet_rewrite_for_sw_interface(vnet_get_main(),
				      adj_fib_proto_2_nd(proto),
				      sw_if_index,
				      adj_get_glean_node(proto)->index,
				      VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
				      &adj->rewrite_header,
				      sizeof (adj->rewrite_data));
    }
    else
    {
	adj = adj_get(adj_gleans[proto][sw_if_index]);
    }

    adj_lock(adj_get_index(adj));

    return (adj_get_index(adj));
}

void
adj_glean_remove (fib_protocol_t proto,
		  u32 sw_if_index)
{
    ASSERT(sw_if_index < vec_len(adj_gleans[proto]));

    adj_gleans[proto][sw_if_index] = ADJ_INDEX_INVALID;
}

static clib_error_t *
adj_glean_interface_state_change (vnet_main_t * vnm,
				  u32 sw_if_index,
				  u32 flags)
{
    /*
     * for each glean on the interface trigger a walk back to the children
     */
    fib_protocol_t proto;
    ip_adjacency_t *adj;


    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
    {
	if (sw_if_index >= vec_len(adj_gleans[proto]) ||
	    ADJ_INDEX_INVALID == adj_gleans[proto][sw_if_index])
	    continue;

	adj = adj_get(adj_gleans[proto][sw_if_index]);

	fib_node_back_walk_ctx_t bw_ctx = {
	    .fnbw_reason = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP ?
			    FIB_NODE_BW_REASON_FLAG_INTERFACE_UP :
			    FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN),
	};

	fib_walk_sync(FIB_NODE_TYPE_ADJ, adj_get_index(adj), &bw_ctx);
    }

    return (NULL);
}

VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION(adj_glean_interface_state_change);

/**
 * @brief Invoked on each SW interface of a HW interface when the
 * HW interface state changes
 */
static void
adj_nbr_hw_sw_interface_state_change (vnet_main_t * vnm,
                                      u32 sw_if_index,
                                      void *arg)
{
    adj_glean_interface_state_change(vnm, sw_if_index, (uword) arg);
}

/**
 * @brief Registered callback for HW interface state changes
 */
static clib_error_t *
adj_glean_hw_interface_state_change (vnet_main_t * vnm,
                                     u32 hw_if_index,
                                     u32 flags)
{
    /*
     * walk SW interfaces on the HW
     */
    uword sw_flags;

    sw_flags = ((flags & VNET_HW_INTERFACE_FLAG_LINK_UP) ?
                VNET_SW_INTERFACE_FLAG_ADMIN_UP :
                0);

    vnet_hw_interface_walk_sw(vnm, hw_if_index,
                              adj_nbr_hw_sw_interface_state_change,
                              (void*) sw_flags);

    return (NULL);
}

VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION(
    adj_glean_hw_interface_state_change);

static clib_error_t *
adj_glean_interface_delete (vnet_main_t * vnm,
			    u32 sw_if_index,
			    u32 is_add)
{
    /*
     * for each glean on the interface trigger a walk back to the children
     */
    fib_protocol_t proto;
    ip_adjacency_t *adj;

    if (is_add)
    {
	/*
	 * not interested in interface additions. we will not back walk
	 * to resolve paths through newly added interfaces. Why? The control
	 * plane should have the brains to add interfaces first, then routes.
	 * So the case where there are paths with a interface that matches
	 * one just created is the case where the path resolved through an
	 * interface that was deleted, and still has not been removed. The
	 * new interface added, is NO GUARANTEE that the interface being
	 * added now, even though it may have the same sw_if_index, is the
	 * same interface that the path needs. So tough!
	 * If the control plane wants these routes to resolve it needs to
	 * remove and add them again.
	 */
	return (NULL);
    }

    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
    {
	if (sw_if_index >= vec_len(adj_gleans[proto]) ||
	    ADJ_INDEX_INVALID == adj_gleans[proto][sw_if_index])
	    continue;

	adj = adj_get(adj_gleans[proto][sw_if_index]);

	fib_node_back_walk_ctx_t bw_ctx = {
	    .fnbw_reason =  FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE,
	};

	fib_walk_sync(FIB_NODE_TYPE_ADJ, adj_get_index(adj), &bw_ctx);
    }

    return (NULL);
}

VNET_SW_INTERFACE_ADD_DEL_FUNCTION(adj_glean_interface_delete);

u8*
format_adj_glean (u8* s, va_list *ap)
{
    index_t index = va_arg(*ap, index_t);
    CLIB_UNUSED(u32 indent) = va_arg(*ap, u32);
    vnet_main_t * vnm = vnet_get_main();
    ip_adjacency_t * adj = adj_get(index);

    return (format(s, "%U-glean: %U",
		   format_fib_protocol, adj->ia_nh_proto,
                   format_vnet_sw_interface_name,
                   vnm,
                   vnet_get_sw_interface(vnm,
                                         adj->rewrite_header.sw_if_index)));
}


static void
adj_dpo_lock (dpo_id_t *dpo)
{
    adj_lock(dpo->dpoi_index);
}
static void
adj_dpo_unlock (dpo_id_t *dpo)
{
    adj_unlock(dpo->dpoi_index);
}

const static dpo_vft_t adj_glean_dpo_vft = {
    .dv_lock = adj_dpo_lock,
    .dv_unlock = adj_dpo_unlock,
    .dv_format = format_adj_glean,
};

/**
 * @brief The per-protocol VLIB graph nodes that are assigned to a glean
 *        object.
 *
 * this means that these graph nodes are ones from which a glean is the
 * parent object in the DPO-graph.
 */
const static char* const glean_ip4_nodes[] =
{
    "ip4-glean",
    NULL,
};
const static char* const glean_ip6_nodes[] =
{
    "ip6-glean",
    NULL,
};

const static char* const * const glean_nodes[DPO_PROTO_NUM] =
{
    [DPO_PROTO_IP4]  = glean_ip4_nodes,
    [DPO_PROTO_IP6]  = glean_ip6_nodes,
    [DPO_PROTO_MPLS] = NULL,
};

void
adj_glean_module_init (void)
{
    dpo_register(DPO_ADJACENCY_GLEAN, &adj_glean_dpo_vft, glean_nodes);
}
0 } /* Literal.String.Single */ .highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */ .highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */ .highlight .vc { color: #336699 } /* Name.Variable.Class */ .highlight .vg { color: #dd7700 } /* Name.Variable.Global */ .highlight .vi { color: #3333bb } /* Name.Variable.Instance */ .highlight .vm { color: #336699 } /* Name.Variable.Magic */ .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
  Copyright (c) 2001-2005 Eliot Dresselhaus

  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
  "Software"), to deal in the Software without restriction, including
  without limitation the rights to use, copy, modify, merge, publish,
  distribute, sublicense, and/or sell copies of the Software, and to
  permit persons to whom the Software is furnished to do so, subject to
  the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#include <vppinfra/hash.h>
#include <vppinfra/error.h>
#include <vppinfra/mem.h>
#include <vppinfra/byte_order.h>	/* for clib_arch_is_big_endian */

always_inline void
zero_pair (hash_t * h, hash_pair_t * p)
{
  memset (p, 0, hash_pair_bytes (h));
}

always_inline void
init_pair (hash_t * h, hash_pair_t * p)
{
  memset (p->value, ~0, hash_value_bytes (h));
}

always_inline hash_pair_union_t *
get_pair (void *v, uword i)
{
  hash_t *h = hash_header (v);
  hash_pair_t *p;
  ASSERT (i < vec_len (v));
  p = v;
  p += i << h->log2_pair_size;
  return (hash_pair_union_t *) p;
}

always_inline void
set_is_user (void *v, uword i, uword is_user)
{
  hash_t *h = hash_header (v);
  uword i0 = i / BITS (h->is_user[0]);
  uword i1 = (uword) 1 << (i % BITS (h->is_user[0]));
  if (is_user)
    h->is_user[i0] |= i1;
  else
    h->is_user[i0] &= ~i1;
}

static u8 *hash_format_pair_default (u8 * s, va_list * args);

#if uword_bits == 64

static inline u64
zap64 (u64 x, word n)
{
#define _(n) (((u64) 1 << (u64) (8*(n))) - (u64) 1)
  static u64 masks_little_endian[] = {
    0, _(1), _(2), _(3), _(4), _(5), _(6), _(7),
  };
  static u64 masks_big_endian[] = {
    0, ~_(7), ~_(6), ~_(5), ~_(4), ~_(3), ~_(2), ~_(1),
  };
#undef _
  if (clib_arch_is_big_endian)
    return x & masks_big_endian[n];
  else
    return x & masks_little_endian[n];
}

/**
 * make address-sanitizer skip this:
 * clib_mem_unaligned + zap64 casts its input as u64, computes a mask
 * according to the input length, and returns the casted maked value.
 * Therefore all the 8 Bytes of the u64 are systematically read, which
 * rightfully causes address-sanitizer to raise an error on smaller inputs.
 *
 * However the invalid Bytes are discarded within zap64(), whicj is why
 * this can be silenced safely.
 */
static inline u64 __attribute__ ((no_sanitize_address))
hash_memory64 (void *p, word n_bytes, u64 state)
{
  u64 *q = p;
  u64 a, b, c, n;

  a = b = 0x9e3779b97f4a7c13LL;
  c = state;
  n = n_bytes;

  while (n >= 3 * sizeof (u64))
    {
      a += clib_mem_unaligned (q + 0, u64);
      b += clib_mem_unaligned (q + 1, u64);
      c += clib_mem_unaligned (q + 2, u64);
      hash_mix64 (a, b, c);
      n -= 3 * sizeof (u64);
      q += 3;
    }

  c += n_bytes;
  switch (n / sizeof (u64))
    {
    case 2:
      a += clib_mem_unaligned (q + 0, u64);
      b += clib_mem_unaligned (q + 1, u64);
      if (n % sizeof (u64))
	c += zap64 (clib_mem_unaligned (q + 2, u64), n % sizeof (u64)) << 8;
      break;

    case 1:
      a += clib_mem_unaligned (q + 0, u64);
      if (n % sizeof (u64))
	b += zap64 (clib_mem_unaligned (q + 1, u64), n % sizeof (u64));
      break;

    case 0:
      if (n % sizeof (u64))
	a += zap64 (clib_mem_unaligned (q + 0, u64), n % sizeof (u64));
      break;
    }

  hash_mix64 (a, b, c);

  return c;
}

#else /* if uword_bits == 64 */

static inline u32
zap32 (u32 x, word n)
{
#define _(n) (((u32) 1 << (u32) (8*(n))) - (u32) 1)
  static u32 masks_little_endian[] = {
    0, _(1), _(2), _(3),
  };
  static u32 masks_big_endian[] = {
    0, ~_(3), ~_(2), ~_(1),
  };
#undef _
  if (clib_arch_is_big_endian)
    return x & masks_big_endian[n];
  else
    return x & masks_little_endian[n];
}

static inline u32
hash_memory32 (void *p, word n_bytes, u32 state)
{
  u32 *q = p;
  u32 a, b, c, n;

  a = b = 0x9e3779b9;
  c = state;
  n = n_bytes;

  while (n >= 3 * sizeof (u32))
    {
      a += clib_mem_unaligned (q + 0, u32);
      b += clib_mem_unaligned (q + 1, u32);
      c += clib_mem_unaligned (q + 2, u32);
      hash_mix32 (a, b, c);
      n -= 3 * sizeof (u32);
      q += 3;
    }

  c += n_bytes;
  switch (n / sizeof (u32))
    {
    case 2:
      a += clib_mem_unaligned (q + 0, u32);
      b += clib_mem_unaligned (q + 1, u32);
      if (n % sizeof (u32))
	c += zap32 (clib_mem_unaligned (q + 2, u32), n % sizeof (u32)) << 8;
      break;

    case 1:
      a += clib_mem_unaligned (q + 0, u32);
      if (n % sizeof (u32))
	b += zap32 (clib_mem_unaligned (q + 1, u32), n % sizeof (u32));
      break;

    case 0:
      if (n % sizeof (u32))
	a += zap32 (clib_mem_unaligned (q + 0, u32), n % sizeof (u32));
      break;
    }

  hash_mix32 (a, b, c);

  return c;
}
#endif

uword
hash_memory (void *p, word n_bytes, uword state)
{
  uword *q = p;

#if uword_bits == 64
  return hash_memory64 (q, n_bytes, state);
#else
  return hash_memory32 (q, n_bytes, state);
#endif
}

#if uword_bits == 64
always_inline uword
hash_uword (uword x)
{
  u64 a, b, c;

  a = b = 0x9e3779b97f4a7c13LL;
  c = 0;
  a += x;
  hash_mix64 (a, b, c);
  return c;
}
#else
always_inline uword
hash_uword (uword x)
{
  u32 a, b, c;

  a = b = 0x9e3779b9;
  c = 0;
  a += x;
  hash_mix32 (a, b, c);
  return c;
}
#endif

/* Call sum function.  Hash code will be sum function value
   modulo the prime length of the hash table. */
always_inline uword
key_sum (hash_t * h, uword key)
{
  uword sum;
  switch (pointer_to_uword ((void *) h->key_sum))
    {
    case KEY_FUNC_NONE:
      sum = hash_uword (key);
      break;

    case KEY_FUNC_POINTER_UWORD:
      sum = hash_uword (*uword_to_pointer (key, uword *));
      break;

    case KEY_FUNC_POINTER_U32:
      sum = hash_uword (*uword_to_pointer (key, u32 *));
      break;

    case KEY_FUNC_STRING:
      sum = string_key_sum (h, key);
      break;

    default:
      sum = h->key_sum (h, key);
      break;
    }

  return sum;
}

always_inline uword
key_equal1 (hash_t * h, uword key1, uword key2, uword e)
{
  switch (pointer_to_uword ((void *) h->key_equal))
    {
    case KEY_FUNC_NONE:
      break;

    case KEY_FUNC_POINTER_UWORD:
      e =
	*uword_to_pointer (key1, uword *) == *uword_to_pointer (key2,
								uword *);
      break;

    case KEY_FUNC_POINTER_U32:
      e = *uword_to_pointer (key1, u32 *) == *uword_to_pointer (key2, u32 *);
      break;

    case KEY_FUNC_STRING:
      e = string_key_equal (h, key1, key2);
      break;

    default:
      e = h->key_equal (h, key1, key2);
      break;
    }
  return e;
}

/* Compares two keys: returns 1 if equal, 0 if not. */
always_inline uword
key_equal (hash_t * h, uword key1, uword key2)
{
  uword e = key1 == key2;
  if (CLIB_DEBUG > 0 && key1 == key2)
    ASSERT (key_equal1 (h, key1, key2, e));
  if (!e)
    e = key_equal1 (h, key1, key2, e);
  return e;
}

static hash_pair_union_t *
get_indirect (void *v, hash_pair_indirect_t * pi, uword key)
{
  hash_t *h = hash_header (v);
  hash_pair_t *p0, *p1;

  p0 = p1 = pi->pairs;
  if (h->log2_pair_size > 0)
    p1 = hash_forward (h, p0, indirect_pair_get_len (pi));
  else
    p1 += vec_len (p0);

  while (p0 < p1)
    {
      if (key_equal (h, p0->key, key))
	return (hash_pair_union_t *) p0;
      p0 = hash_forward1 (h, p0);
    }

  return (hash_pair_union_t *) 0;
}

static hash_pair_union_t *
set_indirect_is_user (void *v, uword i, hash_pair_union_t * p, uword key)
{
  hash_t *h = hash_header (v);
  hash_pair_t *q;
  hash_pair_indirect_t *pi = &p->indirect;
  uword log2_bytes = 0;

  if (h->log2_pair_size == 0)
    q = vec_new (hash_pair_t, 2);
  else
    {
      log2_bytes = 1 + hash_pair_log2_bytes (h);
      q = clib_mem_alloc (1ULL << log2_bytes);
    }
  clib_memcpy (q, &p->direct, hash_pair_bytes (h));

  pi->pairs = q;
  if (h->log2_pair_size > 0)
    indirect_pair_set (pi, log2_bytes, 2);

  set_is_user (v, i, 0);

  /* First element is used by existing pair, second will be used by caller. */
  q = hash_forward1 (h, q);
  q->key = key;
  init_pair (h, q);
  return (hash_pair_union_t *) q;
}

static hash_pair_union_t *
set_indirect (void *v, hash_pair_indirect_t * pi, uword key,
	      uword * found_key)
{
  hash_t *h = hash_header (v);
  hash_pair_t *new_pair;
  hash_pair_union_t *q;

  q = get_indirect (v, pi, key);
  if (q)
    {
      *found_key = 1;
      return q;
    }

  if (h->log2_pair_size == 0)
    vec_add2 (pi->pairs, new_pair, 1);
  else
    {
      uword len, new_len, log2_bytes;

      len = indirect_pair_get_len (pi);
      log2_bytes = indirect_pair_get_log2_bytes (pi);

      new_len = len + 1;
      if (new_len * hash_pair_bytes (h) > (1ULL << log2_bytes))
	{
	  pi->pairs = clib_mem_realloc (pi->pairs,
					1ULL << (log2_bytes + 1),
					1ULL << log2_bytes);
	  log2_bytes++;
	}

      indirect_pair_set (pi, log2_bytes, new_len);
      new_pair = pi->pairs + (len << h->log2_pair_size);
    }
  new_pair->key = key;
  init_pair (h, new_pair);
  *found_key = 0;
  return (hash_pair_union_t *) new_pair;
}

static void
unset_indirect (void *v, uword i, hash_pair_t * q)
{
  hash_t *h = hash_header (v);
  hash_pair_union_t *p = get_pair (v, i);
  hash_pair_t *e;
  hash_pair_indirect_t *pi = &p->indirect;
  uword len, is_vec;

  is_vec = h->log2_pair_size == 0;

  ASSERT (!hash_is_user (v, i));
  len = is_vec ? vec_len (pi->pairs) : indirect_pair_get_len (pi);
  e = hash_forward (h, pi->pairs, len - 1);
  ASSERT (q >= pi->pairs && q <= e);

  /* We have two or fewer pairs and we are delete one pair.
     Make indirect pointer direct and free indirect memory. */
  if (len <= 2)
    {
      hash_pair_t *r = pi->pairs;

      if (len == 2)
	{
	  clib_memcpy (p, q == r ? hash_forward1 (h, r) : r,
		       hash_pair_bytes (h));
	  set_is_user (v, i, 1);
	}
      else
	zero_pair (h, &p->direct);

      if (is_vec)
	vec_free (r);
      else if (r)
	clib_mem_free (r);
    }
  else
    {
      /* If deleting a pair we need to keep non-null pairs together. */
      if (q < e)
	clib_memcpy (q, e, hash_pair_bytes (h));
      else
	zero_pair (h, q);
      if (is_vec)
	_vec_len (pi->pairs) -= 1;
      else
	indirect_pair_set (pi, indirect_pair_get_log2_bytes (pi), len - 1);
    }
}

enum lookup_opcode
{
  GET = 1,
  SET = 2,
  UNSET = 3,
};

static hash_pair_t *
lookup (void *v, uword key, enum lookup_opcode op,
	void *new_value, void *old_value)
{
  hash_t *h = hash_header (v);
  hash_pair_union_t *p = 0;
  uword found_key = 0;
  uword i;

  if (!v)
    return 0;

  i = key_sum (h, key) & (_vec_len (v) - 1);
  p = get_pair (v, i);

  if (hash_is_user (v, i))
    {
      found_key = key_equal (h, p->direct.key, key);
      if (found_key)
	{
	  if (op == UNSET)
	    {
	      set_is_user (v, i, 0);
	      if (old_value)
		clib_memcpy (old_value, p->direct.value,
			     hash_value_bytes (h));
	      zero_pair (h, &p->direct);
	    }
	}
      else
	{
	  if (op == SET)
	    p = set_indirect_is_user (v, i, p, key);
	  else
	    p = 0;
	}
    }
  else
    {
      hash_pair_indirect_t *pi = &p->indirect;

      if (op == SET)
	{
	  if (!pi->pairs)
	    {
	      p->direct.key = key;
	      set_is_user (v, i, 1);
	    }
	  else
	    p = set_indirect (v, pi, key, &found_key);
	}
      else
	{
	  p = get_indirect (v, pi, key);
	  found_key = p != 0;
	  if (found_key && op == UNSET)
	    {
	      if (old_value)
		clib_memcpy (old_value, &p->direct.value,
			     hash_value_bytes (h));

	      unset_indirect (v, i, &p->direct);

	      /* Nullify p (since it's just been deleted).
	         Otherwise we might be tempted to play with it. */
	      p = 0;
	    }
	}
    }

  if (op == SET && p != 0)
    {
      /* Save away old value for caller. */
      if (old_value && found_key)
	clib_memcpy (old_value, &p->direct.value, hash_value_bytes (h));
      clib_memcpy (&p->direct.value, new_value, hash_value_bytes (h));
    }

  if (op == SET)
    h->elts += !found_key;
  if (op == UNSET)
    h->elts -= found_key;

  return &p->direct;
}

/* Fetch value of key. */
uword *
_hash_get (void *v, uword key)
{
  hash_t *h = hash_header (v);
  hash_pair_t *p;

  /* Don't even search table if its empty. */
  if (!v || h->elts == 0)
    return 0;

  p = lookup (v, key, GET, 0, 0);
  if (!p)
    return 0;
  if (h->log2_pair_size == 0)
    return &p->key;
  else
    return &p->value[0];
}

hash_pair_t *
_hash_get_pair (void *v, uword key)
{
  return lookup (v, key, GET, 0, 0);
}

hash_pair_t *
hash_next (void *v, hash_next_t * hn)
{
  hash_t *h = hash_header (v);
  hash_pair_t *p;

  while (1)
    {
      if (hn->i == 0 && hn->j == 0)
	{
	  /* Save flags. */
	  hn->f = h->flags;

	  /* Prevent others from re-sizing hash table. */
	  h->flags |=
	    (HASH_FLAG_NO_AUTO_GROW
	     | HASH_FLAG_NO_AUTO_SHRINK | HASH_FLAG_HASH_NEXT_IN_PROGRESS);
	}
      else if (hn->i >= hash_capacity (v))
	{
	  /* Restore flags. */
	  h->flags = hn->f;
	  memset (hn, 0, sizeof (hn[0]));
	  return 0;
	}

      p = hash_forward (h, v, hn->i);
      if (hash_is_user (v, hn->i))
	{
	  hn->i++;
	  return p;
	}
      else
	{
	  hash_pair_indirect_t *pi = (void *) p;
	  uword n;

	  if (h->log2_pair_size > 0)
	    n = indirect_pair_get_len (pi);
	  else
	    n = vec_len (pi->pairs);

	  if (hn->j >= n)
	    {
	      hn->i++;
	      hn->j = 0;
	    }
	  else
	    return hash_forward (h, pi->pairs, hn->j++);
	}
    }
}

/* Remove key from table. */
void *
_hash_unset (void *v, uword key, void *old_value)
{
  hash_t *h;

  if (!v)
    return v;

  (void) lookup (v, key, UNSET, 0, old_value);

  h = hash_header (v);
  if (!(h->flags & HASH_FLAG_NO_AUTO_SHRINK))
    {
      /* Resize when 1/4 full. */
      if (h->elts > 32 && 4 * (h->elts + 1) < vec_len (v))
	v = hash_resize (v, vec_len (v) / 2);
    }

  return v;
}

void *
_hash_create (uword elts, hash_t * h_user)
{
  hash_t *h;
  uword log2_pair_size;
  void *v;

  /* Size of hash is power of 2 >= ELTS and larger than
     number of bits in is_user bitmap elements. */
  elts = clib_max (elts, BITS (h->is_user[0]));
  elts = 1ULL << max_log2 (elts);

  log2_pair_size = 1;
  if (h_user)
    log2_pair_size = h_user->log2_pair_size;

  v = _vec_resize ((void *) 0,
		   /* vec len: */ elts,
		   /* data bytes: */
		   (elts << log2_pair_size) * sizeof (hash_pair_t),
		   /* header bytes: */
		   sizeof (h[0]) +
		   (elts / BITS (h->is_user[0])) * sizeof (h->is_user[0]),
		   /* alignment */ sizeof (hash_pair_t));
  h = hash_header (v);

  if (h_user)
    h[0] = h_user[0];

  h->log2_pair_size = log2_pair_size;
  h->elts = 0;

  /* Default flags to never shrinking hash tables.
     Shrinking tables can cause "jackpot" cases. */
  if (!h_user)
    h->flags = HASH_FLAG_NO_AUTO_SHRINK;

  if (!h->format_pair)
    {
      h->format_pair = hash_format_pair_default;
      h->format_pair_arg = 0;
    }

  return v;
}

void *
_hash_free (void *v)
{
  hash_t *h = hash_header (v);
  hash_pair_union_t *p;
  uword i;

  if (!v)
    return v;

  /* We zero all freed memory in case user would be tempted to use it. */
  for (i = 0; i < hash_capacity (v); i++)
    {
      if (hash_is_user (v, i))
	continue;
      p = get_pair (v, i);
      if (h->log2_pair_size == 0)
	vec_free (p->indirect.pairs);
      else if (p->indirect.pairs)
	clib_mem_free (p->indirect.pairs);
    }

  vec_free_header (h);

  return 0;
}

static void *
hash_resize_internal (void *old, uword new_size, uword free_old)
{
  void *new;
  hash_pair_t *p;

  new = 0;
  if (new_size > 0)
    {
      hash_t *h = old ? hash_header (old) : 0;
      new = _hash_create (new_size, h);
      /* *INDENT-OFF* */
      hash_foreach_pair (p, old, {
	new = _hash_set3 (new, p->key, &p->value[0], 0);
      });
      /* *INDENT-ON* */
    }

  if (free_old)
    hash_free (old);
  return new;
}

void *
hash_resize (void *old, uword new_size)
{
  return hash_resize_internal (old, new_size, 1);
}

void *
hash_dup (void *old)
{
  return hash_resize_internal (old, vec_len (old), 0);
}

void *
_hash_set3 (void *v, uword key, void *value, void *old_value)
{
  hash_t *h;

  if (!v)
    v = hash_create (0, sizeof (uword));

  h = hash_header (v);
  (void) lookup (v, key, SET, value, old_value);

  if (!(h->flags & HASH_FLAG_NO_AUTO_GROW))
    {
      /* Resize when 3/4 full. */
      if (4 * (h->elts + 1) > 3 * vec_len (v))
	v = hash_resize (v, 2 * vec_len (v));
    }

  return v;
}

uword
vec_key_sum (hash_t * h, uword key)
{
  void *v = uword_to_pointer (key, void *);
  return hash_memory (v, vec_len (v) * h->user, 0);
}

uword
vec_key_equal (hash_t * h, uword key1, uword key2)
{
  void *v1 = uword_to_pointer (key1, void *);
  void *v2 = uword_to_pointer (key2, void *);
  uword l1 = vec_len (v1);
  uword l2 = vec_len (v2);
  return l1 == l2 && 0 == memcmp (v1, v2, l1 * h->user);
}

u8 *
vec_key_format_pair (u8 * s, va_list * args)
{
  void *CLIB_UNUSED (user_arg) = va_arg (*args, void *);
  void *v = va_arg (*args, void *);
  hash_pair_t *p = va_arg (*args, hash_pair_t *);
  hash_t *h = hash_header (v);
  void *u = uword_to_pointer (p->key, void *);
  int i;

  switch (h->user)
    {
    case 1:
      s = format (s, "%v", u);
      break;

    case 2:
      {
	u16 *w = u;
	for (i = 0; i < vec_len (w); i++)
	  s = format (s, "0x%x, ", w[i]);
	break;
      }

    case 4:
      {
	u32 *w = u;
	for (i = 0; i < vec_len (w); i++)
	  s = format (s, "0x%x, ", w[i]);
	break;
      }

    case 8:
      {
	u64 *w = u;
	for (i = 0; i < vec_len (w); i++)
	  s = format (s, "0x%Lx, ", w[i]);
	break;
      }

    default:
      s = format (s, "0x%U", format_hex_bytes, u, vec_len (u) * h->user);
      break;
    }

  if (hash_value_bytes (h) > 0)
    s = format (s, " -> 0x%wx", p->value[0]);

  return s;
}

uword
mem_key_sum (hash_t * h, uword key)
{
  uword *v = uword_to_pointer (key, void *);
  return hash_memory (v, h->user, 0);
}

uword
mem_key_equal (hash_t * h, uword key1, uword key2)
{
  void *v1 = uword_to_pointer (key1, void *);
  void *v2 = uword_to_pointer (key2, void *);
  return v1 && v2 && 0 == memcmp (v1, v2, h->user);
}

uword
string_key_sum (hash_t * h, uword key)
{
  char *v = uword_to_pointer (key, char *);
  return hash_memory (v, strlen (v), 0);
}

uword
string_key_equal (hash_t * h, uword key1, uword key2)
{
  void *v1 = uword_to_pointer (key1, void *);
  void *v2 = uword_to_pointer (key2, void *);
  return v1 && v2 && 0 == strcmp (v1, v2);
}

u8 *
string_key_format_pair (u8 * s, va_list * args)
{
  void *CLIB_UNUSED (user_arg) = va_arg (*args, void *);
  void *v = va_arg (*args, void *);
  hash_pair_t *p = va_arg (*args, hash_pair_t *);
  hash_t *h = hash_header (v);
  void *u = uword_to_pointer (p->key, void *);

  s = format (s, "%s", u);

  if (hash_value_bytes (h) > 0)
    s =
      format (s, " -> 0x%8U", format_hex_bytes, &p->value[0],
	      hash_value_bytes (h));

  return s;
}

static u8 *
hash_format_pair_default (u8 * s, va_list * args)
{
  void *CLIB_UNUSED (user_arg) = va_arg (*args, void *);
  void *v = va_arg (*args, void *);
  hash_pair_t *p = va_arg (*args, hash_pair_t *);
  hash_t *h = hash_header (v);

  s = format (s, "0x%08x", p->key);
  if (hash_value_bytes (h) > 0)
    s =
      format (s, " -> 0x%8U", format_hex_bytes, &p->value[0],
	      hash_value_bytes (h));
  return s;
}

uword
hash_bytes (void *v)
{
  uword i, bytes;
  hash_t *h = hash_header (v);

  if (!v)
    return 0;

  bytes = vec_capacity (v, hash_header_bytes (v));

  for (i = 0; i < hash_capacity (v); i++)
    {
      if (!hash_is_user (v, i))
	{
	  hash_pair_union_t *p = get_pair (v, i);
	  if (h->log2_pair_size > 0)
	    bytes += 1 << indirect_pair_get_log2_bytes (&p->indirect);
	  else
	    bytes += vec_capacity (p->indirect.pairs, 0);
	}
    }
  return bytes;
}

u8 *
format_hash (u8 * s, va_list * va)
{
  void *v = va_arg (*va, void *);
  int verbose = va_arg (*va, int);
  hash_pair_t *p;
  hash_t *h = hash_header (v);
  uword i;

  s = format (s, "hash %p, %wd elts, capacity %wd, %wd bytes used,\n",
	      v, hash_elts (v), hash_capacity (v), hash_bytes (v));

  {
    uword *occupancy = 0;

    /* Count number of buckets with each occupancy. */
    for (i = 0; i < hash_capacity (v); i++)
      {
	uword j;

	if (hash_is_user (v, i))
	  {
	    j = 1;
	  }
	else
	  {
	    hash_pair_union_t *p = get_pair (v, i);
	    if (h->log2_pair_size > 0)
	      j = indirect_pair_get_len (&p->indirect);
	    else
	      j = vec_len (p->indirect.pairs);
	  }

	vec_validate (occupancy, j);
	occupancy[j]++;
      }

    s = format (s, "  profile ");
    for (i = 0; i < vec_len (occupancy); i++)
      s = format (s, "%wd%c", occupancy[i],
		  i + 1 == vec_len (occupancy) ? '\n' : ' ');

    s = format (s, "  lookup # of compares: ");
    for (i = 1; i < vec_len (occupancy); i++)
      s = format (s, "%wd: .%03d%c", i,
		  (1000 * i * occupancy[i]) / hash_elts (v),
		  i + 1 == vec_len (occupancy) ? '\n' : ' ');

    vec_free (occupancy);
  }

  if (verbose)
    {
      /* *INDENT-OFF* */
      hash_foreach_pair (p, v, {
	s = format (s, "  %U\n", h->format_pair, h->format_pair_arg, v, p);
      });
      /* *INDENT-ON* */
    }

  return s;
}

static uword
unformat_hash_string_internal (unformat_input_t * input,
			       va_list * va, int is_vec)
{
  uword *hash = va_arg (*va, uword *);
  int *result = va_arg (*va, int *);
  u8 *string = 0;
  uword *p;

  if (!unformat (input, is_vec ? "%v%_" : "%s%_", &string))
    return 0;

  p = hash_get_mem (hash, string);
  if (p)
    *result = *p;

  vec_free (string);
  return p ? 1 : 0;
}

uword
unformat_hash_vec_string (unformat_input_t * input, va_list * va)
{
  return unformat_hash_string_internal (input, va, /* is_vec */ 1);
}

uword
unformat_hash_string (unformat_input_t * input, va_list * va)
{
  return unformat_hash_string_internal (input, va, /* is_vec */ 0);
}

clib_error_t *
hash_validate (void *v)
{
  hash_t *h = hash_header (v);
  uword i, j;
  uword *keys = 0;
  clib_error_t *error = 0;

#define CHECK(x) if ((error = ERROR_ASSERT (x))) goto done;

  for (i = 0; i < hash_capacity (v); i++)
    {
      hash_pair_union_t *pu = get_pair (v, i);

      if (hash_is_user (v, i))
	{
	  CHECK (pu->direct.key != 0);
	  vec_add1 (keys, pu->direct.key);
	}
      else
	{
	  hash_pair_t *p;
	  hash_pair_indirect_t *pi = &pu->indirect;
	  uword n;

	  n = h->log2_pair_size > 0
	    ? indirect_pair_get_len (pi) : vec_len (pi->pairs);

	  for (p = pi->pairs; n-- > 0; p = hash_forward1 (h, p))
	    {
	      /* Assert key uniqueness. */
	      for (j = 0; j < vec_len (keys); j++)
		CHECK (keys[j] != p->key);
	      vec_add1 (keys, p->key);
	    }
	}
    }

  CHECK (vec_len (keys) == h->elts);

  vec_free (keys);
done:
  return error;
}

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */