aboutsummaryrefslogtreecommitdiffstats
path: root/cicn-plugin/cicn/cicn_hashtb.h
blob: c7522dd1a2827dce765d2d6efa8742299c46ac85 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
/*
 * Copyright (c) 2017 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * cicn_hashtb.h: Fast-path, vpp-aware hashtable, the base for the PIT/CS and FIB
 * used in the cicn forwarder.
 *
 * - As is the case in other areas, we can't share headers between the vpp code
 *   and the cicn/cndn code: there are conflicting definitions.
 *
 */

#ifndef _CICN_HASHTB_H_
#define _CICN_HASHTB_H_ 1

#if !CICN_VPP_PLUGIN
#error "cicn-internal file included externally"
#endif

#include "cicn_std.h"
#include "cicn_params.h"

/* Handy abbreviations for success status, and for boolean values */
#ifndef TRUE
#define TRUE 1
#endif

#ifndef FALSE
#define FALSE 0
#endif

/*
 * Lookup is finding a hashtable record whose name matches the name
 * being looked up.  Most of the lookup work is based on the hash
 * value of the two names. Note that the intel cache line size is 64
 * bytes, and some platforms load in 2 cache lines together.
 * - first step is to match a record at the bucket/slot level
 *   (htab has an array of htbucket_t/htbc_elmt, where each bucket has
 *   7 slots to hold indices for entries.) Matching at this level implies
 *   - the hashes of the lookup name and the record map to the
 *     same bucket
 *   - the high 32 bits of the hashes (slot bce_hash_msb32s) match.
 *   Read cost (on the hash table size, i.e. ignoring reading the
 *   name being looked up):
 *   - First step normally requires 1 cache line load to pull in
 *     the 64-byte htbucket_t with the 7 element slot table holding the
 *     hash_msb32s.
 *   - In the event (hopefully rare for a hash table with
 *     appropriate number of buckets) that more than 7 elements
 *     hash to the same bucket, lookup may well need to look not
 *     only at the static htbc_elmt_t but at the chain of dynamically
 *     allocated htbc_elmt_t's linked to the static htbc_elmt_t, where
 *     each of these holds slot entries for additional elements.
 *   - Before reaching that point, it is initially required is to read in
 *     the hash table record fields (ht_bucket_buf, htnode buf, etc)
 *     holding pointers to the arrays, but these cache lines are common
 *     to all lookups so will likely already be in the cache.
 * - second step is to match at the record level (htnode/htkb level)
 *   once a slot-level match happens. Matching at this level implies
 *   the following match
 *   - the hash values (the full 64 bits vs. bucket+32 msb, above)
 *     With siphash, two names hashing to the same 64-bit value is
 *     quite rare.
 *   - the name which, on the hash table side, is stored as a list
 *     of htkb_t (key buffers). [In some cases, the full name is
 *     not compared, and a match is assumed based on hash value match.
 *   Read cost:
 *   - htnode_t, in one cache line, holds hash value and index for the
 *     htkb at the head of the key buffer list
 *   - each key buffer (htkb_t) is cache line aligned/sized, and holds
 *     60 bytes of the name and requires a cache line read.
 * Simplification is that a fib lookup requires 3 cache lines:
 * - bucket
 * - htnode
 * - single key buffer (for cases where a name comparision is done)
 *
 * Some hashtables (for which rare false positives are tolerable)
 * store hash values but no keys. (In ISM NDN forwarder, this was
 * used for dcm_dpf: data cache manager's dataplane filter, where
 * speed was critical and very rare false positives would be detected
 * in the full dcm check.)
 * - No key buffers are used (or even allocated at hash table creation).
 */

#define CICN_HASH_INVALID_IDX  ~0
/* for cicn_hashtb_next_node() iterator, this otherwise illegal context value
 * indicates first call of iteration.
 * Note: must not be 0, which is a legal context value.
 */
#define CICN_HASH_WALK_CTX_INITIAL (~((uint64_t)0))

/*
 * Key memory allocation scheme.
 *
 * The key is the bytestring that a hashtable entry is
 * storing, e.g. a fib prefix or packet name. The hash of the
 * name is used not just to pick the bucket, but also as a surrogate
 * for the actual key value.
 *
 * Client calls pass key/name as contiguous memory for lookup/add/delete
 * but hashable stores its copy of the key/name as a list of one or
 * more hash_key structs.
 * - key memory is managed as a list of keys (cache line
 *   sized/aligned buffers).
 *   - If (keysize < 128) then use key struct's full 128 bytes
 *   - If not, first key struct is head of a linked list of elements
 *     where the first bytes are used for the key and the last 4 bytes
 *     are the index of the next entry (or an end marker).
 * - key memory is generally the single largest use of memory
 *   in the hash table, especially for PIT, as names are bigger
 *   than node structs (which is also per name/entry).
 *
 */

#define CICN_HASH_KEY_BYTES   128
#define CICN_HASH_KEY_LIST_BYTES (CICN_HASH_KEY_BYTES - sizeof(uint32_t))
typedef struct
{
  union
  {
    struct
    {
      uint8_t key[CICN_HASH_KEY_BYTES];
    } ks;			/* Entire key in one block */
    struct
    {
      uint8_t key[CICN_HASH_KEY_LIST_BYTES];
      uint32_t idx_next;	/* Next keybuf idx */
    } kl;			/* Key in a list of blocks */
  };
} cicn_hash_key_t;

/* Ratio of extra key blocks to allocate, in case the embedded ones aren't
 * sufficient. This is the fraction of the number of entries allocated.
 */
#define CICN_HASHTB_KEY_RATIO 8

/*
 * hash node, used to store a hash table entry; indexed by an entry in a bucket.
 * the node contains an embedded key; long keys are stored as chains of keys.
 *
 * The memory block for a node includes space for client data,
 * additional memory located off the end of the htnode data structure.
 * Size of client-supplied data is fixed, so we can use vpp pools. The PIT
 * and FIB need to ensure that they fit within the available data area,
 * or change the size to accomodate their needs.
 *
 * NOTE: app_data_size currently applies to all apps, i.e. bigger FIB
 *       nodes means (leads to, requires) bigger PCS nodes
 */

/* Size this so that we can offer 64B aligned on 64-bits to the applications */
#define CICN_HASH_NODE_APP_DATA_SIZE 72	/* TODO -- big enough? */

typedef struct cicn_hash_node_s
{

  uint64_t hn_hash;		/* Complete hash value */

  /* Total size of the key (chained in several key structs if necessary) */
  uint16_t hn_keysize;

  /* 1 byte of flags for application use */
  uint8_t hn_flags;

  uint8_t _hn_reserved1;	/* TBD, to align what follows back to 32 */

  cicn_hash_key_t hn_key;	/* Key value embedded in the node, may
				 * chain to more key buffers if necessary
				 */
  /* TODO -- keep array of 'next keys' so we can prefetch better? */

  /* Followed by app-specific data (fib or pit or cs entry, e.g.) */
  uint8_t hn_data[CICN_HASH_NODE_APP_DATA_SIZE];

} cicn_hash_node_t;

#define CICN_HASH_NODE_FLAGS_DEFAULT   0x00


/*
 * cicn_hash_entry_t
 * Structure holding all or part of a hash value, a node index, and other
 * key pieces of info.
 *
 * - 128 bytes/bucket with 16 bytes/entry gives 8 entries,
 *    or 7 entries plus next bucket ptr if overflow
 */
typedef struct
{

  /* MSB of the hash value */
  uint64_t he_msb64;

  /* Index of node block */
  uint32_t he_node;

  /* Timeout value, units and scheme still TBD */
  uint16_t he_timeout;

  /* A few flags, including 'this points to a chain of buckets' */
  uint8_t he_flags;

  /* A byte for domain/application data (e.g. 'virtual fib entry' */
  uint8_t he_appval;

} cicn_hash_entry_t;

#define CICN_HASH_ENTRY_FLAGS_DEFAULT  0x00

/* This entry heads a chain of overflow buckets (we expect to see this
 * only in the last entry in a bucket.) In this case, the index is
 * to an overflow bucket rather than to a single node block.
 */
#define CICN_HASH_ENTRY_FLAG_OVERFLOW 0x01

/* This entry has been marked for deletion */
#define CICN_HASH_ENTRY_FLAG_DELETED  0x02

/* Use fast he_timeout units for expiration, slow if not */
#define CICN_HASH_ENTRY_FLAG_FAST_TIMEOUT 0x04

/*
 * hash bucket: Contains an array of entries.
 * Cache line sized/aligned, so no room for extra fields unless
 * bucket size is increased to 2 cache lines or the entry struct
 * shrinks.
 */

/*
 * Overflow bucket ratio as a fraction of the fixed/configured count;
 * a pool of hash buckets used if a row in the fixed table overflows.
 */
#define CICN_HASHTB_OVERFLOW_FRACTION 8

#define CICN_HASHTB_BUCKET_ENTRIES 8

typedef struct
{
  cicn_hash_entry_t hb_entries[CICN_HASHTB_BUCKET_ENTRIES];
} cicn_hash_bucket_t;

/* Overall target fill-factor for the hashtable */
#define CICN_HASHTB_FILL_FACTOR    4

#define CICN_HASHTB_MIN_ENTRIES  (1 << 4)	// includes dummy node 0 entry
#define CICN_HASHTB_MAX_ENTRIES  (1 << 24)

#define CICN_HASHTB_MIN_BUCKETS (1 << 10)

/*
 * htab_t
 *
 * Hash table main structure.
 *
 * Contains
 * - pointers to dynamically allocated arrays of cache-line
 *   sized/aligned structures (buckets, nodes, keys).
 * Put frequently accessed fields in the first cache line.
 */
typedef struct cicn_hashtb_s
{

  /* 8B - main array of hash buckets */
  cicn_hash_bucket_t *ht_buckets;

  /* 8B - just-in-case block of overflow buckets */
  cicn_hash_bucket_t *ht_overflow_buckets;

  /* 8B - block of nodes associated with entries in buckets */
  cicn_hash_node_t *ht_nodes;

  /*
   * 8B - just-in-case block of extra keys, used when a key is too
   *      large to fit in a node's embedded key area
   */
  cicn_hash_key_t *ht_extra_keys;

  /* Flags */
  uint32_t ht_flags;

  /* Count of buckets allocated in the main array */
  uint32_t ht_bucket_count;

  /* Count of overflow buckets allocated */
  uint32_t ht_overflow_bucket_count;
  uint32_t ht_overflow_buckets_used;

  /* Count of nodes allocated */
  uint32_t ht_node_count;
  uint32_t ht_nodes_used;

  /* Count of overflow key structs allocated */
  uint32_t ht_key_count;
  uint32_t ht_keys_used;

  /* TODO -- stats? */

} cicn_hashtb_t, *cicn_hashtb_h;

/* Offset to aligned start of additional data (PIT/CS, FIB)
 * embedded in each node.
 */
extern uint32_t ht_node_data_offset_aligned;

/* Flags for hashtable */

#define CICN_HASHTB_FLAGS_DEFAULT    0x00

/* Don't use the last/eighth entry in each bucket - only use it for overflow.
 * We use this for the FIB, currently, so that we can support in-place
 * FIB changes that would be difficult if there were hash entry copies
 * as part of overflow handling.
 */
#define CICN_HASHTB_FLAG_USE_SEVEN      0x01
#define CICN_HASHTB_FLAG_KEY_FMT_PFX    0x02
#define CICN_HASHTB_FLAG_KEY_FMT_NAME   0x04

/*
 * Max prefix name components we'll support in our incremental hashing;
 * currently used only for LPM in the FIB.
 */
#define CICN_HASHTB_MAX_NAME_COMPS CICN_PARAM_FIB_ENTRY_PFX_COMPS_MAX

/*
 * Info about an LPM hash computation on a prefix or name.
 */
typedef struct cicn_prefix_hashinf_s
{

  const uint8_t *pfx_ptr;
  uint16_t pfx_len;

  uint16_t pfx_count;		/* Number of prefix entries used */
  uint8_t pfx_overflow;		/* true if pfx has extra components (not hashed) */

  uint16_t pfx_lens[CICN_HASHTB_MAX_NAME_COMPS];
  uint64_t pfx_hashes[CICN_HASHTB_MAX_NAME_COMPS];

  uint64_t pfx_full_hash;

} cicn_prefix_hashinf_t;

/*
 * APIs and inlines
 */

/* Compute hash node index from node pointer */
static inline uint32_t
cicn_hashtb_node_idx_from_node (cicn_hashtb_h h, cicn_hash_node_t * p)
{
  return (p - h->ht_nodes);
}

/* Retrieve a hashtable node by node index */
static inline cicn_hash_node_t *
cicn_hashtb_node_from_idx (cicn_hashtb_h h, uint32_t idx)
{
  return (pool_elt_at_index (h->ht_nodes, idx));
}

/* Allocate a brand-new hashtable */
int cicn_hashtb_alloc (cicn_hashtb_h * ph, uint32_t max_elems,
		       size_t app_data_size);

/* Free a hashtable, including its embedded arrays */
int cicn_hashtb_free (cicn_hashtb_h * ph);

/* Hash a bytestring, currently using siphash64 (for UT) */
uint64_t cicn_hashtb_hash_bytestring (const uint8_t * key, uint32_t keylen);

/* Hash a name, currently using siphash64 */
uint64_t cicn_hashtb_hash_name (const uint8_t * key, uint32_t keylen);

/*
 * Hash a name, with incremental prefix hashing (for LPM, e.g.) If
 * 'limit' > 0, limit computation of incrementals.
 * if 'is_full_name', we expect that 'name' points to the beginning
 * of the entire name TLV; otherwise, 'name' just points to the first
 * name-comp sub-tlv, and we will _not_ compute the full-name hash.
 * Note that 'name' and 'namelen' are captured in the prefix-hash
 * context struct, to make the LPM/FIB apis a little cleaner.
 */
int cicn_hashtb_hash_prefixes (const uint8_t * name, uint16_t namelen,
			       int is_full_name, cicn_prefix_hashinf_t * pfx,
			       int limit);

/*
 * Prepare a hashtable node for insertion, supplying the key
 * and computed hash info. This sets up the node->key relationship, possibly
 * allocating overflow key buffers.
 */
int cicn_hashtb_init_node (cicn_hashtb_h h, cicn_hash_node_t * node,
			   uint64_t hashval,
			   const uint8_t * key, uint32_t keylen);

/*
 * Insert a node into the hashtable. We expect the caller has used the
 * init api to set the node key and hash info, and populated the extra
 * data area (if any) - or done the equivalent work itself.
 */
int cicn_hashtb_insert (cicn_hashtb_h h, cicn_hash_node_t * node);

/*
 * Basic api to lookup a specific hash+key tuple. This does the entire
 * lookup operation, retrieving node structs and comparing keys,
 * so it's not optimized for prefetching or high performance.
 *
 * Returns zero and mails back a node on success, errno otherwise.
 */
int cicn_hashtb_lookup_node (cicn_hashtb_h h, const uint8_t * key,
			     uint32_t keylen, uint64_t hashval,
			     cicn_hash_node_t ** nodep);

/*
 * Extended api to lookup a specific hash+key tuple. The implementation
 * allows the caller to locate nodes that are marked for deletion; this
 * is part of some hashtable applications, such as the FIB.
 *
 * This does the entire lookup operation, retrieving node structs
 * and comparing keys, so it's not optimized for prefetching or high performance.
 *
 * Returns zero and mails back a node on success, errno otherwise.
 */
int
cicn_hashtb_lookup_node_ex (cicn_hashtb_h h, const uint8_t * key,
			    uint32_t keylen, uint64_t hashval,
			    int include_deleted_p, cicn_hash_node_t ** nodep);

/*
 * Remove a node from a hashtable using the node itself. The internal
 * data structs are cleaned up, but the node struct itself is not: the caller
 * must free the node itself.
 */
int cicn_hashtb_remove_node (cicn_hashtb_h h, cicn_hash_node_t * node);

/*
 * Delete a node from a hashtable using the node itself, and delete/free
 * the node.  Caller's pointer is cleared on success.
 */
int cicn_hashtb_delete (cicn_hashtb_h h, cicn_hash_node_t ** pnode);

/*
 * Utility to init a new entry in a hashtable bucket/row. We use this
 * to add new a node+hash, and to clear out an entry during removal.
 */
void cicn_hashtb_init_entry (cicn_hash_entry_t * entry,
			     uint32_t nodeidx, uint64_t hashval);

/*
 * Return data area embedded in a hash node struct. We maintain an 'offset'
 * value in case the common node body struct doesn't leave the data area
 * aligned properly.
 */
static inline void *
cicn_hashtb_node_data (cicn_hash_node_t * node)
{
  return ((uint8_t *) (node) + ht_node_data_offset_aligned);
}

/*
 * Use some bits of the low half of the hash to locate a row/bucket in the table
 */
static inline uint32_t
cicn_hashtb_bucket_idx (cicn_hashtb_h h, uint64_t hashval)
{
  return ((uint32_t) (hashval & (h->ht_bucket_count - 1)));
}

/*
 * Return a hash node struct from the free list, or NULL.
 * Note that the returned struct is _not_ cleared/zeroed - init
 * is up to the caller.
 */
static inline cicn_hash_node_t *
cicn_hashtb_alloc_node (cicn_hashtb_h h)
{
  cicn_hash_node_t *p = NULL;

  if (h->ht_nodes_used < h->ht_node_count)
    {
      pool_get_aligned (h->ht_nodes, p, 8);
      h->ht_nodes_used++;
    }

  return (p);
}

/*
 * Release a hashtable node back to the free list when an entry is cleared
 */
void cicn_hashtb_free_node (cicn_hashtb_h h, cicn_hash_node_t * node);

/*
 * Walk a hashtable, iterating through the nodes, keeping context
 * in 'ctx' between calls.
 *
 * Set the context value to CICN_HASH_WALK_CTX_INITIAL to start an iteration.
 */
int cicn_hashtb_next_node (cicn_hashtb_h h, cicn_hash_node_t ** pnode,
			   uint64_t * ctx);

/*
 * Update the per-entry compression expiration value and type
 * for a hashtable node.
 */
int cicn_hashtb_entry_set_expiration (cicn_hashtb_h h,
				      cicn_hash_node_t * node,
				      uint16_t entry_timeout,
				      uint8_t entry_flags);

int cicn_hashtb_key_to_str (cicn_hashtb_h h, const cicn_hash_node_t * node,
			    char *buf, int bufsize, int must_fit);

#endif /* _CICN_HASHTB_H_ */