/* * Copyright (c) 2012 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Author: Pierre Pfister <ppfister@cisco.com> * * DISCLAIMER ! * * This most likely is not the hash table you are looking for !! * * This structure targets a very specific and quite narrow set of use-cases * that are not covered by other hash tables. * * Read the following text carefully, or ask the author or one of VPP's * committers to make sure this is what you are looking for. * * * -- Abstract: * This hash table intends to provide a very fast lookup and insertion of * key-value pairs for flow tables (although it might be used for other * purposes), with additional support for lazy-timeouts. * In particular, it was designed to minimize blocking reads, register usage and * cache-lines accesses during a typical lookup. * This hash table therefore provides stateful packet processing * without performance degradation even when every single lookup has to fetch * memory from RAM. * This hash table is not thread-safe and requires executing a garbage * collection function to clean-up chained buckets. * * -- Overview: * * One first aspect of this hash table is that it is self-contained in a single * bulk of memory. Each entry contains a key, a value, and a 32 bits timeout * value; occupies a full and single cache line; and is identified by a unique * 32 bits index. The entry index zero is reserved and used when an entry * could not be found nor inserted. Which means it is not necessary to * immediately check whether an insertion or lookup was successful before * behaving accordingly. One can just keep doing business as usual and * check for the error later. * * Each entry is associated with a timeout value (which unit or clock is up to * the user of the hash table). An entry which timeout is strictly smaller * than the current time is considered empty, whereas an entry which timeout is * greater or equal to the current time contains a valid key-value pair. * * Hash table lookup and insertion are equivalent: * - An entry index is always returned (possibly index 0 if no entry could be * found nor created). * - The returned entry always has its key set to the provided key. * - Timeout value will be greater than the provided current time whenever a * valid entry was found, strictly smaller otherwise. In particular, one can * not differentiate between an entry which was just created, and an entry * which already existed in the past but got timeouted in between. * * As mentioned earlier, entry index zero is used as an invalid entry which may * be manipulated as a normal one. Entries which index go from 1 to * N (where N is a power of 2) are used as direct buckets, each containing a * single entry. In the absence of hash collision, a single entry which location * can deterministically be determined from the key-hash and the hash table * header is accessed (One single cache line, without indirection). This * allows for efficient pre-fetching of the key-value for more than 95% of * accesses. * * In order to handle hash collisions (i.e. when multiple keys * end-up in the same bucket), entries which index are greater than N are * grouped into M groups of 16 collision entries. Such groups are linked * with regular entries whenever a collision needs to be handled. * When looking up a key with a bucket where a collision occurred, unused bits * from the key hash are used to select two entries (from the collision bucket) * where the new entry might be inserted. * * Once an entry is inserted, it will never be moved as long as the entry * timeout value remains greater or equal to the provided current time value. * The entry index can therefore be stored in other data structure as a way * to bypass the hash lookup. But when doing so, one should check if the * present key is the actual looked-up key. * * -- Garbage Collection: * * Since there is no explicit element removal, a garbage collector mechanism * is required in order to remove buckets used for hash collisions. This * is done by calling the flowhash_gc function on a regular basis. Each call * to this function examines a single fixed entry. It shall therefore be called * as many times as there are fixed entries in the hash table in order to * ensure a full inspection. * * -- Time and timeout mechanism: * * The hash table makes use of a time value between in [1, 2^32 - 1]. * The provided time value shall keep increasing, and looping is not handled. * When seconds are used, the system should run for 136 years without any issue. * If milliseconds are used, a shift should be operated on all timeout values * on a regular basis (more than every 49 days). */ #ifndef __included_flowhash_template_h__ #define __included_flowhash_template_h__ #include <vppinfra/clib.h> #include <vppinfra/mem.h> #include <vppinfra/cache.h> #ifndef FLOWHASH_TYPE #error FLOWHASH_TYPE not defined #endif #define _fv(a,b) a##b #define __fv(a,b) _fv(a,b) #define FV(a) __fv(a,FLOWHASH_TYPE) #define _fvt(a,b) a##b##_t #define __fvt(a,b) _fvt(a,b) #define FVT(a) __fvt(a,FLOWHASH_TYPE) /* Same for all flowhash variants */ #ifndef __included_flowhash_common__ #define FLOWHASH_INVALID_ENTRY_INDEX 0 #define FLOWHASH_ENTRIES_PER_BUCKETS_LOG 4 #define FLOWHASH_ENTRIES_PER_BUCKETS (1 << FLOWHASH_ENTRIES_PER_BUCKETS_LOG) #endif /* ifndef __included_flowhash_common__ */ /** * @brief Compare a stored key with a lookup key. * * This function must be defined to use this template. It must return 0 * when the two keys are identical, and a different value otherwise. */ static_always_inline u8 FV(flowhash_cmp_key)(FVT(flowhash_skey) *a, FVT(flowhash_lkey) *b); /** * @brief Hash a lookup key into a 32 bit integer. * * This function must be defined to use this template. * It must provides close to 32 bits of entropy distributed amongst * all 32 bits of the provided value. * Keys that are equal must have the same hash. */ static_always_inline u32 FV(flowhash_hash)(FVT(flowhash_lkey) *k); /** * @brief Copy a lookup key into a destination stored key. * * This function must be defined to use this template. It must modify the dst * key such that a later call to flowhash_cmp_key with the same arguments * would return 0. */ static_always_inline void FV(flowhash_cpy_key)(FVT(flowhash_skey) *dst, FVT(flowhash_lkey) *src); /** * @brief One flow hash entry used for both direct buckets and collision * buckets. */ typedef struct { /* Each entry is cache-line aligned. */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); /* Key is first to take advantage of alignment. */ FVT(flowhash_skey) key; /* Entry value. */ FVT(flowhash_value) value; /* Timeout value */ u32 timeout; /* Entry index to the chained bucket. */ u32 chained_entry_index; } FVT(flowhash_entry); typedef struct FVT(__flowhash_struct) { /* Cache aligned to simplify allocation. */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); /* Array going downward containing free bucket indices */ u32 free_buckets_indices[0]; /* Negative index of the first free bucket */ i32 free_buckets_position; /* Number of fixed buckets minus one */ u32 fixed_entries_mask; /* Allocated pointer for this hash table */ void *mem; u32 collision_buckets_mask; u32 total_entries; u64 not_enough_buckets_counter; u64 collision_lookup_counter; u64 garbage_collection_counter; u32 gc_counter; /* Entry array containing: * - 1 Dummy entry for error return * - (buckets_mask + 1) Fixed buckets * - chained_buckets Chained Buckets */ FVT(flowhash_entry) entries[0]; } FVT(flowhash); /* Same for all flowhash variants */ #ifndef __included_flowhash_common__ #define __included_flowhash_common__ /** * @brief Test whether a returned entry index corresponds to an overflow event. */ #define flowhash_is_overflow(ei) \ ((ei) == FLOWHASH_INVALID_ENTRY_INDEX) /** * @brief Iterate over all entries in the hash table. * * Iterate over all entries in the hash table, not including the first invalid * entry (at index 0), but including all chained hash collision buckets. * */ #define flowhash_foreach_entry(h, ei) \ for (ei = 1; \ ei < (h)->total_entries; \ ei++) /** * @brief Iterate over all currently valid entries. * * Iterate over all entries in the hash table which timeout value is greater * or equal to the current time. */ #define flowhash_foreach_valid_entry(h, ei, now) \ flowhash_foreach_entry(h, ei) \ if (((now) <= (h)->entries[ei].timeout)) /** * @brief Timeout variable from a given entry. */ #define flowhash_timeout(h, ei) (h)->entries[ei].timeout /** * @brief Indicates whether the entry is being used. */ #define flowhash_is_timeouted(h, ei, time_now) \ ((time_now) > flowhash_timeout(h, ei)) /** * @brief Get the key from the entry index, casted to the provided type. */ #define flowhash_key(h, ei) (&(h)->entries[ei].key) /** * @brief Get the value from the entry index, casted to the provided type. */ #define flowhash_value(h, ei) (&(h)->entries[ei].value) /** * @brief Get the number of octets allocated to this structure. */ #define flowhash_memory_size(h) clib_mem_size((h)->mem) /** * @brief Test whether the entry index is in hash table boundaries. */ #define flowhash_is_valid_entry_index(h, ei) (ei < (h)->total_entries) /** * @brief Adjust, if necessary, provided parameters such as being valid flowhash * sizes. */ static void flowhash_validate_sizes(u32 *fixed_entries, u32 *collision_buckets) { /* Find power of two greater or equal to the provided value */ if (*fixed_entries < FLOWHASH_ENTRIES_PER_BUCKETS) *fixed_entries = FLOWHASH_ENTRIES_PER_BUCKETS; if (*fixed_entries > (1 << (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG))) *fixed_entries = (1 << (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG)); *fixed_entries -= 1; *fixed_entries |= *fixed_entries >> 16; *fixed_entries |= *fixed_entries >> 8; *fixed_entries |= *fixed_entries >> 4; *fixed_entries |= *fixed_entries >> 2; *fixed_entries |= *fixed_entries >> 1; *fixed_entries += 1; if (*collision_buckets != 0) { if (*collision_buckets < CLIB_CACHE_LINE_BYTES/sizeof(u32)) *collision_buckets = CLIB_CACHE_LINE_BYTES/sizeof(u32); *collision_buckets -= 1; *collision_buckets |= *collision_buckets >> 16; *collision_buckets |= *collision_buckets >> 8; *collision_buckets |= *collision_buckets >> 4; *collision_buckets |= *collision_buckets >> 2; *collision_buckets |= *collision_buckets >> 1; *collision_buckets += 1; } } /** * @brief Prefetch the the hash entry bucket. * * This should be performed approximately 200-300 cycles before lookup * if the table is located in RAM. Or 30-40 cycles before lookup * in case the table is located in L3. */ #define flowhash_prefetch(h, hash) \ CLIB_PREFETCH (&(h)->entries[((hash) & (h)->fixed_entries_mask) + 1], \ sizeof((h)->entries[0]), LOAD) #endif /* ifndef __included_flowhash_common__ */ /** * @brief Allocate a flowhash structure. * * @param[in] fixed_entries The number of fixed entries in the hash table. * @param[in] chained_buckets The number of chained buckets. * * fixed_entries and chained_buckets parameters may not be used as is but * modified in order to fit requirements. * * Since the flowhash does not support dynamic resizing, it is fairly * important to choose the parameters carefully. In particular the performance * gain from using this structure comes from an efficient lookup in the * absence of hash collision. * As a rule of thumbs, if the number of active entries (flows) is M, * there should be about 16*M fixed entries, and M/16 collision buckets. * Which represents 17*M allocated entries. * * For example: * M = 2^20 total_size ~= 1GiB collision ~= 3% * M = 2^18 total_size ~= 250MiB collision ~= 3% * M = 2^10 total_size ~= 1MiB collision ~= 6% * */ static_always_inline FVT(flowhash) *FV(flowhash_alloc)(u32 fixed_entries, u32 collision_buckets) { FVT(flowhash) *h; u32 size; void *mem; u32 entries; flowhash_validate_sizes(&fixed_entries, &collision_buckets); entries = 1 + fixed_entries + collision_buckets * FLOWHASH_ENTRIES_PER_BUCKETS; size = sizeof(*h) + sizeof(h->entries[0]) * entries + sizeof(h->free_buckets_indices[0]) * collision_buckets; mem = clib_mem_alloc_aligned(size, CLIB_CACHE_LINE_BYTES); h = mem + collision_buckets * sizeof(h->free_buckets_indices[0]); h->mem = mem; /* Fill free elements list */ int i; for (i = 1; i <= collision_buckets; i++) { h->free_buckets_indices[-i] = entries - i * FLOWHASH_ENTRIES_PER_BUCKETS; } /* Init buckets */ for (i=0; i < entries; i++) { h->entries[i].chained_entry_index = FLOWHASH_INVALID_ENTRY_INDEX; h->entries[i].timeout = 0; } h->free_buckets_position = -collision_buckets; h->fixed_entries_mask = fixed_entries - 1; h->collision_buckets_mask = collision_buckets - 1; h->total_entries = entries; h->not_enough_buckets_counter = 0; h->collision_lookup_counter = 0; h->garbage_collection_counter = 0; h->gc_counter = 0; return h; } /** * @brief Free the flow hash memory. */ static_always_inline void FV(flowhash_free)(FVT(flowhash) *h) { clib_mem_free(h->mem); } static void FV(__flowhash_get_chained) (FVT(flowhash) *h, FVT(flowhash_lkey) *k, u32 hash, u32 time_now, u32 *ei); /** * @brief Retrieves an entry index corresponding to a provided key and its hash. * * @param h The hash table pointer. * @param k[in] A pointer to the key value. * @param hash[in] The hash of the key. * @param time_now[in] The current time. * @param ei[out] A pointer set to the found entry index. * * This function always sets ei value to a valid entry index which can then be * used to access the stored value as well as get or set its associated timeout. * The key stored in the returned entry is always set to the provided key. * * In case the provided key is not found, and no entry could be created * (either because there is no hash collision bucket available or * the candidate entries in the collision bucket were already used), ei is * set to the special value FLOWHASH_INVALID_ENTRY_INDEX (which can be tested * with the flowhash_is_overflow macro). * * The timeout value is never modified during a lookup. * - Use the flowhash_is_timeouted macro to test whether the returned entry * was already valid, or is proposed for insertion. * - Use the flowhash_timeout macro to get and set the entry timeout value. * */ static_always_inline void FV(flowhash_get) (FVT(flowhash) *h, FVT(flowhash_lkey) *k, u32 hash, u32 time_now, u32 *ei) { *ei = (hash & h->fixed_entries_mask) + 1; if (PREDICT_FALSE(FV(flowhash_cmp_key)(&h->entries[*ei].key, k) != 0)) { if (PREDICT_TRUE(time_now > h->entries[*ei].timeout && (h->entries[*ei].chained_entry_index == FLOWHASH_INVALID_ENTRY_INDEX))) { FV(flowhash_cpy_key)(&h->entries[*ei].key, k); } else { FV(__flowhash_get_chained)(h, k, hash, time_now, ei); } } } static_always_inline void FV(__flowhash_get_chained) (FVT(flowhash) *h, FVT(flowhash_lkey) *k, u32 hash, u32 time_now, u32 *ei) { h->collision_lookup_counter++; if (h->entries[*ei].chained_entry_index == FLOWHASH_INVALID_ENTRY_INDEX) { /* No chained entry yet. Let's chain one. */ if (h->free_buckets_position == 0) { /* Oops. No more buckets available. */ h->not_enough_buckets_counter++; *ei = FLOWHASH_INVALID_ENTRY_INDEX; h->entries[FLOWHASH_INVALID_ENTRY_INDEX].timeout = time_now - 1; FV(flowhash_cpy_key)( &h->entries[FLOWHASH_INVALID_ENTRY_INDEX].key, k); return; } /* Forward link */ h->entries[*ei].chained_entry_index = h->free_buckets_indices[h->free_buckets_position]; /* Backward link (for garbage collection) */ h->entries[h->free_buckets_indices[h->free_buckets_position]]. chained_entry_index = *ei; /* Move pointer */ h->free_buckets_position++; } /* Get the two indexes where to look at. */ u32 bi0 = h->entries[*ei].chained_entry_index + (hash >> (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG)); u32 bi1 = bi0 + 1; bi1 = (bi0 & (FLOWHASH_ENTRIES_PER_BUCKETS - 1)) ? bi1 : bi1 - FLOWHASH_ENTRIES_PER_BUCKETS; /* It is possible that we wait while comparing bi0 key. * It's better to prefetch bi1 so we don't wait twice. */ CLIB_PREFETCH(&h->entries[bi1], sizeof (h->entries[0]), READ); if (FV(flowhash_cmp_key)(&h->entries[bi0].key, k) == 0) { *ei = bi0; return; } if (FV(flowhash_cmp_key)(&h->entries[bi1].key, k) == 0) { *ei = bi1; return; } if (h->entries[*ei].timeout >= time_now) { *ei = FLOWHASH_INVALID_ENTRY_INDEX; *ei = (time_now > h->entries[bi0].timeout) ? bi0 : *ei; *ei = (time_now > h->entries[bi1].timeout) ? bi1 : *ei; } FV(flowhash_cpy_key)(&h->entries[*ei].key, k); } static_always_inline void FV(flowhash_gc)(FVT(flowhash) *h, u32 time_now) { u32 ei; if (PREDICT_FALSE(h->collision_buckets_mask == (((u32)0) - 1))) return; /* prefetch two rounds in advance */ ei = 2 + h->fixed_entries_mask + ((h->gc_counter + 2) & h->collision_buckets_mask) * FLOWHASH_ENTRIES_PER_BUCKETS; CLIB_PREFETCH(&h->entries[ei], sizeof (h->entries[0]), READ); /* prefetch one round in advance */ ei = 2 + h->fixed_entries_mask + ((h->gc_counter + 1) & h->collision_buckets_mask) * FLOWHASH_ENTRIES_PER_BUCKETS; if (h->entries[ei].chained_entry_index != FLOWHASH_INVALID_ENTRY_INDEX) { CLIB_PREFETCH(&h->entries[ei], 4 * CLIB_CACHE_LINE_BYTES, READ); } /* do GC */ ei = 2 + h->fixed_entries_mask + ((h->gc_counter) & h->collision_buckets_mask) * FLOWHASH_ENTRIES_PER_BUCKETS; if (h->entries[ei].chained_entry_index != FLOWHASH_INVALID_ENTRY_INDEX) { u8 found = 0; int i; for (i=0; i<FLOWHASH_ENTRIES_PER_BUCKETS; i++) { if (time_now <= h->entries[ei + i].timeout) { found = 1; break; } } if (!found) { /* The bucket is not used. Let's free it. */ h->free_buckets_position--; /* Reset forward link */ h->entries[h->entries[ei].chained_entry_index].chained_entry_index = FLOWHASH_INVALID_ENTRY_INDEX; /* Reset back link */ h->entries[ei].chained_entry_index = FLOWHASH_INVALID_ENTRY_INDEX; /* Free element */ h->free_buckets_indices[h->free_buckets_position] = ei; /* Count the garbage collection event */ h->garbage_collection_counter++; } } h->gc_counter++; } static_always_inline u32 FV(flowhash_elts)(FVT(flowhash) *h, u32 time_now) { u32 tot = 0; u32 ei; flowhash_foreach_valid_entry(h, ei, time_now) tot++; return tot; } #endif /* __included_flowhash_template_h__ */