diff options
Diffstat (limited to 'lib/librte_hash')
-rw-r--r-- | lib/librte_hash/rte_cuckoo_hash.c | 1061 | ||||
-rw-r--r-- | lib/librte_hash/rte_cuckoo_hash.h | 34 | ||||
-rw-r--r-- | lib/librte_hash/rte_hash.h | 85 | ||||
-rw-r--r-- | lib/librte_hash/rte_hash_version.map | 7 |
4 files changed, 903 insertions, 284 deletions
diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c index f7b86c8c..5ddcccd8 100644 --- a/lib/librte_hash/rte_cuckoo_hash.c +++ b/lib/librte_hash/rte_cuckoo_hash.c @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2010-2016 Intel Corporation + * Copyright(c) 2018 Arm Limited */ #include <string.h> @@ -26,11 +27,14 @@ #include <rte_spinlock.h> #include <rte_ring.h> #include <rte_compat.h> -#include <rte_pause.h> #include "rte_hash.h" #include "rte_cuckoo_hash.h" +#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET) \ + for (CURRENT_BKT = START_BUCKET; \ + CURRENT_BKT != NULL; \ + CURRENT_BKT = CURRENT_BKT->next) TAILQ_HEAD(rte_hash_list, rte_tailq_entry); @@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name) return h; } +static inline struct rte_hash_bucket * +rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt) +{ + while (lst_bkt->next != NULL) + lst_bkt = lst_bkt->next; + return lst_bkt; +} + void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func) { h->cmp_jump_table_idx = KEY_CUSTOM; @@ -78,6 +90,36 @@ rte_hash_cmp_eq(const void *key1, const void *key2, const struct rte_hash *h) return cmp_jump_table[h->cmp_jump_table_idx](key1, key2, h->key_len); } +/* + * We use higher 16 bits of hash as the signature value stored in table. + * We use the lower bits for the primary bucket + * location. Then we XOR primary bucket location and the signature + * to get the secondary bucket location. This is same as + * proposed in Bin Fan, et al's paper + * "MemC3: Compact and Concurrent MemCache with Dumber Caching and + * Smarter Hashing". The benefit to use + * XOR is that one could derive the alternative bucket location + * by only using the current bucket location and the signature. + */ +static inline uint16_t +get_short_sig(const hash_sig_t hash) +{ + return hash >> 16; +} + +static inline uint32_t +get_prim_bucket_index(const struct rte_hash *h, const hash_sig_t hash) +{ + return hash & h->bucket_bitmask; +} + +static inline uint32_t +get_alt_bucket_index(const struct rte_hash *h, + uint32_t cur_bkt_idx, uint16_t sig) +{ + return (cur_bkt_idx ^ sig) & h->bucket_bitmask; +} + struct rte_hash * rte_hash_create(const struct rte_hash_parameters *params) { @@ -85,14 +127,22 @@ rte_hash_create(const struct rte_hash_parameters *params) struct rte_tailq_entry *te = NULL; struct rte_hash_list *hash_list; struct rte_ring *r = NULL; + struct rte_ring *r_ext = NULL; char hash_name[RTE_HASH_NAMESIZE]; void *k = NULL; void *buckets = NULL; + void *buckets_ext = NULL; char ring_name[RTE_RING_NAMESIZE]; + char ext_ring_name[RTE_RING_NAMESIZE]; unsigned num_key_slots; unsigned i; - unsigned int hw_trans_mem_support = 0, multi_writer_support = 0; + unsigned int hw_trans_mem_support = 0, use_local_cache = 0; + unsigned int ext_table_support = 0; unsigned int readwrite_concur_support = 0; + unsigned int writer_takes_lock = 0; + unsigned int no_free_on_del = 0; + uint32_t *tbl_chng_cnt = NULL; + unsigned int readwrite_concur_lf_support = 0; rte_hash_function default_hash_func = (rte_hash_function)rte_jhash; @@ -112,20 +162,52 @@ rte_hash_create(const struct rte_hash_parameters *params) return NULL; } + /* Validate correct usage of extra options */ + if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) && + (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF)) { + rte_errno = EINVAL; + RTE_LOG(ERR, HASH, "rte_hash_create: choose rw concurrency or " + "rw concurrency lock free\n"); + return NULL; + } + + if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) && + (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)) { + rte_errno = EINVAL; + RTE_LOG(ERR, HASH, "rte_hash_create: extendable bucket " + "feature not supported with rw concurrency " + "lock free\n"); + return NULL; + } + /* Check extra flags field to check extra options. */ if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT) hw_trans_mem_support = 1; - if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) - multi_writer_support = 1; + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) { + use_local_cache = 1; + writer_takes_lock = 1; + } if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) { readwrite_concur_support = 1; - multi_writer_support = 1; + writer_takes_lock = 1; + } + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE) + ext_table_support = 1; + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL) + no_free_on_del = 1; + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) { + readwrite_concur_lf_support = 1; + /* Enable not freeing internal memory/index on delete */ + no_free_on_del = 1; } /* Store all keys and leave the first entry as a dummy entry for lookup_bulk */ - if (multi_writer_support) + if (use_local_cache) /* * Increase number of slots by total number of indices * that can be stored in the lcore caches @@ -145,6 +227,24 @@ rte_hash_create(const struct rte_hash_parameters *params) goto err; } + const uint32_t num_buckets = rte_align32pow2(params->entries) / + RTE_HASH_BUCKET_ENTRIES; + + /* Create ring for extendable buckets. */ + if (ext_table_support) { + snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s", + params->name); + r_ext = rte_ring_create(ext_ring_name, + rte_align32pow2(num_buckets + 1), + params->socket_id, 0); + + if (r_ext == NULL) { + RTE_LOG(ERR, HASH, "ext buckets memory allocation " + "failed\n"); + goto err; + } + } + snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name); rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); @@ -177,19 +277,37 @@ rte_hash_create(const struct rte_hash_parameters *params) goto err_unlock; } - const uint32_t num_buckets = rte_align32pow2(params->entries) - / RTE_HASH_BUCKET_ENTRIES; - buckets = rte_zmalloc_socket(NULL, num_buckets * sizeof(struct rte_hash_bucket), RTE_CACHE_LINE_SIZE, params->socket_id); if (buckets == NULL) { - RTE_LOG(ERR, HASH, "memory allocation failed\n"); + RTE_LOG(ERR, HASH, "buckets memory allocation failed\n"); goto err_unlock; } - const uint32_t key_entry_size = sizeof(struct rte_hash_key) + params->key_len; + /* Allocate same number of extendable buckets */ + if (ext_table_support) { + buckets_ext = rte_zmalloc_socket(NULL, + num_buckets * sizeof(struct rte_hash_bucket), + RTE_CACHE_LINE_SIZE, params->socket_id); + if (buckets_ext == NULL) { + RTE_LOG(ERR, HASH, "ext buckets memory allocation " + "failed\n"); + goto err_unlock; + } + /* Populate ext bkt ring. We reserve 0 similar to the + * key-data slot, just in case in future we want to + * use bucket index for the linked list and 0 means NULL + * for next bucket + */ + for (i = 1; i <= num_buckets; i++) + rte_ring_sp_enqueue(r_ext, (void *)((uintptr_t) i)); + } + + const uint32_t key_entry_size = + RTE_ALIGN(sizeof(struct rte_hash_key) + params->key_len, + KEY_ALIGNMENT); const uint64_t key_tbl_size = (uint64_t) key_entry_size * num_key_slots; k = rte_zmalloc_socket(NULL, key_tbl_size, @@ -200,6 +318,14 @@ rte_hash_create(const struct rte_hash_parameters *params) goto err_unlock; } + tbl_chng_cnt = rte_zmalloc_socket(NULL, sizeof(uint32_t), + RTE_CACHE_LINE_SIZE, params->socket_id); + + if (tbl_chng_cnt == NULL) { + RTE_LOG(ERR, HASH, "memory allocation failed\n"); + goto err_unlock; + } + /* * If x86 architecture is used, select appropriate compare function, * which may use x86 intrinsics, otherwise use memcmp @@ -239,7 +365,7 @@ rte_hash_create(const struct rte_hash_parameters *params) h->cmp_jump_table_idx = KEY_OTHER_BYTES; #endif - if (multi_writer_support) { + if (use_local_cache) { h->local_free_slots = rte_zmalloc_socket(NULL, sizeof(struct lcore_cache) * RTE_MAX_LCORE, RTE_CACHE_LINE_SIZE, params->socket_id); @@ -262,27 +388,34 @@ rte_hash_create(const struct rte_hash_parameters *params) h->num_buckets = num_buckets; h->bucket_bitmask = h->num_buckets - 1; h->buckets = buckets; + h->buckets_ext = buckets_ext; + h->free_ext_bkts = r_ext; h->hash_func = (params->hash_func == NULL) ? default_hash_func : params->hash_func; h->key_store = k; h->free_slots = r; + h->tbl_chng_cnt = tbl_chng_cnt; + *h->tbl_chng_cnt = 0; h->hw_trans_mem_support = hw_trans_mem_support; - h->multi_writer_support = multi_writer_support; + h->use_local_cache = use_local_cache; h->readwrite_concur_support = readwrite_concur_support; + h->ext_table_support = ext_table_support; + h->writer_takes_lock = writer_takes_lock; + h->no_free_on_del = no_free_on_del; + h->readwrite_concur_lf_support = readwrite_concur_lf_support; #if defined(RTE_ARCH_X86) - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) - h->sig_cmp_fn = RTE_HASH_COMPARE_AVX2; - else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2)) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2)) h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; else #endif h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; - /* Turn on multi-writer only with explicit flag from user and TM - * support. + /* Writer threads need to take the lock when: + * 1) RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY is enabled OR + * 2) RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD is enabled */ - if (h->multi_writer_support) { + if (h->writer_takes_lock) { h->readwrite_lock = rte_malloc(NULL, sizeof(rte_rwlock_t), RTE_CACHE_LINE_SIZE); if (h->readwrite_lock == NULL) @@ -304,10 +437,13 @@ err_unlock: rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); err: rte_ring_free(r); + rte_ring_free(r_ext); rte_free(te); rte_free(h); rte_free(buckets); + rte_free(buckets_ext); rte_free(k); + rte_free(tbl_chng_cnt); return NULL; } @@ -339,13 +475,16 @@ rte_hash_free(struct rte_hash *h) rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); - if (h->multi_writer_support) { + if (h->use_local_cache) rte_free(h->local_free_slots); + if (h->writer_takes_lock) rte_free(h->readwrite_lock); - } rte_ring_free(h->free_slots); + rte_ring_free(h->free_ext_bkts); rte_free(h->key_store); rte_free(h->buckets); + rte_free(h->buckets_ext); + rte_free(h->tbl_chng_cnt); rte_free(h); rte_free(te); } @@ -357,18 +496,6 @@ rte_hash_hash(const struct rte_hash *h, const void *key) return h->hash_func(key, h->key_len, h->hash_func_init_val); } -/* Calc the secondary hash value from the primary hash value of a given key */ -static inline hash_sig_t -rte_hash_secondary_hash(const hash_sig_t primary_hash) -{ - static const unsigned all_bits_shift = 12; - static const unsigned alt_bits_xor = 0x5bd1e995; - - uint32_t tag = primary_hash >> all_bits_shift; - - return primary_hash ^ ((tag + 1) * alt_bits_xor); -} - int32_t rte_hash_count(const struct rte_hash *h) { @@ -378,7 +505,7 @@ rte_hash_count(const struct rte_hash *h) if (h == NULL) return -EINVAL; - if (h->multi_writer_support) { + if (h->use_local_cache) { tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * (LCORE_CACHE_SIZE - 1); for (i = 0; i < RTE_MAX_LCORE; i++) @@ -397,13 +524,12 @@ rte_hash_count(const struct rte_hash *h) static inline void __hash_rw_writer_lock(const struct rte_hash *h) { - if (h->multi_writer_support && h->hw_trans_mem_support) + if (h->writer_takes_lock && h->hw_trans_mem_support) rte_rwlock_write_lock_tm(h->readwrite_lock); - else if (h->multi_writer_support) + else if (h->writer_takes_lock) rte_rwlock_write_lock(h->readwrite_lock); } - static inline void __hash_rw_reader_lock(const struct rte_hash *h) { @@ -416,9 +542,9 @@ __hash_rw_reader_lock(const struct rte_hash *h) static inline void __hash_rw_writer_unlock(const struct rte_hash *h) { - if (h->multi_writer_support && h->hw_trans_mem_support) + if (h->writer_takes_lock && h->hw_trans_mem_support) rte_rwlock_write_unlock_tm(h->readwrite_lock); - else if (h->multi_writer_support) + else if (h->writer_takes_lock) rte_rwlock_write_unlock(h->readwrite_lock); } @@ -443,13 +569,22 @@ rte_hash_reset(struct rte_hash *h) __hash_rw_writer_lock(h); memset(h->buckets, 0, h->num_buckets * sizeof(struct rte_hash_bucket)); memset(h->key_store, 0, h->key_entry_size * (h->entries + 1)); + *h->tbl_chng_cnt = 0; /* clear the free ring */ while (rte_ring_dequeue(h->free_slots, &ptr) == 0) - rte_pause(); + continue; + + /* clear free extendable bucket ring and memory */ + if (h->ext_table_support) { + memset(h->buckets_ext, 0, h->num_buckets * + sizeof(struct rte_hash_bucket)); + while (rte_ring_dequeue(h->free_ext_bkts, &ptr) == 0) + continue; + } /* Repopulate the free slots ring. Entry zero is reserved for key misses */ - if (h->multi_writer_support) + if (h->use_local_cache) tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * (LCORE_CACHE_SIZE - 1); else @@ -458,7 +593,14 @@ rte_hash_reset(struct rte_hash *h) for (i = 1; i < tot_ring_cnt + 1; i++) rte_ring_sp_enqueue(h->free_slots, (void *)((uintptr_t) i)); - if (h->multi_writer_support) { + /* Repopulate the free ext bkt ring. */ + if (h->ext_table_support) { + for (i = 1; i <= h->num_buckets; i++) + rte_ring_sp_enqueue(h->free_ext_bkts, + (void *)((uintptr_t) i)); + } + + if (h->use_local_cache) { /* Reset local caches per lcore */ for (i = 0; i < RTE_MAX_LCORE; i++) h->local_free_slots[i].len = 0; @@ -476,29 +618,35 @@ enqueue_slot_back(const struct rte_hash *h, struct lcore_cache *cached_free_slots, void *slot_id) { - if (h->multi_writer_support) { + if (h->use_local_cache) { cached_free_slots->objs[cached_free_slots->len] = slot_id; cached_free_slots->len++; } else rte_ring_sp_enqueue(h->free_slots, slot_id); } -/* Search a key from bucket and update its data */ +/* Search a key from bucket and update its data. + * Writer holds the lock before calling this. + */ static inline int32_t search_and_update(const struct rte_hash *h, void *data, const void *key, - struct rte_hash_bucket *bkt, hash_sig_t sig, hash_sig_t alt_hash) + struct rte_hash_bucket *bkt, uint16_t sig) { int i; struct rte_hash_key *k, *keys = h->key_store; for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (bkt->sig_current[i] == sig && - bkt->sig_alt[i] == alt_hash) { + if (bkt->sig_current[i] == sig) { k = (struct rte_hash_key *) ((char *)keys + bkt->key_idx[i] * h->key_entry_size); if (rte_hash_cmp_eq(key, k->key, h) == 0) { - /* Update data */ - k->pdata = data; + /* 'pdata' acts as the synchronization point + * when an existing hash entry is updated. + * Key is not updated in this case. + */ + __atomic_store_n(&k->pdata, + data, + __ATOMIC_RELEASE); /* * Return index where key is stored, * subtracting the first dummy index @@ -520,28 +668,31 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h, struct rte_hash_bucket *prim_bkt, struct rte_hash_bucket *sec_bkt, const struct rte_hash_key *key, void *data, - hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx, + uint16_t sig, uint32_t new_idx, int32_t *ret_val) { unsigned int i; - struct rte_hash_bucket *cur_bkt = prim_bkt; + struct rte_hash_bucket *cur_bkt; int32_t ret; __hash_rw_writer_lock(h); /* Check if key was inserted after last check but before this * protected region in case of inserting duplicated keys. */ - ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash); + ret = search_and_update(h, data, key, prim_bkt, sig); if (ret != -1) { __hash_rw_writer_unlock(h); *ret_val = ret; return 1; } - ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig); - if (ret != -1) { - __hash_rw_writer_unlock(h); - *ret_val = ret; - return 1; + + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + ret = search_and_update(h, data, key, cur_bkt, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } } /* Insert new entry if there is room in the primary @@ -551,8 +702,15 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash *h, /* Check if slot is available */ if (likely(prim_bkt->key_idx[i] == EMPTY_SLOT)) { prim_bkt->sig_current[i] = sig; - prim_bkt->sig_alt[i] = alt_hash; - prim_bkt->key_idx[i] = new_idx; + /* Key can be of arbitrary length, so it is + * not possible to store it atomically. + * Hence the new key element's memory stores + * (key as well as data) should be complete + * before it is referenced. + */ + __atomic_store_n(&prim_bkt->key_idx[i], + new_idx, + __ATOMIC_RELEASE); break; } } @@ -576,11 +734,11 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, struct rte_hash_bucket *alt_bkt, const struct rte_hash_key *key, void *data, struct queue_node *leaf, uint32_t leaf_slot, - hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx, + uint16_t sig, uint32_t new_idx, int32_t *ret_val) { uint32_t prev_alt_bkt_idx; - struct rte_hash_bucket *cur_bkt = bkt; + struct rte_hash_bucket *cur_bkt; struct queue_node *prev_node, *curr_node = leaf; struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt; uint32_t prev_slot, curr_slot = leaf_slot; @@ -597,18 +755,20 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, /* Check if key was inserted after last check but before this * protected region. */ - ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash); + ret = search_and_update(h, data, key, bkt, sig); if (ret != -1) { __hash_rw_writer_unlock(h); *ret_val = ret; return 1; } - ret = search_and_update(h, data, key, alt_bkt, alt_hash, sig); - if (ret != -1) { - __hash_rw_writer_unlock(h); - *ret_val = ret; - return 1; + FOR_EACH_BUCKET(cur_bkt, alt_bkt) { + ret = search_and_update(h, data, key, cur_bkt, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } } while (likely(curr_node->prev != NULL)) { @@ -616,36 +776,73 @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, prev_bkt = prev_node->bkt; prev_slot = curr_node->prev_slot; - prev_alt_bkt_idx = - prev_bkt->sig_alt[prev_slot] & h->bucket_bitmask; + prev_alt_bkt_idx = get_alt_bucket_index(h, + prev_node->cur_bkt_idx, + prev_bkt->sig_current[prev_slot]); if (unlikely(&h->buckets[prev_alt_bkt_idx] != curr_bkt)) { /* revert it to empty, otherwise duplicated keys */ - curr_bkt->key_idx[curr_slot] = EMPTY_SLOT; + __atomic_store_n(&curr_bkt->key_idx[curr_slot], + EMPTY_SLOT, + __ATOMIC_RELEASE); __hash_rw_writer_unlock(h); return -1; } + if (h->readwrite_concur_lf_support) { + /* Inform the previous move. The current move need + * not be informed now as the current bucket entry + * is present in both primary and secondary. + * Since there is one writer, load acquires on + * tbl_chng_cnt are not required. + */ + __atomic_store_n(h->tbl_chng_cnt, + *h->tbl_chng_cnt + 1, + __ATOMIC_RELEASE); + /* The stores to sig_alt and sig_current should not + * move above the store to tbl_chng_cnt. + */ + __atomic_thread_fence(__ATOMIC_RELEASE); + } + /* Need to swap current/alt sig to allow later * Cuckoo insert to move elements back to its * primary bucket if available */ - curr_bkt->sig_alt[curr_slot] = - prev_bkt->sig_current[prev_slot]; curr_bkt->sig_current[curr_slot] = - prev_bkt->sig_alt[prev_slot]; - curr_bkt->key_idx[curr_slot] = - prev_bkt->key_idx[prev_slot]; + prev_bkt->sig_current[prev_slot]; + /* Release the updated bucket entry */ + __atomic_store_n(&curr_bkt->key_idx[curr_slot], + prev_bkt->key_idx[prev_slot], + __ATOMIC_RELEASE); curr_slot = prev_slot; curr_node = prev_node; curr_bkt = curr_node->bkt; } + if (h->readwrite_concur_lf_support) { + /* Inform the previous move. The current move need + * not be informed now as the current bucket entry + * is present in both primary and secondary. + * Since there is one writer, load acquires on + * tbl_chng_cnt are not required. + */ + __atomic_store_n(h->tbl_chng_cnt, + *h->tbl_chng_cnt + 1, + __ATOMIC_RELEASE); + /* The stores to sig_alt and sig_current should not + * move above the store to tbl_chng_cnt. + */ + __atomic_thread_fence(__ATOMIC_RELEASE); + } + curr_bkt->sig_current[curr_slot] = sig; - curr_bkt->sig_alt[curr_slot] = alt_hash; - curr_bkt->key_idx[curr_slot] = new_idx; + /* Release the new bucket entry */ + __atomic_store_n(&curr_bkt->key_idx[curr_slot], + new_idx, + __ATOMIC_RELEASE); __hash_rw_writer_unlock(h); @@ -662,39 +859,44 @@ rte_hash_cuckoo_make_space_mw(const struct rte_hash *h, struct rte_hash_bucket *bkt, struct rte_hash_bucket *sec_bkt, const struct rte_hash_key *key, void *data, - hash_sig_t sig, hash_sig_t alt_hash, + uint16_t sig, uint32_t bucket_idx, uint32_t new_idx, int32_t *ret_val) { unsigned int i; struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN]; struct queue_node *tail, *head; struct rte_hash_bucket *curr_bkt, *alt_bkt; + uint32_t cur_idx, alt_idx; tail = queue; head = queue + 1; tail->bkt = bkt; tail->prev = NULL; tail->prev_slot = -1; + tail->cur_bkt_idx = bucket_idx; /* Cuckoo bfs Search */ while (likely(tail != head && head < queue + RTE_HASH_BFS_QUEUE_MAX_LEN - RTE_HASH_BUCKET_ENTRIES)) { curr_bkt = tail->bkt; + cur_idx = tail->cur_bkt_idx; for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { if (curr_bkt->key_idx[i] == EMPTY_SLOT) { int32_t ret = rte_hash_cuckoo_move_insert_mw(h, bkt, sec_bkt, key, data, - tail, i, sig, alt_hash, + tail, i, sig, new_idx, ret_val); if (likely(ret != -1)) return ret; } /* Enqueue new node and keep prev node info */ - alt_bkt = &(h->buckets[curr_bkt->sig_alt[i] - & h->bucket_bitmask]); + alt_idx = get_alt_bucket_index(h, cur_idx, + curr_bkt->sig_current[i]); + alt_bkt = &(h->buckets[alt_idx]); head->bkt = alt_bkt; + head->cur_bkt_idx = alt_idx; head->prev = tail; head->prev_slot = i; head++; @@ -709,45 +911,50 @@ static inline int32_t __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t sig, void *data) { - hash_sig_t alt_hash; + uint16_t short_sig; uint32_t prim_bucket_idx, sec_bucket_idx; - struct rte_hash_bucket *prim_bkt, *sec_bkt; + struct rte_hash_bucket *prim_bkt, *sec_bkt, *cur_bkt; struct rte_hash_key *new_k, *keys = h->key_store; void *slot_id = NULL; - uint32_t new_idx; + void *ext_bkt_id = NULL; + uint32_t new_idx, bkt_id; int ret; unsigned n_slots; unsigned lcore_id; + unsigned int i; struct lcore_cache *cached_free_slots = NULL; int32_t ret_val; + struct rte_hash_bucket *last; - prim_bucket_idx = sig & h->bucket_bitmask; + short_sig = get_short_sig(sig); + prim_bucket_idx = get_prim_bucket_index(h, sig); + sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig); prim_bkt = &h->buckets[prim_bucket_idx]; - rte_prefetch0(prim_bkt); - - alt_hash = rte_hash_secondary_hash(sig); - sec_bucket_idx = alt_hash & h->bucket_bitmask; sec_bkt = &h->buckets[sec_bucket_idx]; + rte_prefetch0(prim_bkt); rte_prefetch0(sec_bkt); /* Check if key is already inserted in primary location */ __hash_rw_writer_lock(h); - ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash); + ret = search_and_update(h, data, key, prim_bkt, short_sig); if (ret != -1) { __hash_rw_writer_unlock(h); return ret; } /* Check if key is already inserted in secondary location */ - ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig); - if (ret != -1) { - __hash_rw_writer_unlock(h); - return ret; + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + ret = search_and_update(h, data, key, cur_bkt, short_sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } } + __hash_rw_writer_unlock(h); /* Did not find a match, so get a new slot for storing the new key */ - if (h->multi_writer_support) { + if (h->use_local_cache) { lcore_id = rte_lcore_id(); cached_free_slots = &h->local_free_slots[lcore_id]; /* Try to get a free slot from the local cache */ @@ -776,12 +983,19 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, new_idx = (uint32_t)((uintptr_t) slot_id); /* Copy key */ rte_memcpy(new_k->key, key, h->key_len); - new_k->pdata = data; - + /* Key can be of arbitrary length, so it is not possible to store + * it atomically. Hence the new key element's memory stores + * (key as well as data) should be complete before it is referenced. + * 'pdata' acts as the synchronization point when an existing hash + * entry is updated. + */ + __atomic_store_n(&new_k->pdata, + data, + __ATOMIC_RELEASE); /* Find an empty slot and insert */ ret = rte_hash_cuckoo_insert_mw(h, prim_bkt, sec_bkt, key, data, - sig, alt_hash, new_idx, &ret_val); + short_sig, new_idx, &ret_val); if (ret == 0) return new_idx - 1; else if (ret == 1) { @@ -791,7 +1005,7 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, /* Primary bucket full, need to make space for new entry */ ret = rte_hash_cuckoo_make_space_mw(h, prim_bkt, sec_bkt, key, data, - sig, alt_hash, new_idx, &ret_val); + short_sig, prim_bucket_idx, new_idx, &ret_val); if (ret == 0) return new_idx - 1; else if (ret == 1) { @@ -801,17 +1015,75 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, /* Also search secondary bucket to get better occupancy */ ret = rte_hash_cuckoo_make_space_mw(h, sec_bkt, prim_bkt, key, data, - alt_hash, sig, new_idx, &ret_val); + short_sig, sec_bucket_idx, new_idx, &ret_val); if (ret == 0) return new_idx - 1; else if (ret == 1) { enqueue_slot_back(h, cached_free_slots, slot_id); return ret_val; - } else { + } + + /* if ext table not enabled, we failed the insertion */ + if (!h->ext_table_support) { enqueue_slot_back(h, cached_free_slots, slot_id); return ret; } + + /* Now we need to go through the extendable bucket. Protection is needed + * to protect all extendable bucket processes. + */ + __hash_rw_writer_lock(h); + /* We check for duplicates again since could be inserted before the lock */ + ret = search_and_update(h, data, key, prim_bkt, short_sig); + if (ret != -1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + goto failure; + } + + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + ret = search_and_update(h, data, key, cur_bkt, short_sig); + if (ret != -1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + goto failure; + } + } + + /* Search sec and ext buckets to find an empty entry to insert. */ + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + /* Check if slot is available */ + if (likely(cur_bkt->key_idx[i] == EMPTY_SLOT)) { + cur_bkt->sig_current[i] = short_sig; + cur_bkt->key_idx[i] = new_idx; + __hash_rw_writer_unlock(h); + return new_idx - 1; + } + } + } + + /* Failed to get an empty entry from extendable buckets. Link a new + * extendable bucket. We first get a free bucket from ring. + */ + if (rte_ring_sc_dequeue(h->free_ext_bkts, &ext_bkt_id) != 0) { + ret = -ENOSPC; + goto failure; + } + + bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1; + /* Use the first location of the new bucket */ + (h->buckets_ext[bkt_id]).sig_current[0] = short_sig; + (h->buckets_ext[bkt_id]).key_idx[0] = new_idx; + /* Link the new bucket to sec bucket linked list */ + last = rte_hash_get_last_bkt(sec_bkt); + last->next = &h->buckets_ext[bkt_id]; + __hash_rw_writer_unlock(h); + return new_idx - 1; + +failure: + __hash_rw_writer_unlock(h); + return ret; + } int32_t @@ -859,25 +1131,31 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data) /* Search one bucket to find the match key */ static inline int32_t -search_one_bucket(const struct rte_hash *h, const void *key, hash_sig_t sig, +search_one_bucket(const struct rte_hash *h, const void *key, uint16_t sig, void **data, const struct rte_hash_bucket *bkt) { int i; + uint32_t key_idx; + void *pdata; struct rte_hash_key *k, *keys = h->key_store; for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (bkt->sig_current[i] == sig && - bkt->key_idx[i] != EMPTY_SLOT) { + key_idx = __atomic_load_n(&bkt->key_idx[i], + __ATOMIC_ACQUIRE); + if (bkt->sig_current[i] == sig && key_idx != EMPTY_SLOT) { k = (struct rte_hash_key *) ((char *)keys + - bkt->key_idx[i] * h->key_entry_size); + key_idx * h->key_entry_size); + pdata = __atomic_load_n(&k->pdata, + __ATOMIC_ACQUIRE); + if (rte_hash_cmp_eq(key, k->key, h) == 0) { if (data != NULL) - *data = k->pdata; + *data = pdata; /* * Return index where key is stored, * subtracting the first dummy index */ - return bkt->key_idx[i] - 1; + return key_idx - 1; } } } @@ -888,34 +1166,64 @@ static inline int32_t __rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key, hash_sig_t sig, void **data) { - uint32_t bucket_idx; - hash_sig_t alt_hash; - struct rte_hash_bucket *bkt; + uint32_t prim_bucket_idx, sec_bucket_idx; + struct rte_hash_bucket *bkt, *cur_bkt; + uint32_t cnt_b, cnt_a; int ret; + uint16_t short_sig; - bucket_idx = sig & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; + short_sig = get_short_sig(sig); + prim_bucket_idx = get_prim_bucket_index(h, sig); + sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig); __hash_rw_reader_lock(h); - /* Check if key is in primary location */ - ret = search_one_bucket(h, key, sig, data, bkt); - if (ret != -1) { - __hash_rw_reader_unlock(h); - return ret; - } - /* Calculate secondary hash */ - alt_hash = rte_hash_secondary_hash(sig); - bucket_idx = alt_hash & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; + do { + /* Load the table change counter before the lookup + * starts. Acquire semantics will make sure that + * loads in search_one_bucket are not hoisted. + */ + cnt_b = __atomic_load_n(h->tbl_chng_cnt, + __ATOMIC_ACQUIRE); + + /* Check if key is in primary location */ + bkt = &h->buckets[prim_bucket_idx]; + ret = search_one_bucket(h, key, short_sig, data, bkt); + if (ret != -1) { + __hash_rw_reader_unlock(h); + return ret; + } + /* Calculate secondary hash */ + bkt = &h->buckets[sec_bucket_idx]; + + /* Check if key is in secondary location */ + FOR_EACH_BUCKET(cur_bkt, bkt) { + ret = search_one_bucket(h, key, short_sig, + data, cur_bkt); + if (ret != -1) { + __hash_rw_reader_unlock(h); + return ret; + } + } + + /* The loads of sig_current in search_one_bucket + * should not move below the load from tbl_chng_cnt. + */ + __atomic_thread_fence(__ATOMIC_ACQUIRE); + /* Re-read the table change counter to check if the + * table has changed during search. If yes, re-do + * the search. + * This load should not get hoisted. The load + * acquires on cnt_b, key index in primary bucket + * and key index in secondary bucket will make sure + * that it does not get hoisted. + */ + cnt_a = __atomic_load_n(h->tbl_chng_cnt, + __ATOMIC_ACQUIRE); + } while (cnt_b != cnt_a); - /* Check if key is in secondary location */ - ret = search_one_bucket(h, key, alt_hash, data, bkt); - if (ret != -1) { - __hash_rw_reader_unlock(h); - return ret; - } __hash_rw_reader_unlock(h); + return -ENOENT; } @@ -955,9 +1263,7 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i) unsigned lcore_id, n_slots; struct lcore_cache *cached_free_slots; - bkt->sig_current[i] = NULL_SIGNATURE; - bkt->sig_alt[i] = NULL_SIGNATURE; - if (h->multi_writer_support) { + if (h->use_local_cache) { lcore_id = rte_lcore_id(); cached_free_slots = &h->local_free_slots[lcore_id]; /* Cache full, need to free it. */ @@ -978,31 +1284,67 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i) } } -/* Search one bucket and remove the matched key */ +/* Compact the linked list by moving key from last entry in linked list to the + * empty slot. + */ +static inline void +__rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) { + int i; + struct rte_hash_bucket *last_bkt; + + if (!cur_bkt->next) + return; + + last_bkt = rte_hash_get_last_bkt(cur_bkt); + + for (i = RTE_HASH_BUCKET_ENTRIES - 1; i >= 0; i--) { + if (last_bkt->key_idx[i] != EMPTY_SLOT) { + cur_bkt->key_idx[pos] = last_bkt->key_idx[i]; + cur_bkt->sig_current[pos] = last_bkt->sig_current[i]; + last_bkt->sig_current[i] = NULL_SIGNATURE; + last_bkt->key_idx[i] = EMPTY_SLOT; + return; + } + } +} + +/* Search one bucket and remove the matched key. + * Writer is expected to hold the lock while calling this + * function. + */ static inline int32_t search_and_remove(const struct rte_hash *h, const void *key, - struct rte_hash_bucket *bkt, hash_sig_t sig) + struct rte_hash_bucket *bkt, uint16_t sig, int *pos) { struct rte_hash_key *k, *keys = h->key_store; unsigned int i; - int32_t ret; + uint32_t key_idx; - /* Check if key is in primary location */ + /* Check if key is in bucket */ for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - if (bkt->sig_current[i] == sig && - bkt->key_idx[i] != EMPTY_SLOT) { + key_idx = __atomic_load_n(&bkt->key_idx[i], + __ATOMIC_ACQUIRE); + if (bkt->sig_current[i] == sig && key_idx != EMPTY_SLOT) { k = (struct rte_hash_key *) ((char *)keys + - bkt->key_idx[i] * h->key_entry_size); + key_idx * h->key_entry_size); if (rte_hash_cmp_eq(key, k->key, h) == 0) { - remove_entry(h, bkt, i); + bkt->sig_current[i] = NULL_SIGNATURE; + /* Free the key store index if + * no_free_on_del is disabled. + */ + if (!h->no_free_on_del) + remove_entry(h, bkt, i); + + __atomic_store_n(&bkt->key_idx[i], + EMPTY_SLOT, + __ATOMIC_RELEASE); + *pos = i; /* * Return index where key is stored, * subtracting the first dummy index */ - ret = bkt->key_idx[i] - 1; - bkt->key_idx[i] = EMPTY_SLOT; - return ret; + return key_idx - 1; } } } @@ -1013,36 +1355,68 @@ static inline int32_t __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t sig) { - uint32_t bucket_idx; - hash_sig_t alt_hash; - struct rte_hash_bucket *bkt; - int32_t ret; - - bucket_idx = sig & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; + uint32_t prim_bucket_idx, sec_bucket_idx; + struct rte_hash_bucket *prim_bkt, *sec_bkt, *prev_bkt, *last_bkt; + struct rte_hash_bucket *cur_bkt; + int pos; + int32_t ret, i; + uint16_t short_sig; + + short_sig = get_short_sig(sig); + prim_bucket_idx = get_prim_bucket_index(h, sig); + sec_bucket_idx = get_alt_bucket_index(h, prim_bucket_idx, short_sig); + prim_bkt = &h->buckets[prim_bucket_idx]; __hash_rw_writer_lock(h); /* look for key in primary bucket */ - ret = search_and_remove(h, key, bkt, sig); + ret = search_and_remove(h, key, prim_bkt, short_sig, &pos); if (ret != -1) { - __hash_rw_writer_unlock(h); - return ret; + __rte_hash_compact_ll(prim_bkt, pos); + last_bkt = prim_bkt->next; + prev_bkt = prim_bkt; + goto return_bkt; } /* Calculate secondary hash */ - alt_hash = rte_hash_secondary_hash(sig); - bucket_idx = alt_hash & h->bucket_bitmask; - bkt = &h->buckets[bucket_idx]; + sec_bkt = &h->buckets[sec_bucket_idx]; - /* look for key in secondary bucket */ - ret = search_and_remove(h, key, bkt, alt_hash); - if (ret != -1) { + FOR_EACH_BUCKET(cur_bkt, sec_bkt) { + ret = search_and_remove(h, key, cur_bkt, short_sig, &pos); + if (ret != -1) { + __rte_hash_compact_ll(cur_bkt, pos); + last_bkt = sec_bkt->next; + prev_bkt = sec_bkt; + goto return_bkt; + } + } + + __hash_rw_writer_unlock(h); + return -ENOENT; + +/* Search last bucket to see if empty to be recycled */ +return_bkt: + if (!last_bkt) { __hash_rw_writer_unlock(h); return ret; } + while (last_bkt->next) { + prev_bkt = last_bkt; + last_bkt = last_bkt->next; + } + + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + if (last_bkt->key_idx[i] != EMPTY_SLOT) + break; + } + /* found empty bucket and recycle */ + if (i == RTE_HASH_BUCKET_ENTRIES) { + prev_bkt->next = last_bkt->next = NULL; + uint32_t index = last_bkt - h->buckets_ext + 1; + rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index); + } __hash_rw_writer_unlock(h); - return -ENOENT; + return ret; } int32_t @@ -1080,59 +1454,76 @@ rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, return 0; } +int __rte_experimental +rte_hash_free_key_with_position(const struct rte_hash *h, + const int32_t position) +{ + RETURN_IF_TRUE(((h == NULL) || (position == EMPTY_SLOT)), -EINVAL); + + unsigned int lcore_id, n_slots; + struct lcore_cache *cached_free_slots; + const int32_t total_entries = h->num_buckets * RTE_HASH_BUCKET_ENTRIES; + + /* Out of bounds */ + if (position >= total_entries) + return -EINVAL; + + if (h->use_local_cache) { + lcore_id = rte_lcore_id(); + cached_free_slots = &h->local_free_slots[lcore_id]; + /* Cache full, need to free it. */ + if (cached_free_slots->len == LCORE_CACHE_SIZE) { + /* Need to enqueue the free slots in global ring. */ + n_slots = rte_ring_mp_enqueue_burst(h->free_slots, + cached_free_slots->objs, + LCORE_CACHE_SIZE, NULL); + cached_free_slots->len -= n_slots; + } + /* Put index of new free slot in cache. */ + cached_free_slots->objs[cached_free_slots->len] = + (void *)((uintptr_t)position); + cached_free_slots->len++; + } else { + rte_ring_sp_enqueue(h->free_slots, + (void *)((uintptr_t)position)); + } + + return 0; +} + static inline void compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, const struct rte_hash_bucket *prim_bkt, const struct rte_hash_bucket *sec_bkt, - hash_sig_t prim_hash, hash_sig_t sec_hash, + uint16_t sig, enum rte_hash_sig_compare_function sig_cmp_fn) { unsigned int i; + /* For match mask the first bit of every two bits indicates the match */ switch (sig_cmp_fn) { -#ifdef RTE_MACHINE_CPUFLAG_AVX2 - case RTE_HASH_COMPARE_AVX2: - *prim_hash_matches = _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32( - _mm256_load_si256( - (__m256i const *)prim_bkt->sig_current), - _mm256_set1_epi32(prim_hash))); - *sec_hash_matches = _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32( - _mm256_load_si256( - (__m256i const *)sec_bkt->sig_current), - _mm256_set1_epi32(sec_hash))); - break; -#endif #ifdef RTE_MACHINE_CPUFLAG_SSE2 case RTE_HASH_COMPARE_SSE: - /* Compare the first 4 signatures in the bucket */ - *prim_hash_matches = _mm_movemask_ps((__m128)_mm_cmpeq_epi16( + /* Compare all signatures in the bucket */ + *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( _mm_load_si128( (__m128i const *)prim_bkt->sig_current), - _mm_set1_epi32(prim_hash))); - *prim_hash_matches |= (_mm_movemask_ps((__m128)_mm_cmpeq_epi16( - _mm_load_si128( - (__m128i const *)&prim_bkt->sig_current[4]), - _mm_set1_epi32(prim_hash)))) << 4; - /* Compare the first 4 signatures in the bucket */ - *sec_hash_matches = _mm_movemask_ps((__m128)_mm_cmpeq_epi16( + _mm_set1_epi16(sig))); + /* Compare all signatures in the bucket */ + *sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( _mm_load_si128( (__m128i const *)sec_bkt->sig_current), - _mm_set1_epi32(sec_hash))); - *sec_hash_matches |= (_mm_movemask_ps((__m128)_mm_cmpeq_epi16( - _mm_load_si128( - (__m128i const *)&sec_bkt->sig_current[4]), - _mm_set1_epi32(sec_hash)))) << 4; + _mm_set1_epi16(sig))); break; #endif default: for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { *prim_hash_matches |= - ((prim_hash == prim_bkt->sig_current[i]) << i); + ((sig == prim_bkt->sig_current[i]) << (i << 1)); *sec_hash_matches |= - ((sec_hash == sec_bkt->sig_current[i]) << i); + ((sig == sec_bkt->sig_current[i]) << (i << 1)); } } - } #define PREFETCH_OFFSET 4 @@ -1143,12 +1534,18 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, { uint64_t hits = 0; int32_t i; + int32_t ret; uint32_t prim_hash[RTE_HASH_LOOKUP_BULK_MAX]; - uint32_t sec_hash[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t prim_index[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t sec_index[RTE_HASH_LOOKUP_BULK_MAX]; + uint16_t sig[RTE_HASH_LOOKUP_BULK_MAX]; const struct rte_hash_bucket *primary_bkt[RTE_HASH_LOOKUP_BULK_MAX]; const struct rte_hash_bucket *secondary_bkt[RTE_HASH_LOOKUP_BULK_MAX]; uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; + struct rte_hash_bucket *cur_bkt, *next_bkt; + void *pdata[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t cnt_b, cnt_a; /* Prefetch first keys */ for (i = 0; i < PREFETCH_OFFSET && i < num_keys; i++) @@ -1162,10 +1559,13 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, rte_prefetch0(keys[i + PREFETCH_OFFSET]); prim_hash[i] = rte_hash_hash(h, keys[i]); - sec_hash[i] = rte_hash_secondary_hash(prim_hash[i]); - primary_bkt[i] = &h->buckets[prim_hash[i] & h->bucket_bitmask]; - secondary_bkt[i] = &h->buckets[sec_hash[i] & h->bucket_bitmask]; + sig[i] = get_short_sig(prim_hash[i]); + prim_index[i] = get_prim_bucket_index(h, prim_hash[i]); + sec_index[i] = get_alt_bucket_index(h, prim_index[i], sig[i]); + + primary_bkt[i] = &h->buckets[prim_index[i]]; + secondary_bkt[i] = &h->buckets[sec_index[i]]; rte_prefetch0(primary_bkt[i]); rte_prefetch0(secondary_bkt[i]); @@ -1174,96 +1574,178 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, /* Calculate and prefetch rest of the buckets */ for (; i < num_keys; i++) { prim_hash[i] = rte_hash_hash(h, keys[i]); - sec_hash[i] = rte_hash_secondary_hash(prim_hash[i]); - primary_bkt[i] = &h->buckets[prim_hash[i] & h->bucket_bitmask]; - secondary_bkt[i] = &h->buckets[sec_hash[i] & h->bucket_bitmask]; + sig[i] = get_short_sig(prim_hash[i]); + prim_index[i] = get_prim_bucket_index(h, prim_hash[i]); + sec_index[i] = get_alt_bucket_index(h, prim_index[i], sig[i]); + + primary_bkt[i] = &h->buckets[prim_index[i]]; + secondary_bkt[i] = &h->buckets[sec_index[i]]; rte_prefetch0(primary_bkt[i]); rte_prefetch0(secondary_bkt[i]); } __hash_rw_reader_lock(h); - /* Compare signatures and prefetch key slot of first hit */ - for (i = 0; i < num_keys; i++) { - compare_signatures(&prim_hitmask[i], &sec_hitmask[i], + do { + /* Load the table change counter before the lookup + * starts. Acquire semantics will make sure that + * loads in compare_signatures are not hoisted. + */ + cnt_b = __atomic_load_n(h->tbl_chng_cnt, + __ATOMIC_ACQUIRE); + + /* Compare signatures and prefetch key slot of first hit */ + for (i = 0; i < num_keys; i++) { + compare_signatures(&prim_hitmask[i], &sec_hitmask[i], primary_bkt[i], secondary_bkt[i], - prim_hash[i], sec_hash[i], h->sig_cmp_fn); - - if (prim_hitmask[i]) { - uint32_t first_hit = __builtin_ctzl(prim_hitmask[i]); - uint32_t key_idx = primary_bkt[i]->key_idx[first_hit]; - const struct rte_hash_key *key_slot = - (const struct rte_hash_key *)( - (const char *)h->key_store + - key_idx * h->key_entry_size); - rte_prefetch0(key_slot); - continue; - } + sig[i], h->sig_cmp_fn); + + if (prim_hitmask[i]) { + uint32_t first_hit = + __builtin_ctzl(prim_hitmask[i]) + >> 1; + uint32_t key_idx = + primary_bkt[i]->key_idx[first_hit]; + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + rte_prefetch0(key_slot); + continue; + } - if (sec_hitmask[i]) { - uint32_t first_hit = __builtin_ctzl(sec_hitmask[i]); - uint32_t key_idx = secondary_bkt[i]->key_idx[first_hit]; - const struct rte_hash_key *key_slot = - (const struct rte_hash_key *)( - (const char *)h->key_store + - key_idx * h->key_entry_size); - rte_prefetch0(key_slot); + if (sec_hitmask[i]) { + uint32_t first_hit = + __builtin_ctzl(sec_hitmask[i]) + >> 1; + uint32_t key_idx = + secondary_bkt[i]->key_idx[first_hit]; + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + rte_prefetch0(key_slot); + } } - } - /* Compare keys, first hits in primary first */ - for (i = 0; i < num_keys; i++) { - positions[i] = -ENOENT; - while (prim_hitmask[i]) { - uint32_t hit_index = __builtin_ctzl(prim_hitmask[i]); - - uint32_t key_idx = primary_bkt[i]->key_idx[hit_index]; - const struct rte_hash_key *key_slot = - (const struct rte_hash_key *)( - (const char *)h->key_store + - key_idx * h->key_entry_size); - /* - * If key index is 0, do not compare key, - * as it is checking the dummy slot - */ - if (!!key_idx & !rte_hash_cmp_eq(key_slot->key, keys[i], h)) { - if (data != NULL) - data[i] = key_slot->pdata; + /* Compare keys, first hits in primary first */ + for (i = 0; i < num_keys; i++) { + positions[i] = -ENOENT; + while (prim_hitmask[i]) { + uint32_t hit_index = + __builtin_ctzl(prim_hitmask[i]) + >> 1; + uint32_t key_idx = + __atomic_load_n( + &primary_bkt[i]->key_idx[hit_index], + __ATOMIC_ACQUIRE); + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + + if (key_idx != EMPTY_SLOT) + pdata[i] = __atomic_load_n( + &key_slot->pdata, + __ATOMIC_ACQUIRE); + /* + * If key index is 0, do not compare key, + * as it is checking the dummy slot + */ + if (!!key_idx & + !rte_hash_cmp_eq( + key_slot->key, keys[i], h)) { + if (data != NULL) + data[i] = pdata[i]; + + hits |= 1ULL << i; + positions[i] = key_idx - 1; + goto next_key; + } + prim_hitmask[i] &= ~(3ULL << (hit_index << 1)); + } - hits |= 1ULL << i; - positions[i] = key_idx - 1; - goto next_key; + while (sec_hitmask[i]) { + uint32_t hit_index = + __builtin_ctzl(sec_hitmask[i]) + >> 1; + uint32_t key_idx = + __atomic_load_n( + &secondary_bkt[i]->key_idx[hit_index], + __ATOMIC_ACQUIRE); + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + + if (key_idx != EMPTY_SLOT) + pdata[i] = __atomic_load_n( + &key_slot->pdata, + __ATOMIC_ACQUIRE); + /* + * If key index is 0, do not compare key, + * as it is checking the dummy slot + */ + + if (!!key_idx & + !rte_hash_cmp_eq( + key_slot->key, keys[i], h)) { + if (data != NULL) + data[i] = pdata[i]; + + hits |= 1ULL << i; + positions[i] = key_idx - 1; + goto next_key; + } + sec_hitmask[i] &= ~(3ULL << (hit_index << 1)); } - prim_hitmask[i] &= ~(1 << (hit_index)); +next_key: + continue; } - while (sec_hitmask[i]) { - uint32_t hit_index = __builtin_ctzl(sec_hitmask[i]); - - uint32_t key_idx = secondary_bkt[i]->key_idx[hit_index]; - const struct rte_hash_key *key_slot = - (const struct rte_hash_key *)( - (const char *)h->key_store + - key_idx * h->key_entry_size); - /* - * If key index is 0, do not compare key, - * as it is checking the dummy slot - */ - - if (!!key_idx & !rte_hash_cmp_eq(key_slot->key, keys[i], h)) { - if (data != NULL) - data[i] = key_slot->pdata; + /* The loads of sig_current in compare_signatures + * should not move below the load from tbl_chng_cnt. + */ + __atomic_thread_fence(__ATOMIC_ACQUIRE); + /* Re-read the table change counter to check if the + * table has changed during search. If yes, re-do + * the search. + * This load should not get hoisted. The load + * acquires on cnt_b, primary key index and secondary + * key index will make sure that it does not get + * hoisted. + */ + cnt_a = __atomic_load_n(h->tbl_chng_cnt, + __ATOMIC_ACQUIRE); + } while (cnt_b != cnt_a); + + /* all found, do not need to go through ext bkt */ + if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) { + if (hit_mask != NULL) + *hit_mask = hits; + __hash_rw_reader_unlock(h); + return; + } + /* need to check ext buckets for match */ + for (i = 0; i < num_keys; i++) { + if ((hits & (1ULL << i)) != 0) + continue; + next_bkt = secondary_bkt[i]->next; + FOR_EACH_BUCKET(cur_bkt, next_bkt) { + if (data != NULL) + ret = search_one_bucket(h, keys[i], + sig[i], &data[i], cur_bkt); + else + ret = search_one_bucket(h, keys[i], + sig[i], NULL, cur_bkt); + if (ret != -1) { + positions[i] = ret; hits |= 1ULL << i; - positions[i] = key_idx - 1; - goto next_key; + break; } - sec_hitmask[i] &= ~(1 << (hit_index)); } - -next_key: - continue; } __hash_rw_reader_unlock(h); @@ -1308,27 +1790,30 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32 RETURN_IF_TRUE(((h == NULL) || (next == NULL)), -EINVAL); - const uint32_t total_entries = h->num_buckets * RTE_HASH_BUCKET_ENTRIES; - /* Out of bounds */ - if (*next >= total_entries) - return -ENOENT; + const uint32_t total_entries_main = h->num_buckets * + RTE_HASH_BUCKET_ENTRIES; + const uint32_t total_entries = total_entries_main << 1; + + /* Out of bounds of all buckets (both main table and ext table) */ + if (*next >= total_entries_main) + goto extend_table; /* Calculate bucket and index of current iterator */ bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES; idx = *next % RTE_HASH_BUCKET_ENTRIES; /* If current position is empty, go to the next one */ - while (h->buckets[bucket_idx].key_idx[idx] == EMPTY_SLOT) { + while ((position = __atomic_load_n(&h->buckets[bucket_idx].key_idx[idx], + __ATOMIC_ACQUIRE)) == EMPTY_SLOT) { (*next)++; /* End of table */ - if (*next == total_entries) - return -ENOENT; + if (*next == total_entries_main) + goto extend_table; bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES; idx = *next % RTE_HASH_BUCKET_ENTRIES; } + __hash_rw_reader_lock(h); - /* Get position of entry in key table */ - position = h->buckets[bucket_idx].key_idx[idx]; next_key = (struct rte_hash_key *) ((char *)h->key_store + position * h->key_entry_size); /* Return key and data */ @@ -1341,4 +1826,34 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32 (*next)++; return position - 1; + +/* Begin to iterate extendable buckets */ +extend_table: + /* Out of total bound or if ext bucket feature is not enabled */ + if (*next >= total_entries || !h->ext_table_support) + return -ENOENT; + + bucket_idx = (*next - total_entries_main) / RTE_HASH_BUCKET_ENTRIES; + idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES; + + while ((position = h->buckets_ext[bucket_idx].key_idx[idx]) == EMPTY_SLOT) { + (*next)++; + if (*next == total_entries) + return -ENOENT; + bucket_idx = (*next - total_entries_main) / + RTE_HASH_BUCKET_ENTRIES; + idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES; + } + __hash_rw_reader_lock(h); + next_key = (struct rte_hash_key *) ((char *)h->key_store + + position * h->key_entry_size); + /* Return key and data */ + *key = next_key->key; + *data = next_key->pdata; + + __hash_rw_reader_unlock(h); + + /* Increment iterator */ + (*next)++; + return position - 1; } diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h index b43f467d..5dfbbc48 100644 --- a/lib/librte_hash/rte_cuckoo_hash.h +++ b/lib/librte_hash/rte_cuckoo_hash.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2016 Intel Corporation + * Copyright(c) 2018 Arm Limited */ /* rte_cuckoo_hash.h @@ -104,8 +105,6 @@ const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { #define LCORE_CACHE_SIZE 64 -#define RTE_HASH_MAX_PUSHES 100 - #define RTE_HASH_BFS_QUEUE_MAX_LEN 1000 #define RTE_XABORT_CUCKOO_PATH_INVALIDED 0x4 @@ -125,25 +124,24 @@ struct rte_hash_key { }; /* Variable key size */ char key[0]; -} __attribute__((aligned(KEY_ALIGNMENT))); +}; /* All different signature compare functions */ enum rte_hash_sig_compare_function { RTE_HASH_COMPARE_SCALAR = 0, RTE_HASH_COMPARE_SSE, - RTE_HASH_COMPARE_AVX2, RTE_HASH_COMPARE_NUM }; /** Bucket structure */ struct rte_hash_bucket { - hash_sig_t sig_current[RTE_HASH_BUCKET_ENTRIES]; + uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES]; uint32_t key_idx[RTE_HASH_BUCKET_ENTRIES]; - hash_sig_t sig_alt[RTE_HASH_BUCKET_ENTRIES]; - uint8_t flag[RTE_HASH_BUCKET_ENTRIES]; + + void *next; } __rte_cache_aligned; /** A hash table structure. */ @@ -164,10 +162,23 @@ struct rte_hash { /**< Length of hash key. */ uint8_t hw_trans_mem_support; /**< If hardware transactional memory is used. */ - uint8_t multi_writer_support; - /**< If multi-writer support is enabled. */ + uint8_t use_local_cache; + /**< If multi-writer support is enabled, use local cache + * to allocate key-store slots. + */ uint8_t readwrite_concur_support; /**< If read-write concurrency support is enabled */ + uint8_t ext_table_support; /**< Enable extendable bucket table */ + uint8_t no_free_on_del; + /**< If key index should be freed on calling rte_hash_del_xxx APIs. + * If this is set, rte_hash_free_key_with_position must be called to + * free the key index associated with the deleted entry. + * This flag is enabled by default. + */ + uint8_t readwrite_concur_lf_support; + /**< If read-write concurrency lock free support is enabled */ + uint8_t writer_takes_lock; + /**< Indicates if the writer threads need to take lock */ rte_hash_function hash_func; /**< Function used to calculate hash. */ uint32_t hash_func_init_val; /**< Init value used by hash_func. */ rte_hash_cmp_eq_t rte_hash_custom_cmp_eq; @@ -186,10 +197,15 @@ struct rte_hash { * to the key table. */ rte_rwlock_t *readwrite_lock; /**< Read-write lock thread-safety. */ + struct rte_hash_bucket *buckets_ext; /**< Extra buckets array */ + struct rte_ring *free_ext_bkts; /**< Ring of indexes of free buckets */ + uint32_t *tbl_chng_cnt; + /**< Indicates if the hash table changed from last read. */ } __rte_cache_aligned; struct queue_node { struct rte_hash_bucket *bkt; /* Current bucket on the bfs search */ + uint32_t cur_bkt_idx; struct queue_node *prev; /* Parent(bucket) in search path */ int prev_slot; /* Parent(slot) in search path */ diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h index 9e7d9315..c93d1a13 100644 --- a/lib/librte_hash/rte_hash.h +++ b/lib/librte_hash/rte_hash.h @@ -14,6 +14,8 @@ #include <stdint.h> #include <stddef.h> +#include <rte_compat.h> + #ifdef __cplusplus extern "C" { #endif @@ -37,7 +39,27 @@ extern "C" { /** Flag to support reader writer concurrency */ #define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY 0x04 -/** Signature of key that is stored internally. */ +/** Flag to indicate the extendabe bucket table feature should be used */ +#define RTE_HASH_EXTRA_FLAGS_EXT_TABLE 0x08 + +/** Flag to disable freeing of key index on hash delete. + * Refer to rte_hash_del_xxx APIs for more details. + * This is enabled by default when RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF + * is enabled. + */ +#define RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL 0x10 + +/** Flag to support lock free reader writer concurrency. Both single writer + * and multi writer use cases are supported. + * Currently, extendable bucket table feature is not supported with + * this feature. + */ +#define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF 0x20 + +/** + * The type of hash value of a key. + * It should be a value of at least 32bit with fully random pattern. + */ typedef uint32_t hash_sig_t; /** Type of function that can be used for calculating the hash value. */ @@ -119,7 +141,12 @@ void rte_hash_free(struct rte_hash *h); /** - * Reset all hash structure, by zeroing all entries + * Reset all hash structure, by zeroing all entries. + * When RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled, + * it is application's responsibility to make sure that + * none of the readers are referencing the hash table + * while calling this API. + * * @param h * Hash table to reset */ @@ -143,6 +170,11 @@ rte_hash_count(const struct rte_hash *h); * and should only be called from one thread by default. * Thread safety can be enabled by setting flag during * table creation. + * If the key exists already in the table, this API updates its value + * with 'data' passed in this API. It is the responsibility of + * the application to manage any memory associated with the old value. + * The readers might still be using the old value even after this API + * has returned. * * @param h * Hash table to add the key to. @@ -165,6 +197,11 @@ rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data); * and should only be called from one thread by default. * Thread safety can be enabled by setting flag during * table creation. + * If the key exists already in the table, this API updates its value + * with 'data' passed in this API. It is the responsibility of + * the application to manage any memory associated with the old value. + * The readers might still be using the old value even after this API + * has returned. * * @param h * Hash table to add the key to. @@ -230,6 +267,14 @@ rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t * and should only be called from one thread by default. * Thread safety can be enabled by setting flag during * table creation. + * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or + * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled, + * the key index returned by rte_hash_add_key_xxx APIs will not be + * freed by this API. rte_hash_free_key_with_position API must be called + * additionally to free the index associated with the key. + * rte_hash_free_key_with_position API should be called after all + * the readers have stopped referencing the entry corresponding to + * this key. RCU mechanisms could be used to determine such a state. * * @param h * Hash table to remove the key from. @@ -251,6 +296,14 @@ rte_hash_del_key(const struct rte_hash *h, const void *key); * and should only be called from one thread by default. * Thread safety can be enabled by setting flag during * table creation. + * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or + * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled, + * the key index returned by rte_hash_add_key_xxx APIs will not be + * freed by this API. rte_hash_free_key_with_position API must be called + * additionally to free the index associated with the key. + * rte_hash_free_key_with_position API should be called after all + * the readers have stopped referencing the entry corresponding to + * this key. RCU mechanisms could be used to determine such a state. * * @param h * Hash table to remove the key from. @@ -290,6 +343,34 @@ rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, void **key); /** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Free a hash key in the hash table given the position + * of the key. This operation is not multi-thread safe and should + * only be called from one thread by default. Thread safety + * can be enabled by setting flag during table creation. + * If RTE_HASH_EXTRA_FLAGS_NO_FREE_ON_DEL or + * RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF is enabled, + * the key index returned by rte_hash_del_key_xxx APIs must be freed + * using this API. This API should be called after all the readers + * have stopped referencing the entry corresponding to this key. + * RCU mechanisms could be used to determine such a state. + * This API does not validate if the key is already freed. + * + * @param h + * Hash table to free the key from. + * @param position + * Position returned when the key was deleted. + * @return + * - 0 if freed successfully + * - -EINVAL if the parameters are invalid. + */ +int __rte_experimental +rte_hash_free_key_with_position(const struct rte_hash *h, + const int32_t position); + +/** * Find a key-value pair in the hash table. * This operation is multi-thread safe with regarding to other lookup threads. * Read-write concurrency can be enabled by setting flag during diff --git a/lib/librte_hash/rte_hash_version.map b/lib/librte_hash/rte_hash_version.map index e216ac8e..734ae28b 100644 --- a/lib/librte_hash/rte_hash_version.map +++ b/lib/librte_hash/rte_hash_version.map @@ -53,3 +53,10 @@ DPDK_18.08 { rte_hash_count; } DPDK_16.07; + +EXPERIMENTAL { + global: + + rte_hash_free_key_with_position; + +}; |