From fb1ccc7c369d879a20567df609d04cfdcec5f8c1 Mon Sep 17 00:00:00 2001 From: jaszha03 Date: Wed, 26 Jun 2019 17:06:59 -0500 Subject: vppinfra: refactor clib_spinlock_t to use compare and swap Tested performance of a CAS implementation (using __atomic_compare_exchange) against a TAS implementation (using __atomic_exchange) using test_spinlock.c and found some performance improvement. Generated assembly for CAS and TAS implementations show that TAS always executes with a load-store dependency, but CAS moves a branch condition between the load and store so that only a load occurs when the lock is free. Benchmarking: The results below are the cycle counts from test_spinlock.c, configured so that for 10000 iterations, 12 threads on separate cores are spawned, each of which increments a global counter 10000 times in each iteration. For A72, 8 threads are spawned in each test. x86 Xeon TAS: 7.333e8, 7.605e8, 7.535e8, 7.485e8, 7.321e8 x86 Xeon CAS: 5.842e8, 5.433e8, 5.389e8, 5.983e8, 5.552e8 Aarch64 ThX2* TAS: 9.852e7, 10.209e7, 9.190e7, 9.600e7, 9.224e7 Aarch64 ThX2* CAS: 7.640e7, 7.486e7, 7.425e7, 7.269e7, 7.534e7 A72 TAS: 7.289e6, 6.963e6, 7.208e6, 6.976e6, 7.200e6 A72 CAS: 1.695e6, 1.608e6, 1.600e6, 1.634e6, 1.746e6 *ThunderX2 used additional gcc options "-march=armv8.1-a+crc+crypto+lse" Type: refactor Change-Id: Ic5cd97991804f6b012707fad1a5d1a6edb96cd3d Signed-off-by: Jason Zhang Reviewed-by: Honnappa Nagarahalli Reviewed-by: Lijian Zhang --- src/vppinfra/lock.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'src/vppinfra/lock.h') diff --git a/src/vppinfra/lock.h b/src/vppinfra/lock.h index cc6a7f086b1..c1f0b87dfc6 100644 --- a/src/vppinfra/lock.h +++ b/src/vppinfra/lock.h @@ -17,6 +17,7 @@ #define included_clib_lock_h #include +#include #if __x86_64__ #define CLIB_PAUSE() __builtin_ia32_pause () @@ -76,8 +77,15 @@ clib_spinlock_free (clib_spinlock_t * p) static_always_inline void clib_spinlock_lock (clib_spinlock_t * p) { - while (clib_atomic_test_and_set (&(*p)->lock)) - CLIB_PAUSE (); + u32 free = 0; + while (!clib_atomic_cmp_and_swap_acq_relax_n (&(*p)->lock, &free, 1, 0)) + { + /* atomic load limits number of compare_exchange executions */ + while (clib_atomic_load_relax_n (&(*p)->lock)) + CLIB_PAUSE (); + /* on failure, compare_exchange writes (*p)->lock into free */ + free = 0; + } CLIB_LOCK_DBG (p); } -- cgit 1.2.3-korg