From adeaf16960f8895eb246b388553a49d2ade80dc4 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 14 Mar 2023 18:04:45 +0000 Subject: crypto-native: 256-bit AES CBC support Used on intel client CPUs which suppport VAES instruction set without AVX512 Type: improvement Change-Id: I5f816a1ea9f89a8d298d2c0f38d8d7c06f414ba0 Signed-off-by: Damjan Marion --- src/plugins/crypto_native/CMakeLists.txt | 3 + src/plugins/crypto_native/aes.h | 28 ++- src/plugins/crypto_native/aes_cbc.c | 357 +++++++++++++++++++++++++----- src/plugins/crypto_native/aes_gcm.c | 42 ++-- src/plugins/crypto_native/crypto_native.h | 3 +- src/plugins/crypto_native/ghash.h | 2 +- src/plugins/crypto_native/main.c | 4 + src/vppinfra/vector_avx2.h | 16 ++ 8 files changed, 369 insertions(+), 86 deletions(-) (limited to 'src') diff --git a/src/plugins/crypto_native/CMakeLists.txt b/src/plugins/crypto_native/CMakeLists.txt index 688a8c95baf..ba6f6cbcc28 100644 --- a/src/plugins/crypto_native/CMakeLists.txt +++ b/src/plugins/crypto_native/CMakeLists.txt @@ -20,6 +20,9 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") if(compiler_flag_march_icelake_client AND compiler_flag_mprefer_vector_width_512) list(APPEND VARIANTS "icl\;-march=icelake-client -mprefer-vector-width=512") endif() + if(compiler_flag_march_alderlake) + list(APPEND VARIANTS "adl\;-march=alderlake -mprefer-vector-width=256") + endif() set (COMPILE_FILES aes_cbc.c aes_gcm.c) set (COMPILE_OPTS -Wall -fno-common -maes) endif() diff --git a/src/plugins/crypto_native/aes.h b/src/plugins/crypto_native/aes.h index e0d832276e0..40fe681e2b7 100644 --- a/src/plugins/crypto_native/aes.h +++ b/src/plugins/crypto_native/aes.h @@ -48,7 +48,7 @@ aes_enc_round (u8x16 a, u8x16 k) #endif } -#if defined (__VAES__) +#if defined(__VAES__) && defined(__AVX512F__) static_always_inline u8x64 aes_enc_round_x4 (u8x64 a, u8x64 k) { @@ -74,6 +74,32 @@ aes_dec_last_round_x4 (u8x64 a, u8x64 k) } #endif +#ifdef __VAES__ +static_always_inline u8x32 +aes_enc_round_x2 (u8x32 a, u8x32 k) +{ + return (u8x32) _mm256_aesenc_epi128 ((__m256i) a, (__m256i) k); +} + +static_always_inline u8x32 +aes_enc_last_round_x2 (u8x32 a, u8x32 k) +{ + return (u8x32) _mm256_aesenclast_epi128 ((__m256i) a, (__m256i) k); +} + +static_always_inline u8x32 +aes_dec_round_x2 (u8x32 a, u8x32 k) +{ + return (u8x32) _mm256_aesdec_epi128 ((__m256i) a, (__m256i) k); +} + +static_always_inline u8x32 +aes_dec_last_round_x2 (u8x32 a, u8x32 k) +{ + return (u8x32) _mm256_aesdeclast_epi128 ((__m256i) a, (__m256i) k); +} +#endif + static_always_inline u8x16 aes_enc_last_round (u8x16 a, u8x16 k) { diff --git a/src/plugins/crypto_native/aes_cbc.c b/src/plugins/crypto_native/aes_cbc.c index 7896c8814b1..02d96b31c79 100644 --- a/src/plugins/crypto_native/aes_cbc.c +++ b/src/plugins/crypto_native/aes_cbc.c @@ -25,17 +25,35 @@ #pragma GCC optimize ("O3") #endif +#if defined(__VAES__) && defined(__AVX512F__) +#define N 16 +#define u8xN u8x64 +#define u32xN u32x16 +#define u32xN_min_scalar u32x16_min_scalar +#define u32xN_is_all_zero u32x16_is_all_zero +#define u32xN_splat u32x16_splat +#elif defined(__VAES__) +#define N 8 +#define u8xN u8x32 +#define u32xN u32x8 +#define u32xN_min_scalar u32x8_min_scalar +#define u32xN_is_all_zero u32x8_is_all_zero +#define u32xN_splat u32x8_splat +#else +#define N 4 +#define u8xN u8x16 +#define u32xN u32x4 +#define u32xN_min_scalar u32x4_min_scalar +#define u32xN_is_all_zero u32x4_is_all_zero +#define u32xN_splat u32x4_splat +#endif + typedef struct { u8x16 encrypt_key[15]; -#if __VAES__ - u8x64 decrypt_key[15]; -#else - u8x16 decrypt_key[15]; -#endif + u8xN decrypt_key[15]; } aes_cbc_key_data_t; - static_always_inline void __clib_unused aes_cbc_dec (u8x16 * k, u8x16u * src, u8x16u * dst, u8x16u * iv, int count, int rounds) @@ -119,7 +137,7 @@ aes_cbc_dec (u8x16 * k, u8x16u * src, u8x16u * dst, u8x16u * iv, int count, } #if __x86_64__ -#ifdef __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) static_always_inline u8x64 aes_block_load_x4 (u8 * src[], int i) @@ -142,14 +160,13 @@ aes_block_store_x4 (u8 * dst[], int i, u8x64 r) } static_always_inline u8x64 -aes_cbc_dec_permute (u8x64 a, u8x64 b) +aes4_cbc_dec_permute (u8x64 a, u8x64 b) { - __m512i perm = { 6, 7, 8, 9, 10, 11, 12, 13 }; - return (u8x64) _mm512_permutex2var_epi64 ((__m512i) a, perm, (__m512i) b); + return (u8x64) u64x8_shuffle2 (a, b, 6, 7, 8, 9, 10, 11, 12, 13); } static_always_inline void -vaes_cbc_dec (u8x64 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count, +aes4_cbc_dec (u8x64 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count, aes_key_size_t rounds) { u8x64 f, r[4], c[4] = { }; @@ -184,10 +201,10 @@ vaes_cbc_dec (u8x64 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count, r[2] = aes_dec_last_round_x4 (r[2], k[i]); r[3] = aes_dec_last_round_x4 (r[3], k[i]); - dst[0] = r[0] ^= aes_cbc_dec_permute (f, c[0]); - dst[1] = r[1] ^= aes_cbc_dec_permute (c[0], c[1]); - dst[2] = r[2] ^= aes_cbc_dec_permute (c[1], c[2]); - dst[3] = r[3] ^= aes_cbc_dec_permute (c[2], c[3]); + dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]); + dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]); + dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]); + dst[3] = r[3] ^= aes4_cbc_dec_permute (c[2], c[3]); f = c[3]; n_blocks -= 16; @@ -195,40 +212,248 @@ vaes_cbc_dec (u8x64 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count, dst += 4; } - while (n_blocks > 0) + if (n_blocks >= 12) + { + c[0] = src[0]; + c[1] = src[1]; + c[2] = src[2]; + + r[0] = c[0] ^ k[0]; + r[1] = c[1] ^ k[0]; + r[2] = c[2] ^ k[0]; + + for (i = 1; i < rounds; i++) + { + r[0] = aes_dec_round_x4 (r[0], k[i]); + r[1] = aes_dec_round_x4 (r[1], k[i]); + r[2] = aes_dec_round_x4 (r[2], k[i]); + } + + r[0] = aes_dec_last_round_x4 (r[0], k[i]); + r[1] = aes_dec_last_round_x4 (r[1], k[i]); + r[2] = aes_dec_last_round_x4 (r[2], k[i]); + + dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]); + dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]); + dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]); + f = c[2]; + + n_blocks -= 12; + src += 3; + dst += 3; + } + else if (n_blocks >= 8) + { + c[0] = src[0]; + c[1] = src[1]; + + r[0] = c[0] ^ k[0]; + r[1] = c[1] ^ k[0]; + + for (i = 1; i < rounds; i++) + { + r[0] = aes_dec_round_x4 (r[0], k[i]); + r[1] = aes_dec_round_x4 (r[1], k[i]); + } + + r[0] = aes_dec_last_round_x4 (r[0], k[i]); + r[1] = aes_dec_last_round_x4 (r[1], k[i]); + + dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]); + dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]); + f = c[1]; + + n_blocks -= 8; + src += 2; + dst += 2; + } + else if (n_blocks >= 4) + { + c[0] = src[0]; + + r[0] = c[0] ^ k[0]; + + for (i = 1; i < rounds; i++) + { + r[0] = aes_dec_round_x4 (r[0], k[i]); + } + + r[0] = aes_dec_last_round_x4 (r[0], k[i]); + + dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]); + f = c[0]; + + n_blocks -= 4; + src += 1; + dst += 1; + } + + if (n_blocks > 0) { m = (1 << (n_blocks * 2)) - 1; c[0] = (u8x64) _mm512_mask_loadu_epi64 ((__m512i) c[0], m, (__m512i *) src); - f = aes_cbc_dec_permute (f, c[0]); + f = aes4_cbc_dec_permute (f, c[0]); r[0] = c[0] ^ k[0]; for (i = 1; i < rounds; i++) r[0] = aes_dec_round_x4 (r[0], k[i]); r[0] = aes_dec_last_round_x4 (r[0], k[i]); _mm512_mask_storeu_epi64 ((__m512i *) dst, m, (__m512i) (r[0] ^ f)); - f = c[0]; + } +} +#elif defined(__VAES__) + +static_always_inline u8x32 +aes_block_load_x2 (u8 *src[], int i) +{ + u8x32 r = {}; + r = u8x32_insert_lo (r, aes_block_load (src[0] + i)); + r = u8x32_insert_hi (r, aes_block_load (src[1] + i)); + return r; +} + +static_always_inline void +aes_block_store_x2 (u8 *dst[], int i, u8x32 r) +{ + aes_block_store (dst[0] + i, u8x32_extract_lo (r)); + aes_block_store (dst[1] + i, u8x32_extract_hi (r)); +} + +static_always_inline u8x32 +aes2_cbc_dec_permute (u8x32 a, u8x32 b) +{ + return (u8x32) u64x4_shuffle2 ((u64x4) a, (u64x4) b, 2, 3, 4, 5); +} + +static_always_inline void +aes2_cbc_dec (u8x32 *k, u8x32u *src, u8x32u *dst, u8x16u *iv, int count, + aes_key_size_t rounds) +{ + u8x32 f = {}, r[4], c[4] = {}; + int i, n_blocks = count >> 4; + + f = u8x32_insert_hi (f, *iv); + + while (n_blocks >= 8) + { + c[0] = src[0]; + c[1] = src[1]; + c[2] = src[2]; + c[3] = src[3]; + + r[0] = c[0] ^ k[0]; + r[1] = c[1] ^ k[0]; + r[2] = c[2] ^ k[0]; + r[3] = c[3] ^ k[0]; + + for (i = 1; i < rounds; i++) + { + r[0] = aes_dec_round_x2 (r[0], k[i]); + r[1] = aes_dec_round_x2 (r[1], k[i]); + r[2] = aes_dec_round_x2 (r[2], k[i]); + r[3] = aes_dec_round_x2 (r[3], k[i]); + } + + r[0] = aes_dec_last_round_x2 (r[0], k[i]); + r[1] = aes_dec_last_round_x2 (r[1], k[i]); + r[2] = aes_dec_last_round_x2 (r[2], k[i]); + r[3] = aes_dec_last_round_x2 (r[3], k[i]); + + dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]); + dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]); + dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]); + dst[3] = r[3] ^= aes2_cbc_dec_permute (c[2], c[3]); + f = c[3]; + + n_blocks -= 8; + src += 4; + dst += 4; + } + + if (n_blocks >= 6) + { + c[0] = src[0]; + c[1] = src[1]; + c[2] = src[2]; + + r[0] = c[0] ^ k[0]; + r[1] = c[1] ^ k[0]; + r[2] = c[2] ^ k[0]; + + for (i = 1; i < rounds; i++) + { + r[0] = aes_dec_round_x2 (r[0], k[i]); + r[1] = aes_dec_round_x2 (r[1], k[i]); + r[2] = aes_dec_round_x2 (r[2], k[i]); + } + + r[0] = aes_dec_last_round_x2 (r[0], k[i]); + r[1] = aes_dec_last_round_x2 (r[1], k[i]); + r[2] = aes_dec_last_round_x2 (r[2], k[i]); + + dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]); + dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]); + dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]); + f = c[2]; + + n_blocks -= 6; + src += 3; + dst += 3; + } + else if (n_blocks >= 4) + { + c[0] = src[0]; + c[1] = src[1]; + + r[0] = c[0] ^ k[0]; + r[1] = c[1] ^ k[0]; + + for (i = 1; i < rounds; i++) + { + r[0] = aes_dec_round_x2 (r[0], k[i]); + r[1] = aes_dec_round_x2 (r[1], k[i]); + } + + r[0] = aes_dec_last_round_x2 (r[0], k[i]); + r[1] = aes_dec_last_round_x2 (r[1], k[i]); + + dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]); + dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]); + f = c[1]; + n_blocks -= 4; + src += 2; + dst += 2; + } + else if (n_blocks >= 2) + { + c[0] = src[0]; + r[0] = c[0] ^ k[0]; + + for (i = 1; i < rounds; i++) + r[0] = aes_dec_round_x2 (r[0], k[i]); + + r[0] = aes_dec_last_round_x2 (r[0], k[i]); + dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]); + f = c[0]; + + n_blocks -= 2; src += 1; dst += 1; } + + if (n_blocks > 0) + { + u8x16 rl = *(u8x16u *) src ^ u8x32_extract_lo (k[0]); + for (i = 1; i < rounds; i++) + rl = aes_dec_round (rl, u8x32_extract_lo (k[i])); + rl = aes_dec_last_round (rl, u8x32_extract_lo (k[i])); + *(u8x16 *) dst = rl ^ u8x32_extract_hi (f); + } } #endif #endif -#ifdef __VAES__ -#define N 16 -#define u32xN u32x16 -#define u32xN_min_scalar u32x16_min_scalar -#define u32xN_is_all_zero u32x16_is_all_zero -#define u32xN_splat u32x16_splat -#else -#define N 4 -#define u32xN u32x4 -#define u32xN_min_scalar u32x4_min_scalar -#define u32xN_is_all_zero u32x4_is_all_zero -#define u32xN_splat u32x4_splat -#endif - static_always_inline u32 aes_ops_enc_aes_cbc (vlib_main_t * vm, vnet_crypto_op_t * ops[], u32 n_ops, aes_key_size_t ks) @@ -242,14 +467,8 @@ aes_ops_enc_aes_cbc (vlib_main_t * vm, vnet_crypto_op_t * ops[], vnet_crypto_key_index_t key_index[N]; u8 *src[N] = { }; u8 *dst[N] = { }; -#if __VAES__ - u8x64 r[N / 4] = { }; - u8x64 k[15][N / 4] = { }; - u8x16 *kq, *rq = (u8x16 *) r; -#else - u8x16 r[N] = { }; - u8x16 k[15][N] = { }; -#endif + u8xN r[4] = {}; + u8xN k[15][4] = {}; for (i = 0; i < N; i++) key_index[i] = ~0; @@ -268,11 +487,7 @@ more: else { u8x16 t = aes_block_load (ops[0]->iv); -#if __VAES__ - rq[i] = t; -#else - r[i] = t; -#endif + ((u8x16 *) r)[i] = t; src[i] = ops[0]->src; dst[i] = ops[0]->dst; @@ -284,14 +499,7 @@ more: key_index[i] = ops[0]->key_index; kd = (aes_cbc_key_data_t *) cm->key_data[key_index[i]]; for (j = 0; j < rounds + 1; j++) - { -#if __VAES__ - kq = (u8x16 *) k[j]; - kq[i] = kd->encrypt_key[j]; -#else - k[j][i] = kd->encrypt_key[j]; -#endif - } + ((u8x16 *) k[j])[i] = kd->encrypt_key[j]; } ops[0]->status = VNET_CRYPTO_OP_STATUS_COMPLETED; n_left--; @@ -305,7 +513,7 @@ more: for (i = 0; i < count; i += 16) { -#ifdef __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) r[0] = u8x64_xor3 (r[0], aes_block_load_x4 (src, i), k[0][0]); r[1] = u8x64_xor3 (r[1], aes_block_load_x4 (src + 4, i), k[0][1]); r[2] = u8x64_xor3 (r[2], aes_block_load_x4 (src + 8, i), k[0][2]); @@ -327,6 +535,28 @@ more: aes_block_store_x4 (dst + 4, i, r[1]); aes_block_store_x4 (dst + 8, i, r[2]); aes_block_store_x4 (dst + 12, i, r[3]); +#elif defined(__VAES__) + r[0] = u8x32_xor3 (r[0], aes_block_load_x2 (src, i), k[0][0]); + r[1] = u8x32_xor3 (r[1], aes_block_load_x2 (src + 2, i), k[0][1]); + r[2] = u8x32_xor3 (r[2], aes_block_load_x2 (src + 4, i), k[0][2]); + r[3] = u8x32_xor3 (r[3], aes_block_load_x2 (src + 6, i), k[0][3]); + + for (j = 1; j < rounds; j++) + { + r[0] = aes_enc_round_x2 (r[0], k[j][0]); + r[1] = aes_enc_round_x2 (r[1], k[j][1]); + r[2] = aes_enc_round_x2 (r[2], k[j][2]); + r[3] = aes_enc_round_x2 (r[3], k[j][3]); + } + r[0] = aes_enc_last_round_x2 (r[0], k[j][0]); + r[1] = aes_enc_last_round_x2 (r[1], k[j][1]); + r[2] = aes_enc_last_round_x2 (r[2], k[j][2]); + r[3] = aes_enc_last_round_x2 (r[3], k[j][3]); + + aes_block_store_x2 (dst, i, r[0]); + aes_block_store_x2 (dst + 2, i, r[1]); + aes_block_store_x2 (dst + 4, i, r[2]); + aes_block_store_x2 (dst + 6, i, r[3]); #else #if __x86_64__ r[0] = u8x16_xor3 (r[0], aes_block_load (src[0] + i), k[0][0]); @@ -406,8 +636,11 @@ aes_ops_dec_aes_cbc (vlib_main_t * vm, vnet_crypto_op_t * ops[], ASSERT (n_ops >= 1); decrypt: -#ifdef __VAES__ - vaes_cbc_dec (kd->decrypt_key, (u8x64u *) op->src, (u8x64u *) op->dst, +#if defined(__VAES__) && defined(__AVX512F__) + aes4_cbc_dec (kd->decrypt_key, (u8x64u *) op->src, (u8x64u *) op->dst, + (u8x16u *) op->iv, op->len, rounds); +#elif defined(__VAES__) + aes2_cbc_dec (kd->decrypt_key, (u8x32u *) op->src, (u8x32u *) op->dst, (u8x16u *) op->iv, op->len, rounds); #else aes_cbc_dec (kd->decrypt_key, (u8x16u *) op->src, (u8x16u *) op->dst, @@ -435,8 +668,10 @@ aes_cbc_key_exp (vnet_crypto_key_t * key, aes_key_size_t ks) aes_key_enc_to_dec (e, d, ks); for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++) { -#if __VAES__ - kd->decrypt_key[i] = (u8x64) _mm512_broadcast_i64x2 ((__m128i) d[i]); +#if defined(__VAES__) && defined(__AVX512F__) + kd->decrypt_key[i] = u8x64_splat_u8x16 (d[i]); +#elif defined(__VAES__) + kd->decrypt_key[i] = u8x32_splat_u8x16 (d[i]); #else kd->decrypt_key[i] = d[i]; #endif @@ -463,8 +698,10 @@ foreach_aes_cbc_handler_type; #include clib_error_t * -#ifdef __VAES__ -crypto_native_aes_cbc_init_icl (vlib_main_t * vm) +#if defined(__VAES__) && defined(__AVX512F__) +crypto_native_aes_cbc_init_icl (vlib_main_t *vm) +#elif defined(__VAES__) +crypto_native_aes_cbc_init_adl (vlib_main_t *vm) #elif __AVX512F__ crypto_native_aes_cbc_init_skx (vlib_main_t * vm) #elif __aarch64__ diff --git a/src/plugins/crypto_native/aes_gcm.c b/src/plugins/crypto_native/aes_gcm.c index dde8ab34ee6..c13665e3fb1 100644 --- a/src/plugins/crypto_native/aes_gcm.c +++ b/src/plugins/crypto_native/aes_gcm.c @@ -26,7 +26,7 @@ #pragma GCC optimize ("O3") #endif -#ifdef __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) #define NUM_HI 32 #else #define NUM_HI 8 @@ -38,7 +38,7 @@ typedef struct const u8x16 Hi[NUM_HI]; /* extracted AES key */ const u8x16 Ke[15]; -#ifdef __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) const u8x64 Ke4[15]; #endif } aes_gcm_key_data_t; @@ -63,7 +63,6 @@ typedef enum static const u32x4 ctr_inv_1 = { 0, 0, 0, 1 << 24 }; -#ifndef __VAES__ static_always_inline void aes_gcm_enc_first_round (u8x16 * r, aes_gcm_counter_t * ctr, u8x16 k, int n_blocks) @@ -107,7 +106,6 @@ aes_gcm_enc_last_round (u8x16 * r, u8x16 * d, u8x16 const *k, for (int i = 0; i < n_blocks; i++) d[i] ^= aes_enc_last_round (r[i], k[rounds]); } -#endif static_always_inline u8x16 aes_gcm_ghash_blocks (u8x16 T, aes_gcm_key_data_t * kd, @@ -163,11 +161,10 @@ aes_gcm_ghash (u8x16 T, aes_gcm_key_data_t * kd, u8x16u * in, u32 n_left) return T; } -#ifndef __VAES__ -static_always_inline u8x16 -aes_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d, - aes_gcm_counter_t * ctr, u8x16u * inv, u8x16u * outv, - int rounds, int n, int last_block_bytes, aes_gcm_flags_t f) +static_always_inline __clib_unused u8x16 +aes_gcm_calc (u8x16 T, aes_gcm_key_data_t *kd, u8x16 *d, + aes_gcm_counter_t *ctr, u8x16u *inv, u8x16u *outv, int rounds, + int n, int last_block_bytes, aes_gcm_flags_t f) { u8x16 r[n]; ghash_data_t _gd = { }, *gd = &_gd; @@ -258,9 +255,9 @@ aes_gcm_calc (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d, return T; } -static_always_inline u8x16 -aes_gcm_calc_double (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d, - aes_gcm_counter_t * ctr, u8x16u * inv, u8x16u * outv, +static_always_inline __clib_unused u8x16 +aes_gcm_calc_double (u8x16 T, aes_gcm_key_data_t *kd, u8x16 *d, + aes_gcm_counter_t *ctr, u8x16u *inv, u8x16u *outv, int rounds, aes_gcm_flags_t f) { u8x16 r[4]; @@ -396,9 +393,9 @@ aes_gcm_calc_double (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d, return ghash_final (gd); } -static_always_inline u8x16 -aes_gcm_ghash_last (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d, - int n_blocks, int n_bytes) +static_always_inline __clib_unused u8x16 +aes_gcm_ghash_last (u8x16 T, aes_gcm_key_data_t *kd, u8x16 *d, int n_blocks, + int n_bytes) { ghash_data_t _gd, *gd = &_gd; u8x16 *Hi = (u8x16 *) kd->Hi + NUM_HI - n_blocks; @@ -417,9 +414,8 @@ aes_gcm_ghash_last (u8x16 T, aes_gcm_key_data_t * kd, u8x16 * d, ghash_reduce2 (gd); return ghash_final (gd); } -#endif -#ifdef __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) static const u32x16 ctr_inv_1234 = { 0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24, 0, 0, 0, 3 << 24, 0, 0, 0, 4 << 24, }; @@ -757,7 +753,7 @@ aes_gcm_enc (u8x16 T, aes_gcm_key_data_t * kd, aes_gcm_counter_t * ctr, if (n_left == 0) return T; -#if __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) u8x64 d4[4]; if (n_left < 256) { @@ -939,7 +935,7 @@ aes_gcm_dec (u8x16 T, aes_gcm_key_data_t * kd, aes_gcm_counter_t * ctr, u8x16u * inv, u8x16u * outv, u32 n_left, int rounds) { aes_gcm_flags_t f = AES_GCM_F_WITH_GHASH | AES_GCM_F_DECRYPT; -#ifdef __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) u8x64 d4[4] = { }; while (n_left >= 512) @@ -1045,7 +1041,7 @@ aes_gcm (u8x16u *in, u8x16u *out, u8x16u *addt, u8 *ivp, u8x16u *tag, Y0.as_u64x2[0] = *(u64u *) ivp; Y0.as_u32x4[2] = *(u32u *) (ivp + 8); Y0.as_u32x4 += ctr_inv_1; -#ifdef __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) ctr->Y4 = u32x16_splat_u32x4 (Y0.as_u32x4) + ctr_inv_1234; #else ctr->Y = Y0.as_u32x4 + ctr_inv_1; @@ -1177,7 +1173,7 @@ aes_gcm_key_exp (vnet_crypto_key_t * key, aes_key_size_t ks) H = aes_encrypt_block (u8x16_splat (0), kd->Ke, ks); H = u8x16_reflect (H); ghash_precompute (H, (u8x16 *) kd->Hi, NUM_HI); -#ifdef __VAES__ +#if defined(__VAES__) && defined(__AVX512F__) u8x64 *Ke4 = (u8x64 *) kd->Ke4; for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++) Ke4[i] = u8x64_splat_u8x16 (kd->Ke[i]); @@ -1201,8 +1197,8 @@ foreach_aes_gcm_handler_type; #undef _ clib_error_t * -#ifdef __VAES__ -crypto_native_aes_gcm_init_icl (vlib_main_t * vm) +#if defined(__VAES__) && defined(__AVX512F__) +crypto_native_aes_gcm_init_icl (vlib_main_t *vm) #elif __AVX512F__ crypto_native_aes_gcm_init_skx (vlib_main_t * vm) #elif __AVX2__ diff --git a/src/plugins/crypto_native/crypto_native.h b/src/plugins/crypto_native/crypto_native.h index 3bad14ea2df..623070c19b7 100644 --- a/src/plugins/crypto_native/crypto_native.h +++ b/src/plugins/crypto_native/crypto_native.h @@ -29,7 +29,8 @@ typedef struct extern crypto_native_main_t crypto_native_main; -#define foreach_crypto_native_march_variant _(slm) _(hsw) _(skx) _(icl) _(neon) +#define foreach_crypto_native_march_variant \ + _ (slm) _ (hsw) _ (skx) _ (icl) _ (adl) _ (neon) #define _(v) \ clib_error_t __clib_weak *crypto_native_aes_cbc_init_##v (vlib_main_t * vm); \ diff --git a/src/plugins/crypto_native/ghash.h b/src/plugins/crypto_native/ghash.h index f389d11cfe7..5f619cfa129 100644 --- a/src/plugins/crypto_native/ghash.h +++ b/src/plugins/crypto_native/ghash.h @@ -257,7 +257,7 @@ ghash_mul (u8x16 a, u8x16 b) return ghash_final (gd); } -#ifdef __VPCLMULQDQ__ +#if defined(__VPCLMULQDQ__) && defined(__AVX512F__) static const u8x64 ghash4_poly2 = { 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, diff --git a/src/plugins/crypto_native/main.c b/src/plugins/crypto_native/main.c index 2eedcd7357a..718356c745f 100644 --- a/src/plugins/crypto_native/main.c +++ b/src/plugins/crypto_native/main.c @@ -78,6 +78,8 @@ crypto_native_init (vlib_main_t * vm) else if (crypto_native_aes_cbc_init_icl && clib_cpu_supports_vaes () && clib_cpu_supports_avx512f ()) error = crypto_native_aes_cbc_init_icl (vm); + else if (crypto_native_aes_cbc_init_adl && clib_cpu_supports_vaes ()) + error = crypto_native_aes_cbc_init_adl (vm); else if (crypto_native_aes_cbc_init_skx && clib_cpu_supports_avx512f ()) error = crypto_native_aes_cbc_init_skx (vm); else if (crypto_native_aes_cbc_init_hsw && clib_cpu_supports_avx2 ()) @@ -101,6 +103,8 @@ crypto_native_init (vlib_main_t * vm) if (crypto_native_aes_gcm_init_icl && clib_cpu_supports_vaes () && clib_cpu_supports_avx512f ()) error = crypto_native_aes_gcm_init_icl (vm); + else if (crypto_native_aes_gcm_init_adl && clib_cpu_supports_vaes ()) + error = crypto_native_aes_gcm_init_adl (vm); else if (crypto_native_aes_gcm_init_skx && clib_cpu_supports_avx512f ()) error = crypto_native_aes_gcm_init_skx (vm); else if (crypto_native_aes_gcm_init_hsw && clib_cpu_supports_avx2 ()) diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h index f5c09a53f96..80c2e39bdfc 100644 --- a/src/vppinfra/vector_avx2.h +++ b/src/vppinfra/vector_avx2.h @@ -213,6 +213,16 @@ u32x8_hxor (u32x8 v) return v4[0]; } +static_always_inline u8x32 +u8x32_xor3 (u8x32 a, u8x32 b, u8x32 c) +{ +#if __AVX512F__ + return (u8x32) _mm256_ternarylogic_epi32 ((__m256i) a, (__m256i) b, + (__m256i) c, 0x96); +#endif + return a ^ b ^ c; +} + static_always_inline u16x16 u16x16_mask_last (u16x16 v, u8 n_last) { @@ -391,6 +401,12 @@ u64x4_transpose (u64x4 a[8]) a[3] = u64x4_permute_lanes (r[1], r[3], 0x31); } +static_always_inline u8x32 +u8x32_splat_u8x16 (u8x16 a) +{ + return (u8x32) _mm256_broadcastsi128_si256 ((__m128i) a); +} + #endif /* included_vector_avx2_h */ /* -- cgit 1.2.3-korg