/* SPDX-License-Identifier: Apache-2.0 * Copyright(c) 2023 Cisco Systems, Inc. */ #ifndef __crypto_aes_cbc_h__ #define __crypto_aes_cbc_h__ #include #include #include typedef struct { const u8x16 encrypt_key[15]; const u8x16 decrypt_key[15]; } aes_cbc_key_data_t; static_always_inline void clib_aes_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *src, uword len, const u8 *iv, aes_key_size_t ks, u8 *dst) { int rounds = AES_KEY_ROUNDS (ks); u8x16 r, *k = (u8x16 *) kd->encrypt_key; r = *(u8x16u *) iv; for (int i = 0; i < len; i += 16) { int j; r = u8x16_xor3 (r, *(u8x16u *) (src + i), k[0]); for (j = 1; j < rounds; j++) r = aes_enc_round_x1 (r, k[j]); r = aes_enc_last_round_x1 (r, k[rounds]); *(u8x16u *) (dst + i) = r; } } static_always_inline void clib_aes128_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext, uword len, const u8 *iv, u8 *ciphertext) { clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_128, ciphertext); } static_always_inline void clib_aes192_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext, uword len, const u8 *iv, u8 *ciphertext) { clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_192, ciphertext); } static_always_inline void clib_aes256_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext, uword len, const u8 *iv, u8 *ciphertext) { clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_256, ciphertext); } static_always_inline void __clib_unused aes_cbc_dec (const u8x16 *k, u8x16u *src, u8x16u *dst, u8x16u *iv, int count, int rounds) { u8x16 r[4], c[4], f; f = iv[0]; while (count >= 64) { c[0] = r[0] = src[0]; c[1] = r[1] = src[1]; c[2] = r[2] = src[2]; c[3] = r[3] = src[3]; #if __x86_64__ r[0] ^= k[0]; r[1] ^= k[0]; r[2] ^= k[0]; r[3] ^= k[0]; for (int i = 1; i < rounds; i++) { r[0] = aes_dec_round_x1 (r[0], k[i]); r[1] = aes_dec_round_x1 (r[1], k[i]); r[2] = aes_dec_round_x1 (r[2], k[i]); r[3] = aes_dec_round_x1 (r[3], k[i]); } r[0] = aes_dec_last_round_x1 (r[0], k[rounds]); r[1] = aes_dec_last_round_x1 (r[1], k[rounds]); r[2] = aes_dec_last_round_x1 (r[2], k[rounds]); r[3] = aes_dec_last_round_x1 (r[3], k[rounds]); #else for (int i = 0; i < rounds - 1; i++) { r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i])); r[1] = vaesimcq_u8 (vaesdq_u8 (r[1], k[i])); r[2] = vaesimcq_u8 (vaesdq_u8 (r[2], k[i])); r[3] = vaesimcq_u8 (vaesdq_u8 (r[3], k[i])); } r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds]; r[1] = vaesdq_u8 (r[1], k[rounds - 1]) ^ k[rounds]; r[2] = vaesdq_u8 (r[2], k[rounds - 1]) ^ k[rounds]; r[3] = vaesdq_u8 (r[3], k[rounds - 1]) ^ k[rounds]; #endif dst[0] = r[0] ^ f; dst[1] = r[1] ^ c[0]; dst[2] = r[2] ^ c[1]; dst[3] = r[3] ^ c[2]; f = c[3]; count -= 64; src += 4; dst += 4; } while (count > 0) { c[0] = r[0] = src[0]; #if __x86_64__ r[0] ^= k[0]; for (int i = 1; i < rounds; i++) r[0] = aes_dec_round_x1 (r[0], k[i]); r[0] = aes_dec_last_round_x1 (r[0], k[rounds]); #else c[0] = r[0] = src[0]; for (int i = 0; i < rounds - 1; i++) r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i])); r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds]; #endif dst[0] = r[0] ^ f; f = c[0]; count -= 16; src += 1; dst += 1; } } #if __x86_64__ #if defined(__VAES__) && defined(__AVX512F__) static_always_inline u8x64 aes_block_load_x4 (u8 *src[], int i) { u8x64 r = {}; r = u8x64_insert_u8x16 (r, aes_block_load (src[0] + i), 0); r = u8x64_insert_u8x16 (r, aes_block_load (src[1] + i), 1); r = u8x64_insert_u8x16 (r, aes_block_load (src[2] + i), 2); r = u8x64_insert_u8x16 (r, aes_block_load (src[3] + i), 3); return r; } static_always_inline void aes_block_store_x4 (u8 *dst[], int i, u8x64 r) { aes_block_store (dst[0] + i, u8x64_extract_u8x16 (r, 0)); aes_block_store (dst[1] + i, u8x64_extract_u8x16 (r, 1)); aes_block_store (dst[2] + i, u8x64_extract_u8x16 (r, 2)); aes_block_store (dst[3] + i, u8x64_extract_u8x16 (r, 3)); } static_always_inline u8x64 aes4_cbc_dec_permute (u8x64 a, u8x64 b) { return (u8x64) u64x8_shuffle2 (a, b, 6, 7, 8, 9, 10, 11, 12, 13); } static_always_inline void aes4_cbc_dec (const u8x16 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count, aes_key_size_t rounds) { u8x64 f, k4, r[4], c[4] = {}; __mmask8 m; int i, n_blocks = count >> 4; f = u8x64_insert_u8x16 (u8x64_zero (), *iv, 3); while (n_blocks >= 16) { k4 = u8x64_splat_u8x16 (k[0]); c[0] = src[0]; c[1] = src[1]; c[2] = src[2]; c[3] = src[3]; r[0] = c[0] ^ k4; r[1] = c[1] ^ k4; r[2] = c[2] ^ k4; r[3] = c[3] ^ k4; for (i = 1; i < rounds; i++) { k4 = u8x64_splat_u8x16 (k[i]); r[0] = aes_dec_round_x4 (r[0], k4); r[1] = aes_dec_round_x4 (r[1], k4); r[2] = aes_dec_round_x4 (r[2], k4); r[3] = aes_dec_round_x4 (r[3], k4); } k4 = u8x64_splat_u8x16 (k[i]); r[0] = aes_dec_last_round_x4 (r[0], k4); r[1] = aes_dec_last_round_x4 (r[1], k4); r[2] = aes_dec_last_round_x4 (r[2], k4); r[3] = aes_dec_last_round_x4 (r[3], k4); dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]); dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]); dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]); dst[3] = r[3] ^= aes4_cbc_dec_permute (c[2], c[3]); f = c[3]; n_blocks -= 16; src += 4; dst += 4; } if (n_blocks >= 12) { k4 = u8x64_splat_u8x16 (k[0]); c[0] = src[0]; c[1] = src[1]; c[2] = src[2]; r[0] = c[0] ^ k4; r[1] = c[1] ^ k4; r[2] = c[2] ^ k4; for (i = 1; i < rounds; i++) { k4 = u8x64_splat_u8x16 (k[i]); r[0] = aes_dec_round_x4 (r[0], k4); r[1] = aes_dec_round_x4 (r[1], k4); r[2] = aes_dec_round_x4 (r[2], k4); } k4 = u8x64_splat_u8x16 (k[i]); r[0] = aes_dec_last_round_x4 (r[0], k4); r[1] = aes_dec_last_round_x4 (r[1], k4); r[2] = aes_dec_last_round_x4 (r[2], k4); dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]); dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]); dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]); f = c[2]; n_blocks -= 12; src += 3; dst += 3; } else if (n_blocks >= 8) { k4 = u8x64_splat_u8x16 (k[0]); c[0] = src[0]; c[1] = src[1]; r[0] = c[0] ^ k4; r[1] = c[1] ^ k4; for (i = 1; i < rounds; i++) { k4 = u8x64_splat_u8x16 (k[i]); r[0] = aes_dec_round_x4 (r[0], k4); r[1] = aes_dec_round_x4 (r[1], k4); } k4 = u8x64_splat_u8x16 (k[i]); r[0] = aes_dec_last_round_x4 (r[0], k4); r[1] = aes_dec_last_round_x4 (r[1], k4); dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]); dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]); f = c[1]; n_blocks -= 8; src += 2; dst += 2; } else if (n_blocks >= 4) { c[0] = src[0]; r[0] = c[0] ^ u8x64_splat_u8x16 (k[0]); for (i = 1; i < rounds; i++) r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i])); r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i])); dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]); f = c[0]; n_blocks -= 4; src += 1; dst += 1; } if (n_blocks > 0) { k4 = u8x64_splat_u8x16 (k[0]); m = (1 << (n_blocks * 2)) - 1; c[0] = (u8x64) _mm512_mask_loadu_epi64 ((__m512i) c[0], m, (__m512i *) src); f = aes4_cbc_dec_permute (f, c[0]); r[0] = c[0] ^ k4; for (i = 1; i < rounds; i++) r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i])); r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i])); _mm512_mask_storeu_epi64 ((__m512i *) dst, m, (__m512i) (r[0] ^ f)); } } #elif defined(__VAES__) static_always_inline u8x32 aes_block_load_x2 (u8 *src[], int i) { u8x32 r = {}; r = u8x32_insert_lo (r, aes_block_load (src[0] + i)); r = u8x32_insert_hi (r, aes_block_load (src[1] + i)); return r; } static_always_inline void aes_block_store_x2 (u8 *dst[], int i, u8x32 r) { aes_block_store (dst[0] + i, u8x32_extract_lo (r)); aes_block_store (dst[1] + i, u8x32_extract_hi (r)); } static_always_inline u8x32 aes2_cbc_dec_permute (u8x32 a, u8x32 b) { return (u8x32) u64x4_shuffle2 ((u64x4) a, (u64x4) b, 2, 3, 4, 5); } static_always_inline void aes2_cbc_dec (const u8x16 *k, u8x32u *src, u8x32u *dst, u8x16u *iv, int count, aes_key_size_t rounds) { u8x32 k2, f = {}, r[4], c[4] = {}; int i, n_blocks = count >> 4; f = u8x32_insert_hi (f, *iv); while (n_blocks >= 8) { k2 = u8x32_splat_u8x16 (k[0]); c[0] = src[0]; c[1] = src[1]; c[2] = src[2]; c[3] = src[3]; r[0] = c[0] ^ k2; r[1] = c[1] ^ k2; r[2] = c[2] ^ k2; r[3] = c[3] ^ k2; for (i = 1; i < rounds; i++) { k2 = u8x32_splat_u8x16 (k[i]); r[0] = aes_dec_round_x2 (r[0], k2); r[1] = aes_dec_round_x2 (r[1], k2); r[2] = aes_dec_round_x2 (r[2], k2); r[3] = aes_dec_round_x2 (r[3], k2); } k2 = u8x32_splat_u8x16 (k[i]); r[0] = aes_dec_last_round_x2 (r[0], k2); r[1] = aes_dec_last_round_x2 (r[1], k2); r[2] = aes_dec_last_round_x2 (r[2], k2); r[3] = aes_dec_last_round_x2 (r[3], k2); dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]); dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]); dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]); dst[3] = r[3] ^= aes2_cbc_dec_permute (c[2], c[3]); f = c[3]; n_blocks -= 8; src += 4; dst += 4; } if (n_blocks >= 6) { k2 = u8x32_splat_u8x16 (k[0]); c[0] = src[0]; c[1] = src[1]; c[2] = src[2]; r[0] = c[0] ^ k2; r[1] = c[1] ^ k2; r[2] = c[2] ^ k2; for (i = 1; i < rounds; i++) { k2 = u8x32_splat_u8x16 (k[i]); r[0] = aes_dec_round_x2 (r[0], k2); r[1] = aes_dec_round_x2 (r[1], k2); r[2] = aes_dec_round_x2 (r[2], k2); } k2 = u8x32_splat_u8x16 (k[i]); r[0] = aes_dec_last_round_x2 (r[0], k2); r[1] = aes_dec_last_round_x2 (r[1], k2); r[2] = aes_dec_last_round_x2 (r[2], k2); dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]); dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]); dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]); f = c[2]; n_blocks -= 6; src += 3; dst += 3; } else if (n_blocks >= 4) { k2 = u8x32_splat_u8x16 (k[0]); c[0] = src[0]; c[1] = src[1]; r[0] = c[0] ^ k2; r[1] = c[1] ^ k2; for (i = 1; i < rounds; i++) { k2 = u8x32_splat_u8x16 (k[i]); r[0] = aes_dec_round_x2 (r[0], k2); r[1] = aes_dec_round_x2 (r[1], k2); } k2 = u8x32_splat_u8x16 (k[i]); r[0] = aes_dec_last_round_x2 (r[0], k2); r[1] = aes_dec_last_round_x2 (r[1], k2); dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]); dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]); f = c[1]; n_blocks -= 4; src += 2; dst += 2; } else if (n_blocks >= 2) { k2 = u8x32_splat_u8x16 (k[0]); c[0] = src[0]; r[0] = c[0] ^ k2; for (i = 1; i < rounds; i++) r[0] = aes_dec_round_x2 (r[0], u8x32_splat_u8x16 (k[i])); r[0] = aes_dec_last_round_x2 (r[0], u8x32_splat_u8x16 (k[i])); dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]); f = c[0]; n_blocks -= 2; src += 1; dst += 1; } if (n_blocks > 0) { u8x16 rl = *(u8x16u *) src ^ k[0]; for (i = 1; i < rounds; i++) rl = aes_dec_round_x1 (rl, k[i]); rl = aes_dec_last_round_x1 (rl, k[i]); *(u8x16u *) dst = rl ^ u8x32_extract_hi (f); } } #endif #endif static_always_inline void clib_aes_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key, aes_key_size_t ks) { u8x16 e[15], d[15]; aes_key_expand (e, key, ks); aes_key_enc_to_dec (e, d, ks); for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++) { ((u8x16 *) kd->decrypt_key)[i] = d[i]; ((u8x16 *) kd->encrypt_key)[i] = e[i]; } } static_always_inline void clib_aes128_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key) { clib_aes_cbc_key_expand (kd, key, AES_KEY_128); } static_always_inline void clib_aes192_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key) { clib_aes_cbc_key_expand (kd, key, AES_KEY_192); } static_always_inline void clib_aes256_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key) { clib_aes_cbc_key_expand (kd, key, AES_KEY_256); } static_always_inline void clib_aes_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext, uword len, const u8 *iv, aes_key_size_t ks, u8 *plaintext) { int rounds = AES_KEY_ROUNDS (ks); #if defined(__VAES__) && defined(__AVX512F__) aes4_cbc_dec (kd->decrypt_key, (u8x64u *) ciphertext, (u8x64u *) plaintext, (u8x16u *) iv, (int) len, rounds); #elif defined(__VAES__) aes2_cbc_dec (kd->decrypt_key, (u8x32u *) ciphertext, (u8x32u *) plaintext, (u8x16u *) iv, (int) len, rounds); #else aes_cbc_dec (kd->decrypt_key, (u8x16u *) ciphertext, (u8x16u *) plaintext, (u8x16u *) iv, (int) len, rounds); #endif } static_always_inline void clib_aes128_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext, uword len, const u8 *iv, u8 *plaintext) { clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_128, plaintext); } static_always_inline void clib_aes192_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext, uword len, const u8 *iv, u8 *plaintext) { clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_192, plaintext); } static_always_inline void clib_aes256_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext, uword len, const u8 *iv, u8 *plaintext) { clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_256, plaintext); } #if __GNUC__ > 4 && !__clang__ && CLIB_DEBUG == 0 #pragma GCC optimize("O3") #endif #if defined(__VAES__) && defined(__AVX512F__) #define u8xN u8x64 #define u32xN u32x16 #define u32xN_min_scalar u32x16_min_scalar #define u32xN_is_all_zero u32x16_is_all_zero #define u32xN_splat u32x16_splat #elif defined(__VAES__) #define u8xN u8x32 #define u32xN u32x8 #define u32xN_min_scalar u32x8_min_scalar #define u32xN_is_all_zero u32x8_is_all_zero #define u32xN_splat u32x8_splat #else #define u8xN u8x16 #define u32xN u32x4 #define u32xN_min_scalar u32x4_min_scalar #define u32xN_is_all_zero u32x4_is_all_zero #define u32xN_splat u32x4_splat #endif static_always_inline u32 clib_aes_cbc_encrypt_multi (aes_cbc_key_data_t **key_data, const uword *key_indices, u8 **plaintext, const uword *oplen, u8 **iv, aes_key_size_t ks, u8 **ciphertext, uword n_ops) { int rounds = AES_KEY_ROUNDS (ks); u8 placeholder[8192]; u32 i, j, count, n_left = n_ops; u32xN placeholder_mask = {}; u32xN len = {}; u32 key_index[4 * N_AES_LANES]; u8 *src[4 * N_AES_LANES] = {}; u8 *dst[4 * N_AES_LANES] = {}; u8xN r[4] = {}; u8xN k[15][4] = {}; for (i = 0; i < 4 * N_AES_LANES; i++) key_index[i] = ~0; more: for (i = 0; i < 4 * N_AES_LANES; i++) if (len[i] == 0) { if (n_left == 0) { /* no more work to enqueue, so we are enqueueing placeholder buffer */ src[i] = dst[i] = placeholder; len[i] = sizeof (placeholder); placeholder_mask[i] = 0; } else { u8x16 t = aes_block_load (iv[0]); ((u8x16 *) r)[i] = t; src[i] = plaintext[0]; dst[i] = ciphertext[0]; len[i] = oplen[0]; placeholder_mask[i] = ~0; if (key_index[i] != key_indices[0]) { aes_cbc_key_data_t *kd; key_index[i] = key_indices[0]; kd = key_data[key_index[i]]; for (j = 0; j < rounds + 1; j++) ((u8x16 *) k[j])[i] = kd->encrypt_key[j]; } n_left--; iv++; ciphertext++; plaintext++; key_indices++; oplen++; } } count = u32xN_min_scalar (len); ASSERT (count % 16 == 0); for (i = 0; i < count; i += 16) { #if defined(__VAES__) && defined(__AVX512F__) r[0] = u8x64_xor3 (r[0], aes_block_load_x4 (src, i), k[0][0]); r[1] = u8x64_xor3 (r[1], aes_block_load_x4 (src + 4, i), k[0][1]); r[2] = u8x64_xor3 (r[2], aes_block_load_x4 (src + 8, i), k[0][2]); r[3] = u8x64_xor3 (r[3], aes_block_load_x4 (src + 12, i), k[0][3]); for (j = 1; j < rounds; j++) { r[0] = aes_enc_round_x4 (r[0], k[j][0]); r[1] = aes_enc_round_x4 (r[1], k[j][1]); r[2] = aes_enc_round_x4 (r[2], k[j][2]); r[3] = aes_enc_round_x4 (r[3], k[j][3]); } r[0] = aes_enc_last_round_x4 (r[0], k[j][0]); r[1] = aes_enc_last_round_x4 (r[1], k[j][1]); r[2] = aes_enc_last_round_x4 (r[2], k[j][2]); r[3] = aes_enc_last_round_x4 (r[3], k[j][3]); aes_block_store_x4 (dst, i, r[0]); aes_block_store_x4 (dst + 4, i, r[1]); aes_block_store_x4 (dst + 8, i, r[2]); aes_block_store_x4 (dst + 12, i, r[3]); #elif defined(__VAES__) r[0] = u8x32_xor3 (r[0], aes_block_load_x2 (src, i), k[0][0]); r[1] = u8x32_xor3 (r[1], aes_block_load_x2 (src + 2, i), k[0][1]); r[2] = u8x32_xor3 (r[2], aes_block_load_x2 (src + 4, i), k[0][2]); r[3] = u8x32_xor3 (r[3], aes_block_load_x2 (src + 6, i), k[0][3]); for (j = 1; j < rounds; j++) { r[0] = aes_enc_round_x2 (r[0], k[j][0]); r[1] = aes_enc_round_x2 (r[1], k[j][1]); r[2] = aes_enc_round_x2 (r[2], k[j][2]); r[3] = aes_enc_round_x2 (r[3], k[j][3]); } r[0] = aes_enc_last_round_x2 (r[0], k[j][0]); r[1] = aes_enc_last_round_x2 (r[1], k[j][1]); r[2] = aes_enc_last_round_x2 (r[2], k[j][2]); r[3] = aes_enc_last_round_x2 (r[3], k[j][3]); aes_block_store_x2 (dst, i, r[0]); aes_block_store_x2 (dst + 2, i, r[1]); aes_block_store_x2 (dst + 4, i, r[2]); aes_block_store_x2 (dst + 6, i, r[3]); #else #if __x86_64__ r[0] = u8x16_xor3 (r[0], aes_block_load (src[0] + i), k[0][0]); r[1] = u8x16_xor3 (r[1], aes_block_load (src[1] + i), k[0][1]); r[2] = u8x16_xor3 (r[2], aes_block_load (src[2] + i), k[0][2]); r[3] = u8x16_xor3 (r[3], aes_block_load (src[3] + i), k[0][3]); for (j = 1; j < rounds; j++) { r[0] = aes_enc_round_x1 (r[0], k[j][0]); r[1] = aes_enc_round_x1 (r[1], k[j][1]); r[2] = aes_enc_round_x1 (r[2], k[j][2]); r[3] = aes_enc_round_x1 (r[3], k[j][3]); } r[0] = aes_enc_last_round_x1 (r[0], k[j][0]); r[1] = aes_enc_last_round_x1 (r[1], k[j][1]); r[2] = aes_enc_last_round_x1 (r[2], k[j][2]); r[3] = aes_enc_last_round_x1 (r[3], k[j][3]); aes_block_store (dst[0] + i, r[0]); aes_block_store (dst[1] + i, r[1]); aes_block_store (dst[2] + i, r[2]); aes_block_store (dst[3] + i, r[3]); #else r[0] ^= aes_block_load (src[0] + i); r[1] ^= aes_block_load (src[1] + i); r[2] ^= aes_block_load (src[2] + i); r[3] ^= aes_block_load (src[3] + i); for (j = 0; j < rounds - 1; j++) { r[0] = vaesmcq_u8 (vaeseq_u8 (r[0], k[j][0])); r[1] = vaesmcq_u8 (vaeseq_u8 (r[1], k[j][1])); r[2] = vaesmcq_u8 (vaeseq_u8 (r[2], k[j][2])); r[3] = vaesmcq_u8 (vaeseq_u8 (r[3], k[j][3])); } r[0] = vaeseq_u8 (r[0], k[j][0]) ^ k[rounds][0]; r[1] = vaeseq_u8 (r[1], k[j][1]) ^ k[rounds][1]; r[2] = vaeseq_u8 (r[2], k[j][2]) ^ k[rounds][2]; r[3] = vaeseq_u8 (r[3], k[j][3]) ^ k[rounds][3]; aes_block_store (dst[0] + i, r[0]); aes_block_store (dst[1] + i, r[1]); aes_block_store (dst[2] + i, r[2]); aes_block_store (dst[3] + i, r[3]); #endif #endif } len -= u32xN_splat (count); for (i = 0; i < 4 * N_AES_LANES; i++) { src[i] += count; dst[i] += count; } if (n_left > 0) goto more; if (!u32xN_is_all_zero (len & placeholder_mask)) goto more; return n_ops; } #undef u8xN #undef u32xN #undef u32xN_min_scalar #undef u32xN_is_all_zero #undef u32xN_splat #endif /* __crypto_aes_cbc_h__ */