aboutsummaryrefslogtreecommitdiffstats
path: root/src/vppinfra/crypto
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2023-03-15 11:42:06 +0000
committerOle Tr�an <otroan@employees.org>2023-03-23 12:04:46 +0000
commitb47376f0b404d2ba5526fba52b171d79b0f352f8 (patch)
tree5ffcdb47d48f4db0b87483fe6f99a7249831118c /src/vppinfra/crypto
parent5527a78ed96043d2c26e3271066c50b44dd7fc0b (diff)
vppinfra: AES-CBC and AES-GCM refactor and optimizations
- crypto code moved to vppinfra for better testing and reuse - added 256-bit VAES support (Intel Client CPUs) - added AES_GMAC functions Change-Id: I960c8e14ca0a0126703e8f1589d86f32e2a98361 Type: improvement Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/vppinfra/crypto')
-rw-r--r--src/vppinfra/crypto/aes.h439
-rw-r--r--src/vppinfra/crypto/aes_cbc.h549
-rw-r--r--src/vppinfra/crypto/aes_gcm.h975
-rw-r--r--src/vppinfra/crypto/ghash.h515
4 files changed, 2478 insertions, 0 deletions
diff --git a/src/vppinfra/crypto/aes.h b/src/vppinfra/crypto/aes.h
new file mode 100644
index 00000000000..a5e286e4c6e
--- /dev/null
+++ b/src/vppinfra/crypto/aes.h
@@ -0,0 +1,439 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __aesni_h__
+#define __aesni_h__
+
+typedef enum
+{
+ AES_KEY_128 = 0,
+ AES_KEY_192 = 1,
+ AES_KEY_256 = 2,
+} aes_key_size_t;
+
+#define AES_KEY_ROUNDS(x) (10 + x * 2)
+#define AES_KEY_BYTES(x) (16 + x * 8)
+
+static_always_inline u8x16
+aes_block_load (u8 * p)
+{
+ return *(u8x16u *) p;
+}
+
+static_always_inline u8x16
+aes_enc_round (u8x16 a, u8x16 k)
+{
+#if defined (__AES__)
+ return (u8x16) _mm_aesenc_si128 ((__m128i) a, (__m128i) k);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return vaesmcq_u8 (vaeseq_u8 (a, u8x16_splat (0))) ^ k;
+#endif
+}
+
+#if defined(__VAES__) && defined(__AVX512F__)
+static_always_inline u8x64
+aes_enc_round_x4 (u8x64 a, u8x64 k)
+{
+ return (u8x64) _mm512_aesenc_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_enc_last_round_x4 (u8x64 a, u8x64 k)
+{
+ return (u8x64) _mm512_aesenclast_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_dec_round_x4 (u8x64 a, u8x64 k)
+{
+ return (u8x64) _mm512_aesdec_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_dec_last_round_x4 (u8x64 a, u8x64 k)
+{
+ return (u8x64) _mm512_aesdeclast_epi128 ((__m512i) a, (__m512i) k);
+}
+#endif
+
+#ifdef __VAES__
+static_always_inline u8x32
+aes_enc_round_x2 (u8x32 a, u8x32 k)
+{
+ return (u8x32) _mm256_aesenc_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_enc_last_round_x2 (u8x32 a, u8x32 k)
+{
+ return (u8x32) _mm256_aesenclast_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_round_x2 (u8x32 a, u8x32 k)
+{
+ return (u8x32) _mm256_aesdec_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_last_round_x2 (u8x32 a, u8x32 k)
+{
+ return (u8x32) _mm256_aesdeclast_epi128 ((__m256i) a, (__m256i) k);
+}
+#endif
+
+static_always_inline u8x16
+aes_enc_last_round (u8x16 a, u8x16 k)
+{
+#if defined (__AES__)
+ return (u8x16) _mm_aesenclast_si128 ((__m128i) a, (__m128i) k);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return vaeseq_u8 (a, u8x16_splat (0)) ^ k;
+#endif
+}
+
+#ifdef __x86_64__
+
+static_always_inline u8x16
+aes_dec_round (u8x16 a, u8x16 k)
+{
+ return (u8x16) _mm_aesdec_si128 ((__m128i) a, (__m128i) k);
+}
+
+static_always_inline u8x16
+aes_dec_last_round (u8x16 a, u8x16 k)
+{
+ return (u8x16) _mm_aesdeclast_si128 ((__m128i) a, (__m128i) k);
+}
+#endif
+
+static_always_inline void
+aes_block_store (u8 * p, u8x16 r)
+{
+ *(u8x16u *) p = r;
+}
+
+static_always_inline u8x16
+aes_encrypt_block (u8x16 block, const u8x16 * round_keys, aes_key_size_t ks)
+{
+ int rounds = AES_KEY_ROUNDS (ks);
+ block ^= round_keys[0];
+ for (int i = 1; i < rounds; i += 1)
+ block = aes_enc_round (block, round_keys[i]);
+ return aes_enc_last_round (block, round_keys[rounds]);
+}
+
+static_always_inline u8x16
+aes_inv_mix_column (u8x16 a)
+{
+#if defined (__AES__)
+ return (u8x16) _mm_aesimc_si128 ((__m128i) a);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return vaesimcq_u8 (a);
+#endif
+}
+
+#ifdef __x86_64__
+#define aes_keygen_assist(a, b) \
+ (u8x16) _mm_aeskeygenassist_si128((__m128i) a, b)
+
+/* AES-NI based AES key expansion based on code samples from
+ Intel(r) Advanced Encryption Standard (AES) New Instructions White Paper
+ (323641-001) */
+
+static_always_inline void
+aes128_key_assist (u8x16 * rk, u8x16 r)
+{
+ u8x16 t = rk[-1];
+ t ^= u8x16_word_shift_left (t, 4);
+ t ^= u8x16_word_shift_left (t, 4);
+ t ^= u8x16_word_shift_left (t, 4);
+ rk[0] = t ^ (u8x16) u32x4_shuffle ((u32x4) r, 3, 3, 3, 3);
+}
+
+static_always_inline void
+aes128_key_expand (u8x16 *rk, u8x16u const *k)
+{
+ rk[0] = k[0];
+ aes128_key_assist (rk + 1, aes_keygen_assist (rk[0], 0x01));
+ aes128_key_assist (rk + 2, aes_keygen_assist (rk[1], 0x02));
+ aes128_key_assist (rk + 3, aes_keygen_assist (rk[2], 0x04));
+ aes128_key_assist (rk + 4, aes_keygen_assist (rk[3], 0x08));
+ aes128_key_assist (rk + 5, aes_keygen_assist (rk[4], 0x10));
+ aes128_key_assist (rk + 6, aes_keygen_assist (rk[5], 0x20));
+ aes128_key_assist (rk + 7, aes_keygen_assist (rk[6], 0x40));
+ aes128_key_assist (rk + 8, aes_keygen_assist (rk[7], 0x80));
+ aes128_key_assist (rk + 9, aes_keygen_assist (rk[8], 0x1b));
+ aes128_key_assist (rk + 10, aes_keygen_assist (rk[9], 0x36));
+}
+
+static_always_inline void
+aes192_key_assist (u8x16 * r1, u8x16 * r2, u8x16 key_assist)
+{
+ u8x16 t;
+ r1[0] ^= t = u8x16_word_shift_left (r1[0], 4);
+ r1[0] ^= t = u8x16_word_shift_left (t, 4);
+ r1[0] ^= u8x16_word_shift_left (t, 4);
+ r1[0] ^= (u8x16) _mm_shuffle_epi32 ((__m128i) key_assist, 0x55);
+ r2[0] ^= u8x16_word_shift_left (r2[0], 4);
+ r2[0] ^= (u8x16) _mm_shuffle_epi32 ((__m128i) r1[0], 0xff);
+}
+
+static_always_inline void
+aes192_key_expand (u8x16 * rk, u8x16u const *k)
+{
+ u8x16 r1, r2;
+
+ rk[0] = r1 = k[0];
+ /* *INDENT-OFF* */
+ rk[1] = r2 = (u8x16) (u64x2) { *(u64 *) (k + 1), 0 };
+ /* *INDENT-ON* */
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x1));
+ rk[1] = (u8x16) _mm_shuffle_pd ((__m128d) rk[1], (__m128d) r1, 0);
+ rk[2] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x2));
+ rk[3] = r1;
+ rk[4] = r2;
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x4));
+ rk[4] = (u8x16) _mm_shuffle_pd ((__m128d) rk[4], (__m128d) r1, 0);
+ rk[5] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x8));
+ rk[6] = r1;
+ rk[7] = r2;
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x10));
+ rk[7] = (u8x16) _mm_shuffle_pd ((__m128d) rk[7], (__m128d) r1, 0);
+ rk[8] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x20));
+ rk[9] = r1;
+ rk[10] = r2;
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x40));
+ rk[10] = (u8x16) _mm_shuffle_pd ((__m128d) rk[10], (__m128d) r1, 0);
+ rk[11] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x80));
+ rk[12] = r1;
+}
+
+static_always_inline void
+aes256_key_assist (u8x16 * rk, int i, u8x16 key_assist)
+{
+ u8x16 r, t;
+ rk += i;
+ r = rk[-2];
+ r ^= t = u8x16_word_shift_left (r, 4);
+ r ^= t = u8x16_word_shift_left (t, 4);
+ r ^= u8x16_word_shift_left (t, 4);
+ r ^= (u8x16) u32x4_shuffle ((u32x4) key_assist, 3, 3, 3, 3);
+ rk[0] = r;
+
+ if (i >= 14)
+ return;
+
+ key_assist = aes_keygen_assist (rk[0], 0x0);
+ r = rk[-1];
+ r ^= t = u8x16_word_shift_left (r, 4);
+ r ^= t = u8x16_word_shift_left (t, 4);
+ r ^= u8x16_word_shift_left (t, 4);
+ r ^= (u8x16) u32x4_shuffle ((u32x4) key_assist, 2, 2, 2, 2);
+ rk[1] = r;
+}
+
+static_always_inline void
+aes256_key_expand (u8x16 * rk, u8x16u const *k)
+{
+ rk[0] = k[0];
+ rk[1] = k[1];
+ aes256_key_assist (rk, 2, aes_keygen_assist (rk[1], 0x01));
+ aes256_key_assist (rk, 4, aes_keygen_assist (rk[3], 0x02));
+ aes256_key_assist (rk, 6, aes_keygen_assist (rk[5], 0x04));
+ aes256_key_assist (rk, 8, aes_keygen_assist (rk[7], 0x08));
+ aes256_key_assist (rk, 10, aes_keygen_assist (rk[9], 0x10));
+ aes256_key_assist (rk, 12, aes_keygen_assist (rk[11], 0x20));
+ aes256_key_assist (rk, 14, aes_keygen_assist (rk[13], 0x40));
+}
+#endif
+
+#ifdef __aarch64__
+
+static const u8x16 aese_prep_mask1 =
+ { 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 };
+static const u8x16 aese_prep_mask2 =
+ { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 };
+
+static_always_inline void
+aes128_key_expand_round_neon (u8x16 * rk, u32 rcon)
+{
+ u8x16 r, t, last_round = rk[-1], z = { };
+ r = vqtbl1q_u8 (last_round, aese_prep_mask1);
+ r = vaeseq_u8 (r, z);
+ r ^= (u8x16) vdupq_n_u32 (rcon);
+ r ^= last_round;
+ r ^= t = vextq_u8 (z, last_round, 12);
+ r ^= t = vextq_u8 (z, t, 12);
+ r ^= vextq_u8 (z, t, 12);
+ rk[0] = r;
+}
+
+static_always_inline void
+aes128_key_expand (u8x16 *rk, u8x16u const *k)
+{
+ rk[0] = k[0];
+ aes128_key_expand_round_neon (rk + 1, 0x01);
+ aes128_key_expand_round_neon (rk + 2, 0x02);
+ aes128_key_expand_round_neon (rk + 3, 0x04);
+ aes128_key_expand_round_neon (rk + 4, 0x08);
+ aes128_key_expand_round_neon (rk + 5, 0x10);
+ aes128_key_expand_round_neon (rk + 6, 0x20);
+ aes128_key_expand_round_neon (rk + 7, 0x40);
+ aes128_key_expand_round_neon (rk + 8, 0x80);
+ aes128_key_expand_round_neon (rk + 9, 0x1b);
+ aes128_key_expand_round_neon (rk + 10, 0x36);
+}
+
+static_always_inline void
+aes192_key_expand_round_neon (u8x8 * rk, u32 rcon)
+{
+ u8x8 r, last_round = rk[-1], z = { };
+ u8x16 r2, z2 = { };
+
+ r2 = (u8x16) vdupq_lane_u64 ((uint64x1_t) last_round, 0);
+ r2 = vqtbl1q_u8 (r2, aese_prep_mask1);
+ r2 = vaeseq_u8 (r2, z2);
+ r2 ^= (u8x16) vdupq_n_u32 (rcon);
+
+ r = (u8x8) vdup_laneq_u64 ((u64x2) r2, 0);
+ r ^= rk[-3];
+ r ^= vext_u8 (z, rk[-3], 4);
+ rk[0] = r;
+
+ r = rk[-2] ^ vext_u8 (r, z, 4);
+ r ^= vext_u8 (z, r, 4);
+ rk[1] = r;
+
+ if (rcon == 0x80)
+ return;
+
+ r = rk[-1] ^ vext_u8 (r, z, 4);
+ r ^= vext_u8 (z, r, 4);
+ rk[2] = r;
+}
+
+static_always_inline void
+aes192_key_expand (u8x16 * ek, const u8x16u * k)
+{
+ u8x8 *rk = (u8x8 *) ek;
+ ek[0] = k[0];
+ rk[2] = *(u8x8u *) (k + 1);
+ aes192_key_expand_round_neon (rk + 3, 0x01);
+ aes192_key_expand_round_neon (rk + 6, 0x02);
+ aes192_key_expand_round_neon (rk + 9, 0x04);
+ aes192_key_expand_round_neon (rk + 12, 0x08);
+ aes192_key_expand_round_neon (rk + 15, 0x10);
+ aes192_key_expand_round_neon (rk + 18, 0x20);
+ aes192_key_expand_round_neon (rk + 21, 0x40);
+ aes192_key_expand_round_neon (rk + 24, 0x80);
+}
+
+
+static_always_inline void
+aes256_key_expand_round_neon (u8x16 * rk, u32 rcon)
+{
+ u8x16 r, t, z = { };
+
+ r = vqtbl1q_u8 (rk[-1], rcon ? aese_prep_mask1 : aese_prep_mask2);
+ r = vaeseq_u8 (r, z);
+ if (rcon)
+ r ^= (u8x16) vdupq_n_u32 (rcon);
+ r ^= rk[-2];
+ r ^= t = vextq_u8 (z, rk[-2], 12);
+ r ^= t = vextq_u8 (z, t, 12);
+ r ^= vextq_u8 (z, t, 12);
+ rk[0] = r;
+}
+
+static_always_inline void
+aes256_key_expand (u8x16 *rk, u8x16u const *k)
+{
+ rk[0] = k[0];
+ rk[1] = k[1];
+ aes256_key_expand_round_neon (rk + 2, 0x01);
+ aes256_key_expand_round_neon (rk + 3, 0);
+ aes256_key_expand_round_neon (rk + 4, 0x02);
+ aes256_key_expand_round_neon (rk + 5, 0);
+ aes256_key_expand_round_neon (rk + 6, 0x04);
+ aes256_key_expand_round_neon (rk + 7, 0);
+ aes256_key_expand_round_neon (rk + 8, 0x08);
+ aes256_key_expand_round_neon (rk + 9, 0);
+ aes256_key_expand_round_neon (rk + 10, 0x10);
+ aes256_key_expand_round_neon (rk + 11, 0);
+ aes256_key_expand_round_neon (rk + 12, 0x20);
+ aes256_key_expand_round_neon (rk + 13, 0);
+ aes256_key_expand_round_neon (rk + 14, 0x40);
+}
+
+#endif
+
+static_always_inline void
+aes_key_expand (u8x16 * key_schedule, u8 const *key, aes_key_size_t ks)
+{
+ switch (ks)
+ {
+ case AES_KEY_128:
+ aes128_key_expand (key_schedule, (u8x16u const *) key);
+ break;
+ case AES_KEY_192:
+ aes192_key_expand (key_schedule, (u8x16u const *) key);
+ break;
+ case AES_KEY_256:
+ aes256_key_expand (key_schedule, (u8x16u const *) key);
+ break;
+ }
+}
+
+static_always_inline void
+aes_key_enc_to_dec (u8x16 * ke, u8x16 * kd, aes_key_size_t ks)
+{
+ int rounds = AES_KEY_ROUNDS (ks);
+
+ kd[rounds] = ke[0];
+ kd[0] = ke[rounds];
+
+ for (int i = 1; i < (rounds / 2); i++)
+ {
+ kd[rounds - i] = aes_inv_mix_column (ke[i]);
+ kd[i] = aes_inv_mix_column (ke[rounds - i]);
+ }
+
+ kd[rounds / 2] = aes_inv_mix_column (ke[rounds / 2]);
+}
+
+#endif /* __aesni_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vppinfra/crypto/aes_cbc.h b/src/vppinfra/crypto/aes_cbc.h
new file mode 100644
index 00000000000..5c3054f4a93
--- /dev/null
+++ b/src/vppinfra/crypto/aes_cbc.h
@@ -0,0 +1,549 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_cbc_h__
+#define __crypto_aes_cbc_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/crypto/aes.h>
+
+typedef struct
+{
+ const u8x16 encrypt_key[15];
+ const u8x16 decrypt_key[15];
+} aes_cbc_key_data_t;
+
+static_always_inline void
+clib_aes_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *src, uword len,
+ const u8 *iv, aes_key_size_t ks, u8 *dst)
+{
+ int rounds = AES_KEY_ROUNDS (ks);
+ u8x16 r, *k = (u8x16 *) kd->encrypt_key;
+
+ r = *(u8x16u *) iv;
+
+ for (int i = 0; i < len; i += 16)
+ {
+ int j;
+#if __x86_64__
+ r = u8x16_xor3 (r, *(u8x16u *) (src + i), k[0]);
+ for (j = 1; j < rounds; j++)
+ r = aes_enc_round (r, k[j]);
+ r = aes_enc_last_round (r, k[rounds]);
+#else
+ r ^= *(u8x16u *) (src + i);
+ for (j = 1; j < rounds - 1; j++)
+ r = vaesmcq_u8 (vaeseq_u8 (r, k[j]));
+ r = vaeseq_u8 (r, k[j]) ^ k[rounds];
+#endif
+ *(u8x16u *) (dst + i) = r;
+ }
+}
+
+static_always_inline void
+clib_aes128_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+ uword len, const u8 *iv, u8 *ciphertext)
+{
+ clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_128, ciphertext);
+}
+
+static_always_inline void
+clib_aes192_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+ uword len, const u8 *iv, u8 *ciphertext)
+{
+ clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_192, ciphertext);
+}
+
+static_always_inline void
+clib_aes256_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+ uword len, const u8 *iv, u8 *ciphertext)
+{
+ clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_256, ciphertext);
+}
+
+static_always_inline void __clib_unused
+aes_cbc_dec (const u8x16 *k, u8x16u *src, u8x16u *dst, u8x16u *iv, int count,
+ int rounds)
+{
+ u8x16 r[4], c[4], f;
+
+ f = iv[0];
+ while (count >= 64)
+ {
+ c[0] = r[0] = src[0];
+ c[1] = r[1] = src[1];
+ c[2] = r[2] = src[2];
+ c[3] = r[3] = src[3];
+
+#if __x86_64__
+ r[0] ^= k[0];
+ r[1] ^= k[0];
+ r[2] ^= k[0];
+ r[3] ^= k[0];
+
+ for (int i = 1; i < rounds; i++)
+ {
+ r[0] = aes_dec_round (r[0], k[i]);
+ r[1] = aes_dec_round (r[1], k[i]);
+ r[2] = aes_dec_round (r[2], k[i]);
+ r[3] = aes_dec_round (r[3], k[i]);
+ }
+
+ r[0] = aes_dec_last_round (r[0], k[rounds]);
+ r[1] = aes_dec_last_round (r[1], k[rounds]);
+ r[2] = aes_dec_last_round (r[2], k[rounds]);
+ r[3] = aes_dec_last_round (r[3], k[rounds]);
+#else
+ for (int i = 0; i < rounds - 1; i++)
+ {
+ r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
+ r[1] = vaesimcq_u8 (vaesdq_u8 (r[1], k[i]));
+ r[2] = vaesimcq_u8 (vaesdq_u8 (r[2], k[i]));
+ r[3] = vaesimcq_u8 (vaesdq_u8 (r[3], k[i]));
+ }
+ r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
+ r[1] = vaesdq_u8 (r[1], k[rounds - 1]) ^ k[rounds];
+ r[2] = vaesdq_u8 (r[2], k[rounds - 1]) ^ k[rounds];
+ r[3] = vaesdq_u8 (r[3], k[rounds - 1]) ^ k[rounds];
+#endif
+ dst[0] = r[0] ^ f;
+ dst[1] = r[1] ^ c[0];
+ dst[2] = r[2] ^ c[1];
+ dst[3] = r[3] ^ c[2];
+ f = c[3];
+
+ count -= 64;
+ src += 4;
+ dst += 4;
+ }
+
+ while (count > 0)
+ {
+ c[0] = r[0] = src[0];
+#if __x86_64__
+ r[0] ^= k[0];
+ for (int i = 1; i < rounds; i++)
+ r[0] = aes_dec_round (r[0], k[i]);
+ r[0] = aes_dec_last_round (r[0], k[rounds]);
+#else
+ c[0] = r[0] = src[0];
+ for (int i = 0; i < rounds - 1; i++)
+ r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
+ r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
+#endif
+ dst[0] = r[0] ^ f;
+ f = c[0];
+
+ count -= 16;
+ src += 1;
+ dst += 1;
+ }
+}
+
+#if __x86_64__
+#if defined(__VAES__) && defined(__AVX512F__)
+
+static_always_inline u8x64
+aes_block_load_x4 (u8 *src[], int i)
+{
+ u8x64 r = {};
+ r = u8x64_insert_u8x16 (r, aes_block_load (src[0] + i), 0);
+ r = u8x64_insert_u8x16 (r, aes_block_load (src[1] + i), 1);
+ r = u8x64_insert_u8x16 (r, aes_block_load (src[2] + i), 2);
+ r = u8x64_insert_u8x16 (r, aes_block_load (src[3] + i), 3);
+ return r;
+}
+
+static_always_inline void
+aes_block_store_x4 (u8 *dst[], int i, u8x64 r)
+{
+ aes_block_store (dst[0] + i, u8x64_extract_u8x16 (r, 0));
+ aes_block_store (dst[1] + i, u8x64_extract_u8x16 (r, 1));
+ aes_block_store (dst[2] + i, u8x64_extract_u8x16 (r, 2));
+ aes_block_store (dst[3] + i, u8x64_extract_u8x16 (r, 3));
+}
+
+static_always_inline u8x64
+aes4_cbc_dec_permute (u8x64 a, u8x64 b)
+{
+ return (u8x64) u64x8_shuffle2 (a, b, 6, 7, 8, 9, 10, 11, 12, 13);
+}
+
+static_always_inline void
+aes4_cbc_dec (const u8x16 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count,
+ aes_key_size_t rounds)
+{
+ u8x64 f, k4, r[4], c[4] = {};
+ __mmask8 m;
+ int i, n_blocks = count >> 4;
+
+ f = u8x64_insert_u8x16 (u8x64_zero (), *iv, 3);
+
+ while (n_blocks >= 16)
+ {
+ k4 = u8x64_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+ c[3] = src[3];
+
+ r[0] = c[0] ^ k4;
+ r[1] = c[1] ^ k4;
+ r[2] = c[2] ^ k4;
+ r[3] = c[3] ^ k4;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x4 (r[0], k4);
+ r[1] = aes_dec_round_x4 (r[1], k4);
+ r[2] = aes_dec_round_x4 (r[2], k4);
+ r[3] = aes_dec_round_x4 (r[3], k4);
+ }
+
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x4 (r[0], k4);
+ r[1] = aes_dec_last_round_x4 (r[1], k4);
+ r[2] = aes_dec_last_round_x4 (r[2], k4);
+ r[3] = aes_dec_last_round_x4 (r[3], k4);
+
+ dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
+ dst[3] = r[3] ^= aes4_cbc_dec_permute (c[2], c[3]);
+ f = c[3];
+
+ n_blocks -= 16;
+ src += 4;
+ dst += 4;
+ }
+
+ if (n_blocks >= 12)
+ {
+ k4 = u8x64_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+
+ r[0] = c[0] ^ k4;
+ r[1] = c[1] ^ k4;
+ r[2] = c[2] ^ k4;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x4 (r[0], k4);
+ r[1] = aes_dec_round_x4 (r[1], k4);
+ r[2] = aes_dec_round_x4 (r[2], k4);
+ }
+
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x4 (r[0], k4);
+ r[1] = aes_dec_last_round_x4 (r[1], k4);
+ r[2] = aes_dec_last_round_x4 (r[2], k4);
+
+ dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
+ f = c[2];
+
+ n_blocks -= 12;
+ src += 3;
+ dst += 3;
+ }
+ else if (n_blocks >= 8)
+ {
+ k4 = u8x64_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+
+ r[0] = c[0] ^ k4;
+ r[1] = c[1] ^ k4;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x4 (r[0], k4);
+ r[1] = aes_dec_round_x4 (r[1], k4);
+ }
+
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x4 (r[0], k4);
+ r[1] = aes_dec_last_round_x4 (r[1], k4);
+
+ dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+ f = c[1];
+
+ n_blocks -= 8;
+ src += 2;
+ dst += 2;
+ }
+ else if (n_blocks >= 4)
+ {
+ c[0] = src[0];
+
+ r[0] = c[0] ^ u8x64_splat_u8x16 (k[0]);
+
+ for (i = 1; i < rounds; i++)
+ r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+
+ r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+
+ dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+ f = c[0];
+
+ n_blocks -= 4;
+ src += 1;
+ dst += 1;
+ }
+
+ if (n_blocks > 0)
+ {
+ k4 = u8x64_splat_u8x16 (k[0]);
+ m = (1 << (n_blocks * 2)) - 1;
+ c[0] =
+ (u8x64) _mm512_mask_loadu_epi64 ((__m512i) c[0], m, (__m512i *) src);
+ f = aes4_cbc_dec_permute (f, c[0]);
+ r[0] = c[0] ^ k4;
+ for (i = 1; i < rounds; i++)
+ r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+ r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+ _mm512_mask_storeu_epi64 ((__m512i *) dst, m, (__m512i) (r[0] ^ f));
+ }
+}
+#elif defined(__VAES__)
+
+static_always_inline u8x32
+aes_block_load_x2 (u8 *src[], int i)
+{
+ u8x32 r = {};
+ r = u8x32_insert_lo (r, aes_block_load (src[0] + i));
+ r = u8x32_insert_hi (r, aes_block_load (src[1] + i));
+ return r;
+}
+
+static_always_inline void
+aes_block_store_x2 (u8 *dst[], int i, u8x32 r)
+{
+ aes_block_store (dst[0] + i, u8x32_extract_lo (r));
+ aes_block_store (dst[1] + i, u8x32_extract_hi (r));
+}
+
+static_always_inline u8x32
+aes2_cbc_dec_permute (u8x32 a, u8x32 b)
+{
+ return (u8x32) u64x4_shuffle2 ((u64x4) a, (u64x4) b, 2, 3, 4, 5);
+}
+
+static_always_inline void
+aes2_cbc_dec (const u8x16 *k, u8x32u *src, u8x32u *dst, u8x16u *iv, int count,
+ aes_key_size_t rounds)
+{
+ u8x32 k2, f = {}, r[4], c[4] = {};
+ int i, n_blocks = count >> 4;
+
+ f = u8x32_insert_hi (f, *iv);
+
+ while (n_blocks >= 8)
+ {
+ k2 = u8x32_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+ c[3] = src[3];
+
+ r[0] = c[0] ^ k2;
+ r[1] = c[1] ^ k2;
+ r[2] = c[2] ^ k2;
+ r[3] = c[3] ^ k2;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x2 (r[0], k2);
+ r[1] = aes_dec_round_x2 (r[1], k2);
+ r[2] = aes_dec_round_x2 (r[2], k2);
+ r[3] = aes_dec_round_x2 (r[3], k2);
+ }
+
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x2 (r[0], k2);
+ r[1] = aes_dec_last_round_x2 (r[1], k2);
+ r[2] = aes_dec_last_round_x2 (r[2], k2);
+ r[3] = aes_dec_last_round_x2 (r[3], k2);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+ dst[3] = r[3] ^= aes2_cbc_dec_permute (c[2], c[3]);
+ f = c[3];
+
+ n_blocks -= 8;
+ src += 4;
+ dst += 4;
+ }
+
+ if (n_blocks >= 6)
+ {
+ k2 = u8x32_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+
+ r[0] = c[0] ^ k2;
+ r[1] = c[1] ^ k2;
+ r[2] = c[2] ^ k2;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x2 (r[0], k2);
+ r[1] = aes_dec_round_x2 (r[1], k2);
+ r[2] = aes_dec_round_x2 (r[2], k2);
+ }
+
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x2 (r[0], k2);
+ r[1] = aes_dec_last_round_x2 (r[1], k2);
+ r[2] = aes_dec_last_round_x2 (r[2], k2);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+ f = c[2];
+
+ n_blocks -= 6;
+ src += 3;
+ dst += 3;
+ }
+ else if (n_blocks >= 4)
+ {
+ k2 = u8x32_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+
+ r[0] = c[0] ^ k2;
+ r[1] = c[1] ^ k2;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x2 (r[0], k2);
+ r[1] = aes_dec_round_x2 (r[1], k2);
+ }
+
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x2 (r[0], k2);
+ r[1] = aes_dec_last_round_x2 (r[1], k2);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ f = c[1];
+
+ n_blocks -= 4;
+ src += 2;
+ dst += 2;
+ }
+ else if (n_blocks >= 2)
+ {
+ k2 = u8x32_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ r[0] = c[0] ^ k2;
+
+ for (i = 1; i < rounds; i++)
+ r[0] = aes_dec_round_x2 (r[0], u8x32_splat_u8x16 (k[i]));
+
+ r[0] = aes_dec_last_round_x2 (r[0], u8x32_splat_u8x16 (k[i]));
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ f = c[0];
+
+ n_blocks -= 2;
+ src += 1;
+ dst += 1;
+ }
+
+ if (n_blocks > 0)
+ {
+ u8x16 rl = *(u8x16u *) src ^ k[0];
+ for (i = 1; i < rounds; i++)
+ rl = aes_dec_round (rl, k[i]);
+ rl = aes_dec_last_round (rl, k[i]);
+ *(u8x16 *) dst = rl ^ u8x32_extract_hi (f);
+ }
+}
+#endif
+#endif
+
+static_always_inline void
+clib_aes_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key,
+ aes_key_size_t ks)
+{
+ u8x16 e[15], d[15];
+ aes_key_expand (e, key, ks);
+ aes_key_enc_to_dec (e, d, ks);
+ for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+ {
+ ((u8x16 *) kd->decrypt_key)[i] = d[i];
+ ((u8x16 *) kd->encrypt_key)[i] = e[i];
+ }
+}
+
+static_always_inline void
+clib_aes128_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+ clib_aes_cbc_key_expand (kd, key, AES_KEY_128);
+}
+static_always_inline void
+clib_aes192_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+ clib_aes_cbc_key_expand (kd, key, AES_KEY_192);
+}
+static_always_inline void
+clib_aes256_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+ clib_aes_cbc_key_expand (kd, key, AES_KEY_256);
+}
+
+static_always_inline void
+clib_aes_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+ uword len, const u8 *iv, aes_key_size_t ks,
+ u8 *plaintext)
+{
+ int rounds = AES_KEY_ROUNDS (ks);
+#if defined(__VAES__) && defined(__AVX512F__)
+ aes4_cbc_dec (kd->decrypt_key, (u8x64u *) ciphertext, (u8x64u *) plaintext,
+ (u8x16u *) iv, (int) len, rounds);
+#elif defined(__VAES__)
+ aes2_cbc_dec (kd->decrypt_key, (u8x32u *) ciphertext, (u8x32u *) plaintext,
+ (u8x16u *) iv, (int) len, rounds);
+#else
+ aes_cbc_dec (kd->decrypt_key, (u8x16u *) ciphertext, (u8x16u *) plaintext,
+ (u8x16u *) iv, (int) len, rounds);
+#endif
+}
+
+static_always_inline void
+clib_aes128_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+ uword len, const u8 *iv, u8 *plaintext)
+{
+ clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_128, plaintext);
+}
+
+static_always_inline void
+clib_aes192_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+ uword len, const u8 *iv, u8 *plaintext)
+{
+ clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_192, plaintext);
+}
+
+static_always_inline void
+clib_aes256_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+ uword len, const u8 *iv, u8 *plaintext)
+{
+ clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_256, plaintext);
+}
+
+#endif /* __crypto_aes_cbc_h__ */
diff --git a/src/vppinfra/crypto/aes_gcm.h b/src/vppinfra/crypto/aes_gcm.h
new file mode 100644
index 00000000000..8a5f76c3b33
--- /dev/null
+++ b/src/vppinfra/crypto/aes_gcm.h
@@ -0,0 +1,975 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_gcm_h__
+#define __crypto_aes_gcm_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+#include <vppinfra/crypto/aes.h>
+#include <vppinfra/crypto/ghash.h>
+
+#define NUM_HI 36
+#if defined(__VAES__) && defined(__AVX512F__)
+typedef u8x64 aes_data_t;
+typedef u8x64u aes_ghash_t;
+typedef u8x64u aes_mem_t;
+typedef u32x16 aes_gcm_counter_t;
+#define N 64
+#define aes_gcm_load_partial(p, n) u8x64_load_partial ((u8 *) (p), n)
+#define aes_gcm_store_partial(v, p, n) u8x64_store_partial (v, (u8 *) (p), n)
+#define aes_gcm_splat(v) u8x64_splat (v)
+#define aes_gcm_reflect(r) u8x64_reflect_u8x16 (r)
+#define aes_gcm_ghash_reduce(c) ghash4_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c) ghash4_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c) (c)->T = ghash4_final (&(c)->gd)
+#elif defined(__VAES__)
+typedef u8x32 aes_data_t;
+typedef u8x32u aes_ghash_t;
+typedef u8x32u aes_mem_t;
+typedef u32x8 aes_gcm_counter_t;
+#define N 32
+#define aes_gcm_load_partial(p, n) u8x32_load_partial ((u8 *) (p), n)
+#define aes_gcm_store_partial(v, p, n) u8x32_store_partial (v, (u8 *) (p), n)
+#define aes_gcm_splat(v) u8x32_splat (v)
+#define aes_gcm_reflect(r) u8x32_reflect_u8x16 (r)
+#define aes_gcm_ghash_reduce(c) ghash2_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c) ghash2_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c) (c)->T = ghash2_final (&(c)->gd)
+#else
+typedef u8x16 aes_data_t;
+typedef u8x16 aes_ghash_t;
+typedef u8x16u aes_mem_t;
+typedef u32x4 aes_gcm_counter_t;
+#define N 16
+#define aes_gcm_load_partial(p, n) u8x16_load_partial ((u8 *) (p), n)
+#define aes_gcm_store_partial(v, p, n) u8x16_store_partial (v, (u8 *) (p), n)
+#define aes_gcm_splat(v) u8x16_splat (v)
+#define aes_gcm_reflect(r) u8x16_reflect (r)
+#define aes_gcm_ghash_reduce(c) ghash_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c) ghash_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c) (c)->T = ghash_final (&(c)->gd)
+#endif
+#define N_LANES (N / 16)
+
+typedef enum
+{
+ AES_GCM_OP_UNKNONW = 0,
+ AES_GCM_OP_ENCRYPT,
+ AES_GCM_OP_DECRYPT,
+ AES_GCM_OP_GMAC
+} aes_gcm_op_t;
+
+typedef union
+{
+ u8x16 x1;
+ u8x32 x2;
+ u8x64 x4;
+ u8x16 lanes[4];
+} __clib_aligned (64)
+aes_gcm_expaned_key_t;
+
+typedef struct
+{
+ /* pre-calculated hash key values */
+ const u8x16 Hi[NUM_HI];
+ /* extracted AES key */
+ const aes_gcm_expaned_key_t Ke[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+} aes_gcm_key_data_t;
+
+typedef struct
+{
+ aes_gcm_op_t operation;
+ int last;
+ u8 rounds;
+ uword data_bytes;
+ uword aad_bytes;
+
+ u8x16 T;
+
+ /* hash */
+ const u8x16 *Hi;
+ const aes_ghash_t *next_Hi;
+
+ /* expaded keys */
+ const aes_gcm_expaned_key_t *Ke;
+
+ /* counter */
+ u32 counter;
+ u8x16 EY0;
+ aes_gcm_counter_t Y;
+
+ /* ghash */
+ ghash_data_t gd;
+} aes_gcm_ctx_t;
+
+static_always_inline void
+aes_gcm_ghash_mul_first (aes_gcm_ctx_t *ctx, aes_data_t data, u32 n_lanes)
+{
+ uword hash_offset = NUM_HI - n_lanes;
+ ctx->next_Hi = (aes_ghash_t *) (ctx->Hi + hash_offset);
+#if N_LANES == 4
+ u8x64 tag4 = {};
+ tag4 = u8x64_insert_u8x16 (tag4, ctx->T, 0);
+ ghash4_mul_first (&ctx->gd, aes_gcm_reflect (data) ^ tag4, *ctx->next_Hi++);
+#elif N_LANES == 2
+ u8x32 tag2 = {};
+ tag2 = u8x32_insert_lo (tag2, ctx->T);
+ ghash2_mul_first (&ctx->gd, aes_gcm_reflect (data) ^ tag2, *ctx->next_Hi++);
+#else
+ ghash_mul_first (&ctx->gd, aes_gcm_reflect (data) ^ ctx->T, *ctx->next_Hi++);
+#endif
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_next (aes_gcm_ctx_t *ctx, aes_data_t data)
+{
+#if N_LANES == 4
+ ghash4_mul_next (&ctx->gd, aes_gcm_reflect (data), *ctx->next_Hi++);
+#elif N_LANES == 2
+ ghash2_mul_next (&ctx->gd, aes_gcm_reflect (data), *ctx->next_Hi++);
+#else
+ ghash_mul_next (&ctx->gd, aes_gcm_reflect (data), *ctx->next_Hi++);
+#endif
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_bit_len (aes_gcm_ctx_t *ctx)
+{
+ u8x16 r = (u8x16) ((u64x2){ ctx->data_bytes, ctx->aad_bytes } << 3);
+#if N_LANES == 4
+ u8x64 h = u8x64_insert_u8x16 (u8x64_zero (), ctx->Hi[NUM_HI - 1], 0);
+ u8x64 r4 = u8x64_insert_u8x16 (u8x64_zero (), r, 0);
+ ghash4_mul_next (&ctx->gd, r4, h);
+#elif N_LANES == 2
+ u8x32 h = u8x32_insert_lo (u8x32_zero (), ctx->Hi[NUM_HI - 1]);
+ u8x32 r2 = u8x32_insert_lo (u8x32_zero (), r);
+ ghash2_mul_next (&ctx->gd, r2, h);
+#else
+ ghash_mul_next (&ctx->gd, r, ctx->Hi[NUM_HI - 1]);
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_ctr0_round (aes_gcm_ctx_t *ctx, int aes_round)
+{
+ if (aes_round == 0)
+ ctx->EY0 ^= ctx->Ke[0].x1;
+ else if (aes_round == ctx->rounds)
+ ctx->EY0 = aes_enc_last_round (ctx->EY0, ctx->Ke[aes_round].x1);
+ else
+ ctx->EY0 = aes_enc_round (ctx->EY0, ctx->Ke[aes_round].x1);
+}
+
+static_always_inline void
+aes_gcm_ghash (aes_gcm_ctx_t *ctx, u8 *data, u32 n_left)
+{
+ uword i;
+ aes_data_t r = {};
+ const aes_mem_t *d = (aes_mem_t *) data;
+
+ for (; n_left >= 8 * N; n_left -= 8 * N, d += 8)
+ {
+ if (ctx->operation == AES_GCM_OP_GMAC && n_left == N * 8)
+ {
+ aes_gcm_ghash_mul_first (ctx, d[0], 8 * N_LANES + 1);
+ for (i = 1; i < 8; i++)
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ aes_gcm_ghash_mul_bit_len (ctx);
+ aes_gcm_ghash_reduce (ctx);
+ aes_gcm_ghash_reduce2 (ctx);
+ aes_gcm_ghash_final (ctx);
+ goto done;
+ }
+
+ aes_gcm_ghash_mul_first (ctx, d[0], 8 * N_LANES);
+ for (i = 1; i < 8; i++)
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ aes_gcm_ghash_reduce (ctx);
+ aes_gcm_ghash_reduce2 (ctx);
+ aes_gcm_ghash_final (ctx);
+ }
+
+ if (n_left > 0)
+ {
+ int n_lanes = (n_left + 15) / 16;
+
+ if (ctx->operation == AES_GCM_OP_GMAC)
+ n_lanes++;
+
+ if (n_left < N)
+ {
+ clib_memcpy_fast (&r, d, n_left);
+ aes_gcm_ghash_mul_first (ctx, r, n_lanes);
+ }
+ else
+ {
+ aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+ n_left -= N;
+ i = 1;
+
+ if (n_left >= 4 * N)
+ {
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ aes_gcm_ghash_mul_next (ctx, d[i + 1]);
+ aes_gcm_ghash_mul_next (ctx, d[i + 2]);
+ aes_gcm_ghash_mul_next (ctx, d[i + 3]);
+ n_left -= 4 * N;
+ i += 4;
+ }
+ if (n_left >= 2 * N)
+ {
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ aes_gcm_ghash_mul_next (ctx, d[i + 1]);
+ n_left -= 2 * N;
+ i += 2;
+ }
+
+ if (n_left >= N)
+ {
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ n_left -= N;
+ i += 1;
+ }
+
+ if (n_left)
+ {
+ clib_memcpy_fast (&r, d + i, n_left);
+ aes_gcm_ghash_mul_next (ctx, r);
+ }
+ }
+
+ if (ctx->operation == AES_GCM_OP_GMAC)
+ aes_gcm_ghash_mul_bit_len (ctx);
+ aes_gcm_ghash_reduce (ctx);
+ aes_gcm_ghash_reduce2 (ctx);
+ aes_gcm_ghash_final (ctx);
+ }
+ else if (ctx->operation == AES_GCM_OP_GMAC)
+ {
+ u8x16 r = (u8x16) ((u64x2){ ctx->data_bytes, ctx->aad_bytes } << 3);
+ ctx->T = ghash_mul (r ^ ctx->T, ctx->Hi[NUM_HI - 1]);
+ }
+
+done:
+ /* encrypt counter 0 E(Y0, k) */
+ if (ctx->operation == AES_GCM_OP_GMAC)
+ for (int i = 0; i < ctx->rounds + 1; i += 1)
+ aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline void
+aes_gcm_enc_first_round (aes_gcm_ctx_t *ctx, aes_data_t *r, uword n_blocks)
+{
+ const aes_gcm_expaned_key_t Ke0 = ctx->Ke[0];
+ uword i = 0;
+
+#if N_LANES == 4
+ const u32x16 ctr_inv_4444 = { 0, 0, 0, 4 << 24, 0, 0, 0, 4 << 24,
+ 0, 0, 0, 4 << 24, 0, 0, 0, 4 << 24 };
+
+ const u32x16 ctr_4444 = {
+ 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0,
+ };
+
+ /* As counter is stored in network byte order for performance reasons we
+ are incrementing least significant byte only except in case where we
+ overlow. As we are processing four 512-blocks in parallel except the
+ last round, overflow can happen only when n == 4 */
+
+ if (n_blocks == 4)
+ for (; i < 2; i++)
+ {
+ r[i] = Ke0.x4 ^ (u8x64) ctx->Y;
+ ctx->Y += ctr_inv_4444;
+ }
+
+ if (n_blocks == 4 && PREDICT_FALSE ((u8) ctx->counter == 242))
+ {
+ u32x16 Yr = (u32x16) aes_gcm_reflect ((u8x64) ctx->Y);
+
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x4 ^ (u8x64) ctx->Y;
+ Yr += ctr_4444;
+ ctx->Y = (u32x16) aes_gcm_reflect ((u8x64) Yr);
+ }
+ }
+ else
+ {
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x4 ^ (u8x64) ctx->Y;
+ ctx->Y += ctr_inv_4444;
+ }
+ }
+ ctx->counter += n_blocks * 4;
+#elif N_LANES == 2
+ const u32x8 ctr_inv_22 = { 0, 0, 0, 2 << 24, 0, 0, 0, 2 << 24 };
+ const u32x8 ctr_22 = { 2, 0, 0, 0, 2, 0, 0, 0 };
+
+ /* As counter is stored in network byte order for performance reasons we
+ are incrementing least significant byte only except in case where we
+ overlow. As we are processing four 512-blocks in parallel except the
+ last round, overflow can happen only when n == 4 */
+
+ if (n_blocks == 4)
+ for (; i < 2; i++)
+ {
+ r[i] = Ke0.x2 ^ (u8x32) ctx->Y;
+ ctx->Y += ctr_inv_22;
+ }
+
+ if (n_blocks == 4 && PREDICT_FALSE ((u8) ctx->counter == 250))
+ {
+ u32x8 Yr = (u32x8) aes_gcm_reflect ((u8x32) ctx->Y);
+
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x2 ^ (u8x32) ctx->Y;
+ Yr += ctr_22;
+ ctx->Y = (u32x8) aes_gcm_reflect ((u8x32) Yr);
+ }
+ }
+ else
+ {
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x2 ^ (u8x32) ctx->Y;
+ ctx->Y += ctr_inv_22;
+ }
+ }
+ ctx->counter += n_blocks * 2;
+#else
+ const u32x4 ctr_inv_1 = { 0, 0, 0, 1 << 24 };
+
+ if (PREDICT_TRUE ((u8) ctx->counter < 0xfe) || n_blocks < 3)
+ {
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x1 ^ (u8x16) ctx->Y;
+ ctx->Y += ctr_inv_1;
+ }
+ ctx->counter += n_blocks;
+ }
+ else
+ {
+ r[i++] = Ke0.x1 ^ (u8x16) ctx->Y;
+ ctx->Y += ctr_inv_1;
+ ctx->counter += 1;
+
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x1 ^ (u8x16) ctx->Y;
+ ctx->counter++;
+ ctx->Y[3] = clib_host_to_net_u32 (ctx->counter);
+ }
+ }
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_round (aes_data_t *r, const aes_gcm_expaned_key_t *Ke,
+ uword n_blocks)
+{
+ for (int i = 0; i < n_blocks; i++)
+#if N_LANES == 4
+ r[i] = aes_enc_round_x4 (r[i], Ke->x4);
+#elif N_LANES == 2
+ r[i] = aes_enc_round_x2 (r[i], Ke->x2);
+#else
+ r[i] = aes_enc_round (r[i], Ke->x1);
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_last_round (aes_gcm_ctx_t *ctx, aes_data_t *r, aes_data_t *d,
+ const aes_gcm_expaned_key_t *Ke, uword n_blocks)
+{
+ /* additional ronuds for AES-192 and AES-256 */
+ for (int i = 10; i < ctx->rounds; i++)
+ aes_gcm_enc_round (r, Ke + i, n_blocks);
+
+ for (int i = 0; i < n_blocks; i++)
+#if N_LANES == 4
+ d[i] ^= aes_enc_last_round_x4 (r[i], Ke[ctx->rounds].x4);
+#elif N_LANES == 2
+ d[i] ^= aes_enc_last_round_x2 (r[i], Ke[ctx->rounds].x2);
+#else
+ d[i] ^= aes_enc_last_round (r[i], Ke[ctx->rounds].x1);
+#endif
+}
+
+static_always_inline void
+aes_gcm_calc (aes_gcm_ctx_t *ctx, aes_data_t *d, const u8 *src, u8 *dst, u32 n,
+ u32 n_bytes, int with_ghash)
+{
+ const aes_gcm_expaned_key_t *k = ctx->Ke;
+ const aes_mem_t *sv = (aes_mem_t *) src;
+ aes_mem_t *dv = (aes_mem_t *) dst;
+ uword ghash_blocks, gc = 1;
+ aes_data_t r[4];
+ u32 i, n_lanes;
+
+ if (ctx->operation == AES_GCM_OP_ENCRYPT)
+ {
+ ghash_blocks = 4;
+ n_lanes = N_LANES * 4;
+ }
+ else
+ {
+ ghash_blocks = n;
+ n_lanes = n * N_LANES;
+#if N_LANES != 1
+ if (ctx->last)
+ n_lanes = (n_bytes + 15) / 16;
+#endif
+ }
+
+ n_bytes -= (n - 1) * N;
+
+ /* AES rounds 0 and 1 */
+ aes_gcm_enc_first_round (ctx, r, n);
+ aes_gcm_enc_round (r, k + 1, n);
+
+ /* load data - decrypt round */
+ if (ctx->operation == AES_GCM_OP_DECRYPT)
+ {
+ for (i = 0; i < n - ctx->last; i++)
+ d[i] = sv[i];
+
+ if (ctx->last)
+ d[n - 1] = aes_gcm_load_partial ((u8 *) (sv + n - 1), n_bytes);
+ }
+
+ /* GHASH multiply block 0 */
+ if (with_ghash)
+ aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+
+ /* AES rounds 2 and 3 */
+ aes_gcm_enc_round (r, k + 2, n);
+ aes_gcm_enc_round (r, k + 3, n);
+
+ /* GHASH multiply block 1 */
+ if (with_ghash && gc++ < ghash_blocks)
+ aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+ /* AES rounds 4 and 5 */
+ aes_gcm_enc_round (r, k + 4, n);
+ aes_gcm_enc_round (r, k + 5, n);
+
+ /* GHASH multiply block 2 */
+ if (with_ghash && gc++ < ghash_blocks)
+ aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+ /* AES rounds 6 and 7 */
+ aes_gcm_enc_round (r, k + 6, n);
+ aes_gcm_enc_round (r, k + 7, n);
+
+ /* GHASH multiply block 3 */
+ if (with_ghash && gc++ < ghash_blocks)
+ aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+ /* load 4 blocks of data - decrypt round */
+ if (ctx->operation == AES_GCM_OP_ENCRYPT)
+ {
+ for (i = 0; i < n - ctx->last; i++)
+ d[i] = sv[i];
+
+ if (ctx->last)
+ d[n - 1] = aes_gcm_load_partial (sv + n - 1, n_bytes);
+ }
+
+ /* AES rounds 8 and 9 */
+ aes_gcm_enc_round (r, k + 8, n);
+ aes_gcm_enc_round (r, k + 9, n);
+
+ /* AES last round(s) */
+ aes_gcm_enc_last_round (ctx, r, d, k, n);
+
+ /* store data */
+ for (i = 0; i < n - ctx->last; i++)
+ dv[i] = d[i];
+
+ if (ctx->last)
+ aes_gcm_store_partial (d[n - 1], dv + n - 1, n_bytes);
+
+ /* GHASH reduce 1st step */
+ aes_gcm_ghash_reduce (ctx);
+
+ /* GHASH reduce 2nd step */
+ if (with_ghash)
+ aes_gcm_ghash_reduce2 (ctx);
+
+ /* GHASH final step */
+ if (with_ghash)
+ aes_gcm_ghash_final (ctx);
+}
+
+static_always_inline void
+aes_gcm_calc_double (aes_gcm_ctx_t *ctx, aes_data_t *d, const u8 *src, u8 *dst,
+ int with_ghash)
+{
+ const aes_gcm_expaned_key_t *k = ctx->Ke;
+ const aes_mem_t *sv = (aes_mem_t *) src;
+ aes_mem_t *dv = (aes_mem_t *) dst;
+ aes_data_t r[4];
+
+ /* AES rounds 0 and 1 */
+ aes_gcm_enc_first_round (ctx, r, 4);
+ aes_gcm_enc_round (r, k + 1, 4);
+
+ /* load 4 blocks of data - decrypt round */
+ if (ctx->operation == AES_GCM_OP_DECRYPT)
+ for (int i = 0; i < 4; i++)
+ d[i] = sv[i];
+
+ /* GHASH multiply block 0 */
+ aes_gcm_ghash_mul_first (ctx, d[0], N_LANES * 8);
+
+ /* AES rounds 2 and 3 */
+ aes_gcm_enc_round (r, k + 2, 4);
+ aes_gcm_enc_round (r, k + 3, 4);
+
+ /* GHASH multiply block 1 */
+ aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+ /* AES rounds 4 and 5 */
+ aes_gcm_enc_round (r, k + 4, 4);
+ aes_gcm_enc_round (r, k + 5, 4);
+
+ /* GHASH multiply block 2 */
+ aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+ /* AES rounds 6 and 7 */
+ aes_gcm_enc_round (r, k + 6, 4);
+ aes_gcm_enc_round (r, k + 7, 4);
+
+ /* GHASH multiply block 3 */
+ aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+ /* AES rounds 8 and 9 */
+ aes_gcm_enc_round (r, k + 8, 4);
+ aes_gcm_enc_round (r, k + 9, 4);
+
+ /* load 4 blocks of data - encrypt round */
+ if (ctx->operation == AES_GCM_OP_ENCRYPT)
+ for (int i = 0; i < 4; i++)
+ d[i] = sv[i];
+
+ /* AES last round(s) */
+ aes_gcm_enc_last_round (ctx, r, d, k, 4);
+
+ /* store 4 blocks of data */
+ for (int i = 0; i < 4; i++)
+ dv[i] = d[i];
+
+ /* load next 4 blocks of data data - decrypt round */
+ if (ctx->operation == AES_GCM_OP_DECRYPT)
+ for (int i = 0; i < 4; i++)
+ d[i] = sv[i + 4];
+
+ /* GHASH multiply block 4 */
+ aes_gcm_ghash_mul_next (ctx, (d[0]));
+
+ /* AES rounds 0 and 1 */
+ aes_gcm_enc_first_round (ctx, r, 4);
+ aes_gcm_enc_round (r, k + 1, 4);
+
+ /* GHASH multiply block 5 */
+ aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+ /* AES rounds 2 and 3 */
+ aes_gcm_enc_round (r, k + 2, 4);
+ aes_gcm_enc_round (r, k + 3, 4);
+
+ /* GHASH multiply block 6 */
+ aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+ /* AES rounds 4 and 5 */
+ aes_gcm_enc_round (r, k + 4, 4);
+ aes_gcm_enc_round (r, k + 5, 4);
+
+ /* GHASH multiply block 7 */
+ aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+ /* AES rounds 6 and 7 */
+ aes_gcm_enc_round (r, k + 6, 4);
+ aes_gcm_enc_round (r, k + 7, 4);
+
+ /* GHASH reduce 1st step */
+ aes_gcm_ghash_reduce (ctx);
+
+ /* AES rounds 8 and 9 */
+ aes_gcm_enc_round (r, k + 8, 4);
+ aes_gcm_enc_round (r, k + 9, 4);
+
+ /* GHASH reduce 2nd step */
+ aes_gcm_ghash_reduce2 (ctx);
+
+ /* load 4 blocks of data - encrypt round */
+ if (ctx->operation == AES_GCM_OP_ENCRYPT)
+ for (int i = 0; i < 4; i++)
+ d[i] = sv[i + 4];
+
+ /* AES last round(s) */
+ aes_gcm_enc_last_round (ctx, r, d, k, 4);
+
+ /* store data */
+ for (int i = 0; i < 4; i++)
+ dv[i + 4] = d[i];
+
+ /* GHASH final step */
+ aes_gcm_ghash_final (ctx);
+}
+
+static_always_inline void
+aes_gcm_mask_bytes (aes_data_t *d, uword n_bytes)
+{
+ const union
+ {
+ u8 b[64];
+ aes_data_t r;
+ } scale = {
+ .b = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+ };
+
+ d[0] &= (aes_gcm_splat (n_bytes) > scale.r);
+}
+
+static_always_inline void
+aes_gcm_calc_last (aes_gcm_ctx_t *ctx, aes_data_t *d, int n_blocks,
+ u32 n_bytes)
+{
+ int n_lanes = (N_LANES == 1 ? n_blocks : (n_bytes + 15) / 16) + 1;
+ n_bytes -= (n_blocks - 1) * N;
+ int i;
+
+ aes_gcm_enc_ctr0_round (ctx, 0);
+ aes_gcm_enc_ctr0_round (ctx, 1);
+
+ if (n_bytes != N)
+ aes_gcm_mask_bytes (d + n_blocks - 1, n_bytes);
+
+ aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+
+ aes_gcm_enc_ctr0_round (ctx, 2);
+ aes_gcm_enc_ctr0_round (ctx, 3);
+
+ if (n_blocks > 1)
+ aes_gcm_ghash_mul_next (ctx, d[1]);
+
+ aes_gcm_enc_ctr0_round (ctx, 4);
+ aes_gcm_enc_ctr0_round (ctx, 5);
+
+ if (n_blocks > 2)
+ aes_gcm_ghash_mul_next (ctx, d[2]);
+
+ aes_gcm_enc_ctr0_round (ctx, 6);
+ aes_gcm_enc_ctr0_round (ctx, 7);
+
+ if (n_blocks > 3)
+ aes_gcm_ghash_mul_next (ctx, d[3]);
+
+ aes_gcm_enc_ctr0_round (ctx, 8);
+ aes_gcm_enc_ctr0_round (ctx, 9);
+
+ aes_gcm_ghash_mul_bit_len (ctx);
+ aes_gcm_ghash_reduce (ctx);
+
+ for (i = 10; i < ctx->rounds; i++)
+ aes_gcm_enc_ctr0_round (ctx, i);
+
+ aes_gcm_ghash_reduce2 (ctx);
+
+ aes_gcm_ghash_final (ctx);
+
+ aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline void
+aes_gcm_enc (aes_gcm_ctx_t *ctx, const u8 *src, u8 *dst, u32 n_left)
+{
+ aes_data_t d[4];
+
+ if (PREDICT_FALSE (n_left == 0))
+ {
+ int i;
+ for (i = 0; i < ctx->rounds + 1; i++)
+ aes_gcm_enc_ctr0_round (ctx, i);
+ return;
+ }
+
+ if (n_left < 4 * N)
+ {
+ ctx->last = 1;
+ if (n_left > 3 * N)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 0);
+ aes_gcm_calc_last (ctx, d, 4, n_left);
+ }
+ else if (n_left > 2 * N)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 0);
+ aes_gcm_calc_last (ctx, d, 3, n_left);
+ }
+ else if (n_left > N)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 0);
+ aes_gcm_calc_last (ctx, d, 2, n_left);
+ }
+ else
+ {
+ aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 0);
+ aes_gcm_calc_last (ctx, d, 1, n_left);
+ }
+ return;
+ }
+ aes_gcm_calc (ctx, d, src, dst, 4, 4 * N, /* with_ghash */ 0);
+
+ /* next */
+ n_left -= 4 * N;
+ dst += 4 * N;
+ src += 4 * N;
+
+ for (; n_left >= 8 * N; n_left -= 8 * N, src += 8 * N, dst += 8 * N)
+ aes_gcm_calc_double (ctx, d, src, dst, /* with_ghash */ 1);
+
+ if (n_left >= 4 * N)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 4, 4 * N, /* with_ghash */ 1);
+
+ /* next */
+ n_left -= 4 * N;
+ dst += 4 * N;
+ src += 4 * N;
+ }
+
+ if (n_left == 0)
+ {
+ aes_gcm_calc_last (ctx, d, 4, 4 * N);
+ return;
+ }
+
+ ctx->last = 1;
+
+ if (n_left > 3 * N)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 1);
+ aes_gcm_calc_last (ctx, d, 4, n_left);
+ }
+ else if (n_left > 2 * N)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 1);
+ aes_gcm_calc_last (ctx, d, 3, n_left);
+ }
+ else if (n_left > N)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 1);
+ aes_gcm_calc_last (ctx, d, 2, n_left);
+ }
+ else
+ {
+ aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 1);
+ aes_gcm_calc_last (ctx, d, 1, n_left);
+ }
+}
+
+static_always_inline void
+aes_gcm_dec (aes_gcm_ctx_t *ctx, const u8 *src, u8 *dst, uword n_left)
+{
+ aes_data_t d[4] = {};
+ for (; n_left >= 8 * N; n_left -= 8 * N, dst += 8 * N, src += 8 * N)
+ aes_gcm_calc_double (ctx, d, src, dst, /* with_ghash */ 1);
+
+ if (n_left >= 4 * N)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 4, 4 * N, /* with_ghash */ 1);
+
+ /* next */
+ n_left -= 4 * N;
+ dst += N * 4;
+ src += N * 4;
+ }
+
+ if (n_left == 0)
+ goto done;
+
+ ctx->last = 1;
+
+ if (n_left > 3 * N)
+ aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 1);
+ else if (n_left > 2 * N)
+ aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 1);
+ else if (n_left > N)
+ aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 1);
+ else
+ aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 1);
+
+ u8x16 r;
+done:
+ r = (u8x16) ((u64x2){ ctx->data_bytes, ctx->aad_bytes } << 3);
+ ctx->T = ghash_mul (r ^ ctx->T, ctx->Hi[NUM_HI - 1]);
+
+ /* encrypt counter 0 E(Y0, k) */
+ for (int i = 0; i < ctx->rounds + 1; i += 1)
+ aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline int
+aes_gcm (const u8 *src, u8 *dst, const u8 *aad, u8 *ivp, u8 *tag,
+ u32 data_bytes, u32 aad_bytes, u8 tag_len,
+ const aes_gcm_key_data_t *kd, int aes_rounds, aes_gcm_op_t op)
+{
+ u8 *addt = (u8 *) aad;
+ u32x4 Y0;
+
+ aes_gcm_ctx_t _ctx = { .counter = 2,
+ .rounds = aes_rounds,
+ .operation = op,
+ .data_bytes = data_bytes,
+ .aad_bytes = aad_bytes,
+ .Hi = kd->Hi },
+ *ctx = &_ctx;
+
+ /* initalize counter */
+ Y0 = (u32x4) (u64x2){ *(u64u *) ivp, 0 };
+ Y0[2] = *(u32u *) (ivp + 8);
+ Y0[3] = 1 << 24;
+ ctx->EY0 = (u8x16) Y0;
+ ctx->Ke = kd->Ke;
+#if N_LANES == 4
+ ctx->Y = u32x16_splat_u32x4 (Y0) + (u32x16){
+ 0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24, 0, 0, 0, 3 << 24, 0, 0, 0, 4 << 24,
+ };
+#elif N_LANES == 2
+ ctx->Y =
+ u32x8_splat_u32x4 (Y0) + (u32x8){ 0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24 };
+#else
+ ctx->Y = Y0 + (u32x4){ 0, 0, 0, 1 << 24 };
+#endif
+
+ /* calculate ghash for AAD */
+ aes_gcm_ghash (ctx, addt, aad_bytes);
+
+ clib_prefetch_load (tag);
+
+ /* ghash and encrypt/edcrypt */
+ if (op == AES_GCM_OP_ENCRYPT)
+ aes_gcm_enc (ctx, src, dst, data_bytes);
+ else if (op == AES_GCM_OP_DECRYPT)
+ aes_gcm_dec (ctx, src, dst, data_bytes);
+
+ /* final tag is */
+ ctx->T = u8x16_reflect (ctx->T) ^ ctx->EY0;
+
+ /* tag_len 16 -> 0 */
+ tag_len &= 0xf;
+
+ if (op == AES_GCM_OP_ENCRYPT || op == AES_GCM_OP_GMAC)
+ {
+ /* store tag */
+ if (tag_len)
+ u8x16_store_partial (ctx->T, tag, tag_len);
+ else
+ ((u8x16u *) tag)[0] = ctx->T;
+ }
+ else
+ {
+ /* check tag */
+ if (tag_len)
+ {
+ u16 mask = pow2_mask (tag_len);
+ u8x16 expected = u8x16_load_partial (tag, tag_len);
+ if ((u8x16_msb_mask (expected == ctx->T) & mask) == mask)
+ return 1;
+ }
+ else
+ {
+ if (u8x16_is_equal (ctx->T, *(u8x16u *) tag))
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static_always_inline void
+clib_aes_gcm_key_expand (aes_gcm_key_data_t *kd, const u8 *key,
+ aes_key_size_t ks)
+{
+ u8x16 H;
+ u8x16 ek[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+ aes_gcm_expaned_key_t *Ke = (aes_gcm_expaned_key_t *) kd->Ke;
+
+ /* expand AES key */
+ aes_key_expand (ek, key, ks);
+ for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+ Ke[i].lanes[0] = Ke[i].lanes[1] = Ke[i].lanes[2] = Ke[i].lanes[3] = ek[i];
+
+ /* pre-calculate H */
+ H = aes_encrypt_block (u8x16_zero (), ek, ks);
+ H = u8x16_reflect (H);
+ ghash_precompute (H, (u8x16 *) kd->Hi, ARRAY_LEN (kd->Hi));
+}
+
+static_always_inline void
+clib_aes128_gcm_enc (const aes_gcm_key_data_t *kd, const u8 *plaintext,
+ u32 data_bytes, const u8 *aad, u32 aad_bytes,
+ const u8 *iv, u32 tag_bytes, u8 *cyphertext, u8 *tag)
+{
+ aes_gcm (plaintext, cyphertext, aad, (u8 *) iv, tag, data_bytes, aad_bytes,
+ tag_bytes, kd, AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_ENCRYPT);
+}
+
+static_always_inline void
+clib_aes256_gcm_enc (const aes_gcm_key_data_t *kd, const u8 *plaintext,
+ u32 data_bytes, const u8 *aad, u32 aad_bytes,
+ const u8 *iv, u32 tag_bytes, u8 *cyphertext, u8 *tag)
+{
+ aes_gcm (plaintext, cyphertext, aad, (u8 *) iv, tag, data_bytes, aad_bytes,
+ tag_bytes, kd, AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_ENCRYPT);
+}
+
+static_always_inline int
+clib_aes128_gcm_dec (const aes_gcm_key_data_t *kd, const u8 *cyphertext,
+ u32 data_bytes, const u8 *aad, u32 aad_bytes,
+ const u8 *iv, const u8 *tag, u32 tag_bytes, u8 *plaintext)
+{
+ return aes_gcm (cyphertext, plaintext, aad, (u8 *) iv, (u8 *) tag,
+ data_bytes, aad_bytes, tag_bytes, kd,
+ AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_DECRYPT);
+}
+
+static_always_inline int
+clib_aes256_gcm_dec (const aes_gcm_key_data_t *kd, const u8 *cyphertext,
+ u32 data_bytes, const u8 *aad, u32 aad_bytes,
+ const u8 *iv, const u8 *tag, u32 tag_bytes, u8 *plaintext)
+{
+ return aes_gcm (cyphertext, plaintext, aad, (u8 *) iv, (u8 *) tag,
+ data_bytes, aad_bytes, tag_bytes, kd,
+ AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_DECRYPT);
+}
+
+static_always_inline void
+clib_aes128_gmac (const aes_gcm_key_data_t *kd, const u8 *data, u32 data_bytes,
+ const u8 *iv, u32 tag_bytes, u8 *tag)
+{
+ aes_gcm (0, 0, data, (u8 *) iv, tag, 0, data_bytes, tag_bytes, kd,
+ AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_GMAC);
+}
+
+static_always_inline void
+clib_aes256_gmac (const aes_gcm_key_data_t *kd, const u8 *data, u32 data_bytes,
+ const u8 *iv, u32 tag_bytes, u8 *tag)
+{
+ aes_gcm (0, 0, data, (u8 *) iv, tag, 0, data_bytes, tag_bytes, kd,
+ AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_GMAC);
+}
+
+#endif /* __crypto_aes_gcm_h__ */
diff --git a/src/vppinfra/crypto/ghash.h b/src/vppinfra/crypto/ghash.h
new file mode 100644
index 00000000000..bae8badb5fc
--- /dev/null
+++ b/src/vppinfra/crypto/ghash.h
@@ -0,0 +1,515 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2019 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+/*
+ *------------------------------------------------------------------
+ * Copyright(c) 2018, Intel Corporation All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
+ * DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *------------------------------------------------------------------
+ */
+
+/*
+ * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
+ * Vinodh Gopal, James Guilford, Tomasz Kantecki
+ *
+ * References:
+ * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
+ * Intel Architecture Processors. August, 2010
+ * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
+ * Intel Architecture Processors. October, 2012.
+ * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
+ *
+ * Definitions:
+ * GF Galois Extension Field GF(2^128) - finite field where elements are
+ * represented as polynomials with coefficients in GF(2) with the
+ * highest degree of 127. Polynomials are represented as 128-bit binary
+ * numbers where each bit represents one coefficient.
+ * e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
+ * H hash key (128 bit)
+ * POLY irreducible polynomial x^127 + x^7 + x^2 + x + 1
+ * RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
+ * + addition in GF, which equals to XOR operation
+ * * multiplication in GF
+ *
+ * GF multiplication consists of 2 steps:
+ * - carry-less multiplication of two 128-bit operands into 256-bit result
+ * - reduction of 256-bit result into 128-bit with modulo POLY
+ *
+ * GHash is calculated on 128-bit blocks of data according to the following
+ * formula:
+ * GH = (GH + data) * hash_key
+ *
+ * To avoid bit-reflection of data, this code uses GF multipication
+ * with reversed polynomial:
+ * a * b * x^-127 mod RPOLY
+ *
+ * To improve computation speed table Hi is precomputed with powers of H',
+ * where H' is calculated as H<<1 mod RPOLY.
+ * This allows us to improve performance by deferring reduction. For example
+ * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
+ *
+ * u8x16 Hi[4];
+ * ghash_precompute (H, Hi, 4);
+ *
+ * ghash_data_t _gd, *gd = &_gd;
+ * ghash_mul_first (gd, GH ^ b0, Hi[3]);
+ * ghash_mul_next (gd, b1, Hi[2]);
+ * ghash_mul_next (gd, b2, Hi[1]);
+ * ghash_mul_next (gd, b3, Hi[0]);
+ * ghash_reduce (gd);
+ * ghash_reduce2 (gd);
+ * GH = ghash_final (gd);
+ *
+ * Reduction step is split into 3 functions so it can be better interleaved
+ * with other code, (i.e. with AES computation).
+ */
+
+#ifndef __ghash_h__
+#define __ghash_h__
+
+static_always_inline u8x16
+gmul_lo_lo (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+ return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
+ (poly64_t) vget_low_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_hi_lo (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+ return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
+ (poly64_t) vget_low_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_lo_hi (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+ return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
+ (poly64_t) vget_high_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_hi_hi (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+ return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
+#endif
+}
+
+typedef struct
+{
+ u8x16 mid, hi, lo, tmp_lo, tmp_hi;
+ u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
+ u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
+ int pending;
+} ghash_data_t;
+
+static const u8x16 ghash_poly = {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
+};
+
+static const u8x16 ghash_poly2 = {
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
+};
+
+static_always_inline void
+ghash_mul_first (ghash_data_t * gd, u8x16 a, u8x16 b)
+{
+ /* a1 * b1 */
+ gd->hi = gmul_hi_hi (a, b);
+ /* a0 * b0 */
+ gd->lo = gmul_lo_lo (a, b);
+ /* a0 * b1 ^ a1 * b0 */
+ gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
+
+ /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
+ there is no pending data in tmp_lo and tmp_hi */
+ gd->pending = 0;
+}
+
+static_always_inline void
+ghash_mul_next (ghash_data_t * gd, u8x16 a, u8x16 b)
+{
+ /* a1 * b1 */
+ u8x16 hi = gmul_hi_hi (a, b);
+ /* a0 * b0 */
+ u8x16 lo = gmul_lo_lo (a, b);
+
+ /* this branch will be optimized out by the compiler, and it allows us to
+ reduce number of XOR operations by using ternary logic */
+ if (gd->pending)
+ {
+ /* there is peding data from previous invocation so we can XOR */
+ gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
+ gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
+ gd->pending = 0;
+ }
+ else
+ {
+ /* there is no peding data from previous invocation so we postpone XOR */
+ gd->tmp_hi = hi;
+ gd->tmp_lo = lo;
+ gd->pending = 1;
+ }
+
+ /* gd->mid ^= a0 * b1 ^ a1 * b0 */
+ gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash_reduce (ghash_data_t * gd)
+{
+ u8x16 r;
+
+ /* Final combination:
+ gd->lo ^= gd->mid << 64
+ gd->hi ^= gd->mid >> 64 */
+ u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
+ u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
+
+ if (gd->pending)
+ {
+ gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
+ gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
+ }
+ else
+ {
+ gd->lo ^= midl;
+ gd->hi ^= midr;
+ }
+ r = gmul_hi_lo (ghash_poly2, gd->lo);
+ gd->lo ^= u8x16_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash_reduce2 (ghash_data_t * gd)
+{
+ gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
+ gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
+}
+
+static_always_inline u8x16
+ghash_final (ghash_data_t * gd)
+{
+ return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
+ u8x16_word_shift_left (gd->tmp_hi, 4));
+}
+
+static_always_inline u8x16
+ghash_mul (u8x16 a, u8x16 b)
+{
+ ghash_data_t _gd, *gd = &_gd;
+ ghash_mul_first (gd, a, b);
+ ghash_reduce (gd);
+ ghash_reduce2 (gd);
+ return ghash_final (gd);
+}
+
+#if defined(__VPCLMULQDQ__) && defined(__AVX512F__)
+
+static const u8x64 ghash4_poly2 = {
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+};
+
+static_always_inline u8x64
+gmul4_lo_lo (u8x64 a, u8x64 b)
+{
+ return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
+}
+
+static_always_inline u8x64
+gmul4_hi_lo (u8x64 a, u8x64 b)
+{
+ return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
+}
+
+static_always_inline u8x64
+gmul4_lo_hi (u8x64 a, u8x64 b)
+{
+ return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
+}
+
+static_always_inline u8x64
+gmul4_hi_hi (u8x64 a, u8x64 b)
+{
+ return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
+}
+
+static_always_inline void
+ghash4_mul_first (ghash_data_t *gd, u8x64 a, u8x64 b)
+{
+ gd->hi4 = gmul4_hi_hi (a, b);
+ gd->lo4 = gmul4_lo_lo (a, b);
+ gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
+ gd->pending = 0;
+}
+
+static_always_inline void
+ghash4_mul_next (ghash_data_t *gd, u8x64 a, u8x64 b)
+{
+ u8x64 hi = gmul4_hi_hi (a, b);
+ u8x64 lo = gmul4_lo_lo (a, b);
+
+ if (gd->pending)
+ {
+ /* there is peding data from previous invocation so we can XOR */
+ gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
+ gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
+ gd->pending = 0;
+ }
+ else
+ {
+ /* there is no peding data from previous invocation so we postpone XOR */
+ gd->tmp_hi4 = hi;
+ gd->tmp_lo4 = lo;
+ gd->pending = 1;
+ }
+ gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash4_reduce (ghash_data_t *gd)
+{
+ u8x64 r;
+
+ /* Final combination:
+ gd->lo4 ^= gd->mid4 << 64
+ gd->hi4 ^= gd->mid4 >> 64 */
+
+ u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
+ u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
+
+ if (gd->pending)
+ {
+ gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
+ gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
+ }
+ else
+ {
+ gd->lo4 ^= midl;
+ gd->hi4 ^= midr;
+ }
+
+ r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
+ gd->lo4 ^= u8x64_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash4_reduce2 (ghash_data_t *gd)
+{
+ gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
+ gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
+}
+
+static_always_inline u8x16
+ghash4_final (ghash_data_t *gd)
+{
+ u8x64 r;
+ u8x32 t;
+
+ r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
+ u8x64_word_shift_left (gd->tmp_hi4, 4));
+
+ /* horizontal XOR of 4 128-bit lanes */
+ t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
+ return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
+}
+#endif
+
+#if defined(__VPCLMULQDQ__)
+
+static const u8x32 ghash2_poly2 = {
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+};
+
+static_always_inline u8x32
+gmul2_lo_lo (u8x32 a, u8x32 b)
+{
+ return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
+}
+
+static_always_inline u8x32
+gmul2_hi_lo (u8x32 a, u8x32 b)
+{
+ return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
+}
+
+static_always_inline u8x32
+gmul2_lo_hi (u8x32 a, u8x32 b)
+{
+ return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
+}
+
+static_always_inline u8x32
+gmul2_hi_hi (u8x32 a, u8x32 b)
+{
+ return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
+}
+
+static_always_inline void
+ghash2_mul_first (ghash_data_t *gd, u8x32 a, u8x32 b)
+{
+ gd->hi2 = gmul2_hi_hi (a, b);
+ gd->lo2 = gmul2_lo_lo (a, b);
+ gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
+ gd->pending = 0;
+}
+
+static_always_inline void
+ghash2_mul_next (ghash_data_t *gd, u8x32 a, u8x32 b)
+{
+ u8x32 hi = gmul2_hi_hi (a, b);
+ u8x32 lo = gmul2_lo_lo (a, b);
+
+ if (gd->pending)
+ {
+ /* there is peding data from previous invocation so we can XOR */
+ gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
+ gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
+ gd->pending = 0;
+ }
+ else
+ {
+ /* there is no peding data from previous invocation so we postpone XOR */
+ gd->tmp_hi2 = hi;
+ gd->tmp_lo2 = lo;
+ gd->pending = 1;
+ }
+ gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash2_reduce (ghash_data_t *gd)
+{
+ u8x32 r;
+
+ /* Final combination:
+ gd->lo2 ^= gd->mid2 << 64
+ gd->hi2 ^= gd->mid2 >> 64 */
+
+ u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
+ u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
+
+ if (gd->pending)
+ {
+ gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
+ gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
+ }
+ else
+ {
+ gd->lo2 ^= midl;
+ gd->hi2 ^= midr;
+ }
+
+ r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
+ gd->lo2 ^= u8x32_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash2_reduce2 (ghash_data_t *gd)
+{
+ gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
+ gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
+}
+
+static_always_inline u8x16
+ghash2_final (ghash_data_t *gd)
+{
+ u8x32 r;
+
+ r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
+ u8x32_word_shift_left (gd->tmp_hi2, 4));
+
+ /* horizontal XOR of 2 128-bit lanes */
+ return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
+}
+#endif
+
+static_always_inline void
+ghash_precompute (u8x16 H, u8x16 * Hi, int n)
+{
+ u8x16 r8;
+ u32x4 r32;
+ /* calcullate H<<1 mod poly from the hash key */
+ r8 = (u8x16) ((u64x2) H >> 63);
+ H = (u8x16) ((u64x2) H << 1);
+ H |= u8x16_word_shift_left (r8, 8);
+ r32 = (u32x4) u8x16_word_shift_right (r8, 8);
+#ifdef __SSE2__
+ r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
+#else
+ r32[3] = r32[0];
+#endif
+ r32 = r32 == (u32x4) {1, 0, 0, 1};
+ Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
+
+ /* calculate H^(i + 1) */
+ for (int i = n - 2; i >= 0; i--)
+ Hi[i] = ghash_mul (H, Hi[i + 1]);
+}
+
+#endif /* __ghash_h__ */
+