From b47376f0b404d2ba5526fba52b171d79b0f352f8 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Wed, 15 Mar 2023 11:42:06 +0000
Subject: vppinfra: AES-CBC and AES-GCM refactor and optimizations

- crypto code moved to vppinfra for better testing and reuse
- added 256-bit VAES support (Intel Client CPUs)
- added AES_GMAC functions

Change-Id: I960c8e14ca0a0126703e8f1589d86f32e2a98361
Type: improvement
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/vppinfra/crypto/aes.h     | 439 +++++++++++++++++++
 src/vppinfra/crypto/aes_cbc.h | 549 ++++++++++++++++++++++++
 src/vppinfra/crypto/aes_gcm.h | 975 ++++++++++++++++++++++++++++++++++++++++++
 src/vppinfra/crypto/ghash.h   | 515 ++++++++++++++++++++++
 4 files changed, 2478 insertions(+)
 create mode 100644 src/vppinfra/crypto/aes.h
 create mode 100644 src/vppinfra/crypto/aes_cbc.h
 create mode 100644 src/vppinfra/crypto/aes_gcm.h
 create mode 100644 src/vppinfra/crypto/ghash.h

(limited to 'src/vppinfra/crypto')

diff --git a/src/vppinfra/crypto/aes.h b/src/vppinfra/crypto/aes.h
new file mode 100644
index 00000000000..a5e286e4c6e
--- /dev/null
+++ b/src/vppinfra/crypto/aes.h
@@ -0,0 +1,439 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __aesni_h__
+#define __aesni_h__
+
+typedef enum
+{
+  AES_KEY_128 = 0,
+  AES_KEY_192 = 1,
+  AES_KEY_256 = 2,
+} aes_key_size_t;
+
+#define AES_KEY_ROUNDS(x)		(10 + x * 2)
+#define AES_KEY_BYTES(x)		(16 + x * 8)
+
+static_always_inline u8x16
+aes_block_load (u8 * p)
+{
+  return *(u8x16u *) p;
+}
+
+static_always_inline u8x16
+aes_enc_round (u8x16 a, u8x16 k)
+{
+#if defined (__AES__)
+  return (u8x16) _mm_aesenc_si128 ((__m128i) a, (__m128i) k);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return vaesmcq_u8 (vaeseq_u8 (a, u8x16_splat (0))) ^ k;
+#endif
+}
+
+#if defined(__VAES__) && defined(__AVX512F__)
+static_always_inline u8x64
+aes_enc_round_x4 (u8x64 a, u8x64 k)
+{
+  return (u8x64) _mm512_aesenc_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_enc_last_round_x4 (u8x64 a, u8x64 k)
+{
+  return (u8x64) _mm512_aesenclast_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_dec_round_x4 (u8x64 a, u8x64 k)
+{
+  return (u8x64) _mm512_aesdec_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_dec_last_round_x4 (u8x64 a, u8x64 k)
+{
+  return (u8x64) _mm512_aesdeclast_epi128 ((__m512i) a, (__m512i) k);
+}
+#endif
+
+#ifdef __VAES__
+static_always_inline u8x32
+aes_enc_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesenc_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_enc_last_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesenclast_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesdec_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_last_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesdeclast_epi128 ((__m256i) a, (__m256i) k);
+}
+#endif
+
+static_always_inline u8x16
+aes_enc_last_round (u8x16 a, u8x16 k)
+{
+#if defined (__AES__)
+  return (u8x16) _mm_aesenclast_si128 ((__m128i) a, (__m128i) k);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return vaeseq_u8 (a, u8x16_splat (0)) ^ k;
+#endif
+}
+
+#ifdef __x86_64__
+
+static_always_inline u8x16
+aes_dec_round (u8x16 a, u8x16 k)
+{
+  return (u8x16) _mm_aesdec_si128 ((__m128i) a, (__m128i) k);
+}
+
+static_always_inline u8x16
+aes_dec_last_round (u8x16 a, u8x16 k)
+{
+  return (u8x16) _mm_aesdeclast_si128 ((__m128i) a, (__m128i) k);
+}
+#endif
+
+static_always_inline void
+aes_block_store (u8 * p, u8x16 r)
+{
+  *(u8x16u *) p = r;
+}
+
+static_always_inline u8x16
+aes_encrypt_block (u8x16 block, const u8x16 * round_keys, aes_key_size_t ks)
+{
+  int rounds = AES_KEY_ROUNDS (ks);
+  block ^= round_keys[0];
+  for (int i = 1; i < rounds; i += 1)
+    block = aes_enc_round (block, round_keys[i]);
+  return aes_enc_last_round (block, round_keys[rounds]);
+}
+
+static_always_inline u8x16
+aes_inv_mix_column (u8x16 a)
+{
+#if defined (__AES__)
+  return (u8x16) _mm_aesimc_si128 ((__m128i) a);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return vaesimcq_u8 (a);
+#endif
+}
+
+#ifdef __x86_64__
+#define aes_keygen_assist(a, b) \
+  (u8x16) _mm_aeskeygenassist_si128((__m128i) a, b)
+
+/* AES-NI based AES key expansion based on code samples from
+   Intel(r) Advanced Encryption Standard (AES) New Instructions White Paper
+   (323641-001) */
+
+static_always_inline void
+aes128_key_assist (u8x16 * rk, u8x16 r)
+{
+  u8x16 t = rk[-1];
+  t ^= u8x16_word_shift_left (t, 4);
+  t ^= u8x16_word_shift_left (t, 4);
+  t ^= u8x16_word_shift_left (t, 4);
+  rk[0] = t ^ (u8x16) u32x4_shuffle ((u32x4) r, 3, 3, 3, 3);
+}
+
+static_always_inline void
+aes128_key_expand (u8x16 *rk, u8x16u const *k)
+{
+  rk[0] = k[0];
+  aes128_key_assist (rk + 1, aes_keygen_assist (rk[0], 0x01));
+  aes128_key_assist (rk + 2, aes_keygen_assist (rk[1], 0x02));
+  aes128_key_assist (rk + 3, aes_keygen_assist (rk[2], 0x04));
+  aes128_key_assist (rk + 4, aes_keygen_assist (rk[3], 0x08));
+  aes128_key_assist (rk + 5, aes_keygen_assist (rk[4], 0x10));
+  aes128_key_assist (rk + 6, aes_keygen_assist (rk[5], 0x20));
+  aes128_key_assist (rk + 7, aes_keygen_assist (rk[6], 0x40));
+  aes128_key_assist (rk + 8, aes_keygen_assist (rk[7], 0x80));
+  aes128_key_assist (rk + 9, aes_keygen_assist (rk[8], 0x1b));
+  aes128_key_assist (rk + 10, aes_keygen_assist (rk[9], 0x36));
+}
+
+static_always_inline void
+aes192_key_assist (u8x16 * r1, u8x16 * r2, u8x16 key_assist)
+{
+  u8x16 t;
+  r1[0] ^= t = u8x16_word_shift_left (r1[0], 4);
+  r1[0] ^= t = u8x16_word_shift_left (t, 4);
+  r1[0] ^= u8x16_word_shift_left (t, 4);
+  r1[0] ^= (u8x16) _mm_shuffle_epi32 ((__m128i) key_assist, 0x55);
+  r2[0] ^= u8x16_word_shift_left (r2[0], 4);
+  r2[0] ^= (u8x16) _mm_shuffle_epi32 ((__m128i) r1[0], 0xff);
+}
+
+static_always_inline void
+aes192_key_expand (u8x16 * rk, u8x16u const *k)
+{
+  u8x16 r1, r2;
+
+  rk[0] = r1 = k[0];
+  /* *INDENT-OFF* */
+  rk[1] = r2 = (u8x16) (u64x2) { *(u64 *) (k + 1), 0 };
+  /* *INDENT-ON* */
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x1));
+  rk[1] = (u8x16) _mm_shuffle_pd ((__m128d) rk[1], (__m128d) r1, 0);
+  rk[2] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x2));
+  rk[3] = r1;
+  rk[4] = r2;
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x4));
+  rk[4] = (u8x16) _mm_shuffle_pd ((__m128d) rk[4], (__m128d) r1, 0);
+  rk[5] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x8));
+  rk[6] = r1;
+  rk[7] = r2;
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x10));
+  rk[7] = (u8x16) _mm_shuffle_pd ((__m128d) rk[7], (__m128d) r1, 0);
+  rk[8] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x20));
+  rk[9] = r1;
+  rk[10] = r2;
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x40));
+  rk[10] = (u8x16) _mm_shuffle_pd ((__m128d) rk[10], (__m128d) r1, 0);
+  rk[11] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x80));
+  rk[12] = r1;
+}
+
+static_always_inline void
+aes256_key_assist (u8x16 * rk, int i, u8x16 key_assist)
+{
+  u8x16 r, t;
+  rk += i;
+  r = rk[-2];
+  r ^= t = u8x16_word_shift_left (r, 4);
+  r ^= t = u8x16_word_shift_left (t, 4);
+  r ^= u8x16_word_shift_left (t, 4);
+  r ^= (u8x16) u32x4_shuffle ((u32x4) key_assist, 3, 3, 3, 3);
+  rk[0] = r;
+
+  if (i >= 14)
+    return;
+
+  key_assist = aes_keygen_assist (rk[0], 0x0);
+  r = rk[-1];
+  r ^= t = u8x16_word_shift_left (r, 4);
+  r ^= t = u8x16_word_shift_left (t, 4);
+  r ^= u8x16_word_shift_left (t, 4);
+  r ^= (u8x16) u32x4_shuffle ((u32x4) key_assist, 2, 2, 2, 2);
+  rk[1] = r;
+}
+
+static_always_inline void
+aes256_key_expand (u8x16 * rk, u8x16u const *k)
+{
+  rk[0] = k[0];
+  rk[1] = k[1];
+  aes256_key_assist (rk, 2, aes_keygen_assist (rk[1], 0x01));
+  aes256_key_assist (rk, 4, aes_keygen_assist (rk[3], 0x02));
+  aes256_key_assist (rk, 6, aes_keygen_assist (rk[5], 0x04));
+  aes256_key_assist (rk, 8, aes_keygen_assist (rk[7], 0x08));
+  aes256_key_assist (rk, 10, aes_keygen_assist (rk[9], 0x10));
+  aes256_key_assist (rk, 12, aes_keygen_assist (rk[11], 0x20));
+  aes256_key_assist (rk, 14, aes_keygen_assist (rk[13], 0x40));
+}
+#endif
+
+#ifdef __aarch64__
+
+static const u8x16 aese_prep_mask1 =
+  { 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 };
+static const u8x16 aese_prep_mask2 =
+  { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 };
+
+static_always_inline void
+aes128_key_expand_round_neon (u8x16 * rk, u32 rcon)
+{
+  u8x16 r, t, last_round = rk[-1], z = { };
+  r = vqtbl1q_u8 (last_round, aese_prep_mask1);
+  r = vaeseq_u8 (r, z);
+  r ^= (u8x16) vdupq_n_u32 (rcon);
+  r ^= last_round;
+  r ^= t = vextq_u8 (z, last_round, 12);
+  r ^= t = vextq_u8 (z, t, 12);
+  r ^= vextq_u8 (z, t, 12);
+  rk[0] = r;
+}
+
+static_always_inline void
+aes128_key_expand (u8x16 *rk, u8x16u const *k)
+{
+  rk[0] = k[0];
+  aes128_key_expand_round_neon (rk + 1, 0x01);
+  aes128_key_expand_round_neon (rk + 2, 0x02);
+  aes128_key_expand_round_neon (rk + 3, 0x04);
+  aes128_key_expand_round_neon (rk + 4, 0x08);
+  aes128_key_expand_round_neon (rk + 5, 0x10);
+  aes128_key_expand_round_neon (rk + 6, 0x20);
+  aes128_key_expand_round_neon (rk + 7, 0x40);
+  aes128_key_expand_round_neon (rk + 8, 0x80);
+  aes128_key_expand_round_neon (rk + 9, 0x1b);
+  aes128_key_expand_round_neon (rk + 10, 0x36);
+}
+
+static_always_inline void
+aes192_key_expand_round_neon (u8x8 * rk, u32 rcon)
+{
+  u8x8 r, last_round = rk[-1], z = { };
+  u8x16 r2, z2 = { };
+
+  r2 = (u8x16) vdupq_lane_u64 ((uint64x1_t) last_round, 0);
+  r2 = vqtbl1q_u8 (r2, aese_prep_mask1);
+  r2 = vaeseq_u8 (r2, z2);
+  r2 ^= (u8x16) vdupq_n_u32 (rcon);
+
+  r = (u8x8) vdup_laneq_u64 ((u64x2) r2, 0);
+  r ^= rk[-3];
+  r ^= vext_u8 (z, rk[-3], 4);
+  rk[0] = r;
+
+  r = rk[-2] ^ vext_u8 (r, z, 4);
+  r ^= vext_u8 (z, r, 4);
+  rk[1] = r;
+
+  if (rcon == 0x80)
+    return;
+
+  r = rk[-1] ^ vext_u8 (r, z, 4);
+  r ^= vext_u8 (z, r, 4);
+  rk[2] = r;
+}
+
+static_always_inline void
+aes192_key_expand (u8x16 * ek, const u8x16u * k)
+{
+  u8x8 *rk = (u8x8 *) ek;
+  ek[0] = k[0];
+  rk[2] = *(u8x8u *) (k + 1);
+  aes192_key_expand_round_neon (rk + 3, 0x01);
+  aes192_key_expand_round_neon (rk + 6, 0x02);
+  aes192_key_expand_round_neon (rk + 9, 0x04);
+  aes192_key_expand_round_neon (rk + 12, 0x08);
+  aes192_key_expand_round_neon (rk + 15, 0x10);
+  aes192_key_expand_round_neon (rk + 18, 0x20);
+  aes192_key_expand_round_neon (rk + 21, 0x40);
+  aes192_key_expand_round_neon (rk + 24, 0x80);
+}
+
+
+static_always_inline void
+aes256_key_expand_round_neon (u8x16 * rk, u32 rcon)
+{
+  u8x16 r, t, z = { };
+
+  r = vqtbl1q_u8 (rk[-1], rcon ? aese_prep_mask1 : aese_prep_mask2);
+  r = vaeseq_u8 (r, z);
+  if (rcon)
+    r ^= (u8x16) vdupq_n_u32 (rcon);
+  r ^= rk[-2];
+  r ^= t = vextq_u8 (z, rk[-2], 12);
+  r ^= t = vextq_u8 (z, t, 12);
+  r ^= vextq_u8 (z, t, 12);
+  rk[0] = r;
+}
+
+static_always_inline void
+aes256_key_expand (u8x16 *rk, u8x16u const *k)
+{
+  rk[0] = k[0];
+  rk[1] = k[1];
+  aes256_key_expand_round_neon (rk + 2, 0x01);
+  aes256_key_expand_round_neon (rk + 3, 0);
+  aes256_key_expand_round_neon (rk + 4, 0x02);
+  aes256_key_expand_round_neon (rk + 5, 0);
+  aes256_key_expand_round_neon (rk + 6, 0x04);
+  aes256_key_expand_round_neon (rk + 7, 0);
+  aes256_key_expand_round_neon (rk + 8, 0x08);
+  aes256_key_expand_round_neon (rk + 9, 0);
+  aes256_key_expand_round_neon (rk + 10, 0x10);
+  aes256_key_expand_round_neon (rk + 11, 0);
+  aes256_key_expand_round_neon (rk + 12, 0x20);
+  aes256_key_expand_round_neon (rk + 13, 0);
+  aes256_key_expand_round_neon (rk + 14, 0x40);
+}
+
+#endif
+
+static_always_inline void
+aes_key_expand (u8x16 * key_schedule, u8 const *key, aes_key_size_t ks)
+{
+  switch (ks)
+    {
+    case AES_KEY_128:
+      aes128_key_expand (key_schedule, (u8x16u const *) key);
+      break;
+    case AES_KEY_192:
+      aes192_key_expand (key_schedule, (u8x16u const *) key);
+      break;
+    case AES_KEY_256:
+      aes256_key_expand (key_schedule, (u8x16u const *) key);
+      break;
+    }
+}
+
+static_always_inline void
+aes_key_enc_to_dec (u8x16 * ke, u8x16 * kd, aes_key_size_t ks)
+{
+  int rounds = AES_KEY_ROUNDS (ks);
+
+  kd[rounds] = ke[0];
+  kd[0] = ke[rounds];
+
+  for (int i = 1; i < (rounds / 2); i++)
+    {
+      kd[rounds - i] = aes_inv_mix_column (ke[i]);
+      kd[i] = aes_inv_mix_column (ke[rounds - i]);
+    }
+
+  kd[rounds / 2] = aes_inv_mix_column (ke[rounds / 2]);
+}
+
+#endif /* __aesni_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vppinfra/crypto/aes_cbc.h b/src/vppinfra/crypto/aes_cbc.h
new file mode 100644
index 00000000000..5c3054f4a93
--- /dev/null
+++ b/src/vppinfra/crypto/aes_cbc.h
@@ -0,0 +1,549 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_cbc_h__
+#define __crypto_aes_cbc_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/crypto/aes.h>
+
+typedef struct
+{
+  const u8x16 encrypt_key[15];
+  const u8x16 decrypt_key[15];
+} aes_cbc_key_data_t;
+
+static_always_inline void
+clib_aes_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *src, uword len,
+		      const u8 *iv, aes_key_size_t ks, u8 *dst)
+{
+  int rounds = AES_KEY_ROUNDS (ks);
+  u8x16 r, *k = (u8x16 *) kd->encrypt_key;
+
+  r = *(u8x16u *) iv;
+
+  for (int i = 0; i < len; i += 16)
+    {
+      int j;
+#if __x86_64__
+      r = u8x16_xor3 (r, *(u8x16u *) (src + i), k[0]);
+      for (j = 1; j < rounds; j++)
+	r = aes_enc_round (r, k[j]);
+      r = aes_enc_last_round (r, k[rounds]);
+#else
+      r ^= *(u8x16u *) (src + i);
+      for (j = 1; j < rounds - 1; j++)
+	r = vaesmcq_u8 (vaeseq_u8 (r, k[j]));
+      r = vaeseq_u8 (r, k[j]) ^ k[rounds];
+#endif
+      *(u8x16u *) (dst + i) = r;
+    }
+}
+
+static_always_inline void
+clib_aes128_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+			 uword len, const u8 *iv, u8 *ciphertext)
+{
+  clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_128, ciphertext);
+}
+
+static_always_inline void
+clib_aes192_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+			 uword len, const u8 *iv, u8 *ciphertext)
+{
+  clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_192, ciphertext);
+}
+
+static_always_inline void
+clib_aes256_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+			 uword len, const u8 *iv, u8 *ciphertext)
+{
+  clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_256, ciphertext);
+}
+
+static_always_inline void __clib_unused
+aes_cbc_dec (const u8x16 *k, u8x16u *src, u8x16u *dst, u8x16u *iv, int count,
+	     int rounds)
+{
+  u8x16 r[4], c[4], f;
+
+  f = iv[0];
+  while (count >= 64)
+    {
+      c[0] = r[0] = src[0];
+      c[1] = r[1] = src[1];
+      c[2] = r[2] = src[2];
+      c[3] = r[3] = src[3];
+
+#if __x86_64__
+      r[0] ^= k[0];
+      r[1] ^= k[0];
+      r[2] ^= k[0];
+      r[3] ^= k[0];
+
+      for (int i = 1; i < rounds; i++)
+	{
+	  r[0] = aes_dec_round (r[0], k[i]);
+	  r[1] = aes_dec_round (r[1], k[i]);
+	  r[2] = aes_dec_round (r[2], k[i]);
+	  r[3] = aes_dec_round (r[3], k[i]);
+	}
+
+      r[0] = aes_dec_last_round (r[0], k[rounds]);
+      r[1] = aes_dec_last_round (r[1], k[rounds]);
+      r[2] = aes_dec_last_round (r[2], k[rounds]);
+      r[3] = aes_dec_last_round (r[3], k[rounds]);
+#else
+      for (int i = 0; i < rounds - 1; i++)
+	{
+	  r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
+	  r[1] = vaesimcq_u8 (vaesdq_u8 (r[1], k[i]));
+	  r[2] = vaesimcq_u8 (vaesdq_u8 (r[2], k[i]));
+	  r[3] = vaesimcq_u8 (vaesdq_u8 (r[3], k[i]));
+	}
+      r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
+      r[1] = vaesdq_u8 (r[1], k[rounds - 1]) ^ k[rounds];
+      r[2] = vaesdq_u8 (r[2], k[rounds - 1]) ^ k[rounds];
+      r[3] = vaesdq_u8 (r[3], k[rounds - 1]) ^ k[rounds];
+#endif
+      dst[0] = r[0] ^ f;
+      dst[1] = r[1] ^ c[0];
+      dst[2] = r[2] ^ c[1];
+      dst[3] = r[3] ^ c[2];
+      f = c[3];
+
+      count -= 64;
+      src += 4;
+      dst += 4;
+    }
+
+  while (count > 0)
+    {
+      c[0] = r[0] = src[0];
+#if __x86_64__
+      r[0] ^= k[0];
+      for (int i = 1; i < rounds; i++)
+	r[0] = aes_dec_round (r[0], k[i]);
+      r[0] = aes_dec_last_round (r[0], k[rounds]);
+#else
+      c[0] = r[0] = src[0];
+      for (int i = 0; i < rounds - 1; i++)
+	r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
+      r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
+#endif
+      dst[0] = r[0] ^ f;
+      f = c[0];
+
+      count -= 16;
+      src += 1;
+      dst += 1;
+    }
+}
+
+#if __x86_64__
+#if defined(__VAES__) && defined(__AVX512F__)
+
+static_always_inline u8x64
+aes_block_load_x4 (u8 *src[], int i)
+{
+  u8x64 r = {};
+  r = u8x64_insert_u8x16 (r, aes_block_load (src[0] + i), 0);
+  r = u8x64_insert_u8x16 (r, aes_block_load (src[1] + i), 1);
+  r = u8x64_insert_u8x16 (r, aes_block_load (src[2] + i), 2);
+  r = u8x64_insert_u8x16 (r, aes_block_load (src[3] + i), 3);
+  return r;
+}
+
+static_always_inline void
+aes_block_store_x4 (u8 *dst[], int i, u8x64 r)
+{
+  aes_block_store (dst[0] + i, u8x64_extract_u8x16 (r, 0));
+  aes_block_store (dst[1] + i, u8x64_extract_u8x16 (r, 1));
+  aes_block_store (dst[2] + i, u8x64_extract_u8x16 (r, 2));
+  aes_block_store (dst[3] + i, u8x64_extract_u8x16 (r, 3));
+}
+
+static_always_inline u8x64
+aes4_cbc_dec_permute (u8x64 a, u8x64 b)
+{
+  return (u8x64) u64x8_shuffle2 (a, b, 6, 7, 8, 9, 10, 11, 12, 13);
+}
+
+static_always_inline void
+aes4_cbc_dec (const u8x16 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count,
+	      aes_key_size_t rounds)
+{
+  u8x64 f, k4, r[4], c[4] = {};
+  __mmask8 m;
+  int i, n_blocks = count >> 4;
+
+  f = u8x64_insert_u8x16 (u8x64_zero (), *iv, 3);
+
+  while (n_blocks >= 16)
+    {
+      k4 = u8x64_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+      c[2] = src[2];
+      c[3] = src[3];
+
+      r[0] = c[0] ^ k4;
+      r[1] = c[1] ^ k4;
+      r[2] = c[2] ^ k4;
+      r[3] = c[3] ^ k4;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k4 = u8x64_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x4 (r[0], k4);
+	  r[1] = aes_dec_round_x4 (r[1], k4);
+	  r[2] = aes_dec_round_x4 (r[2], k4);
+	  r[3] = aes_dec_round_x4 (r[3], k4);
+	}
+
+      k4 = u8x64_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x4 (r[0], k4);
+      r[1] = aes_dec_last_round_x4 (r[1], k4);
+      r[2] = aes_dec_last_round_x4 (r[2], k4);
+      r[3] = aes_dec_last_round_x4 (r[3], k4);
+
+      dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+      dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
+      dst[3] = r[3] ^= aes4_cbc_dec_permute (c[2], c[3]);
+      f = c[3];
+
+      n_blocks -= 16;
+      src += 4;
+      dst += 4;
+    }
+
+  if (n_blocks >= 12)
+    {
+      k4 = u8x64_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+      c[2] = src[2];
+
+      r[0] = c[0] ^ k4;
+      r[1] = c[1] ^ k4;
+      r[2] = c[2] ^ k4;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k4 = u8x64_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x4 (r[0], k4);
+	  r[1] = aes_dec_round_x4 (r[1], k4);
+	  r[2] = aes_dec_round_x4 (r[2], k4);
+	}
+
+      k4 = u8x64_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x4 (r[0], k4);
+      r[1] = aes_dec_last_round_x4 (r[1], k4);
+      r[2] = aes_dec_last_round_x4 (r[2], k4);
+
+      dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+      dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
+      f = c[2];
+
+      n_blocks -= 12;
+      src += 3;
+      dst += 3;
+    }
+  else if (n_blocks >= 8)
+    {
+      k4 = u8x64_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+
+      r[0] = c[0] ^ k4;
+      r[1] = c[1] ^ k4;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k4 = u8x64_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x4 (r[0], k4);
+	  r[1] = aes_dec_round_x4 (r[1], k4);
+	}
+
+      k4 = u8x64_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x4 (r[0], k4);
+      r[1] = aes_dec_last_round_x4 (r[1], k4);
+
+      dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+      f = c[1];
+
+      n_blocks -= 8;
+      src += 2;
+      dst += 2;
+    }
+  else if (n_blocks >= 4)
+    {
+      c[0] = src[0];
+
+      r[0] = c[0] ^ u8x64_splat_u8x16 (k[0]);
+
+      for (i = 1; i < rounds; i++)
+	r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+
+      r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+
+      dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+      f = c[0];
+
+      n_blocks -= 4;
+      src += 1;
+      dst += 1;
+    }
+
+  if (n_blocks > 0)
+    {
+      k4 = u8x64_splat_u8x16 (k[0]);
+      m = (1 << (n_blocks * 2)) - 1;
+      c[0] =
+	(u8x64) _mm512_mask_loadu_epi64 ((__m512i) c[0], m, (__m512i *) src);
+      f = aes4_cbc_dec_permute (f, c[0]);
+      r[0] = c[0] ^ k4;
+      for (i = 1; i < rounds; i++)
+	r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+      r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+      _mm512_mask_storeu_epi64 ((__m512i *) dst, m, (__m512i) (r[0] ^ f));
+    }
+}
+#elif defined(__VAES__)
+
+static_always_inline u8x32
+aes_block_load_x2 (u8 *src[], int i)
+{
+  u8x32 r = {};
+  r = u8x32_insert_lo (r, aes_block_load (src[0] + i));
+  r = u8x32_insert_hi (r, aes_block_load (src[1] + i));
+  return r;
+}
+
+static_always_inline void
+aes_block_store_x2 (u8 *dst[], int i, u8x32 r)
+{
+  aes_block_store (dst[0] + i, u8x32_extract_lo (r));
+  aes_block_store (dst[1] + i, u8x32_extract_hi (r));
+}
+
+static_always_inline u8x32
+aes2_cbc_dec_permute (u8x32 a, u8x32 b)
+{
+  return (u8x32) u64x4_shuffle2 ((u64x4) a, (u64x4) b, 2, 3, 4, 5);
+}
+
+static_always_inline void
+aes2_cbc_dec (const u8x16 *k, u8x32u *src, u8x32u *dst, u8x16u *iv, int count,
+	      aes_key_size_t rounds)
+{
+  u8x32 k2, f = {}, r[4], c[4] = {};
+  int i, n_blocks = count >> 4;
+
+  f = u8x32_insert_hi (f, *iv);
+
+  while (n_blocks >= 8)
+    {
+      k2 = u8x32_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+      c[2] = src[2];
+      c[3] = src[3];
+
+      r[0] = c[0] ^ k2;
+      r[1] = c[1] ^ k2;
+      r[2] = c[2] ^ k2;
+      r[3] = c[3] ^ k2;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k2 = u8x32_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x2 (r[0], k2);
+	  r[1] = aes_dec_round_x2 (r[1], k2);
+	  r[2] = aes_dec_round_x2 (r[2], k2);
+	  r[3] = aes_dec_round_x2 (r[3], k2);
+	}
+
+      k2 = u8x32_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x2 (r[0], k2);
+      r[1] = aes_dec_last_round_x2 (r[1], k2);
+      r[2] = aes_dec_last_round_x2 (r[2], k2);
+      r[3] = aes_dec_last_round_x2 (r[3], k2);
+
+      dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+      dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+      dst[3] = r[3] ^= aes2_cbc_dec_permute (c[2], c[3]);
+      f = c[3];
+
+      n_blocks -= 8;
+      src += 4;
+      dst += 4;
+    }
+
+  if (n_blocks >= 6)
+    {
+      k2 = u8x32_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+      c[2] = src[2];
+
+      r[0] = c[0] ^ k2;
+      r[1] = c[1] ^ k2;
+      r[2] = c[2] ^ k2;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k2 = u8x32_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x2 (r[0], k2);
+	  r[1] = aes_dec_round_x2 (r[1], k2);
+	  r[2] = aes_dec_round_x2 (r[2], k2);
+	}
+
+      k2 = u8x32_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x2 (r[0], k2);
+      r[1] = aes_dec_last_round_x2 (r[1], k2);
+      r[2] = aes_dec_last_round_x2 (r[2], k2);
+
+      dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+      dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+      f = c[2];
+
+      n_blocks -= 6;
+      src += 3;
+      dst += 3;
+    }
+  else if (n_blocks >= 4)
+    {
+      k2 = u8x32_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+
+      r[0] = c[0] ^ k2;
+      r[1] = c[1] ^ k2;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k2 = u8x32_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x2 (r[0], k2);
+	  r[1] = aes_dec_round_x2 (r[1], k2);
+	}
+
+      k2 = u8x32_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x2 (r[0], k2);
+      r[1] = aes_dec_last_round_x2 (r[1], k2);
+
+      dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+      f = c[1];
+
+      n_blocks -= 4;
+      src += 2;
+      dst += 2;
+    }
+  else if (n_blocks >= 2)
+    {
+      k2 = u8x32_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      r[0] = c[0] ^ k2;
+
+      for (i = 1; i < rounds; i++)
+	r[0] = aes_dec_round_x2 (r[0], u8x32_splat_u8x16 (k[i]));
+
+      r[0] = aes_dec_last_round_x2 (r[0], u8x32_splat_u8x16 (k[i]));
+      dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+      f = c[0];
+
+      n_blocks -= 2;
+      src += 1;
+      dst += 1;
+    }
+
+  if (n_blocks > 0)
+    {
+      u8x16 rl = *(u8x16u *) src ^ k[0];
+      for (i = 1; i < rounds; i++)
+	rl = aes_dec_round (rl, k[i]);
+      rl = aes_dec_last_round (rl, k[i]);
+      *(u8x16 *) dst = rl ^ u8x32_extract_hi (f);
+    }
+}
+#endif
+#endif
+
+static_always_inline void
+clib_aes_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key,
+			 aes_key_size_t ks)
+{
+  u8x16 e[15], d[15];
+  aes_key_expand (e, key, ks);
+  aes_key_enc_to_dec (e, d, ks);
+  for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+    {
+      ((u8x16 *) kd->decrypt_key)[i] = d[i];
+      ((u8x16 *) kd->encrypt_key)[i] = e[i];
+    }
+}
+
+static_always_inline void
+clib_aes128_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+  clib_aes_cbc_key_expand (kd, key, AES_KEY_128);
+}
+static_always_inline void
+clib_aes192_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+  clib_aes_cbc_key_expand (kd, key, AES_KEY_192);
+}
+static_always_inline void
+clib_aes256_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+  clib_aes_cbc_key_expand (kd, key, AES_KEY_256);
+}
+
+static_always_inline void
+clib_aes_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+		      uword len, const u8 *iv, aes_key_size_t ks,
+		      u8 *plaintext)
+{
+  int rounds = AES_KEY_ROUNDS (ks);
+#if defined(__VAES__) && defined(__AVX512F__)
+  aes4_cbc_dec (kd->decrypt_key, (u8x64u *) ciphertext, (u8x64u *) plaintext,
+		(u8x16u *) iv, (int) len, rounds);
+#elif defined(__VAES__)
+  aes2_cbc_dec (kd->decrypt_key, (u8x32u *) ciphertext, (u8x32u *) plaintext,
+		(u8x16u *) iv, (int) len, rounds);
+#else
+  aes_cbc_dec (kd->decrypt_key, (u8x16u *) ciphertext, (u8x16u *) plaintext,
+	       (u8x16u *) iv, (int) len, rounds);
+#endif
+}
+
+static_always_inline void
+clib_aes128_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+			 uword len, const u8 *iv, u8 *plaintext)
+{
+  clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_128, plaintext);
+}
+
+static_always_inline void
+clib_aes192_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+			 uword len, const u8 *iv, u8 *plaintext)
+{
+  clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_192, plaintext);
+}
+
+static_always_inline void
+clib_aes256_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+			 uword len, const u8 *iv, u8 *plaintext)
+{
+  clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_256, plaintext);
+}
+
+#endif /* __crypto_aes_cbc_h__ */
diff --git a/src/vppinfra/crypto/aes_gcm.h b/src/vppinfra/crypto/aes_gcm.h
new file mode 100644
index 00000000000..8a5f76c3b33
--- /dev/null
+++ b/src/vppinfra/crypto/aes_gcm.h
@@ -0,0 +1,975 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_gcm_h__
+#define __crypto_aes_gcm_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+#include <vppinfra/crypto/aes.h>
+#include <vppinfra/crypto/ghash.h>
+
+#define NUM_HI 36
+#if defined(__VAES__) && defined(__AVX512F__)
+typedef u8x64 aes_data_t;
+typedef u8x64u aes_ghash_t;
+typedef u8x64u aes_mem_t;
+typedef u32x16 aes_gcm_counter_t;
+#define N			       64
+#define aes_gcm_load_partial(p, n)     u8x64_load_partial ((u8 *) (p), n)
+#define aes_gcm_store_partial(v, p, n) u8x64_store_partial (v, (u8 *) (p), n)
+#define aes_gcm_splat(v)	       u8x64_splat (v)
+#define aes_gcm_reflect(r)	       u8x64_reflect_u8x16 (r)
+#define aes_gcm_ghash_reduce(c)	       ghash4_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c)       ghash4_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c)	       (c)->T = ghash4_final (&(c)->gd)
+#elif defined(__VAES__)
+typedef u8x32 aes_data_t;
+typedef u8x32u aes_ghash_t;
+typedef u8x32u aes_mem_t;
+typedef u32x8 aes_gcm_counter_t;
+#define N			       32
+#define aes_gcm_load_partial(p, n)     u8x32_load_partial ((u8 *) (p), n)
+#define aes_gcm_store_partial(v, p, n) u8x32_store_partial (v, (u8 *) (p), n)
+#define aes_gcm_splat(v)	       u8x32_splat (v)
+#define aes_gcm_reflect(r)	       u8x32_reflect_u8x16 (r)
+#define aes_gcm_ghash_reduce(c)	       ghash2_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c)       ghash2_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c)	       (c)->T = ghash2_final (&(c)->gd)
+#else
+typedef u8x16 aes_data_t;
+typedef u8x16 aes_ghash_t;
+typedef u8x16u aes_mem_t;
+typedef u32x4 aes_gcm_counter_t;
+#define N			       16
+#define aes_gcm_load_partial(p, n)     u8x16_load_partial ((u8 *) (p), n)
+#define aes_gcm_store_partial(v, p, n) u8x16_store_partial (v, (u8 *) (p), n)
+#define aes_gcm_splat(v)	       u8x16_splat (v)
+#define aes_gcm_reflect(r)	       u8x16_reflect (r)
+#define aes_gcm_ghash_reduce(c)	       ghash_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c)       ghash_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c)	       (c)->T = ghash_final (&(c)->gd)
+#endif
+#define N_LANES (N / 16)
+
+typedef enum
+{
+  AES_GCM_OP_UNKNONW = 0,
+  AES_GCM_OP_ENCRYPT,
+  AES_GCM_OP_DECRYPT,
+  AES_GCM_OP_GMAC
+} aes_gcm_op_t;
+
+typedef union
+{
+  u8x16 x1;
+  u8x32 x2;
+  u8x64 x4;
+  u8x16 lanes[4];
+} __clib_aligned (64)
+aes_gcm_expaned_key_t;
+
+typedef struct
+{
+  /* pre-calculated hash key values */
+  const u8x16 Hi[NUM_HI];
+  /* extracted AES key */
+  const aes_gcm_expaned_key_t Ke[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+} aes_gcm_key_data_t;
+
+typedef struct
+{
+  aes_gcm_op_t operation;
+  int last;
+  u8 rounds;
+  uword data_bytes;
+  uword aad_bytes;
+
+  u8x16 T;
+
+  /* hash */
+  const u8x16 *Hi;
+  const aes_ghash_t *next_Hi;
+
+  /* expaded keys */
+  const aes_gcm_expaned_key_t *Ke;
+
+  /* counter */
+  u32 counter;
+  u8x16 EY0;
+  aes_gcm_counter_t Y;
+
+  /* ghash */
+  ghash_data_t gd;
+} aes_gcm_ctx_t;
+
+static_always_inline void
+aes_gcm_ghash_mul_first (aes_gcm_ctx_t *ctx, aes_data_t data, u32 n_lanes)
+{
+  uword hash_offset = NUM_HI - n_lanes;
+  ctx->next_Hi = (aes_ghash_t *) (ctx->Hi + hash_offset);
+#if N_LANES == 4
+  u8x64 tag4 = {};
+  tag4 = u8x64_insert_u8x16 (tag4, ctx->T, 0);
+  ghash4_mul_first (&ctx->gd, aes_gcm_reflect (data) ^ tag4, *ctx->next_Hi++);
+#elif N_LANES == 2
+  u8x32 tag2 = {};
+  tag2 = u8x32_insert_lo (tag2, ctx->T);
+  ghash2_mul_first (&ctx->gd, aes_gcm_reflect (data) ^ tag2, *ctx->next_Hi++);
+#else
+  ghash_mul_first (&ctx->gd, aes_gcm_reflect (data) ^ ctx->T, *ctx->next_Hi++);
+#endif
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_next (aes_gcm_ctx_t *ctx, aes_data_t data)
+{
+#if N_LANES == 4
+  ghash4_mul_next (&ctx->gd, aes_gcm_reflect (data), *ctx->next_Hi++);
+#elif N_LANES == 2
+  ghash2_mul_next (&ctx->gd, aes_gcm_reflect (data), *ctx->next_Hi++);
+#else
+  ghash_mul_next (&ctx->gd, aes_gcm_reflect (data), *ctx->next_Hi++);
+#endif
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_bit_len (aes_gcm_ctx_t *ctx)
+{
+  u8x16 r = (u8x16) ((u64x2){ ctx->data_bytes, ctx->aad_bytes } << 3);
+#if N_LANES == 4
+  u8x64 h = u8x64_insert_u8x16 (u8x64_zero (), ctx->Hi[NUM_HI - 1], 0);
+  u8x64 r4 = u8x64_insert_u8x16 (u8x64_zero (), r, 0);
+  ghash4_mul_next (&ctx->gd, r4, h);
+#elif N_LANES == 2
+  u8x32 h = u8x32_insert_lo (u8x32_zero (), ctx->Hi[NUM_HI - 1]);
+  u8x32 r2 = u8x32_insert_lo (u8x32_zero (), r);
+  ghash2_mul_next (&ctx->gd, r2, h);
+#else
+  ghash_mul_next (&ctx->gd, r, ctx->Hi[NUM_HI - 1]);
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_ctr0_round (aes_gcm_ctx_t *ctx, int aes_round)
+{
+  if (aes_round == 0)
+    ctx->EY0 ^= ctx->Ke[0].x1;
+  else if (aes_round == ctx->rounds)
+    ctx->EY0 = aes_enc_last_round (ctx->EY0, ctx->Ke[aes_round].x1);
+  else
+    ctx->EY0 = aes_enc_round (ctx->EY0, ctx->Ke[aes_round].x1);
+}
+
+static_always_inline void
+aes_gcm_ghash (aes_gcm_ctx_t *ctx, u8 *data, u32 n_left)
+{
+  uword i;
+  aes_data_t r = {};
+  const aes_mem_t *d = (aes_mem_t *) data;
+
+  for (; n_left >= 8 * N; n_left -= 8 * N, d += 8)
+    {
+      if (ctx->operation == AES_GCM_OP_GMAC && n_left == N * 8)
+	{
+	  aes_gcm_ghash_mul_first (ctx, d[0], 8 * N_LANES + 1);
+	  for (i = 1; i < 8; i++)
+	    aes_gcm_ghash_mul_next (ctx, d[i]);
+	  aes_gcm_ghash_mul_bit_len (ctx);
+	  aes_gcm_ghash_reduce (ctx);
+	  aes_gcm_ghash_reduce2 (ctx);
+	  aes_gcm_ghash_final (ctx);
+	  goto done;
+	}
+
+      aes_gcm_ghash_mul_first (ctx, d[0], 8 * N_LANES);
+      for (i = 1; i < 8; i++)
+	aes_gcm_ghash_mul_next (ctx, d[i]);
+      aes_gcm_ghash_reduce (ctx);
+      aes_gcm_ghash_reduce2 (ctx);
+      aes_gcm_ghash_final (ctx);
+    }
+
+  if (n_left > 0)
+    {
+      int n_lanes = (n_left + 15) / 16;
+
+      if (ctx->operation == AES_GCM_OP_GMAC)
+	n_lanes++;
+
+      if (n_left < N)
+	{
+	  clib_memcpy_fast (&r, d, n_left);
+	  aes_gcm_ghash_mul_first (ctx, r, n_lanes);
+	}
+      else
+	{
+	  aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+	  n_left -= N;
+	  i = 1;
+
+	  if (n_left >= 4 * N)
+	    {
+	      aes_gcm_ghash_mul_next (ctx, d[i]);
+	      aes_gcm_ghash_mul_next (ctx, d[i + 1]);
+	      aes_gcm_ghash_mul_next (ctx, d[i + 2]);
+	      aes_gcm_ghash_mul_next (ctx, d[i + 3]);
+	      n_left -= 4 * N;
+	      i += 4;
+	    }
+	  if (n_left >= 2 * N)
+	    {
+	      aes_gcm_ghash_mul_next (ctx, d[i]);
+	      aes_gcm_ghash_mul_next (ctx, d[i + 1]);
+	      n_left -= 2 * N;
+	      i += 2;
+	    }
+
+	  if (n_left >= N)
+	    {
+	      aes_gcm_ghash_mul_next (ctx, d[i]);
+	      n_left -= N;
+	      i += 1;
+	    }
+
+	  if (n_left)
+	    {
+	      clib_memcpy_fast (&r, d + i, n_left);
+	      aes_gcm_ghash_mul_next (ctx, r);
+	    }
+	}
+
+      if (ctx->operation == AES_GCM_OP_GMAC)
+	aes_gcm_ghash_mul_bit_len (ctx);
+      aes_gcm_ghash_reduce (ctx);
+      aes_gcm_ghash_reduce2 (ctx);
+      aes_gcm_ghash_final (ctx);
+    }
+  else if (ctx->operation == AES_GCM_OP_GMAC)
+    {
+      u8x16 r = (u8x16) ((u64x2){ ctx->data_bytes, ctx->aad_bytes } << 3);
+      ctx->T = ghash_mul (r ^ ctx->T, ctx->Hi[NUM_HI - 1]);
+    }
+
+done:
+  /* encrypt counter 0 E(Y0, k) */
+  if (ctx->operation == AES_GCM_OP_GMAC)
+    for (int i = 0; i < ctx->rounds + 1; i += 1)
+      aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline void
+aes_gcm_enc_first_round (aes_gcm_ctx_t *ctx, aes_data_t *r, uword n_blocks)
+{
+  const aes_gcm_expaned_key_t Ke0 = ctx->Ke[0];
+  uword i = 0;
+
+#if N_LANES == 4
+  const u32x16 ctr_inv_4444 = { 0, 0, 0, 4 << 24, 0, 0, 0, 4 << 24,
+				0, 0, 0, 4 << 24, 0, 0, 0, 4 << 24 };
+
+  const u32x16 ctr_4444 = {
+    4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0,
+  };
+
+  /* As counter is stored in network byte order for performance reasons we
+     are incrementing least significant byte only except in case where we
+     overlow. As we are processing four 512-blocks in parallel except the
+     last round, overflow can happen only when n == 4 */
+
+  if (n_blocks == 4)
+    for (; i < 2; i++)
+      {
+	r[i] = Ke0.x4 ^ (u8x64) ctx->Y;
+	ctx->Y += ctr_inv_4444;
+      }
+
+  if (n_blocks == 4 && PREDICT_FALSE ((u8) ctx->counter == 242))
+    {
+      u32x16 Yr = (u32x16) aes_gcm_reflect ((u8x64) ctx->Y);
+
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x4 ^ (u8x64) ctx->Y;
+	  Yr += ctr_4444;
+	  ctx->Y = (u32x16) aes_gcm_reflect ((u8x64) Yr);
+	}
+    }
+  else
+    {
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x4 ^ (u8x64) ctx->Y;
+	  ctx->Y += ctr_inv_4444;
+	}
+    }
+  ctx->counter += n_blocks * 4;
+#elif N_LANES == 2
+  const u32x8 ctr_inv_22 = { 0, 0, 0, 2 << 24, 0, 0, 0, 2 << 24 };
+  const u32x8 ctr_22 = { 2, 0, 0, 0, 2, 0, 0, 0 };
+
+  /* As counter is stored in network byte order for performance reasons we
+     are incrementing least significant byte only except in case where we
+     overlow. As we are processing four 512-blocks in parallel except the
+     last round, overflow can happen only when n == 4 */
+
+  if (n_blocks == 4)
+    for (; i < 2; i++)
+      {
+	r[i] = Ke0.x2 ^ (u8x32) ctx->Y;
+	ctx->Y += ctr_inv_22;
+      }
+
+  if (n_blocks == 4 && PREDICT_FALSE ((u8) ctx->counter == 250))
+    {
+      u32x8 Yr = (u32x8) aes_gcm_reflect ((u8x32) ctx->Y);
+
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x2 ^ (u8x32) ctx->Y;
+	  Yr += ctr_22;
+	  ctx->Y = (u32x8) aes_gcm_reflect ((u8x32) Yr);
+	}
+    }
+  else
+    {
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x2 ^ (u8x32) ctx->Y;
+	  ctx->Y += ctr_inv_22;
+	}
+    }
+  ctx->counter += n_blocks * 2;
+#else
+  const u32x4 ctr_inv_1 = { 0, 0, 0, 1 << 24 };
+
+  if (PREDICT_TRUE ((u8) ctx->counter < 0xfe) || n_blocks < 3)
+    {
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x1 ^ (u8x16) ctx->Y;
+	  ctx->Y += ctr_inv_1;
+	}
+      ctx->counter += n_blocks;
+    }
+  else
+    {
+      r[i++] = Ke0.x1 ^ (u8x16) ctx->Y;
+      ctx->Y += ctr_inv_1;
+      ctx->counter += 1;
+
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x1 ^ (u8x16) ctx->Y;
+	  ctx->counter++;
+	  ctx->Y[3] = clib_host_to_net_u32 (ctx->counter);
+	}
+    }
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_round (aes_data_t *r, const aes_gcm_expaned_key_t *Ke,
+		   uword n_blocks)
+{
+  for (int i = 0; i < n_blocks; i++)
+#if N_LANES == 4
+    r[i] = aes_enc_round_x4 (r[i], Ke->x4);
+#elif N_LANES == 2
+    r[i] = aes_enc_round_x2 (r[i], Ke->x2);
+#else
+    r[i] = aes_enc_round (r[i], Ke->x1);
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_last_round (aes_gcm_ctx_t *ctx, aes_data_t *r, aes_data_t *d,
+			const aes_gcm_expaned_key_t *Ke, uword n_blocks)
+{
+  /* additional ronuds for AES-192 and AES-256 */
+  for (int i = 10; i < ctx->rounds; i++)
+    aes_gcm_enc_round (r, Ke + i, n_blocks);
+
+  for (int i = 0; i < n_blocks; i++)
+#if N_LANES == 4
+    d[i] ^= aes_enc_last_round_x4 (r[i], Ke[ctx->rounds].x4);
+#elif N_LANES == 2
+    d[i] ^= aes_enc_last_round_x2 (r[i], Ke[ctx->rounds].x2);
+#else
+    d[i] ^= aes_enc_last_round (r[i], Ke[ctx->rounds].x1);
+#endif
+}
+
+static_always_inline void
+aes_gcm_calc (aes_gcm_ctx_t *ctx, aes_data_t *d, const u8 *src, u8 *dst, u32 n,
+	      u32 n_bytes, int with_ghash)
+{
+  const aes_gcm_expaned_key_t *k = ctx->Ke;
+  const aes_mem_t *sv = (aes_mem_t *) src;
+  aes_mem_t *dv = (aes_mem_t *) dst;
+  uword ghash_blocks, gc = 1;
+  aes_data_t r[4];
+  u32 i, n_lanes;
+
+  if (ctx->operation == AES_GCM_OP_ENCRYPT)
+    {
+      ghash_blocks = 4;
+      n_lanes = N_LANES * 4;
+    }
+  else
+    {
+      ghash_blocks = n;
+      n_lanes = n * N_LANES;
+#if N_LANES != 1
+      if (ctx->last)
+	n_lanes = (n_bytes + 15) / 16;
+#endif
+    }
+
+  n_bytes -= (n - 1) * N;
+
+  /* AES rounds 0 and 1 */
+  aes_gcm_enc_first_round (ctx, r, n);
+  aes_gcm_enc_round (r, k + 1, n);
+
+  /* load data - decrypt round */
+  if (ctx->operation == AES_GCM_OP_DECRYPT)
+    {
+      for (i = 0; i < n - ctx->last; i++)
+	d[i] = sv[i];
+
+      if (ctx->last)
+	d[n - 1] = aes_gcm_load_partial ((u8 *) (sv + n - 1), n_bytes);
+    }
+
+  /* GHASH multiply block 0 */
+  if (with_ghash)
+    aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+
+  /* AES rounds 2 and 3 */
+  aes_gcm_enc_round (r, k + 2, n);
+  aes_gcm_enc_round (r, k + 3, n);
+
+  /* GHASH multiply block 1 */
+  if (with_ghash && gc++ < ghash_blocks)
+    aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+  /* AES rounds 4 and 5 */
+  aes_gcm_enc_round (r, k + 4, n);
+  aes_gcm_enc_round (r, k + 5, n);
+
+  /* GHASH multiply block 2 */
+  if (with_ghash && gc++ < ghash_blocks)
+    aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+  /* AES rounds 6 and 7 */
+  aes_gcm_enc_round (r, k + 6, n);
+  aes_gcm_enc_round (r, k + 7, n);
+
+  /* GHASH multiply block 3 */
+  if (with_ghash && gc++ < ghash_blocks)
+    aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+  /* load 4 blocks of data - decrypt round */
+  if (ctx->operation == AES_GCM_OP_ENCRYPT)
+    {
+      for (i = 0; i < n - ctx->last; i++)
+	d[i] = sv[i];
+
+      if (ctx->last)
+	d[n - 1] = aes_gcm_load_partial (sv + n - 1, n_bytes);
+    }
+
+  /* AES rounds 8 and 9 */
+  aes_gcm_enc_round (r, k + 8, n);
+  aes_gcm_enc_round (r, k + 9, n);
+
+  /* AES last round(s) */
+  aes_gcm_enc_last_round (ctx, r, d, k, n);
+
+  /* store data */
+  for (i = 0; i < n - ctx->last; i++)
+    dv[i] = d[i];
+
+  if (ctx->last)
+    aes_gcm_store_partial (d[n - 1], dv + n - 1, n_bytes);
+
+  /* GHASH reduce 1st step */
+  aes_gcm_ghash_reduce (ctx);
+
+  /* GHASH reduce 2nd step */
+  if (with_ghash)
+    aes_gcm_ghash_reduce2 (ctx);
+
+  /* GHASH final step */
+  if (with_ghash)
+    aes_gcm_ghash_final (ctx);
+}
+
+static_always_inline void
+aes_gcm_calc_double (aes_gcm_ctx_t *ctx, aes_data_t *d, const u8 *src, u8 *dst,
+		     int with_ghash)
+{
+  const aes_gcm_expaned_key_t *k = ctx->Ke;
+  const aes_mem_t *sv = (aes_mem_t *) src;
+  aes_mem_t *dv = (aes_mem_t *) dst;
+  aes_data_t r[4];
+
+  /* AES rounds 0 and 1 */
+  aes_gcm_enc_first_round (ctx, r, 4);
+  aes_gcm_enc_round (r, k + 1, 4);
+
+  /* load 4 blocks of data - decrypt round */
+  if (ctx->operation == AES_GCM_OP_DECRYPT)
+    for (int i = 0; i < 4; i++)
+      d[i] = sv[i];
+
+  /* GHASH multiply block 0 */
+  aes_gcm_ghash_mul_first (ctx, d[0], N_LANES * 8);
+
+  /* AES rounds 2 and 3 */
+  aes_gcm_enc_round (r, k + 2, 4);
+  aes_gcm_enc_round (r, k + 3, 4);
+
+  /* GHASH multiply block 1 */
+  aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+  /* AES rounds 4 and 5 */
+  aes_gcm_enc_round (r, k + 4, 4);
+  aes_gcm_enc_round (r, k + 5, 4);
+
+  /* GHASH multiply block 2 */
+  aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+  /* AES rounds 6 and 7 */
+  aes_gcm_enc_round (r, k + 6, 4);
+  aes_gcm_enc_round (r, k + 7, 4);
+
+  /* GHASH multiply block 3 */
+  aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+  /* AES rounds 8 and 9 */
+  aes_gcm_enc_round (r, k + 8, 4);
+  aes_gcm_enc_round (r, k + 9, 4);
+
+  /* load 4 blocks of data - encrypt round */
+  if (ctx->operation == AES_GCM_OP_ENCRYPT)
+    for (int i = 0; i < 4; i++)
+      d[i] = sv[i];
+
+  /* AES last round(s) */
+  aes_gcm_enc_last_round (ctx, r, d, k, 4);
+
+  /* store 4 blocks of data */
+  for (int i = 0; i < 4; i++)
+    dv[i] = d[i];
+
+  /* load next 4 blocks of data data - decrypt round */
+  if (ctx->operation == AES_GCM_OP_DECRYPT)
+    for (int i = 0; i < 4; i++)
+      d[i] = sv[i + 4];
+
+  /* GHASH multiply block 4 */
+  aes_gcm_ghash_mul_next (ctx, (d[0]));
+
+  /* AES rounds 0 and 1 */
+  aes_gcm_enc_first_round (ctx, r, 4);
+  aes_gcm_enc_round (r, k + 1, 4);
+
+  /* GHASH multiply block 5 */
+  aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+  /* AES rounds 2 and 3 */
+  aes_gcm_enc_round (r, k + 2, 4);
+  aes_gcm_enc_round (r, k + 3, 4);
+
+  /* GHASH multiply block 6 */
+  aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+  /* AES rounds 4 and 5 */
+  aes_gcm_enc_round (r, k + 4, 4);
+  aes_gcm_enc_round (r, k + 5, 4);
+
+  /* GHASH multiply block 7 */
+  aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+  /* AES rounds 6 and 7 */
+  aes_gcm_enc_round (r, k + 6, 4);
+  aes_gcm_enc_round (r, k + 7, 4);
+
+  /* GHASH reduce 1st step */
+  aes_gcm_ghash_reduce (ctx);
+
+  /* AES rounds 8 and 9 */
+  aes_gcm_enc_round (r, k + 8, 4);
+  aes_gcm_enc_round (r, k + 9, 4);
+
+  /* GHASH reduce 2nd step */
+  aes_gcm_ghash_reduce2 (ctx);
+
+  /* load 4 blocks of data - encrypt round */
+  if (ctx->operation == AES_GCM_OP_ENCRYPT)
+    for (int i = 0; i < 4; i++)
+      d[i] = sv[i + 4];
+
+  /* AES last round(s) */
+  aes_gcm_enc_last_round (ctx, r, d, k, 4);
+
+  /* store data */
+  for (int i = 0; i < 4; i++)
+    dv[i + 4] = d[i];
+
+  /* GHASH final step */
+  aes_gcm_ghash_final (ctx);
+}
+
+static_always_inline void
+aes_gcm_mask_bytes (aes_data_t *d, uword n_bytes)
+{
+  const union
+  {
+    u8 b[64];
+    aes_data_t r;
+  } scale = {
+    .b = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+	   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+	   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+	   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+  };
+
+  d[0] &= (aes_gcm_splat (n_bytes) > scale.r);
+}
+
+static_always_inline void
+aes_gcm_calc_last (aes_gcm_ctx_t *ctx, aes_data_t *d, int n_blocks,
+		   u32 n_bytes)
+{
+  int n_lanes = (N_LANES == 1 ? n_blocks : (n_bytes + 15) / 16) + 1;
+  n_bytes -= (n_blocks - 1) * N;
+  int i;
+
+  aes_gcm_enc_ctr0_round (ctx, 0);
+  aes_gcm_enc_ctr0_round (ctx, 1);
+
+  if (n_bytes != N)
+    aes_gcm_mask_bytes (d + n_blocks - 1, n_bytes);
+
+  aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+
+  aes_gcm_enc_ctr0_round (ctx, 2);
+  aes_gcm_enc_ctr0_round (ctx, 3);
+
+  if (n_blocks > 1)
+    aes_gcm_ghash_mul_next (ctx, d[1]);
+
+  aes_gcm_enc_ctr0_round (ctx, 4);
+  aes_gcm_enc_ctr0_round (ctx, 5);
+
+  if (n_blocks > 2)
+    aes_gcm_ghash_mul_next (ctx, d[2]);
+
+  aes_gcm_enc_ctr0_round (ctx, 6);
+  aes_gcm_enc_ctr0_round (ctx, 7);
+
+  if (n_blocks > 3)
+    aes_gcm_ghash_mul_next (ctx, d[3]);
+
+  aes_gcm_enc_ctr0_round (ctx, 8);
+  aes_gcm_enc_ctr0_round (ctx, 9);
+
+  aes_gcm_ghash_mul_bit_len (ctx);
+  aes_gcm_ghash_reduce (ctx);
+
+  for (i = 10; i < ctx->rounds; i++)
+    aes_gcm_enc_ctr0_round (ctx, i);
+
+  aes_gcm_ghash_reduce2 (ctx);
+
+  aes_gcm_ghash_final (ctx);
+
+  aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline void
+aes_gcm_enc (aes_gcm_ctx_t *ctx, const u8 *src, u8 *dst, u32 n_left)
+{
+  aes_data_t d[4];
+
+  if (PREDICT_FALSE (n_left == 0))
+    {
+      int i;
+      for (i = 0; i < ctx->rounds + 1; i++)
+	aes_gcm_enc_ctr0_round (ctx, i);
+      return;
+    }
+
+  if (n_left < 4 * N)
+    {
+      ctx->last = 1;
+      if (n_left > 3 * N)
+	{
+	  aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 0);
+	  aes_gcm_calc_last (ctx, d, 4, n_left);
+	}
+      else if (n_left > 2 * N)
+	{
+	  aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 0);
+	  aes_gcm_calc_last (ctx, d, 3, n_left);
+	}
+      else if (n_left > N)
+	{
+	  aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 0);
+	  aes_gcm_calc_last (ctx, d, 2, n_left);
+	}
+      else
+	{
+	  aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 0);
+	  aes_gcm_calc_last (ctx, d, 1, n_left);
+	}
+      return;
+    }
+  aes_gcm_calc (ctx, d, src, dst, 4, 4 * N, /* with_ghash */ 0);
+
+  /* next */
+  n_left -= 4 * N;
+  dst += 4 * N;
+  src += 4 * N;
+
+  for (; n_left >= 8 * N; n_left -= 8 * N, src += 8 * N, dst += 8 * N)
+    aes_gcm_calc_double (ctx, d, src, dst, /* with_ghash */ 1);
+
+  if (n_left >= 4 * N)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 4, 4 * N, /* with_ghash */ 1);
+
+      /* next */
+      n_left -= 4 * N;
+      dst += 4 * N;
+      src += 4 * N;
+    }
+
+  if (n_left == 0)
+    {
+      aes_gcm_calc_last (ctx, d, 4, 4 * N);
+      return;
+    }
+
+  ctx->last = 1;
+
+  if (n_left > 3 * N)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 1);
+      aes_gcm_calc_last (ctx, d, 4, n_left);
+    }
+  else if (n_left > 2 * N)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 1);
+      aes_gcm_calc_last (ctx, d, 3, n_left);
+    }
+  else if (n_left > N)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 1);
+      aes_gcm_calc_last (ctx, d, 2, n_left);
+    }
+  else
+    {
+      aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 1);
+      aes_gcm_calc_last (ctx, d, 1, n_left);
+    }
+}
+
+static_always_inline void
+aes_gcm_dec (aes_gcm_ctx_t *ctx, const u8 *src, u8 *dst, uword n_left)
+{
+  aes_data_t d[4] = {};
+  for (; n_left >= 8 * N; n_left -= 8 * N, dst += 8 * N, src += 8 * N)
+    aes_gcm_calc_double (ctx, d, src, dst, /* with_ghash */ 1);
+
+  if (n_left >= 4 * N)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 4, 4 * N, /* with_ghash */ 1);
+
+      /* next */
+      n_left -= 4 * N;
+      dst += N * 4;
+      src += N * 4;
+    }
+
+  if (n_left == 0)
+    goto done;
+
+  ctx->last = 1;
+
+  if (n_left > 3 * N)
+    aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 1);
+  else if (n_left > 2 * N)
+    aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 1);
+  else if (n_left > N)
+    aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 1);
+  else
+    aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 1);
+
+  u8x16 r;
+done:
+  r = (u8x16) ((u64x2){ ctx->data_bytes, ctx->aad_bytes } << 3);
+  ctx->T = ghash_mul (r ^ ctx->T, ctx->Hi[NUM_HI - 1]);
+
+  /* encrypt counter 0 E(Y0, k) */
+  for (int i = 0; i < ctx->rounds + 1; i += 1)
+    aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline int
+aes_gcm (const u8 *src, u8 *dst, const u8 *aad, u8 *ivp, u8 *tag,
+	 u32 data_bytes, u32 aad_bytes, u8 tag_len,
+	 const aes_gcm_key_data_t *kd, int aes_rounds, aes_gcm_op_t op)
+{
+  u8 *addt = (u8 *) aad;
+  u32x4 Y0;
+
+  aes_gcm_ctx_t _ctx = { .counter = 2,
+			 .rounds = aes_rounds,
+			 .operation = op,
+			 .data_bytes = data_bytes,
+			 .aad_bytes = aad_bytes,
+			 .Hi = kd->Hi },
+		*ctx = &_ctx;
+
+  /* initalize counter */
+  Y0 = (u32x4) (u64x2){ *(u64u *) ivp, 0 };
+  Y0[2] = *(u32u *) (ivp + 8);
+  Y0[3] = 1 << 24;
+  ctx->EY0 = (u8x16) Y0;
+  ctx->Ke = kd->Ke;
+#if N_LANES == 4
+  ctx->Y = u32x16_splat_u32x4 (Y0) + (u32x16){
+    0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24, 0, 0, 0, 3 << 24, 0, 0, 0, 4 << 24,
+  };
+#elif N_LANES == 2
+  ctx->Y =
+    u32x8_splat_u32x4 (Y0) + (u32x8){ 0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24 };
+#else
+  ctx->Y = Y0 + (u32x4){ 0, 0, 0, 1 << 24 };
+#endif
+
+  /* calculate ghash for AAD */
+  aes_gcm_ghash (ctx, addt, aad_bytes);
+
+  clib_prefetch_load (tag);
+
+  /* ghash and encrypt/edcrypt  */
+  if (op == AES_GCM_OP_ENCRYPT)
+    aes_gcm_enc (ctx, src, dst, data_bytes);
+  else if (op == AES_GCM_OP_DECRYPT)
+    aes_gcm_dec (ctx, src, dst, data_bytes);
+
+  /* final tag is */
+  ctx->T = u8x16_reflect (ctx->T) ^ ctx->EY0;
+
+  /* tag_len 16 -> 0 */
+  tag_len &= 0xf;
+
+  if (op == AES_GCM_OP_ENCRYPT || op == AES_GCM_OP_GMAC)
+    {
+      /* store tag */
+      if (tag_len)
+	u8x16_store_partial (ctx->T, tag, tag_len);
+      else
+	((u8x16u *) tag)[0] = ctx->T;
+    }
+  else
+    {
+      /* check tag */
+      if (tag_len)
+	{
+	  u16 mask = pow2_mask (tag_len);
+	  u8x16 expected = u8x16_load_partial (tag, tag_len);
+	  if ((u8x16_msb_mask (expected == ctx->T) & mask) == mask)
+	    return 1;
+	}
+      else
+	{
+	  if (u8x16_is_equal (ctx->T, *(u8x16u *) tag))
+	    return 1;
+	}
+    }
+  return 0;
+}
+
+static_always_inline void
+clib_aes_gcm_key_expand (aes_gcm_key_data_t *kd, const u8 *key,
+			 aes_key_size_t ks)
+{
+  u8x16 H;
+  u8x16 ek[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+  aes_gcm_expaned_key_t *Ke = (aes_gcm_expaned_key_t *) kd->Ke;
+
+  /* expand AES key */
+  aes_key_expand (ek, key, ks);
+  for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+    Ke[i].lanes[0] = Ke[i].lanes[1] = Ke[i].lanes[2] = Ke[i].lanes[3] = ek[i];
+
+  /* pre-calculate H */
+  H = aes_encrypt_block (u8x16_zero (), ek, ks);
+  H = u8x16_reflect (H);
+  ghash_precompute (H, (u8x16 *) kd->Hi, ARRAY_LEN (kd->Hi));
+}
+
+static_always_inline void
+clib_aes128_gcm_enc (const aes_gcm_key_data_t *kd, const u8 *plaintext,
+		     u32 data_bytes, const u8 *aad, u32 aad_bytes,
+		     const u8 *iv, u32 tag_bytes, u8 *cyphertext, u8 *tag)
+{
+  aes_gcm (plaintext, cyphertext, aad, (u8 *) iv, tag, data_bytes, aad_bytes,
+	   tag_bytes, kd, AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_ENCRYPT);
+}
+
+static_always_inline void
+clib_aes256_gcm_enc (const aes_gcm_key_data_t *kd, const u8 *plaintext,
+		     u32 data_bytes, const u8 *aad, u32 aad_bytes,
+		     const u8 *iv, u32 tag_bytes, u8 *cyphertext, u8 *tag)
+{
+  aes_gcm (plaintext, cyphertext, aad, (u8 *) iv, tag, data_bytes, aad_bytes,
+	   tag_bytes, kd, AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_ENCRYPT);
+}
+
+static_always_inline int
+clib_aes128_gcm_dec (const aes_gcm_key_data_t *kd, const u8 *cyphertext,
+		     u32 data_bytes, const u8 *aad, u32 aad_bytes,
+		     const u8 *iv, const u8 *tag, u32 tag_bytes, u8 *plaintext)
+{
+  return aes_gcm (cyphertext, plaintext, aad, (u8 *) iv, (u8 *) tag,
+		  data_bytes, aad_bytes, tag_bytes, kd,
+		  AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_DECRYPT);
+}
+
+static_always_inline int
+clib_aes256_gcm_dec (const aes_gcm_key_data_t *kd, const u8 *cyphertext,
+		     u32 data_bytes, const u8 *aad, u32 aad_bytes,
+		     const u8 *iv, const u8 *tag, u32 tag_bytes, u8 *plaintext)
+{
+  return aes_gcm (cyphertext, plaintext, aad, (u8 *) iv, (u8 *) tag,
+		  data_bytes, aad_bytes, tag_bytes, kd,
+		  AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_DECRYPT);
+}
+
+static_always_inline void
+clib_aes128_gmac (const aes_gcm_key_data_t *kd, const u8 *data, u32 data_bytes,
+		  const u8 *iv, u32 tag_bytes, u8 *tag)
+{
+  aes_gcm (0, 0, data, (u8 *) iv, tag, 0, data_bytes, tag_bytes, kd,
+	   AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_GMAC);
+}
+
+static_always_inline void
+clib_aes256_gmac (const aes_gcm_key_data_t *kd, const u8 *data, u32 data_bytes,
+		  const u8 *iv, u32 tag_bytes, u8 *tag)
+{
+  aes_gcm (0, 0, data, (u8 *) iv, tag, 0, data_bytes, tag_bytes, kd,
+	   AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_GMAC);
+}
+
+#endif /* __crypto_aes_gcm_h__ */
diff --git a/src/vppinfra/crypto/ghash.h b/src/vppinfra/crypto/ghash.h
new file mode 100644
index 00000000000..bae8badb5fc
--- /dev/null
+++ b/src/vppinfra/crypto/ghash.h
@@ -0,0 +1,515 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2019 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+/*
+ *------------------------------------------------------------------
+ *  Copyright(c) 2018, Intel Corporation All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
+ *  DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *------------------------------------------------------------------
+ */
+
+/*
+ * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
+ *                   Vinodh Gopal, James Guilford, Tomasz Kantecki
+ *
+ * References:
+ * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
+ *     Intel Architecture Processors. August, 2010
+ * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
+ *     Intel Architecture Processors. October, 2012.
+ * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
+ *
+ * Definitions:
+ *  GF    Galois Extension Field GF(2^128) - finite field where elements are
+ *        represented as polynomials with coefficients in GF(2) with the
+ *        highest degree of 127. Polynomials are represented as 128-bit binary
+ *        numbers where each bit represents one coefficient.
+ *        e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
+ *  H     hash key (128 bit)
+ *  POLY  irreducible polynomial x^127 + x^7 + x^2 + x + 1
+ *  RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
+ *  +     addition in GF, which equals to XOR operation
+ *  *     multiplication in GF
+ *
+ * GF multiplication consists of 2 steps:
+ *  - carry-less multiplication of two 128-bit operands into 256-bit result
+ *  - reduction of 256-bit result into 128-bit with modulo POLY
+ *
+ * GHash is calculated on 128-bit blocks of data according to the following
+ * formula:
+ *    GH = (GH + data) * hash_key
+ *
+ * To avoid bit-reflection of data, this code uses GF multipication
+ * with reversed polynomial:
+ *   a * b * x^-127 mod RPOLY
+ *
+ * To improve computation speed table Hi is precomputed with powers of H',
+ * where H' is calculated as H<<1 mod RPOLY.
+ * This allows us to improve performance by deferring reduction. For example
+ * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
+ *
+ * u8x16 Hi[4];
+ * ghash_precompute (H, Hi, 4);
+ *
+ * ghash_data_t _gd, *gd = &_gd;
+ * ghash_mul_first (gd, GH ^ b0, Hi[3]);
+ * ghash_mul_next (gd, b1, Hi[2]);
+ * ghash_mul_next (gd, b2, Hi[1]);
+ * ghash_mul_next (gd, b3, Hi[0]);
+ * ghash_reduce (gd);
+ * ghash_reduce2 (gd);
+ * GH = ghash_final (gd);
+ *
+ * Reduction step is split into 3 functions so it can be better interleaved
+ * with other code, (i.e. with AES computation).
+ */
+
+#ifndef __ghash_h__
+#define __ghash_h__
+
+static_always_inline u8x16
+gmul_lo_lo (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
+			    (poly64_t) vget_low_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_hi_lo (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
+			    (poly64_t) vget_low_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_lo_hi (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
+			    (poly64_t) vget_high_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_hi_hi (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
+#endif
+}
+
+typedef struct
+{
+  u8x16 mid, hi, lo, tmp_lo, tmp_hi;
+  u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
+  u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
+  int pending;
+} ghash_data_t;
+
+static const u8x16 ghash_poly = {
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
+};
+
+static const u8x16 ghash_poly2 = {
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
+};
+
+static_always_inline void
+ghash_mul_first (ghash_data_t * gd, u8x16 a, u8x16 b)
+{
+  /* a1 * b1 */
+  gd->hi = gmul_hi_hi (a, b);
+  /* a0 * b0 */
+  gd->lo = gmul_lo_lo (a, b);
+  /* a0 * b1 ^ a1 * b0 */
+  gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
+
+  /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
+     there is no pending data in tmp_lo and tmp_hi */
+  gd->pending = 0;
+}
+
+static_always_inline void
+ghash_mul_next (ghash_data_t * gd, u8x16 a, u8x16 b)
+{
+  /* a1 * b1 */
+  u8x16 hi = gmul_hi_hi (a, b);
+  /* a0 * b0 */
+  u8x16 lo = gmul_lo_lo (a, b);
+
+  /* this branch will be optimized out by the compiler, and it allows us to
+     reduce number of XOR operations by using ternary logic */
+  if (gd->pending)
+    {
+      /* there is peding data from previous invocation so we can XOR */
+      gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
+      gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
+      gd->pending = 0;
+    }
+  else
+    {
+      /* there is no peding data from previous invocation so we postpone XOR */
+      gd->tmp_hi = hi;
+      gd->tmp_lo = lo;
+      gd->pending = 1;
+    }
+
+  /* gd->mid ^= a0 * b1 ^ a1 * b0  */
+  gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash_reduce (ghash_data_t * gd)
+{
+  u8x16 r;
+
+  /* Final combination:
+     gd->lo ^= gd->mid << 64
+     gd->hi ^= gd->mid >> 64 */
+  u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
+  u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
+
+  if (gd->pending)
+    {
+      gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
+      gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
+    }
+  else
+    {
+      gd->lo ^= midl;
+      gd->hi ^= midr;
+    }
+  r = gmul_hi_lo (ghash_poly2, gd->lo);
+  gd->lo ^= u8x16_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash_reduce2 (ghash_data_t * gd)
+{
+  gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
+  gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
+}
+
+static_always_inline u8x16
+ghash_final (ghash_data_t * gd)
+{
+  return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
+		     u8x16_word_shift_left (gd->tmp_hi, 4));
+}
+
+static_always_inline u8x16
+ghash_mul (u8x16 a, u8x16 b)
+{
+  ghash_data_t _gd, *gd = &_gd;
+  ghash_mul_first (gd, a, b);
+  ghash_reduce (gd);
+  ghash_reduce2 (gd);
+  return ghash_final (gd);
+}
+
+#if defined(__VPCLMULQDQ__) && defined(__AVX512F__)
+
+static const u8x64 ghash4_poly2 = {
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+};
+
+static_always_inline u8x64
+gmul4_lo_lo (u8x64 a, u8x64 b)
+{
+  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
+}
+
+static_always_inline u8x64
+gmul4_hi_lo (u8x64 a, u8x64 b)
+{
+  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
+}
+
+static_always_inline u8x64
+gmul4_lo_hi (u8x64 a, u8x64 b)
+{
+  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
+}
+
+static_always_inline u8x64
+gmul4_hi_hi (u8x64 a, u8x64 b)
+{
+  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
+}
+
+static_always_inline void
+ghash4_mul_first (ghash_data_t *gd, u8x64 a, u8x64 b)
+{
+  gd->hi4 = gmul4_hi_hi (a, b);
+  gd->lo4 = gmul4_lo_lo (a, b);
+  gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
+  gd->pending = 0;
+}
+
+static_always_inline void
+ghash4_mul_next (ghash_data_t *gd, u8x64 a, u8x64 b)
+{
+  u8x64 hi = gmul4_hi_hi (a, b);
+  u8x64 lo = gmul4_lo_lo (a, b);
+
+  if (gd->pending)
+    {
+      /* there is peding data from previous invocation so we can XOR */
+      gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
+      gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
+      gd->pending = 0;
+    }
+  else
+    {
+      /* there is no peding data from previous invocation so we postpone XOR */
+      gd->tmp_hi4 = hi;
+      gd->tmp_lo4 = lo;
+      gd->pending = 1;
+    }
+  gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash4_reduce (ghash_data_t *gd)
+{
+  u8x64 r;
+
+  /* Final combination:
+     gd->lo4 ^= gd->mid4 << 64
+     gd->hi4 ^= gd->mid4 >> 64 */
+
+  u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
+  u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
+
+  if (gd->pending)
+    {
+      gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
+      gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
+    }
+  else
+    {
+      gd->lo4 ^= midl;
+      gd->hi4 ^= midr;
+    }
+
+  r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
+  gd->lo4 ^= u8x64_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash4_reduce2 (ghash_data_t *gd)
+{
+  gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
+  gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
+}
+
+static_always_inline u8x16
+ghash4_final (ghash_data_t *gd)
+{
+  u8x64 r;
+  u8x32 t;
+
+  r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
+		  u8x64_word_shift_left (gd->tmp_hi4, 4));
+
+  /* horizontal XOR of 4 128-bit lanes */
+  t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
+  return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
+}
+#endif
+
+#if defined(__VPCLMULQDQ__)
+
+static const u8x32 ghash2_poly2 = {
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+};
+
+static_always_inline u8x32
+gmul2_lo_lo (u8x32 a, u8x32 b)
+{
+  return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
+}
+
+static_always_inline u8x32
+gmul2_hi_lo (u8x32 a, u8x32 b)
+{
+  return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
+}
+
+static_always_inline u8x32
+gmul2_lo_hi (u8x32 a, u8x32 b)
+{
+  return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
+}
+
+static_always_inline u8x32
+gmul2_hi_hi (u8x32 a, u8x32 b)
+{
+  return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
+}
+
+static_always_inline void
+ghash2_mul_first (ghash_data_t *gd, u8x32 a, u8x32 b)
+{
+  gd->hi2 = gmul2_hi_hi (a, b);
+  gd->lo2 = gmul2_lo_lo (a, b);
+  gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
+  gd->pending = 0;
+}
+
+static_always_inline void
+ghash2_mul_next (ghash_data_t *gd, u8x32 a, u8x32 b)
+{
+  u8x32 hi = gmul2_hi_hi (a, b);
+  u8x32 lo = gmul2_lo_lo (a, b);
+
+  if (gd->pending)
+    {
+      /* there is peding data from previous invocation so we can XOR */
+      gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
+      gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
+      gd->pending = 0;
+    }
+  else
+    {
+      /* there is no peding data from previous invocation so we postpone XOR */
+      gd->tmp_hi2 = hi;
+      gd->tmp_lo2 = lo;
+      gd->pending = 1;
+    }
+  gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash2_reduce (ghash_data_t *gd)
+{
+  u8x32 r;
+
+  /* Final combination:
+     gd->lo2 ^= gd->mid2 << 64
+     gd->hi2 ^= gd->mid2 >> 64 */
+
+  u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
+  u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
+
+  if (gd->pending)
+    {
+      gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
+      gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
+    }
+  else
+    {
+      gd->lo2 ^= midl;
+      gd->hi2 ^= midr;
+    }
+
+  r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
+  gd->lo2 ^= u8x32_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash2_reduce2 (ghash_data_t *gd)
+{
+  gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
+  gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
+}
+
+static_always_inline u8x16
+ghash2_final (ghash_data_t *gd)
+{
+  u8x32 r;
+
+  r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
+		  u8x32_word_shift_left (gd->tmp_hi2, 4));
+
+  /* horizontal XOR of 2 128-bit lanes */
+  return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
+}
+#endif
+
+static_always_inline void
+ghash_precompute (u8x16 H, u8x16 * Hi, int n)
+{
+  u8x16 r8;
+  u32x4 r32;
+  /* calcullate H<<1 mod poly from the hash key */
+  r8 = (u8x16) ((u64x2) H >> 63);
+  H = (u8x16) ((u64x2) H << 1);
+  H |= u8x16_word_shift_left (r8, 8);
+  r32 = (u32x4) u8x16_word_shift_right (r8, 8);
+#ifdef __SSE2__
+  r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
+#else
+  r32[3] = r32[0];
+#endif
+  r32 = r32 == (u32x4) {1, 0, 0, 1};
+  Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
+
+  /* calculate H^(i + 1) */
+  for (int i = n - 2; i >= 0; i--)
+    Hi[i] = ghash_mul (H, Hi[i + 1]);
+}
+
+#endif /* __ghash_h__ */
+
-- 
cgit