7 files changed, 3631 insertions, 0 deletions
diff --git a/src/vppinfra/crypto/aes.h b/src/vppinfra/crypto/aes.h
new file mode 100644
index 00000000000..9e80e3b0318
--- /dev/null
+++ b/src/vppinfra/crypto/aes.h
@@ -0,0 +1,491 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __aes_h__
+#define __aes_h__
+
+typedef enum
+{
+  AES_KEY_128 = 0,
+  AES_KEY_192 = 1,
+  AES_KEY_256 = 2,
+} aes_key_size_t;
+
+#define AES_KEY_ROUNDS(x)		(10 + x * 2)
+#define AES_KEY_BYTES(x)		(16 + x * 8)
+
+static_always_inline u8x16
+aes_block_load (u8 * p)
+{
+  return *(u8x16u *) p;
+}
+
+static_always_inline u8x16
+aes_enc_round_x1 (u8x16 a, u8x16 k)
+{
+#if defined (__AES__)
+  return (u8x16) _mm_aesenc_si128 ((__m128i) a, (__m128i) k);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return vaesmcq_u8 (vaeseq_u8 (a, u8x16_splat (0))) ^ k;
+#endif
+}
+
+#if defined(__VAES__) && defined(__AVX512F__)
+static_always_inline u8x64
+aes_enc_round_x4 (u8x64 a, u8x64 k)
+{
+  return (u8x64) _mm512_aesenc_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_enc_last_round_x4 (u8x64 a, u8x64 k)
+{
+  return (u8x64) _mm512_aesenclast_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_dec_round_x4 (u8x64 a, u8x64 k)
+{
+  return (u8x64) _mm512_aesdec_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_dec_last_round_x4 (u8x64 a, u8x64 k)
+{
+  return (u8x64) _mm512_aesdeclast_epi128 ((__m512i) a, (__m512i) k);
+}
+#endif
+
+#ifdef __VAES__
+static_always_inline u8x32
+aes_enc_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesenc_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_enc_last_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesenclast_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesdec_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_last_round_x2 (u8x32 a, u8x32 k)
+{
+  return (u8x32) _mm256_aesdeclast_epi128 ((__m256i) a, (__m256i) k);
+}
+#endif
+
+static_always_inline u8x16
+aes_enc_last_round_x1 (u8x16 a, u8x16 k)
+{
+#if defined (__AES__)
+  return (u8x16) _mm_aesenclast_si128 ((__m128i) a, (__m128i) k);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return vaeseq_u8 (a, u8x16_splat (0)) ^ k;
+#endif
+}
+
+#ifdef __x86_64__
+
+static_always_inline u8x16
+aes_dec_round_x1 (u8x16 a, u8x16 k)
+{
+  return (u8x16) _mm_aesdec_si128 ((__m128i) a, (__m128i) k);
+}
+
+static_always_inline u8x16
+aes_dec_last_round_x1 (u8x16 a, u8x16 k)
+{
+  return (u8x16) _mm_aesdeclast_si128 ((__m128i) a, (__m128i) k);
+}
+#endif
+
+static_always_inline void
+aes_block_store (u8 * p, u8x16 r)
+{
+  *(u8x16u *) p = r;
+}
+
+static_always_inline u8x16
+aes_encrypt_block (u8x16 block, const u8x16 * round_keys, aes_key_size_t ks)
+{
+  int rounds = AES_KEY_ROUNDS (ks);
+  block ^= round_keys[0];
+  for (int i = 1; i < rounds; i += 1)
+    block = aes_enc_round_x1 (block, round_keys[i]);
+  return aes_enc_last_round_x1 (block, round_keys[rounds]);
+}
+
+static_always_inline u8x16
+aes_inv_mix_column (u8x16 a)
+{
+#if defined (__AES__)
+  return (u8x16) _mm_aesimc_si128 ((__m128i) a);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return vaesimcq_u8 (a);
+#endif
+}
+
+#ifdef __x86_64__
+#define aes_keygen_assist(a, b) \
+  (u8x16) _mm_aeskeygenassist_si128((__m128i) a, b)
+
+/* AES-NI based AES key expansion based on code samples from
+   Intel(r) Advanced Encryption Standard (AES) New Instructions White Paper
+   (323641-001) */
+
+static_always_inline void
+aes128_key_assist (u8x16 * rk, u8x16 r)
+{
+  u8x16 t = rk[-1];
+  t ^= u8x16_word_shift_left (t, 4);
+  t ^= u8x16_word_shift_left (t, 4);
+  t ^= u8x16_word_shift_left (t, 4);
+  rk[0] = t ^ (u8x16) u32x4_shuffle ((u32x4) r, 3, 3, 3, 3);
+}
+
+static_always_inline void
+aes128_key_expand (u8x16 *rk, u8x16u const *k)
+{
+  rk[0] = k[0];
+  aes128_key_assist (rk + 1, aes_keygen_assist (rk[0], 0x01));
+  aes128_key_assist (rk + 2, aes_keygen_assist (rk[1], 0x02));
+  aes128_key_assist (rk + 3, aes_keygen_assist (rk[2], 0x04));
+  aes128_key_assist (rk + 4, aes_keygen_assist (rk[3], 0x08));
+  aes128_key_assist (rk + 5, aes_keygen_assist (rk[4], 0x10));
+  aes128_key_assist (rk + 6, aes_keygen_assist (rk[5], 0x20));
+  aes128_key_assist (rk + 7, aes_keygen_assist (rk[6], 0x40));
+  aes128_key_assist (rk + 8, aes_keygen_assist (rk[7], 0x80));
+  aes128_key_assist (rk + 9, aes_keygen_assist (rk[8], 0x1b));
+  aes128_key_assist (rk + 10, aes_keygen_assist (rk[9], 0x36));
+}
+
+static_always_inline void
+aes192_key_assist (u8x16 * r1, u8x16 * r2, u8x16 key_assist)
+{
+  u8x16 t;
+  r1[0] ^= t = u8x16_word_shift_left (r1[0], 4);
+  r1[0] ^= t = u8x16_word_shift_left (t, 4);
+  r1[0] ^= u8x16_word_shift_left (t, 4);
+  r1[0] ^= (u8x16) _mm_shuffle_epi32 ((__m128i) key_assist, 0x55);
+  r2[0] ^= u8x16_word_shift_left (r2[0], 4);
+  r2[0] ^= (u8x16) _mm_shuffle_epi32 ((__m128i) r1[0], 0xff);
+}
+
+static_always_inline void
+aes192_key_expand (u8x16 * rk, u8x16u const *k)
+{
+  u8x16 r1, r2;
+
+  rk[0] = r1 = k[0];
+  rk[1] = r2 = (u8x16) (u64x2) { *(u64 *) (k + 1), 0 };
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x1));
+  rk[1] = (u8x16) _mm_shuffle_pd ((__m128d) rk[1], (__m128d) r1, 0);
+  rk[2] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x2));
+  rk[3] = r1;
+  rk[4] = r2;
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x4));
+  rk[4] = (u8x16) _mm_shuffle_pd ((__m128d) rk[4], (__m128d) r1, 0);
+  rk[5] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x8));
+  rk[6] = r1;
+  rk[7] = r2;
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x10));
+  rk[7] = (u8x16) _mm_shuffle_pd ((__m128d) rk[7], (__m128d) r1, 0);
+  rk[8] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x20));
+  rk[9] = r1;
+  rk[10] = r2;
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x40));
+  rk[10] = (u8x16) _mm_shuffle_pd ((__m128d) rk[10], (__m128d) r1, 0);
+  rk[11] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+  aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x80));
+  rk[12] = r1;
+}
+
+static_always_inline void
+aes256_key_assist (u8x16 * rk, int i, u8x16 key_assist)
+{
+  u8x16 r, t;
+  rk += i;
+  r = rk[-2];
+  r ^= t = u8x16_word_shift_left (r, 4);
+  r ^= t = u8x16_word_shift_left (t, 4);
+  r ^= u8x16_word_shift_left (t, 4);
+  r ^= (u8x16) u32x4_shuffle ((u32x4) key_assist, 3, 3, 3, 3);
+  rk[0] = r;
+
+  if (i >= 14)
+    return;
+
+  key_assist = aes_keygen_assist (rk[0], 0x0);
+  r = rk[-1];
+  r ^= t = u8x16_word_shift_left (r, 4);
+  r ^= t = u8x16_word_shift_left (t, 4);
+  r ^= u8x16_word_shift_left (t, 4);
+  r ^= (u8x16) u32x4_shuffle ((u32x4) key_assist, 2, 2, 2, 2);
+  rk[1] = r;
+}
+
+static_always_inline void
+aes256_key_expand (u8x16 * rk, u8x16u const *k)
+{
+  rk[0] = k[0];
+  rk[1] = k[1];
+  aes256_key_assist (rk, 2, aes_keygen_assist (rk[1], 0x01));
+  aes256_key_assist (rk, 4, aes_keygen_assist (rk[3], 0x02));
+  aes256_key_assist (rk, 6, aes_keygen_assist (rk[5], 0x04));
+  aes256_key_assist (rk, 8, aes_keygen_assist (rk[7], 0x08));
+  aes256_key_assist (rk, 10, aes_keygen_assist (rk[9], 0x10));
+  aes256_key_assist (rk, 12, aes_keygen_assist (rk[11], 0x20));
+  aes256_key_assist (rk, 14, aes_keygen_assist (rk[13], 0x40));
+}
+#endif
+
+#ifdef __aarch64__
+
+static const u8x16 aese_prep_mask1 =
+  { 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 };
+static const u8x16 aese_prep_mask2 =
+  { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 };
+
+static_always_inline void
+aes128_key_expand_round_neon (u8x16 * rk, u32 rcon)
+{
+  u8x16 r, t, last_round = rk[-1], z = { };
+  r = vqtbl1q_u8 (last_round, aese_prep_mask1);
+  r = vaeseq_u8 (r, z);
+  r ^= (u8x16) vdupq_n_u32 (rcon);
+  r ^= last_round;
+  r ^= t = vextq_u8 (z, last_round, 12);
+  r ^= t = vextq_u8 (z, t, 12);
+  r ^= vextq_u8 (z, t, 12);
+  rk[0] = r;
+}
+
+static_always_inline void
+aes128_key_expand (u8x16 *rk, u8x16u const *k)
+{
+  rk[0] = k[0];
+  aes128_key_expand_round_neon (rk + 1, 0x01);
+  aes128_key_expand_round_neon (rk + 2, 0x02);
+  aes128_key_expand_round_neon (rk + 3, 0x04);
+  aes128_key_expand_round_neon (rk + 4, 0x08);
+  aes128_key_expand_round_neon (rk + 5, 0x10);
+  aes128_key_expand_round_neon (rk + 6, 0x20);
+  aes128_key_expand_round_neon (rk + 7, 0x40);
+  aes128_key_expand_round_neon (rk + 8, 0x80);
+  aes128_key_expand_round_neon (rk + 9, 0x1b);
+  aes128_key_expand_round_neon (rk + 10, 0x36);
+}
+
+static_always_inline void
+aes192_key_expand_round_neon (u8x8 * rk, u32 rcon)
+{
+  u8x8 r, last_round = rk[-1], z = { };
+  u8x16 r2, z2 = { };
+
+  r2 = (u8x16) vdupq_lane_u64 ((uint64x1_t) last_round, 0);
+  r2 = vqtbl1q_u8 (r2, aese_prep_mask1);
+  r2 = vaeseq_u8 (r2, z2);
+  r2 ^= (u8x16) vdupq_n_u32 (rcon);
+
+  r = (u8x8) vdup_laneq_u64 ((u64x2) r2, 0);
+  r ^= rk[-3];
+  r ^= vext_u8 (z, rk[-3], 4);
+  rk[0] = r;
+
+  r = rk[-2] ^ vext_u8 (r, z, 4);
+  r ^= vext_u8 (z, r, 4);
+  rk[1] = r;
+
+  if (rcon == 0x80)
+    return;
+
+  r = rk[-1] ^ vext_u8 (r, z, 4);
+  r ^= vext_u8 (z, r, 4);
+  rk[2] = r;
+}
+
+static_always_inline void
+aes192_key_expand (u8x16 * ek, const u8x16u * k)
+{
+  u8x8 *rk = (u8x8 *) ek;
+  ek[0] = k[0];
+  rk[2] = *(u8x8u *) (k + 1);
+  aes192_key_expand_round_neon (rk + 3, 0x01);
+  aes192_key_expand_round_neon (rk + 6, 0x02);
+  aes192_key_expand_round_neon (rk + 9, 0x04);
+  aes192_key_expand_round_neon (rk + 12, 0x08);
+  aes192_key_expand_round_neon (rk + 15, 0x10);
+  aes192_key_expand_round_neon (rk + 18, 0x20);
+  aes192_key_expand_round_neon (rk + 21, 0x40);
+  aes192_key_expand_round_neon (rk + 24, 0x80);
+}
+
+
+static_always_inline void
+aes256_key_expand_round_neon (u8x16 * rk, u32 rcon)
+{
+  u8x16 r, t, z = { };
+
+  r = vqtbl1q_u8 (rk[-1], rcon ? aese_prep_mask1 : aese_prep_mask2);
+  r = vaeseq_u8 (r, z);
+  if (rcon)
+    r ^= (u8x16) vdupq_n_u32 (rcon);
+  r ^= rk[-2];
+  r ^= t = vextq_u8 (z, rk[-2], 12);
+  r ^= t = vextq_u8 (z, t, 12);
+  r ^= vextq_u8 (z, t, 12);
+  rk[0] = r;
+}
+
+static_always_inline void
+aes256_key_expand (u8x16 *rk, u8x16u const *k)
+{
+  rk[0] = k[0];
+  rk[1] = k[1];
+  aes256_key_expand_round_neon (rk + 2, 0x01);
+  aes256_key_expand_round_neon (rk + 3, 0);
+  aes256_key_expand_round_neon (rk + 4, 0x02);
+  aes256_key_expand_round_neon (rk + 5, 0);
+  aes256_key_expand_round_neon (rk + 6, 0x04);
+  aes256_key_expand_round_neon (rk + 7, 0);
+  aes256_key_expand_round_neon (rk + 8, 0x08);
+  aes256_key_expand_round_neon (rk + 9, 0);
+  aes256_key_expand_round_neon (rk + 10, 0x10);
+  aes256_key_expand_round_neon (rk + 11, 0);
+  aes256_key_expand_round_neon (rk + 12, 0x20);
+  aes256_key_expand_round_neon (rk + 13, 0);
+  aes256_key_expand_round_neon (rk + 14, 0x40);
+}
+
+#endif
+
+static_always_inline void
+aes_key_expand (u8x16 * key_schedule, u8 const *key, aes_key_size_t ks)
+{
+  switch (ks)
+    {
+    case AES_KEY_128:
+      aes128_key_expand (key_schedule, (u8x16u const *) key);
+      break;
+    case AES_KEY_192:
+      aes192_key_expand (key_schedule, (u8x16u const *) key);
+      break;
+    case AES_KEY_256:
+      aes256_key_expand (key_schedule, (u8x16u const *) key);
+      break;
+    }
+}
+
+static_always_inline void
+aes_key_enc_to_dec (u8x16 * ke, u8x16 * kd, aes_key_size_t ks)
+{
+  int rounds = AES_KEY_ROUNDS (ks);
+
+  kd[rounds] = ke[0];
+  kd[0] = ke[rounds];
+
+  for (int i = 1; i < (rounds / 2); i++)
+    {
+      kd[rounds - i] = aes_inv_mix_column (ke[i]);
+      kd[i] = aes_inv_mix_column (ke[rounds - i]);
+    }
+
+  kd[rounds / 2] = aes_inv_mix_column (ke[rounds / 2]);
+}
+#if defined(__VAES__) && defined(__AVX512F__)
+#define N_AES_LANES		   4
+#define aes_load_partial(p, n)	   u8x64_load_partial ((u8 *) (p), n)
+#define aes_store_partial(v, p, n) u8x64_store_partial (v, (u8 *) (p), n)
+#define aes_reflect(r)		   u8x64_reflect_u8x16 (r)
+typedef u8x64 aes_data_t;
+typedef u8x64u aes_mem_t;
+typedef u32x16 aes_counter_t;
+#elif defined(__VAES__)
+#define N_AES_LANES		   2
+#define aes_load_partial(p, n)	   u8x32_load_partial ((u8 *) (p), n)
+#define aes_store_partial(v, p, n) u8x32_store_partial (v, (u8 *) (p), n)
+#define aes_reflect(r)		   u8x32_reflect_u8x16 (r)
+typedef u8x32 aes_data_t;
+typedef u8x32u aes_mem_t;
+typedef u32x8 aes_counter_t;
+#else
+#define N_AES_LANES		   1
+#define aes_load_partial(p, n)	   u8x16_load_partial ((u8 *) (p), n)
+#define aes_store_partial(v, p, n) u8x16_store_partial (v, (u8 *) (p), n)
+#define aes_reflect(r)		   u8x16_reflect (r)
+typedef u8x16 aes_data_t;
+typedef u8x16u aes_mem_t;
+typedef u32x4 aes_counter_t;
+#endif
+
+#define N_AES_BYTES (N_AES_LANES * 16)
+
+typedef union
+{
+  u8x16 x1;
+  u8x32 x2;
+  u8x64 x4;
+  u8x16 lanes[4];
+} aes_expaned_key_t;
+
+static_always_inline void
+aes_enc_round (aes_data_t *r, const aes_expaned_key_t *ek, uword n_blocks)
+{
+  for (int i = 0; i < n_blocks; i++)
+#if N_AES_LANES == 4
+    r[i] = aes_enc_round_x4 (r[i], ek->x4);
+#elif N_AES_LANES == 2
+    r[i] = aes_enc_round_x2 (r[i], ek->x2);
+#else
+    r[i] = aes_enc_round_x1 (r[i], ek->x1);
+#endif
+}
+
+static_always_inline void
+aes_enc_last_round (aes_data_t *r, aes_data_t *d, const aes_expaned_key_t *ek,
+		    uword n_blocks)
+{
+  for (int i = 0; i < n_blocks; i++)
+#if N_AES_LANES == 4
+    d[i] ^= r[i] = aes_enc_last_round_x4 (r[i], ek->x4);
+#elif N_AES_LANES == 2
+    d[i] ^= r[i] = aes_enc_last_round_x2 (r[i], ek->x2);
+#else
+    d[i] ^= r[i] = aes_enc_last_round_x1 (r[i], ek->x1);
+#endif
+}
+
+#endif /* __aes_h__ */
diff --git a/src/vppinfra/crypto/aes_cbc.h b/src/vppinfra/crypto/aes_cbc.h
new file mode 100644
index 00000000000..cb3d0784051
--- /dev/null
+++ b/src/vppinfra/crypto/aes_cbc.h
@@ -0,0 +1,542 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_cbc_h__
+#define __crypto_aes_cbc_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/crypto/aes.h>
+
+typedef struct
+{
+  const u8x16 encrypt_key[15];
+  const u8x16 decrypt_key[15];
+} aes_cbc_key_data_t;
+
+static_always_inline void
+clib_aes_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *src, uword len,
+		      const u8 *iv, aes_key_size_t ks, u8 *dst)
+{
+  int rounds = AES_KEY_ROUNDS (ks);
+  u8x16 r, *k = (u8x16 *) kd->encrypt_key;
+
+  r = *(u8x16u *) iv;
+
+  for (int i = 0; i < len; i += 16)
+    {
+      int j;
+      r = u8x16_xor3 (r, *(u8x16u *) (src + i), k[0]);
+      for (j = 1; j < rounds; j++)
+	r = aes_enc_round_x1 (r, k[j]);
+      r = aes_enc_last_round_x1 (r, k[rounds]);
+      *(u8x16u *) (dst + i) = r;
+    }
+}
+
+static_always_inline void
+clib_aes128_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+			 uword len, const u8 *iv, u8 *ciphertext)
+{
+  clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_128, ciphertext);
+}
+
+static_always_inline void
+clib_aes192_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+			 uword len, const u8 *iv, u8 *ciphertext)
+{
+  clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_192, ciphertext);
+}
+
+static_always_inline void
+clib_aes256_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+			 uword len, const u8 *iv, u8 *ciphertext)
+{
+  clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_256, ciphertext);
+}
+
+static_always_inline void __clib_unused
+aes_cbc_dec (const u8x16 *k, u8x16u *src, u8x16u *dst, u8x16u *iv, int count,
+	     int rounds)
+{
+  u8x16 r[4], c[4], f;
+
+  f = iv[0];
+  while (count >= 64)
+    {
+      c[0] = r[0] = src[0];
+      c[1] = r[1] = src[1];
+      c[2] = r[2] = src[2];
+      c[3] = r[3] = src[3];
+
+#if __x86_64__
+      r[0] ^= k[0];
+      r[1] ^= k[0];
+      r[2] ^= k[0];
+      r[3] ^= k[0];
+
+      for (int i = 1; i < rounds; i++)
+	{
+	  r[0] = aes_dec_round_x1 (r[0], k[i]);
+	  r[1] = aes_dec_round_x1 (r[1], k[i]);
+	  r[2] = aes_dec_round_x1 (r[2], k[i]);
+	  r[3] = aes_dec_round_x1 (r[3], k[i]);
+	}
+
+      r[0] = aes_dec_last_round_x1 (r[0], k[rounds]);
+      r[1] = aes_dec_last_round_x1 (r[1], k[rounds]);
+      r[2] = aes_dec_last_round_x1 (r[2], k[rounds]);
+      r[3] = aes_dec_last_round_x1 (r[3], k[rounds]);
+#else
+      for (int i = 0; i < rounds - 1; i++)
+	{
+	  r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
+	  r[1] = vaesimcq_u8 (vaesdq_u8 (r[1], k[i]));
+	  r[2] = vaesimcq_u8 (vaesdq_u8 (r[2], k[i]));
+	  r[3] = vaesimcq_u8 (vaesdq_u8 (r[3], k[i]));
+	}
+      r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
+      r[1] = vaesdq_u8 (r[1], k[rounds - 1]) ^ k[rounds];
+      r[2] = vaesdq_u8 (r[2], k[rounds - 1]) ^ k[rounds];
+      r[3] = vaesdq_u8 (r[3], k[rounds - 1]) ^ k[rounds];
+#endif
+      dst[0] = r[0] ^ f;
+      dst[1] = r[1] ^ c[0];
+      dst[2] = r[2] ^ c[1];
+      dst[3] = r[3] ^ c[2];
+      f = c[3];
+
+      count -= 64;
+      src += 4;
+      dst += 4;
+    }
+
+  while (count > 0)
+    {
+      c[0] = r[0] = src[0];
+#if __x86_64__
+      r[0] ^= k[0];
+      for (int i = 1; i < rounds; i++)
+	r[0] = aes_dec_round_x1 (r[0], k[i]);
+      r[0] = aes_dec_last_round_x1 (r[0], k[rounds]);
+#else
+      c[0] = r[0] = src[0];
+      for (int i = 0; i < rounds - 1; i++)
+	r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
+      r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
+#endif
+      dst[0] = r[0] ^ f;
+      f = c[0];
+
+      count -= 16;
+      src += 1;
+      dst += 1;
+    }
+}
+
+#if __x86_64__
+#if defined(__VAES__) && defined(__AVX512F__)
+
+static_always_inline u8x64
+aes_block_load_x4 (u8 *src[], int i)
+{
+  u8x64 r = {};
+  r = u8x64_insert_u8x16 (r, aes_block_load (src[0] + i), 0);
+  r = u8x64_insert_u8x16 (r, aes_block_load (src[1] + i), 1);
+  r = u8x64_insert_u8x16 (r, aes_block_load (src[2] + i), 2);
+  r = u8x64_insert_u8x16 (r, aes_block_load (src[3] + i), 3);
+  return r;
+}
+
+static_always_inline void
+aes_block_store_x4 (u8 *dst[], int i, u8x64 r)
+{
+  aes_block_store (dst[0] + i, u8x64_extract_u8x16 (r, 0));
+  aes_block_store (dst[1] + i, u8x64_extract_u8x16 (r, 1));
+  aes_block_store (dst[2] + i, u8x64_extract_u8x16 (r, 2));
+  aes_block_store (dst[3] + i, u8x64_extract_u8x16 (r, 3));
+}
+
+static_always_inline u8x64
+aes4_cbc_dec_permute (u8x64 a, u8x64 b)
+{
+  return (u8x64) u64x8_shuffle2 (a, b, 6, 7, 8, 9, 10, 11, 12, 13);
+}
+
+static_always_inline void
+aes4_cbc_dec (const u8x16 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count,
+	      aes_key_size_t rounds)
+{
+  u8x64 f, k4, r[4], c[4] = {};
+  __mmask8 m;
+  int i, n_blocks = count >> 4;
+
+  f = u8x64_insert_u8x16 (u8x64_zero (), *iv, 3);
+
+  while (n_blocks >= 16)
+    {
+      k4 = u8x64_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+      c[2] = src[2];
+      c[3] = src[3];
+
+      r[0] = c[0] ^ k4;
+      r[1] = c[1] ^ k4;
+      r[2] = c[2] ^ k4;
+      r[3] = c[3] ^ k4;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k4 = u8x64_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x4 (r[0], k4);
+	  r[1] = aes_dec_round_x4 (r[1], k4);
+	  r[2] = aes_dec_round_x4 (r[2], k4);
+	  r[3] = aes_dec_round_x4 (r[3], k4);
+	}
+
+      k4 = u8x64_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x4 (r[0], k4);
+      r[1] = aes_dec_last_round_x4 (r[1], k4);
+      r[2] = aes_dec_last_round_x4 (r[2], k4);
+      r[3] = aes_dec_last_round_x4 (r[3], k4);
+
+      dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+      dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
+      dst[3] = r[3] ^= aes4_cbc_dec_permute (c[2], c[3]);
+      f = c[3];
+
+      n_blocks -= 16;
+      src += 4;
+      dst += 4;
+    }
+
+  if (n_blocks >= 12)
+    {
+      k4 = u8x64_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+      c[2] = src[2];
+
+      r[0] = c[0] ^ k4;
+      r[1] = c[1] ^ k4;
+      r[2] = c[2] ^ k4;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k4 = u8x64_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x4 (r[0], k4);
+	  r[1] = aes_dec_round_x4 (r[1], k4);
+	  r[2] = aes_dec_round_x4 (r[2], k4);
+	}
+
+      k4 = u8x64_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x4 (r[0], k4);
+      r[1] = aes_dec_last_round_x4 (r[1], k4);
+      r[2] = aes_dec_last_round_x4 (r[2], k4);
+
+      dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+      dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
+      f = c[2];
+
+      n_blocks -= 12;
+      src += 3;
+      dst += 3;
+    }
+  else if (n_blocks >= 8)
+    {
+      k4 = u8x64_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+
+      r[0] = c[0] ^ k4;
+      r[1] = c[1] ^ k4;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k4 = u8x64_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x4 (r[0], k4);
+	  r[1] = aes_dec_round_x4 (r[1], k4);
+	}
+
+      k4 = u8x64_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x4 (r[0], k4);
+      r[1] = aes_dec_last_round_x4 (r[1], k4);
+
+      dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+      f = c[1];
+
+      n_blocks -= 8;
+      src += 2;
+      dst += 2;
+    }
+  else if (n_blocks >= 4)
+    {
+      c[0] = src[0];
+
+      r[0] = c[0] ^ u8x64_splat_u8x16 (k[0]);
+
+      for (i = 1; i < rounds; i++)
+	r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+
+      r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+
+      dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+      f = c[0];
+
+      n_blocks -= 4;
+      src += 1;
+      dst += 1;
+    }
+
+  if (n_blocks > 0)
+    {
+      k4 = u8x64_splat_u8x16 (k[0]);
+      m = (1 << (n_blocks * 2)) - 1;
+      c[0] =
+	(u8x64) _mm512_mask_loadu_epi64 ((__m512i) c[0], m, (__m512i *) src);
+      f = aes4_cbc_dec_permute (f, c[0]);
+      r[0] = c[0] ^ k4;
+      for (i = 1; i < rounds; i++)
+	r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+      r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+      _mm512_mask_storeu_epi64 ((__m512i *) dst, m, (__m512i) (r[0] ^ f));
+    }
+}
+#elif defined(__VAES__)
+
+static_always_inline u8x32
+aes_block_load_x2 (u8 *src[], int i)
+{
+  u8x32 r = {};
+  r = u8x32_insert_lo (r, aes_block_load (src[0] + i));
+  r = u8x32_insert_hi (r, aes_block_load (src[1] + i));
+  return r;
+}
+
+static_always_inline void
+aes_block_store_x2 (u8 *dst[], int i, u8x32 r)
+{
+  aes_block_store (dst[0] + i, u8x32_extract_lo (r));
+  aes_block_store (dst[1] + i, u8x32_extract_hi (r));
+}
+
+static_always_inline u8x32
+aes2_cbc_dec_permute (u8x32 a, u8x32 b)
+{
+  return (u8x32) u64x4_shuffle2 ((u64x4) a, (u64x4) b, 2, 3, 4, 5);
+}
+
+static_always_inline void
+aes2_cbc_dec (const u8x16 *k, u8x32u *src, u8x32u *dst, u8x16u *iv, int count,
+	      aes_key_size_t rounds)
+{
+  u8x32 k2, f = {}, r[4], c[4] = {};
+  int i, n_blocks = count >> 4;
+
+  f = u8x32_insert_hi (f, *iv);
+
+  while (n_blocks >= 8)
+    {
+      k2 = u8x32_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+      c[2] = src[2];
+      c[3] = src[3];
+
+      r[0] = c[0] ^ k2;
+      r[1] = c[1] ^ k2;
+      r[2] = c[2] ^ k2;
+      r[3] = c[3] ^ k2;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k2 = u8x32_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x2 (r[0], k2);
+	  r[1] = aes_dec_round_x2 (r[1], k2);
+	  r[2] = aes_dec_round_x2 (r[2], k2);
+	  r[3] = aes_dec_round_x2 (r[3], k2);
+	}
+
+      k2 = u8x32_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x2 (r[0], k2);
+      r[1] = aes_dec_last_round_x2 (r[1], k2);
+      r[2] = aes_dec_last_round_x2 (r[2], k2);
+      r[3] = aes_dec_last_round_x2 (r[3], k2);
+
+      dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+      dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+      dst[3] = r[3] ^= aes2_cbc_dec_permute (c[2], c[3]);
+      f = c[3];
+
+      n_blocks -= 8;
+      src += 4;
+      dst += 4;
+    }
+
+  if (n_blocks >= 6)
+    {
+      k2 = u8x32_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+      c[2] = src[2];
+
+      r[0] = c[0] ^ k2;
+      r[1] = c[1] ^ k2;
+      r[2] = c[2] ^ k2;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k2 = u8x32_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x2 (r[0], k2);
+	  r[1] = aes_dec_round_x2 (r[1], k2);
+	  r[2] = aes_dec_round_x2 (r[2], k2);
+	}
+
+      k2 = u8x32_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x2 (r[0], k2);
+      r[1] = aes_dec_last_round_x2 (r[1], k2);
+      r[2] = aes_dec_last_round_x2 (r[2], k2);
+
+      dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+      dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+      f = c[2];
+
+      n_blocks -= 6;
+      src += 3;
+      dst += 3;
+    }
+  else if (n_blocks >= 4)
+    {
+      k2 = u8x32_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      c[1] = src[1];
+
+      r[0] = c[0] ^ k2;
+      r[1] = c[1] ^ k2;
+
+      for (i = 1; i < rounds; i++)
+	{
+	  k2 = u8x32_splat_u8x16 (k[i]);
+	  r[0] = aes_dec_round_x2 (r[0], k2);
+	  r[1] = aes_dec_round_x2 (r[1], k2);
+	}
+
+      k2 = u8x32_splat_u8x16 (k[i]);
+      r[0] = aes_dec_last_round_x2 (r[0], k2);
+      r[1] = aes_dec_last_round_x2 (r[1], k2);
+
+      dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+      dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+      f = c[1];
+
+      n_blocks -= 4;
+      src += 2;
+      dst += 2;
+    }
+  else if (n_blocks >= 2)
+    {
+      k2 = u8x32_splat_u8x16 (k[0]);
+      c[0] = src[0];
+      r[0] = c[0] ^ k2;
+
+      for (i = 1; i < rounds; i++)
+	r[0] = aes_dec_round_x2 (r[0], u8x32_splat_u8x16 (k[i]));
+
+      r[0] = aes_dec_last_round_x2 (r[0], u8x32_splat_u8x16 (k[i]));
+      dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+      f = c[0];
+
+      n_blocks -= 2;
+      src += 1;
+      dst += 1;
+    }
+
+  if (n_blocks > 0)
+    {
+      u8x16 rl = *(u8x16u *) src ^ k[0];
+      for (i = 1; i < rounds; i++)
+	rl = aes_dec_round_x1 (rl, k[i]);
+      rl = aes_dec_last_round_x1 (rl, k[i]);
+      *(u8x16u *) dst = rl ^ u8x32_extract_hi (f);
+    }
+}
+#endif
+#endif
+
+static_always_inline void
+clib_aes_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key,
+			 aes_key_size_t ks)
+{
+  u8x16 e[15], d[15];
+  aes_key_expand (e, key, ks);
+  aes_key_enc_to_dec (e, d, ks);
+  for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+    {
+      ((u8x16 *) kd->decrypt_key)[i] = d[i];
+      ((u8x16 *) kd->encrypt_key)[i] = e[i];
+    }
+}
+
+static_always_inline void
+clib_aes128_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+  clib_aes_cbc_key_expand (kd, key, AES_KEY_128);
+}
+static_always_inline void
+clib_aes192_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+  clib_aes_cbc_key_expand (kd, key, AES_KEY_192);
+}
+static_always_inline void
+clib_aes256_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+  clib_aes_cbc_key_expand (kd, key, AES_KEY_256);
+}
+
+static_always_inline void
+clib_aes_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+		      uword len, const u8 *iv, aes_key_size_t ks,
+		      u8 *plaintext)
+{
+  int rounds = AES_KEY_ROUNDS (ks);
+#if defined(__VAES__) && defined(__AVX512F__)
+  aes4_cbc_dec (kd->decrypt_key, (u8x64u *) ciphertext, (u8x64u *) plaintext,
+		(u8x16u *) iv, (int) len, rounds);
+#elif defined(__VAES__)
+  aes2_cbc_dec (kd->decrypt_key, (u8x32u *) ciphertext, (u8x32u *) plaintext,
+		(u8x16u *) iv, (int) len, rounds);
+#else
+  aes_cbc_dec (kd->decrypt_key, (u8x16u *) ciphertext, (u8x16u *) plaintext,
+	       (u8x16u *) iv, (int) len, rounds);
+#endif
+}
+
+static_always_inline void
+clib_aes128_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+			 uword len, const u8 *iv, u8 *plaintext)
+{
+  clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_128, plaintext);
+}
+
+static_always_inline void
+clib_aes192_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+			 uword len, const u8 *iv, u8 *plaintext)
+{
+  clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_192, plaintext);
+}
+
+static_always_inline void
+clib_aes256_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+			 uword len, const u8 *iv, u8 *plaintext)
+{
+  clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_256, plaintext);
+}
+
+#endif /* __crypto_aes_cbc_h__ */
diff --git a/src/vppinfra/crypto/aes_ctr.h b/src/vppinfra/crypto/aes_ctr.h
new file mode 100644
index 00000000000..74a9f96d90d
--- /dev/null
+++ b/src/vppinfra/crypto/aes_ctr.h
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2024 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_ctr_h__
+#define __crypto_aes_ctr_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+#include <vppinfra/crypto/aes.h>
+
+typedef struct
+{
+  const aes_expaned_key_t exp_key[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+} aes_ctr_key_data_t;
+
+typedef struct
+{
+  const aes_expaned_key_t exp_key[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+  aes_counter_t ctr;		   /* counter (reflected) */
+  u8 keystream_bytes[N_AES_BYTES]; /* keystream leftovers */
+  u32 n_keystream_bytes;	   /* number of keystream leftovers */
+} aes_ctr_ctx_t;
+
+static_always_inline aes_counter_t
+aes_ctr_one_block (aes_ctr_ctx_t *ctx, aes_counter_t ctr, const u8 *src,
+		   u8 *dst, u32 n_parallel, u32 n_bytes, int rounds, int last)
+{
+  u32 __clib_aligned (N_AES_BYTES)
+  inc[] = { N_AES_LANES, 0, 0, 0, N_AES_LANES, 0, 0, 0,
+	    N_AES_LANES, 0, 0, 0, N_AES_LANES, 0, 0, 0 };
+  const aes_expaned_key_t *k = ctx->exp_key;
+  const aes_mem_t *sv = (aes_mem_t *) src;
+  aes_mem_t *dv = (aes_mem_t *) dst;
+  aes_data_t d[4], t[4];
+  u32 r;
+
+  n_bytes -= (n_parallel - 1) * N_AES_BYTES;
+
+  /* AES First Round */
+  for (int i = 0; i < n_parallel; i++)
+    {
+#if N_AES_LANES == 4
+      t[i] = k[0].x4 ^ (u8x64) aes_reflect ((u8x64) ctr);
+#elif N_AES_LANES == 2
+      t[i] = k[0].x2 ^ (u8x32) aes_reflect ((u8x32) ctr);
+#else
+      t[i] = k[0].x1 ^ (u8x16) aes_reflect ((u8x16) ctr);
+#endif
+      ctr += *(aes_counter_t *) inc;
+    }
+
+  /* Load Data */
+  for (int i = 0; i < n_parallel - last; i++)
+    d[i] = sv[i];
+
+  if (last)
+    d[n_parallel - 1] =
+      aes_load_partial ((u8 *) (sv + n_parallel - 1), n_bytes);
+
+  /* AES Intermediate Rounds */
+  for (r = 1; r < rounds; r++)
+    aes_enc_round (t, k + r, n_parallel);
+
+  /* AES Last Round */
+  aes_enc_last_round (t, d, k + r, n_parallel);
+
+  /* Store Data */
+  for (int i = 0; i < n_parallel - last; i++)
+    dv[i] = d[i];
+
+  if (last)
+    {
+      aes_store_partial (d[n_parallel - 1], dv + n_parallel - 1, n_bytes);
+      *(aes_data_t *) ctx->keystream_bytes = t[n_parallel - 1];
+      ctx->n_keystream_bytes = N_AES_BYTES - n_bytes;
+    }
+
+  return ctr;
+}
+
+static_always_inline void
+clib_aes_ctr_init (aes_ctr_ctx_t *ctx, const aes_ctr_key_data_t *kd,
+		   const u8 *iv, aes_key_size_t ks)
+{
+  u32x4 ctr = (u32x4) u8x16_reflect (*(u8x16u *) iv);
+#if N_AES_LANES == 4
+  ctx->ctr = (aes_counter_t) u32x16_splat_u32x4 (ctr) +
+	     (u32x16){ 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 };
+#elif N_AES_LANES == 2
+  ctx->ctr = (aes_counter_t) u32x8_splat_u32x4 (ctr) +
+	     (u32x8){ 0, 0, 0, 0, 1, 0, 0, 0 };
+#else
+  ctx->ctr = ctr;
+#endif
+  for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+    ((aes_expaned_key_t *) ctx->exp_key)[i] = kd->exp_key[i];
+  ctx->n_keystream_bytes = 0;
+}
+
+static_always_inline void
+clib_aes_ctr_transform (aes_ctr_ctx_t *ctx, const u8 *src, u8 *dst,
+			u32 n_bytes, aes_key_size_t ks)
+{
+  int r = AES_KEY_ROUNDS (ks);
+  aes_counter_t ctr = ctx->ctr;
+
+  if (ctx->n_keystream_bytes)
+    {
+      u8 *ks = ctx->keystream_bytes + N_AES_BYTES - ctx->n_keystream_bytes;
+
+      if (ctx->n_keystream_bytes >= n_bytes)
+	{
+	  for (int i = 0; i < n_bytes; i++)
+	    dst[i] = src[i] ^ ks[i];
+	  ctx->n_keystream_bytes -= n_bytes;
+	  return;
+	}
+
+      for (int i = 0; i < ctx->n_keystream_bytes; i++)
+	dst++[0] = src++[0] ^ ks[i];
+
+      n_bytes -= ctx->n_keystream_bytes;
+      ctx->n_keystream_bytes = 0;
+    }
+
+  /* main loop */
+  for (int n = 4 * N_AES_BYTES; n_bytes >= n; n_bytes -= n, dst += n, src += n)
+    ctr = aes_ctr_one_block (ctx, ctr, src, dst, 4, n, r, 0);
+
+  if (n_bytes)
+    {
+      if (n_bytes > 3 * N_AES_BYTES)
+	ctr = aes_ctr_one_block (ctx, ctr, src, dst, 4, n_bytes, r, 1);
+      else if (n_bytes > 2 * N_AES_BYTES)
+	ctr = aes_ctr_one_block (ctx, ctr, src, dst, 3, n_bytes, r, 1);
+      else if (n_bytes > N_AES_BYTES)
+	ctr = aes_ctr_one_block (ctx, ctr, src, dst, 2, n_bytes, r, 1);
+      else
+	ctr = aes_ctr_one_block (ctx, ctr, src, dst, 1, n_bytes, r, 1);
+    }
+  else
+    ctx->n_keystream_bytes = 0;
+
+  ctx->ctr = ctr;
+}
+
+static_always_inline void
+clib_aes_ctr_key_expand (aes_ctr_key_data_t *kd, const u8 *key,
+			 aes_key_size_t ks)
+{
+  u8x16 ek[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+  aes_expaned_key_t *k = (aes_expaned_key_t *) kd->exp_key;
+
+  /* expand AES key */
+  aes_key_expand (ek, key, ks);
+  for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+    k[i].lanes[0] = k[i].lanes[1] = k[i].lanes[2] = k[i].lanes[3] = ek[i];
+}
+
+static_always_inline void
+clib_aes128_ctr (const aes_ctr_key_data_t *kd, const u8 *src, u32 n_bytes,
+		 const u8 *iv, u8 *dst)
+{
+  aes_ctr_ctx_t ctx;
+  clib_aes_ctr_init (&ctx, kd, iv, AES_KEY_128);
+  clib_aes_ctr_transform (&ctx, src, dst, n_bytes, AES_KEY_128);
+}
+
+static_always_inline void
+clib_aes192_ctr (const aes_ctr_key_data_t *kd, const u8 *src, u32 n_bytes,
+		 const u8 *iv, u8 *dst)
+{
+  aes_ctr_ctx_t ctx;
+  clib_aes_ctr_init (&ctx, kd, iv, AES_KEY_192);
+  clib_aes_ctr_transform (&ctx, src, dst, n_bytes, AES_KEY_192);
+}
+
+static_always_inline void
+clib_aes256_ctr (const aes_ctr_key_data_t *kd, const u8 *src, u32 n_bytes,
+		 const u8 *iv, u8 *dst)
+{
+  aes_ctr_ctx_t ctx;
+  clib_aes_ctr_init (&ctx, kd, iv, AES_KEY_256);
+  clib_aes_ctr_transform (&ctx, src, dst, n_bytes, AES_KEY_256);
+}
+
+#endif /* __crypto_aes_ctr_h__ */
diff --git a/src/vppinfra/crypto/aes_gcm.h b/src/vppinfra/crypto/aes_gcm.h
new file mode 100644
index 00000000000..5b628c87745
--- /dev/null
+++ b/src/vppinfra/crypto/aes_gcm.h
@@ -0,0 +1,944 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_gcm_h__
+#define __crypto_aes_gcm_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+#include <vppinfra/crypto/aes.h>
+#include <vppinfra/crypto/ghash.h>
+
+#define NUM_HI 36
+#if N_AES_LANES == 4
+typedef u8x64u aes_ghash_t;
+#define aes_gcm_splat(v)	       u8x64_splat (v)
+#define aes_gcm_ghash_reduce(c)	       ghash4_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c)       ghash4_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c)	       (c)->T = ghash4_final (&(c)->gd)
+#elif N_AES_LANES == 2
+typedef u8x32u aes_ghash_t;
+#define aes_gcm_splat(v)	       u8x32_splat (v)
+#define aes_gcm_ghash_reduce(c)	       ghash2_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c)       ghash2_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c)	       (c)->T = ghash2_final (&(c)->gd)
+#else
+typedef u8x16 aes_ghash_t;
+#define aes_gcm_splat(v)	       u8x16_splat (v)
+#define aes_gcm_ghash_reduce(c)	       ghash_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c)       ghash_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c)	       (c)->T = ghash_final (&(c)->gd)
+#endif
+
+typedef enum
+{
+  AES_GCM_OP_UNKNONW = 0,
+  AES_GCM_OP_ENCRYPT,
+  AES_GCM_OP_DECRYPT,
+  AES_GCM_OP_GMAC
+} aes_gcm_op_t;
+
+typedef struct
+{
+  /* pre-calculated hash key values */
+  const u8x16 Hi[NUM_HI];
+  /* extracted AES key */
+  const aes_expaned_key_t Ke[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+} aes_gcm_key_data_t;
+
+typedef struct
+{
+  aes_gcm_op_t operation;
+  int last;
+  u8 rounds;
+  uword data_bytes;
+  uword aad_bytes;
+
+  u8x16 T;
+
+  /* hash */
+  const u8x16 *Hi;
+  const aes_ghash_t *next_Hi;
+
+  /* expaded keys */
+  const aes_expaned_key_t *Ke;
+
+  /* counter */
+  u32 counter;
+  u8x16 EY0;
+  aes_counter_t Y;
+
+  /* ghash */
+  ghash_ctx_t gd;
+} aes_gcm_ctx_t;
+
+static_always_inline u8x16
+aes_gcm_final_block (aes_gcm_ctx_t *ctx)
+{
+  return (u8x16) ((u64x2){ ctx->data_bytes, ctx->aad_bytes } << 3);
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_first (aes_gcm_ctx_t *ctx, aes_data_t data, u32 n_lanes)
+{
+  uword hash_offset = NUM_HI - n_lanes;
+  ctx->next_Hi = (aes_ghash_t *) (ctx->Hi + hash_offset);
+#if N_AES_LANES == 4
+  u8x64 tag4 = {};
+  tag4 = u8x64_insert_u8x16 (tag4, ctx->T, 0);
+  ghash4_mul_first (&ctx->gd, aes_reflect (data) ^ tag4, *ctx->next_Hi++);
+#elif N_AES_LANES == 2
+  u8x32 tag2 = {};
+  tag2 = u8x32_insert_lo (tag2, ctx->T);
+  ghash2_mul_first (&ctx->gd, aes_reflect (data) ^ tag2, *ctx->next_Hi++);
+#else
+  ghash_mul_first (&ctx->gd, aes_reflect (data) ^ ctx->T, *ctx->next_Hi++);
+#endif
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_next (aes_gcm_ctx_t *ctx, aes_data_t data)
+{
+#if N_AES_LANES == 4
+  ghash4_mul_next (&ctx->gd, aes_reflect (data), *ctx->next_Hi++);
+#elif N_AES_LANES == 2
+  ghash2_mul_next (&ctx->gd, aes_reflect (data), *ctx->next_Hi++);
+#else
+  ghash_mul_next (&ctx->gd, aes_reflect (data), *ctx->next_Hi++);
+#endif
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_final_block (aes_gcm_ctx_t *ctx)
+{
+#if N_AES_LANES == 4
+  u8x64 h = u8x64_insert_u8x16 (u8x64_zero (), ctx->Hi[NUM_HI - 1], 0);
+  u8x64 r4 = u8x64_insert_u8x16 (u8x64_zero (), aes_gcm_final_block (ctx), 0);
+  ghash4_mul_next (&ctx->gd, r4, h);
+#elif N_AES_LANES == 2
+  u8x32 h = u8x32_insert_lo (u8x32_zero (), ctx->Hi[NUM_HI - 1]);
+  u8x32 r2 = u8x32_insert_lo (u8x32_zero (), aes_gcm_final_block (ctx));
+  ghash2_mul_next (&ctx->gd, r2, h);
+#else
+  ghash_mul_next (&ctx->gd, aes_gcm_final_block (ctx), ctx->Hi[NUM_HI - 1]);
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_ctr0_round (aes_gcm_ctx_t *ctx, int aes_round)
+{
+  if (aes_round == 0)
+    ctx->EY0 ^= ctx->Ke[0].x1;
+  else if (aes_round == ctx->rounds)
+    ctx->EY0 = aes_enc_last_round_x1 (ctx->EY0, ctx->Ke[aes_round].x1);
+  else
+    ctx->EY0 = aes_enc_round_x1 (ctx->EY0, ctx->Ke[aes_round].x1);
+}
+
+static_always_inline void
+aes_gcm_ghash (aes_gcm_ctx_t *ctx, u8 *data, u32 n_left)
+{
+  uword i;
+  aes_data_t r = {};
+  const aes_mem_t *d = (aes_mem_t *) data;
+
+  for (int n = 8 * N_AES_BYTES; n_left >= n; n_left -= n, d += 8)
+    {
+      if (ctx->operation == AES_GCM_OP_GMAC && n_left == n)
+	{
+	  aes_gcm_ghash_mul_first (ctx, d[0], 8 * N_AES_LANES + 1);
+	  for (i = 1; i < 8; i++)
+	    aes_gcm_ghash_mul_next (ctx, d[i]);
+	  aes_gcm_ghash_mul_final_block (ctx);
+	  aes_gcm_ghash_reduce (ctx);
+	  aes_gcm_ghash_reduce2 (ctx);
+	  aes_gcm_ghash_final (ctx);
+	  goto done;
+	}
+
+      aes_gcm_ghash_mul_first (ctx, d[0], 8 * N_AES_LANES);
+      for (i = 1; i < 8; i++)
+	aes_gcm_ghash_mul_next (ctx, d[i]);
+      aes_gcm_ghash_reduce (ctx);
+      aes_gcm_ghash_reduce2 (ctx);
+      aes_gcm_ghash_final (ctx);
+    }
+
+  if (n_left > 0)
+    {
+      int n_lanes = (n_left + 15) / 16;
+
+      if (ctx->operation == AES_GCM_OP_GMAC)
+	n_lanes++;
+
+      if (n_left < N_AES_BYTES)
+	{
+	  clib_memcpy_fast (&r, d, n_left);
+	  aes_gcm_ghash_mul_first (ctx, r, n_lanes);
+	}
+      else
+	{
+	  aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+	  n_left -= N_AES_BYTES;
+	  i = 1;
+
+	  if (n_left >= 4 * N_AES_BYTES)
+	    {
+	      aes_gcm_ghash_mul_next (ctx, d[i]);
+	      aes_gcm_ghash_mul_next (ctx, d[i + 1]);
+	      aes_gcm_ghash_mul_next (ctx, d[i + 2]);
+	      aes_gcm_ghash_mul_next (ctx, d[i + 3]);
+	      n_left -= 4 * N_AES_BYTES;
+	      i += 4;
+	    }
+	  if (n_left >= 2 * N_AES_BYTES)
+	    {
+	      aes_gcm_ghash_mul_next (ctx, d[i]);
+	      aes_gcm_ghash_mul_next (ctx, d[i + 1]);
+	      n_left -= 2 * N_AES_BYTES;
+	      i += 2;
+	    }
+
+	  if (n_left >= N_AES_BYTES)
+	    {
+	      aes_gcm_ghash_mul_next (ctx, d[i]);
+	      n_left -= N_AES_BYTES;
+	      i += 1;
+	    }
+
+	  if (n_left)
+	    {
+	      clib_memcpy_fast (&r, d + i, n_left);
+	      aes_gcm_ghash_mul_next (ctx, r);
+	    }
+	}
+
+      if (ctx->operation == AES_GCM_OP_GMAC)
+	aes_gcm_ghash_mul_final_block (ctx);
+      aes_gcm_ghash_reduce (ctx);
+      aes_gcm_ghash_reduce2 (ctx);
+      aes_gcm_ghash_final (ctx);
+    }
+  else if (ctx->operation == AES_GCM_OP_GMAC)
+    ctx->T =
+      ghash_mul (aes_gcm_final_block (ctx) ^ ctx->T, ctx->Hi[NUM_HI - 1]);
+
+done:
+  /* encrypt counter 0 E(Y0, k) */
+  if (ctx->operation == AES_GCM_OP_GMAC)
+    for (int i = 0; i < ctx->rounds + 1; i += 1)
+      aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline void
+aes_gcm_enc_first_round (aes_gcm_ctx_t *ctx, aes_data_t *r, uword n_blocks)
+{
+  const aes_expaned_key_t Ke0 = ctx->Ke[0];
+  uword i = 0;
+
+  /* As counter is stored in network byte order for performance reasons we
+     are incrementing least significant byte only except in case where we
+     overlow. As we are processing four 128, 256 or 512-blocks in parallel
+     except the last round, overflow can happen only when n_blocks == 4 */
+
+#if N_AES_LANES == 4
+  const u32x16 ctr_inv_4444 = { 0, 0, 0, 4 << 24, 0, 0, 0, 4 << 24,
+				0, 0, 0, 4 << 24, 0, 0, 0, 4 << 24 };
+
+  const u32x16 ctr_4444 = {
+    4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0,
+  };
+
+  if (n_blocks == 4)
+    for (; i < 2; i++)
+      {
+	r[i] = Ke0.x4 ^ (u8x64) ctx->Y; /* Initial AES round */
+	ctx->Y += ctr_inv_4444;
+      }
+
+  if (n_blocks == 4 && PREDICT_FALSE ((u8) ctx->counter == 242))
+    {
+      u32x16 Yr = (u32x16) aes_reflect ((u8x64) ctx->Y);
+
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x4 ^ (u8x64) ctx->Y; /* Initial AES round */
+	  Yr += ctr_4444;
+	  ctx->Y = (u32x16) aes_reflect ((u8x64) Yr);
+	}
+    }
+  else
+    {
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x4 ^ (u8x64) ctx->Y; /* Initial AES round */
+	  ctx->Y += ctr_inv_4444;
+	}
+    }
+  ctx->counter += n_blocks * 4;
+#elif N_AES_LANES == 2
+  const u32x8 ctr_inv_22 = { 0, 0, 0, 2 << 24, 0, 0, 0, 2 << 24 };
+  const u32x8 ctr_22 = { 2, 0, 0, 0, 2, 0, 0, 0 };
+
+  if (n_blocks == 4)
+    for (; i < 2; i++)
+      {
+	r[i] = Ke0.x2 ^ (u8x32) ctx->Y; /* Initial AES round */
+	ctx->Y += ctr_inv_22;
+      }
+
+  if (n_blocks == 4 && PREDICT_FALSE ((u8) ctx->counter == 250))
+    {
+      u32x8 Yr = (u32x8) aes_reflect ((u8x32) ctx->Y);
+
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x2 ^ (u8x32) ctx->Y; /* Initial AES round */
+	  Yr += ctr_22;
+	  ctx->Y = (u32x8) aes_reflect ((u8x32) Yr);
+	}
+    }
+  else
+    {
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x2 ^ (u8x32) ctx->Y; /* Initial AES round */
+	  ctx->Y += ctr_inv_22;
+	}
+    }
+  ctx->counter += n_blocks * 2;
+#else
+  const u32x4 ctr_inv_1 = { 0, 0, 0, 1 << 24 };
+
+  if (PREDICT_TRUE ((u8) ctx->counter < 0xfe) || n_blocks < 3)
+    {
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x1 ^ (u8x16) ctx->Y; /* Initial AES round */
+	  ctx->Y += ctr_inv_1;
+	}
+      ctx->counter += n_blocks;
+    }
+  else
+    {
+      r[i++] = Ke0.x1 ^ (u8x16) ctx->Y; /* Initial AES round */
+      ctx->Y += ctr_inv_1;
+      ctx->counter += 1;
+
+      for (; i < n_blocks; i++)
+	{
+	  r[i] = Ke0.x1 ^ (u8x16) ctx->Y; /* Initial AES round */
+	  ctx->counter++;
+	  ctx->Y[3] = clib_host_to_net_u32 (ctx->counter);
+	}
+    }
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_last_round (aes_gcm_ctx_t *ctx, aes_data_t *r, aes_data_t *d,
+			const aes_expaned_key_t *Ke, uword n_blocks)
+{
+  /* additional ronuds for AES-192 and AES-256 */
+  for (int i = 10; i < ctx->rounds; i++)
+    aes_enc_round (r, Ke + i, n_blocks);
+
+  aes_enc_last_round (r, d, Ke + ctx->rounds, n_blocks);
+}
+
+static_always_inline void
+aes_gcm_calc (aes_gcm_ctx_t *ctx, aes_data_t *d, const u8 *src, u8 *dst, u32 n,
+	      u32 n_bytes, int with_ghash)
+{
+  const aes_expaned_key_t *k = ctx->Ke;
+  const aes_mem_t *sv = (aes_mem_t *) src;
+  aes_mem_t *dv = (aes_mem_t *) dst;
+  uword ghash_blocks, gc = 1;
+  aes_data_t r[4];
+  u32 i, n_lanes;
+
+  if (ctx->operation == AES_GCM_OP_ENCRYPT)
+    {
+      ghash_blocks = 4;
+      n_lanes = N_AES_LANES * 4;
+    }
+  else
+    {
+      ghash_blocks = n;
+      n_lanes = n * N_AES_LANES;
+#if N_AES_LANES != 1
+      if (ctx->last)
+	n_lanes = (n_bytes + 15) / 16;
+#endif
+    }
+
+  n_bytes -= (n - 1) * N_AES_BYTES;
+
+  /* AES rounds 0 and 1 */
+  aes_gcm_enc_first_round (ctx, r, n);
+  aes_enc_round (r, k + 1, n);
+
+  /* load data - decrypt round */
+  if (ctx->operation == AES_GCM_OP_DECRYPT)
+    {
+      for (i = 0; i < n - ctx->last; i++)
+	d[i] = sv[i];
+
+      if (ctx->last)
+	d[n - 1] = aes_load_partial ((u8 *) (sv + n - 1), n_bytes);
+    }
+
+  /* GHASH multiply block 0 */
+  if (with_ghash)
+    aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+
+  /* AES rounds 2 and 3 */
+  aes_enc_round (r, k + 2, n);
+  aes_enc_round (r, k + 3, n);
+
+  /* GHASH multiply block 1 */
+  if (with_ghash && gc++ < ghash_blocks)
+    aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+  /* AES rounds 4 and 5 */
+  aes_enc_round (r, k + 4, n);
+  aes_enc_round (r, k + 5, n);
+
+  /* GHASH multiply block 2 */
+  if (with_ghash && gc++ < ghash_blocks)
+    aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+  /* AES rounds 6 and 7 */
+  aes_enc_round (r, k + 6, n);
+  aes_enc_round (r, k + 7, n);
+
+  /* GHASH multiply block 3 */
+  if (with_ghash && gc++ < ghash_blocks)
+    aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+  /* load 4 blocks of data - decrypt round */
+  if (ctx->operation == AES_GCM_OP_ENCRYPT)
+    {
+      for (i = 0; i < n - ctx->last; i++)
+	d[i] = sv[i];
+
+      if (ctx->last)
+	d[n - 1] = aes_load_partial (sv + n - 1, n_bytes);
+    }
+
+  /* AES rounds 8 and 9 */
+  aes_enc_round (r, k + 8, n);
+  aes_enc_round (r, k + 9, n);
+
+  /* AES last round(s) */
+  aes_gcm_enc_last_round (ctx, r, d, k, n);
+
+  /* store data */
+  for (i = 0; i < n - ctx->last; i++)
+    dv[i] = d[i];
+
+  if (ctx->last)
+    aes_store_partial (d[n - 1], dv + n - 1, n_bytes);
+
+  /* GHASH reduce 1st step */
+  aes_gcm_ghash_reduce (ctx);
+
+  /* GHASH reduce 2nd step */
+  if (with_ghash)
+    aes_gcm_ghash_reduce2 (ctx);
+
+  /* GHASH final step */
+  if (with_ghash)
+    aes_gcm_ghash_final (ctx);
+}
+
+static_always_inline void
+aes_gcm_calc_double (aes_gcm_ctx_t *ctx, aes_data_t *d, const u8 *src, u8 *dst)
+{
+  const aes_expaned_key_t *k = ctx->Ke;
+  const aes_mem_t *sv = (aes_mem_t *) src;
+  aes_mem_t *dv = (aes_mem_t *) dst;
+  aes_data_t r[4];
+
+  /* AES rounds 0 and 1 */
+  aes_gcm_enc_first_round (ctx, r, 4);
+  aes_enc_round (r, k + 1, 4);
+
+  /* load 4 blocks of data - decrypt round */
+  if (ctx->operation == AES_GCM_OP_DECRYPT)
+    for (int i = 0; i < 4; i++)
+      d[i] = sv[i];
+
+  /* GHASH multiply block 0 */
+  aes_gcm_ghash_mul_first (ctx, d[0], N_AES_LANES * 8);
+
+  /* AES rounds 2 and 3 */
+  aes_enc_round (r, k + 2, 4);
+  aes_enc_round (r, k + 3, 4);
+
+  /* GHASH multiply block 1 */
+  aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+  /* AES rounds 4 and 5 */
+  aes_enc_round (r, k + 4, 4);
+  aes_enc_round (r, k + 5, 4);
+
+  /* GHASH multiply block 2 */
+  aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+  /* AES rounds 6 and 7 */
+  aes_enc_round (r, k + 6, 4);
+  aes_enc_round (r, k + 7, 4);
+
+  /* GHASH multiply block 3 */
+  aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+  /* AES rounds 8 and 9 */
+  aes_enc_round (r, k + 8, 4);
+  aes_enc_round (r, k + 9, 4);
+
+  /* load 4 blocks of data - encrypt round */
+  if (ctx->operation == AES_GCM_OP_ENCRYPT)
+    for (int i = 0; i < 4; i++)
+      d[i] = sv[i];
+
+  /* AES last round(s) */
+  aes_gcm_enc_last_round (ctx, r, d, k, 4);
+
+  /* store 4 blocks of data */
+  for (int i = 0; i < 4; i++)
+    dv[i] = d[i];
+
+  /* load next 4 blocks of data data - decrypt round */
+  if (ctx->operation == AES_GCM_OP_DECRYPT)
+    for (int i = 0; i < 4; i++)
+      d[i] = sv[i + 4];
+
+  /* GHASH multiply block 4 */
+  aes_gcm_ghash_mul_next (ctx, (d[0]));
+
+  /* AES rounds 0 and 1 */
+  aes_gcm_enc_first_round (ctx, r, 4);
+  aes_enc_round (r, k + 1, 4);
+
+  /* GHASH multiply block 5 */
+  aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+  /* AES rounds 2 and 3 */
+  aes_enc_round (r, k + 2, 4);
+  aes_enc_round (r, k + 3, 4);
+
+  /* GHASH multiply block 6 */
+  aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+  /* AES rounds 4 and 5 */
+  aes_enc_round (r, k + 4, 4);
+  aes_enc_round (r, k + 5, 4);
+
+  /* GHASH multiply block 7 */
+  aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+  /* AES rounds 6 and 7 */
+  aes_enc_round (r, k + 6, 4);
+  aes_enc_round (r, k + 7, 4);
+
+  /* GHASH reduce 1st step */
+  aes_gcm_ghash_reduce (ctx);
+
+  /* AES rounds 8 and 9 */
+  aes_enc_round (r, k + 8, 4);
+  aes_enc_round (r, k + 9, 4);
+
+  /* GHASH reduce 2nd step */
+  aes_gcm_ghash_reduce2 (ctx);
+
+  /* load 4 blocks of data - encrypt round */
+  if (ctx->operation == AES_GCM_OP_ENCRYPT)
+    for (int i = 0; i < 4; i++)
+      d[i] = sv[i + 4];
+
+  /* AES last round(s) */
+  aes_gcm_enc_last_round (ctx, r, d, k, 4);
+
+  /* store data */
+  for (int i = 0; i < 4; i++)
+    dv[i + 4] = d[i];
+
+  /* GHASH final step */
+  aes_gcm_ghash_final (ctx);
+}
+
+static_always_inline void
+aes_gcm_mask_bytes (aes_data_t *d, uword n_bytes)
+{
+  const union
+  {
+    u8 b[64];
+    aes_data_t r;
+  } scale = {
+    .b = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+	   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+	   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+	   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+  };
+
+  d[0] &= (aes_gcm_splat (n_bytes) > scale.r);
+}
+
+static_always_inline void
+aes_gcm_calc_last (aes_gcm_ctx_t *ctx, aes_data_t *d, int n_blocks,
+		   u32 n_bytes)
+{
+  int n_lanes = (N_AES_LANES == 1 ? n_blocks : (n_bytes + 15) / 16) + 1;
+  n_bytes -= (n_blocks - 1) * N_AES_BYTES;
+  int i;
+
+  aes_gcm_enc_ctr0_round (ctx, 0);
+  aes_gcm_enc_ctr0_round (ctx, 1);
+
+  if (n_bytes != N_AES_BYTES)
+    aes_gcm_mask_bytes (d + n_blocks - 1, n_bytes);
+
+  aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+
+  aes_gcm_enc_ctr0_round (ctx, 2);
+  aes_gcm_enc_ctr0_round (ctx, 3);
+
+  if (n_blocks > 1)
+    aes_gcm_ghash_mul_next (ctx, d[1]);
+
+  aes_gcm_enc_ctr0_round (ctx, 4);
+  aes_gcm_enc_ctr0_round (ctx, 5);
+
+  if (n_blocks > 2)
+    aes_gcm_ghash_mul_next (ctx, d[2]);
+
+  aes_gcm_enc_ctr0_round (ctx, 6);
+  aes_gcm_enc_ctr0_round (ctx, 7);
+
+  if (n_blocks > 3)
+    aes_gcm_ghash_mul_next (ctx, d[3]);
+
+  aes_gcm_enc_ctr0_round (ctx, 8);
+  aes_gcm_enc_ctr0_round (ctx, 9);
+
+  aes_gcm_ghash_mul_final_block (ctx);
+  aes_gcm_ghash_reduce (ctx);
+
+  for (i = 10; i < ctx->rounds; i++)
+    aes_gcm_enc_ctr0_round (ctx, i);
+
+  aes_gcm_ghash_reduce2 (ctx);
+
+  aes_gcm_ghash_final (ctx);
+
+  aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline void
+aes_gcm_enc (aes_gcm_ctx_t *ctx, const u8 *src, u8 *dst, u32 n_left)
+{
+  aes_data_t d[4];
+
+  if (PREDICT_FALSE (n_left == 0))
+    {
+      int i;
+      for (i = 0; i < ctx->rounds + 1; i++)
+	aes_gcm_enc_ctr0_round (ctx, i);
+      return;
+    }
+
+  if (n_left < 4 * N_AES_BYTES)
+    {
+      ctx->last = 1;
+      if (n_left > 3 * N_AES_BYTES)
+	{
+	  aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 0);
+	  aes_gcm_calc_last (ctx, d, 4, n_left);
+	}
+      else if (n_left > 2 * N_AES_BYTES)
+	{
+	  aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 0);
+	  aes_gcm_calc_last (ctx, d, 3, n_left);
+	}
+      else if (n_left > N_AES_BYTES)
+	{
+	  aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 0);
+	  aes_gcm_calc_last (ctx, d, 2, n_left);
+	}
+      else
+	{
+	  aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 0);
+	  aes_gcm_calc_last (ctx, d, 1, n_left);
+	}
+      return;
+    }
+
+  aes_gcm_calc (ctx, d, src, dst, 4, 4 * N_AES_BYTES, /* with_ghash */ 0);
+
+  /* next */
+  n_left -= 4 * N_AES_BYTES;
+  dst += 4 * N_AES_BYTES;
+  src += 4 * N_AES_BYTES;
+
+  for (int n = 8 * N_AES_BYTES; n_left >= n; n_left -= n, src += n, dst += n)
+    aes_gcm_calc_double (ctx, d, src, dst);
+
+  if (n_left >= 4 * N_AES_BYTES)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 4, 4 * N_AES_BYTES, /* with_ghash */ 1);
+
+      /* next */
+      n_left -= 4 * N_AES_BYTES;
+      dst += 4 * N_AES_BYTES;
+      src += 4 * N_AES_BYTES;
+    }
+
+  if (n_left == 0)
+    {
+      aes_gcm_calc_last (ctx, d, 4, 4 * N_AES_BYTES);
+      return;
+    }
+
+  ctx->last = 1;
+
+  if (n_left > 3 * N_AES_BYTES)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 1);
+      aes_gcm_calc_last (ctx, d, 4, n_left);
+    }
+  else if (n_left > 2 * N_AES_BYTES)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 1);
+      aes_gcm_calc_last (ctx, d, 3, n_left);
+    }
+  else if (n_left > N_AES_BYTES)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 1);
+      aes_gcm_calc_last (ctx, d, 2, n_left);
+    }
+  else
+    {
+      aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 1);
+      aes_gcm_calc_last (ctx, d, 1, n_left);
+    }
+}
+
+static_always_inline void
+aes_gcm_dec (aes_gcm_ctx_t *ctx, const u8 *src, u8 *dst, uword n_left)
+{
+  aes_data_t d[4] = {};
+  ghash_ctx_t gd;
+
+  /* main encryption loop */
+  for (int n = 8 * N_AES_BYTES; n_left >= n; n_left -= n, dst += n, src += n)
+    aes_gcm_calc_double (ctx, d, src, dst);
+
+  if (n_left >= 4 * N_AES_BYTES)
+    {
+      aes_gcm_calc (ctx, d, src, dst, 4, 4 * N_AES_BYTES, /* with_ghash */ 1);
+
+      /* next */
+      n_left -= 4 * N_AES_BYTES;
+      dst += N_AES_BYTES * 4;
+      src += N_AES_BYTES * 4;
+    }
+
+  if (n_left)
+    {
+      ctx->last = 1;
+
+      if (n_left > 3 * N_AES_BYTES)
+	aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 1);
+      else if (n_left > 2 * N_AES_BYTES)
+	aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 1);
+      else if (n_left > N_AES_BYTES)
+	aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 1);
+      else
+	aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 1);
+    }
+
+  /* interleaved counter 0 encryption E(Y0, k) and ghash of final GCM
+   * (bit length) block */
+
+  aes_gcm_enc_ctr0_round (ctx, 0);
+  aes_gcm_enc_ctr0_round (ctx, 1);
+
+  ghash_mul_first (&gd, aes_gcm_final_block (ctx) ^ ctx->T,
+		   ctx->Hi[NUM_HI - 1]);
+
+  aes_gcm_enc_ctr0_round (ctx, 2);
+  aes_gcm_enc_ctr0_round (ctx, 3);
+
+  ghash_reduce (&gd);
+
+  aes_gcm_enc_ctr0_round (ctx, 4);
+  aes_gcm_enc_ctr0_round (ctx, 5);
+
+  ghash_reduce2 (&gd);
+
+  aes_gcm_enc_ctr0_round (ctx, 6);
+  aes_gcm_enc_ctr0_round (ctx, 7);
+
+  ctx->T = ghash_final (&gd);
+
+  aes_gcm_enc_ctr0_round (ctx, 8);
+  aes_gcm_enc_ctr0_round (ctx, 9);
+
+  for (int i = 10; i < ctx->rounds + 1; i += 1)
+    aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline int
+aes_gcm (const u8 *src, u8 *dst, const u8 *aad, u8 *ivp, u8 *tag,
+	 u32 data_bytes, u32 aad_bytes, u8 tag_len,
+	 const aes_gcm_key_data_t *kd, int aes_rounds, aes_gcm_op_t op)
+{
+  u8 *addt = (u8 *) aad;
+  u32x4 Y0;
+
+  aes_gcm_ctx_t _ctx = { .counter = 2,
+			 .rounds = aes_rounds,
+			 .operation = op,
+			 .data_bytes = data_bytes,
+			 .aad_bytes = aad_bytes,
+			 .Ke = kd->Ke,
+			 .Hi = kd->Hi },
+		*ctx = &_ctx;
+
+  /* initalize counter */
+  Y0 = (u32x4) (u64x2){ *(u64u *) ivp, 0 };
+  Y0[2] = *(u32u *) (ivp + 8);
+  Y0[3] = 1 << 24;
+  ctx->EY0 = (u8x16) Y0;
+
+#if N_AES_LANES == 4
+  ctx->Y = u32x16_splat_u32x4 (Y0) + (u32x16){
+    0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24, 0, 0, 0, 3 << 24, 0, 0, 0, 4 << 24,
+  };
+#elif N_AES_LANES == 2
+  ctx->Y =
+    u32x8_splat_u32x4 (Y0) + (u32x8){ 0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24 };
+#else
+  ctx->Y = Y0 + (u32x4){ 0, 0, 0, 1 << 24 };
+#endif
+
+  /* calculate ghash for AAD */
+  aes_gcm_ghash (ctx, addt, aad_bytes);
+
+  /* ghash and encrypt/edcrypt  */
+  if (op == AES_GCM_OP_ENCRYPT)
+    aes_gcm_enc (ctx, src, dst, data_bytes);
+  else if (op == AES_GCM_OP_DECRYPT)
+    aes_gcm_dec (ctx, src, dst, data_bytes);
+
+  /* final tag is */
+  ctx->T = u8x16_reflect (ctx->T) ^ ctx->EY0;
+
+  /* tag_len 16 -> 0 */
+  tag_len &= 0xf;
+
+  if (op == AES_GCM_OP_ENCRYPT || op == AES_GCM_OP_GMAC)
+    {
+      /* store tag */
+      if (tag_len)
+	u8x16_store_partial (ctx->T, tag, tag_len);
+      else
+	((u8x16u *) tag)[0] = ctx->T;
+    }
+  else
+    {
+      /* check tag */
+      if (tag_len)
+	{
+	  u16 mask = pow2_mask (tag_len);
+	  u8x16 expected = u8x16_load_partial (tag, tag_len);
+	  if ((u8x16_msb_mask (expected == ctx->T) & mask) == mask)
+	    return 1;
+	}
+      else
+	{
+	  if (u8x16_is_equal (ctx->T, *(u8x16u *) tag))
+	    return 1;
+	}
+    }
+  return 0;
+}
+
+static_always_inline void
+clib_aes_gcm_key_expand (aes_gcm_key_data_t *kd, const u8 *key,
+			 aes_key_size_t ks)
+{
+  u8x16 H;
+  u8x16 ek[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+  aes_expaned_key_t *Ke = (aes_expaned_key_t *) kd->Ke;
+
+  /* expand AES key */
+  aes_key_expand (ek, key, ks);
+  for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+    Ke[i].lanes[0] = Ke[i].lanes[1] = Ke[i].lanes[2] = Ke[i].lanes[3] = ek[i];
+
+  /* pre-calculate H */
+  H = aes_encrypt_block (u8x16_zero (), ek, ks);
+  H = u8x16_reflect (H);
+  ghash_precompute (H, (u8x16 *) kd->Hi, ARRAY_LEN (kd->Hi));
+}
+
+static_always_inline void
+clib_aes128_gcm_enc (const aes_gcm_key_data_t *kd, const u8 *plaintext,
+		     u32 data_bytes, const u8 *aad, u32 aad_bytes,
+		     const u8 *iv, u32 tag_bytes, u8 *cyphertext, u8 *tag)
+{
+  aes_gcm (plaintext, cyphertext, aad, (u8 *) iv, tag, data_bytes, aad_bytes,
+	   tag_bytes, kd, AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_ENCRYPT);
+}
+
+static_always_inline void
+clib_aes256_gcm_enc (const aes_gcm_key_data_t *kd, const u8 *plaintext,
+		     u32 data_bytes, const u8 *aad, u32 aad_bytes,
+		     const u8 *iv, u32 tag_bytes, u8 *cyphertext, u8 *tag)
+{
+  aes_gcm (plaintext, cyphertext, aad, (u8 *) iv, tag, data_bytes, aad_bytes,
+	   tag_bytes, kd, AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_ENCRYPT);
+}
+
+static_always_inline int
+clib_aes128_gcm_dec (const aes_gcm_key_data_t *kd, const u8 *cyphertext,
+		     u32 data_bytes, const u8 *aad, u32 aad_bytes,
+		     const u8 *iv, const u8 *tag, u32 tag_bytes, u8 *plaintext)
+{
+  return aes_gcm (cyphertext, plaintext, aad, (u8 *) iv, (u8 *) tag,
+		  data_bytes, aad_bytes, tag_bytes, kd,
+		  AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_DECRYPT);
+}
+
+static_always_inline int
+clib_aes256_gcm_dec (const aes_gcm_key_data_t *kd, const u8 *cyphertext,
+		     u32 data_bytes, const u8 *aad, u32 aad_bytes,
+		     const u8 *iv, const u8 *tag, u32 tag_bytes, u8 *plaintext)
+{
+  return aes_gcm (cyphertext, plaintext, aad, (u8 *) iv, (u8 *) tag,
+		  data_bytes, aad_bytes, tag_bytes, kd,
+		  AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_DECRYPT);
+}
+
+static_always_inline void
+clib_aes128_gmac (const aes_gcm_key_data_t *kd, const u8 *data, u32 data_bytes,
+		  const u8 *iv, u32 tag_bytes, u8 *tag)
+{
+  aes_gcm (0, 0, data, (u8 *) iv, tag, 0, data_bytes, tag_bytes, kd,
+	   AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_GMAC);
+}
+
+static_always_inline void
+clib_aes256_gmac (const aes_gcm_key_data_t *kd, const u8 *data, u32 data_bytes,
+		  const u8 *iv, u32 tag_bytes, u8 *tag)
+{
+  aes_gcm (0, 0, data, (u8 *) iv, tag, 0, data_bytes, tag_bytes, kd,
+	   AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_GMAC);
+}
+
+#endif /* __crypto_aes_gcm_h__ */
diff --git a/src/vppinfra/crypto/ghash.h b/src/vppinfra/crypto/ghash.h
new file mode 100644
index 00000000000..66e3f6a673a
--- /dev/null
+++ b/src/vppinfra/crypto/ghash.h
@@ -0,0 +1,515 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2019 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+/*
+ *------------------------------------------------------------------
+ *  Copyright(c) 2018, Intel Corporation All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
+ *  DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *------------------------------------------------------------------
+ */
+
+/*
+ * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
+ *                   Vinodh Gopal, James Guilford, Tomasz Kantecki
+ *
+ * References:
+ * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
+ *     Intel Architecture Processors. August, 2010
+ * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
+ *     Intel Architecture Processors. October, 2012.
+ * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
+ *
+ * Definitions:
+ *  GF    Galois Extension Field GF(2^128) - finite field where elements are
+ *        represented as polynomials with coefficients in GF(2) with the
+ *        highest degree of 127. Polynomials are represented as 128-bit binary
+ *        numbers where each bit represents one coefficient.
+ *        e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
+ *  H     hash key (128 bit)
+ *  POLY  irreducible polynomial x^127 + x^7 + x^2 + x + 1
+ *  RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
+ *  +     addition in GF, which equals to XOR operation
+ *  *     multiplication in GF
+ *
+ * GF multiplication consists of 2 steps:
+ *  - carry-less multiplication of two 128-bit operands into 256-bit result
+ *  - reduction of 256-bit result into 128-bit with modulo POLY
+ *
+ * GHash is calculated on 128-bit blocks of data according to the following
+ * formula:
+ *    GH = (GH + data) * hash_key
+ *
+ * To avoid bit-reflection of data, this code uses GF multipication
+ * with reversed polynomial:
+ *   a * b * x^-127 mod RPOLY
+ *
+ * To improve computation speed table Hi is precomputed with powers of H',
+ * where H' is calculated as H<<1 mod RPOLY.
+ * This allows us to improve performance by deferring reduction. For example
+ * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
+ *
+ * u8x16 Hi[4];
+ * ghash_precompute (H, Hi, 4);
+ *
+ * ghash_ctx_t _gd, *gd = &_gd;
+ * ghash_mul_first (gd, GH ^ b0, Hi[3]);
+ * ghash_mul_next (gd, b1, Hi[2]);
+ * ghash_mul_next (gd, b2, Hi[1]);
+ * ghash_mul_next (gd, b3, Hi[0]);
+ * ghash_reduce (gd);
+ * ghash_reduce2 (gd);
+ * GH = ghash_final (gd);
+ *
+ * Reduction step is split into 3 functions so it can be better interleaved
+ * with other code, (i.e. with AES computation).
+ */
+
+#ifndef __ghash_h__
+#define __ghash_h__
+
+static_always_inline u8x16
+gmul_lo_lo (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
+			    (poly64_t) vget_low_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_hi_lo (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
+			    (poly64_t) vget_low_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_lo_hi (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
+			    (poly64_t) vget_high_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_hi_hi (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
+#elif defined (__ARM_FEATURE_CRYPTO)
+  return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
+#endif
+}
+
+typedef struct
+{
+  u8x16 mid, hi, lo, tmp_lo, tmp_hi;
+  u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
+  u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
+  int pending;
+} ghash_ctx_t;
+
+static const u8x16 ghash_poly = {
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
+};
+
+static const u8x16 ghash_poly2 = {
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
+};
+
+static_always_inline void
+ghash_mul_first (ghash_ctx_t *gd, u8x16 a, u8x16 b)
+{
+  /* a1 * b1 */
+  gd->hi = gmul_hi_hi (a, b);
+  /* a0 * b0 */
+  gd->lo = gmul_lo_lo (a, b);
+  /* a0 * b1 ^ a1 * b0 */
+  gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
+
+  /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
+     there is no pending data in tmp_lo and tmp_hi */
+  gd->pending = 0;
+}
+
+static_always_inline void
+ghash_mul_next (ghash_ctx_t *gd, u8x16 a, u8x16 b)
+{
+  /* a1 * b1 */
+  u8x16 hi = gmul_hi_hi (a, b);
+  /* a0 * b0 */
+  u8x16 lo = gmul_lo_lo (a, b);
+
+  /* this branch will be optimized out by the compiler, and it allows us to
+     reduce number of XOR operations by using ternary logic */
+  if (gd->pending)
+    {
+      /* there is peding data from previous invocation so we can XOR */
+      gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
+      gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
+      gd->pending = 0;
+    }
+  else
+    {
+      /* there is no peding data from previous invocation so we postpone XOR */
+      gd->tmp_hi = hi;
+      gd->tmp_lo = lo;
+      gd->pending = 1;
+    }
+
+  /* gd->mid ^= a0 * b1 ^ a1 * b0  */
+  gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash_reduce (ghash_ctx_t *gd)
+{
+  u8x16 r;
+
+  /* Final combination:
+     gd->lo ^= gd->mid << 64
+     gd->hi ^= gd->mid >> 64 */
+  u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
+  u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
+
+  if (gd->pending)
+    {
+      gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
+      gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
+    }
+  else
+    {
+      gd->lo ^= midl;
+      gd->hi ^= midr;
+    }
+  r = gmul_hi_lo (ghash_poly2, gd->lo);
+  gd->lo ^= u8x16_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash_reduce2 (ghash_ctx_t *gd)
+{
+  gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
+  gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
+}
+
+static_always_inline u8x16
+ghash_final (ghash_ctx_t *gd)
+{
+  return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
+		     u8x16_word_shift_left (gd->tmp_hi, 4));
+}
+
+static_always_inline u8x16
+ghash_mul (u8x16 a, u8x16 b)
+{
+  ghash_ctx_t _gd, *gd = &_gd;
+  ghash_mul_first (gd, a, b);
+  ghash_reduce (gd);
+  ghash_reduce2 (gd);
+  return ghash_final (gd);
+}
+
+#if defined(__VPCLMULQDQ__) && defined(__AVX512F__)
+
+static const u8x64 ghash4_poly2 = {
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+};
+
+static_always_inline u8x64
+gmul4_lo_lo (u8x64 a, u8x64 b)
+{
+  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
+}
+
+static_always_inline u8x64
+gmul4_hi_lo (u8x64 a, u8x64 b)
+{
+  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
+}
+
+static_always_inline u8x64
+gmul4_lo_hi (u8x64 a, u8x64 b)
+{
+  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
+}
+
+static_always_inline u8x64
+gmul4_hi_hi (u8x64 a, u8x64 b)
+{
+  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
+}
+
+static_always_inline void
+ghash4_mul_first (ghash_ctx_t *gd, u8x64 a, u8x64 b)
+{
+  gd->hi4 = gmul4_hi_hi (a, b);
+  gd->lo4 = gmul4_lo_lo (a, b);
+  gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
+  gd->pending = 0;
+}
+
+static_always_inline void
+ghash4_mul_next (ghash_ctx_t *gd, u8x64 a, u8x64 b)
+{
+  u8x64 hi = gmul4_hi_hi (a, b);
+  u8x64 lo = gmul4_lo_lo (a, b);
+
+  if (gd->pending)
+    {
+      /* there is peding data from previous invocation so we can XOR */
+      gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
+      gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
+      gd->pending = 0;
+    }
+  else
+    {
+      /* there is no peding data from previous invocation so we postpone XOR */
+      gd->tmp_hi4 = hi;
+      gd->tmp_lo4 = lo;
+      gd->pending = 1;
+    }
+  gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash4_reduce (ghash_ctx_t *gd)
+{
+  u8x64 r;
+
+  /* Final combination:
+     gd->lo4 ^= gd->mid4 << 64
+     gd->hi4 ^= gd->mid4 >> 64 */
+
+  u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
+  u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
+
+  if (gd->pending)
+    {
+      gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
+      gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
+    }
+  else
+    {
+      gd->lo4 ^= midl;
+      gd->hi4 ^= midr;
+    }
+
+  r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
+  gd->lo4 ^= u8x64_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash4_reduce2 (ghash_ctx_t *gd)
+{
+  gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
+  gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
+}
+
+static_always_inline u8x16
+ghash4_final (ghash_ctx_t *gd)
+{
+  u8x64 r;
+  u8x32 t;
+
+  r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
+		  u8x64_word_shift_left (gd->tmp_hi4, 4));
+
+  /* horizontal XOR of 4 128-bit lanes */
+  t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
+  return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
+}
+#endif
+
+#if defined(__VPCLMULQDQ__)
+
+static const u8x32 ghash2_poly2 = {
+  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+};
+
+static_always_inline u8x32
+gmul2_lo_lo (u8x32 a, u8x32 b)
+{
+  return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
+}
+
+static_always_inline u8x32
+gmul2_hi_lo (u8x32 a, u8x32 b)
+{
+  return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
+}
+
+static_always_inline u8x32
+gmul2_lo_hi (u8x32 a, u8x32 b)
+{
+  return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
+}
+
+static_always_inline u8x32
+gmul2_hi_hi (u8x32 a, u8x32 b)
+{
+  return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
+}
+
+static_always_inline void
+ghash2_mul_first (ghash_ctx_t *gd, u8x32 a, u8x32 b)
+{
+  gd->hi2 = gmul2_hi_hi (a, b);
+  gd->lo2 = gmul2_lo_lo (a, b);
+  gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
+  gd->pending = 0;
+}
+
+static_always_inline void
+ghash2_mul_next (ghash_ctx_t *gd, u8x32 a, u8x32 b)
+{
+  u8x32 hi = gmul2_hi_hi (a, b);
+  u8x32 lo = gmul2_lo_lo (a, b);
+
+  if (gd->pending)
+    {
+      /* there is peding data from previous invocation so we can XOR */
+      gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
+      gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
+      gd->pending = 0;
+    }
+  else
+    {
+      /* there is no peding data from previous invocation so we postpone XOR */
+      gd->tmp_hi2 = hi;
+      gd->tmp_lo2 = lo;
+      gd->pending = 1;
+    }
+  gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash2_reduce (ghash_ctx_t *gd)
+{
+  u8x32 r;
+
+  /* Final combination:
+     gd->lo2 ^= gd->mid2 << 64
+     gd->hi2 ^= gd->mid2 >> 64 */
+
+  u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
+  u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
+
+  if (gd->pending)
+    {
+      gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
+      gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
+    }
+  else
+    {
+      gd->lo2 ^= midl;
+      gd->hi2 ^= midr;
+    }
+
+  r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
+  gd->lo2 ^= u8x32_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash2_reduce2 (ghash_ctx_t *gd)
+{
+  gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
+  gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
+}
+
+static_always_inline u8x16
+ghash2_final (ghash_ctx_t *gd)
+{
+  u8x32 r;
+
+  r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
+		  u8x32_word_shift_left (gd->tmp_hi2, 4));
+
+  /* horizontal XOR of 2 128-bit lanes */
+  return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
+}
+#endif
+
+static_always_inline void
+ghash_precompute (u8x16 H, u8x16 * Hi, int n)
+{
+  u8x16 r8;
+  u32x4 r32;
+  /* calcullate H<<1 mod poly from the hash key */
+  r8 = (u8x16) ((u64x2) H >> 63);
+  H = (u8x16) ((u64x2) H << 1);
+  H |= u8x16_word_shift_left (r8, 8);
+  r32 = (u32x4) u8x16_word_shift_right (r8, 8);
+#ifdef __SSE2__
+  r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
+#else
+  r32[3] = r32[0];
+#endif
+  r32 = r32 == (u32x4) {1, 0, 0, 1};
+  Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
+
+  /* calculate H^(i + 1) */
+  for (int i = n - 2; i >= 0; i--)
+    Hi[i] = ghash_mul (H, Hi[i + 1]);
+}
+
+#endif /* __ghash_h__ */
+
diff --git a/src/vppinfra/crypto/poly1305.h b/src/vppinfra/crypto/poly1305.h
new file mode 100644
index 00000000000..cd6ea60cdf7
--- /dev/null
+++ b/src/vppinfra/crypto/poly1305.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __clib_poly1305_h__
+#define __clib_poly1305_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+
+/* implementation of DJB's poly1305 using 64-bit arithmetrics */
+
+typedef struct
+{
+  const u64 r[3], s[2];
+  u64 h[3];
+
+  /* partial data */
+  union
+  {
+    u8 as_u8[16];
+    u64 as_u64[2];
+  } partial;
+
+  size_t n_partial_bytes;
+} clib_poly1305_ctx;
+
+static_always_inline void
+clib_poly1305_init (clib_poly1305_ctx *ctx, const u8 key[32])
+{
+  u64u *k = (u64u *) key;
+  u64 *h = (u64 *) ctx->h;
+  u64 *r = (u64 *) ctx->r;
+  u64 *s = (u64 *) ctx->s;
+
+  /* initialize accumulator */
+  h[0] = h[1] = h[2] = 0;
+
+  /* clamp 1st half of the key and store it into r[] */
+  r[0] = k[0] & 0x0ffffffc0fffffff;
+  r[1] = k[1] & 0x0ffffffc0ffffffc;
+  s[0] = k[2];
+  s[1] = k[3];
+
+  /* precompute (r[1] >> 2) * 5 */
+  r[2] = r[1] + (r[1] >> 2);
+
+  ctx->n_partial_bytes = 0;
+}
+
+static_always_inline void
+_clib_poly1305_multiply_and_reduce (u64 h[3], const u64 r[3])
+{
+  union
+  {
+    struct
+    {
+      u64 lo, hi;
+    };
+    u128 n;
+  } l0, l1, l2;
+  u64 c;
+
+  /*
+		       h2       h1       h0
+    x                           r1       r0
+    ---------------------------------------
+		  r0 x h2  r0 x h1  r0 × h0
+    +    r1 x h2  r1 x h1  r1 x h0
+    ---------------------------------------
+
+    for p = 2^130-5, following applies:
+    (r * 2^130) mod p == (r * 5) mod p
+
+    bits above 130 can be shifted right (divided by 2^130)
+    and multiplied by 5 per equation above
+
+	     h2               h1                h0
+    x                         r1                r0
+    ----------------------------------------------
+       r0 x h2           r0 x h1           r0 × h0
+    +                    r1 x h0
+    +           5x (r1 >>2) x h2  5x (r1 >>2) x h1
+    ----------------------------------------------
+       [0:l2.lo]   [l1.hi:l1.lo]     [l0.hi:l0.lo]
+   */
+
+  l0.n = l1.n = l2.n = 0;
+  /* u64 x u64 = u128 multiplications */
+  l0.n += (u128) h[0] * r[0];
+  l0.n += (u128) h[1] * r[2]; /* r[2] holds precomputed (r[1] >> 2) * 5 */
+  l1.n += (u128) h[0] * r[1];
+  l1.n += (u128) h[1] * r[0];
+
+  /* u64 x u64 = u64 multiplications, as h[2] may have only lower 2 bits set
+   * and r[1] have clamped bits 60-63  */
+  l1.n += (u128) (h[2] * r[2]);
+  l2.n += (u128) (h[2] * r[0]);
+
+  /* propagate upper 64 bits to higher limb */
+  c = 0;
+  l1.lo = u64_add_with_carry (&c, l1.lo, l0.hi);
+  l2.lo = u64_add_with_carry (&c, l2.lo, l1.hi);
+
+  l2.hi = l2.lo;
+  /* keep bits [128:129] */
+  l2.lo &= 3;
+
+  /* bits 130 and above multiply with 5 and store to l2.hi */
+  l2.hi -= l2.lo;
+  l2.hi += l2.hi >> 2;
+
+  /* add l2.hi to l0.lo with carry propagation and store result to h2:h1:h0 */
+  c = 0;
+  h[0] = u64_add_with_carry (&c, l0.lo, l2.hi);
+  h[1] = u64_add_with_carry (&c, l1.lo, 0);
+  h[2] = u64_add_with_carry (&c, l2.lo, 0);
+}
+
+static_always_inline u32
+_clib_poly1305_add_blocks (clib_poly1305_ctx *ctx, const u8 *msg,
+			   uword n_bytes, const u32 bit17)
+{
+  u64 r[3], h[3];
+
+  for (int i = 0; i < 3; i++)
+    {
+      h[i] = ctx->h[i];
+      r[i] = ctx->r[i];
+    }
+
+  for (const u64u *m = (u64u *) msg; n_bytes >= 16; n_bytes -= 16, m += 2)
+    {
+      u64 c = 0;
+
+      /* h += m */
+      h[0] = u64_add_with_carry (&c, h[0], m[0]);
+      h[1] = u64_add_with_carry (&c, h[1], m[1]);
+      h[2] = u64_add_with_carry (&c, h[2], bit17 ? 1 : 0);
+
+      /* h = (h * r) mod p */
+      _clib_poly1305_multiply_and_reduce (h, r);
+    }
+
+  for (int i = 0; i < 3; i++)
+    ctx->h[i] = h[i];
+
+  return n_bytes;
+}
+
+static_always_inline void
+clib_poly1305_update (clib_poly1305_ctx *ctx, const u8 *msg, uword len)
+{
+  uword n_left = len;
+
+  if (n_left == 0)
+    return;
+
+  if (ctx->n_partial_bytes)
+    {
+      u16 missing_bytes = 16 - ctx->n_partial_bytes;
+      if (PREDICT_FALSE (n_left < missing_bytes))
+	{
+	  clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg,
+			    n_left);
+	  ctx->n_partial_bytes += n_left;
+	  return;
+	}
+
+      clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg,
+			missing_bytes);
+      _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 1);
+      ctx->n_partial_bytes = 0;
+      n_left -= missing_bytes;
+      msg += missing_bytes;
+    }
+
+  n_left = _clib_poly1305_add_blocks (ctx, msg, n_left, 1);
+
+  if (n_left)
+    {
+      ctx->partial.as_u64[0] = ctx->partial.as_u64[1] = 0;
+      clib_memcpy_fast (ctx->partial.as_u8, msg + len - n_left, n_left);
+      ctx->n_partial_bytes = n_left;
+    }
+}
+
+static_always_inline void
+clib_poly1305_final (clib_poly1305_ctx *ctx, u8 *out)
+{
+  const u64 p[] = { 0xFFFFFFFFFFFFFFFB, 0xFFFFFFFFFFFFFFFF, 3 }; /* 2^128-5 */
+  const u64 *s = ctx->s;
+  u64u *t = (u64u *) out;
+  u64 h0, h1, t0, t1;
+  u64 c;
+
+  if (ctx->n_partial_bytes)
+    {
+      ctx->partial.as_u8[ctx->n_partial_bytes] = 1;
+      _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 0);
+    }
+
+  h0 = ctx->h[0];
+  h1 = ctx->h[1];
+
+  /* h may not be fully reduced, try to subtract 2^128-5 */
+  c = 0;
+  t0 = u64_sub_with_borrow (&c, h0, p[0]);
+  t1 = u64_sub_with_borrow (&c, h1, p[1]);
+  u64_sub_with_borrow (&c, ctx->h[2], p[2]);
+
+  if (!c)
+    {
+      h0 = t0;
+      h1 = t1;
+    }
+
+  c = 0;
+  t[0] = u64_add_with_carry (&c, h0, s[0]);
+  t[1] = u64_add_with_carry (&c, h1, s[1]);
+}
+
+static_always_inline void
+clib_poly1305 (const u8 *key, const u8 *msg, uword len, u8 *out)
+{
+  clib_poly1305_ctx ctx;
+  clib_poly1305_init (&ctx, key);
+  clib_poly1305_update (&ctx, msg, len);
+  clib_poly1305_final (&ctx, out);
+}
+
+#endif /* __clib_poly1305_h__ */
diff --git a/src/vppinfra/crypto/sha2.h b/src/vppinfra/crypto/sha2.h
new file mode 100644
index 00000000000..69a24a2d087
--- /dev/null
+++ b/src/vppinfra/crypto/sha2.h
@@ -0,0 +1,715 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2024 Cisco Systems, Inc.
+ */
+
+#ifndef included_sha2_h
+#define included_sha2_h
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/string.h>
+
+#define SHA256_ROTR(x, y)   ((x >> y) | (x << (32 - y)))
+#define SHA256_CH(a, b, c)  ((a & b) ^ (~a & c))
+#define SHA256_MAJ(a, b, c) ((a & b) ^ (a & c) ^ (b & c))
+#define SHA256_CSIGMA0(x)                                                     \
+  (SHA256_ROTR (x, 2) ^ SHA256_ROTR (x, 13) ^ SHA256_ROTR (x, 22));
+#define SHA256_CSIGMA1(x)                                                     \
+  (SHA256_ROTR (x, 6) ^ SHA256_ROTR (x, 11) ^ SHA256_ROTR (x, 25));
+#define SHA256_SSIGMA0(x) (SHA256_ROTR (x, 7) ^ SHA256_ROTR (x, 18) ^ (x >> 3))
+#define SHA256_SSIGMA1(x)                                                     \
+  (SHA256_ROTR (x, 17) ^ SHA256_ROTR (x, 19) ^ (x >> 10))
+
+#define SHA256_MSG_SCHED(w, j)                                                \
+  {                                                                           \
+    w[j] = w[j - 7] + w[j - 16];                                              \
+    w[j] += SHA256_SSIGMA0 (w[j - 15]);                                       \
+    w[j] += SHA256_SSIGMA1 (w[j - 2]);                                        \
+  }
+
+#define SHA256_TRANSFORM(s, w, i, k)                                          \
+  {                                                                           \
+    __typeof__ (s[0]) t1, t2;                                                 \
+    t1 = k + w[i] + s[7];                                                     \
+    t1 += SHA256_CSIGMA1 (s[4]);                                              \
+    t1 += SHA256_CH (s[4], s[5], s[6]);                                       \
+    t2 = SHA256_CSIGMA0 (s[0]);                                               \
+    t2 += SHA256_MAJ (s[0], s[1], s[2]);                                      \
+    s[7] = s[6];                                                              \
+    s[6] = s[5];                                                              \
+    s[5] = s[4];                                                              \
+    s[4] = s[3] + t1;                                                         \
+    s[3] = s[2];                                                              \
+    s[2] = s[1];                                                              \
+    s[1] = s[0];                                                              \
+    s[0] = t1 + t2;                                                           \
+  }
+
+#define SHA512_ROTR(x, y)   ((x >> y) | (x << (64 - y)))
+#define SHA512_CH(a, b, c)  ((a & b) ^ (~a & c))
+#define SHA512_MAJ(a, b, c) ((a & b) ^ (a & c) ^ (b & c))
+#define SHA512_CSIGMA0(x)                                                     \
+  (SHA512_ROTR (x, 28) ^ SHA512_ROTR (x, 34) ^ SHA512_ROTR (x, 39))
+#define SHA512_CSIGMA1(x)                                                     \
+  (SHA512_ROTR (x, 14) ^ SHA512_ROTR (x, 18) ^ SHA512_ROTR (x, 41))
+#define SHA512_SSIGMA0(x) (SHA512_ROTR (x, 1) ^ SHA512_ROTR (x, 8) ^ (x >> 7))
+#define SHA512_SSIGMA1(x)                                                     \
+  (SHA512_ROTR (x, 19) ^ SHA512_ROTR (x, 61) ^ (x >> 6))
+
+#define SHA512_MSG_SCHED(w, j)                                                \
+  {                                                                           \
+    w[j] = w[j - 7] + w[j - 16];                                              \
+    w[j] += SHA512_SSIGMA0 (w[j - 15]);                                       \
+    w[j] += SHA512_SSIGMA1 (w[j - 2]);                                        \
+  }
+
+#define SHA512_TRANSFORM(s, w, i, k)                                          \
+  {                                                                           \
+    __typeof__ (s[0]) t1, t2;                                                 \
+    t1 = k + w[i] + s[7];                                                     \
+    t1 += SHA512_CSIGMA1 (s[4]);                                              \
+    t1 += SHA512_CH (s[4], s[5], s[6]);                                       \
+    t2 = SHA512_CSIGMA0 (s[0]);                                               \
+    t2 += SHA512_MAJ (s[0], s[1], s[2]);                                      \
+    s[7] = s[6];                                                              \
+    s[6] = s[5];                                                              \
+    s[5] = s[4];                                                              \
+    s[4] = s[3] + t1;                                                         \
+    s[3] = s[2];                                                              \
+    s[2] = s[1];                                                              \
+    s[1] = s[0];                                                              \
+    s[0] = t1 + t2;                                                           \
+  }
+
+#if defined(__SHA__) && defined(__x86_64__)
+#define CLIB_SHA256_ISA_INTEL
+#define CLIB_SHA256_ISA
+#endif
+
+#ifdef __ARM_FEATURE_SHA2
+#define CLIB_SHA256_ISA_ARM
+#define CLIB_SHA256_ISA
+#endif
+
+static const u32 sha224_h[8] = { 0xc1059ed8, 0x367cd507, 0x3070dd17,
+				 0xf70e5939, 0xffc00b31, 0x68581511,
+				 0x64f98fa7, 0xbefa4fa4 };
+
+static const u32 sha256_h[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372,
+				 0xa54ff53a, 0x510e527f, 0x9b05688c,
+				 0x1f83d9ab, 0x5be0cd19 };
+
+static const u32 clib_sha2_256_k[64] = {
+  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+  0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+  0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+  0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+  0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+  0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static const u64 sha384_h[8] = { 0xcbbb9d5dc1059ed8, 0x629a292a367cd507,
+				 0x9159015a3070dd17, 0x152fecd8f70e5939,
+				 0x67332667ffc00b31, 0x8eb44a8768581511,
+				 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4 };
+
+static const u64 sha512_h[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+				 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+				 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+				 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 };
+
+static const u64 sha512_224_h[8] = { 0x8c3d37c819544da2, 0x73e1996689dcd4d6,
+				     0x1dfab7ae32ff9c82, 0x679dd514582f9fcf,
+				     0x0f6d2b697bd44da8, 0x77e36f7304c48942,
+				     0x3f9d85a86a1d36c8, 0x1112e6ad91d692a1 };
+
+static const u64 sha512_256_h[8] = { 0x22312194fc2bf72c, 0x9f555fa3c84c64c2,
+				     0x2393b86b6f53b151, 0x963877195940eabd,
+				     0x96283ee2a88effe3, 0xbe5e1e2553863992,
+				     0x2b0199fc2c85b8aa, 0x0eb72ddc81c52ca2 };
+
+static const u64 clib_sha2_512_k[80] = {
+  0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f,
+  0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019,
+  0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242,
+  0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+  0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
+  0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
+  0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275,
+  0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+  0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f,
+  0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
+  0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc,
+  0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+  0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6,
+  0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
+  0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
+  0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+  0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99,
+  0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
+  0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc,
+  0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+  0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915,
+  0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207,
+  0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba,
+  0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+  0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
+  0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
+  0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+};
+
+typedef enum
+{
+  CLIB_SHA2_224,
+  CLIB_SHA2_256,
+  CLIB_SHA2_384,
+  CLIB_SHA2_512,
+  CLIB_SHA2_512_224,
+  CLIB_SHA2_512_256,
+} clib_sha2_type_t;
+
+#define CLIB_SHA2_256_BLOCK_SIZE 64
+#define CLIB_SHA2_512_BLOCK_SIZE 128
+#define SHA2_MAX_BLOCK_SIZE	 CLIB_SHA2_512_BLOCK_SIZE
+#define SHA2_MAX_DIGEST_SIZE	 64
+
+static const struct
+{
+  u8 block_size;
+  u8 digest_size;
+  const u32 *h32;
+  const u64 *h64;
+} clib_sha2_variants[] = {
+  [CLIB_SHA2_224] = {
+    .block_size = CLIB_SHA2_256_BLOCK_SIZE,
+    .digest_size = 28,
+    .h32 = sha224_h,
+  },
+  [CLIB_SHA2_256] = {
+    .block_size = CLIB_SHA2_256_BLOCK_SIZE,
+    .digest_size = 32,
+    .h32 = sha256_h,
+  },
+  [CLIB_SHA2_384] = {
+    .block_size = CLIB_SHA2_512_BLOCK_SIZE,
+    .digest_size = 48,
+    .h64 = sha384_h,
+  },
+  [CLIB_SHA2_512] = {
+    .block_size = CLIB_SHA2_512_BLOCK_SIZE,
+    .digest_size = 64,
+    .h64 = sha512_h,
+  },
+  [CLIB_SHA2_512_224] = {
+    .block_size = CLIB_SHA2_512_BLOCK_SIZE,
+    .digest_size = 28,
+    .h64 = sha512_224_h,
+  },
+  [CLIB_SHA2_512_256] = {
+    .block_size = CLIB_SHA2_512_BLOCK_SIZE,
+    .digest_size = 32,
+    .h64 = sha512_256_h,
+  },
+};
+
+typedef union
+{
+  u32 h32[8];
+  u64 h64[8];
+#ifdef CLIB_SHA256_ISA
+  u32x4 h32x4[2];
+#endif
+} clib_sha2_h_t;
+
+typedef struct
+{
+  u64 total_bytes;
+  u16 n_pending;
+  clib_sha2_h_t h;
+  union
+  {
+    u8 as_u8[SHA2_MAX_BLOCK_SIZE];
+    u64 as_u64[SHA2_MAX_BLOCK_SIZE / sizeof (u64)];
+    uword as_uword[SHA2_MAX_BLOCK_SIZE / sizeof (uword)];
+  } pending;
+} clib_sha2_state_t;
+
+typedef struct
+{
+  clib_sha2_type_t type;
+  u8 block_size;
+  u8 digest_size;
+  clib_sha2_state_t state;
+} clib_sha2_ctx_t;
+
+static_always_inline void
+clib_sha2_state_init (clib_sha2_state_t *state, clib_sha2_type_t type)
+{
+  clib_sha2_state_t st = {};
+
+  if (clib_sha2_variants[type].block_size == CLIB_SHA2_256_BLOCK_SIZE)
+    for (int i = 0; i < 8; i++)
+      st.h.h32[i] = clib_sha2_variants[type].h32[i];
+  else
+    for (int i = 0; i < 8; i++)
+      st.h.h64[i] = clib_sha2_variants[type].h64[i];
+
+  *state = st;
+}
+
+static_always_inline void
+clib_sha2_init (clib_sha2_ctx_t *ctx, clib_sha2_type_t type)
+{
+  clib_sha2_state_init (&ctx->state, type);
+  ctx->block_size = clib_sha2_variants[type].block_size;
+  ctx->digest_size = clib_sha2_variants[type].digest_size;
+  ctx->type = type;
+}
+
+#ifdef CLIB_SHA256_ISA
+static inline void
+clib_sha256_vec_cycle_w (u32x4 w[], u8 i)
+{
+  u8 j = (i + 1) % 4;
+  u8 k = (i + 2) % 4;
+  u8 l = (i + 3) % 4;
+#ifdef CLIB_SHA256_ISA_INTEL
+  w[i] = (u32x4) _mm_sha256msg1_epu32 ((__m128i) w[i], (__m128i) w[j]);
+  w[i] += (u32x4) _mm_alignr_epi8 ((__m128i) w[l], (__m128i) w[k], 4);
+  w[i] = (u32x4) _mm_sha256msg2_epu32 ((__m128i) w[i], (__m128i) w[l]);
+#elif defined(CLIB_SHA256_ISA_ARM)
+  w[i] = vsha256su1q_u32 (vsha256su0q_u32 (w[i], w[j]), w[k], w[l]);
+#endif
+}
+
+static inline void
+clib_sha256_vec_4_rounds (u32x4 w, u8 n, u32x4 s[])
+{
+#ifdef CLIB_SHA256_ISA_INTEL
+  u32x4 r = *(u32x4 *) (clib_sha2_256_k + 4 * n) + w;
+  s[0] = (u32x4) _mm_sha256rnds2_epu32 ((__m128i) s[0], (__m128i) s[1],
+					(__m128i) r);
+  r = (u32x4) u64x2_interleave_hi ((u64x2) r, (u64x2) r);
+  s[1] = (u32x4) _mm_sha256rnds2_epu32 ((__m128i) s[1], (__m128i) s[0],
+					(__m128i) r);
+#elif defined(CLIB_SHA256_ISA_ARM)
+  u32x4 r0, s0;
+  const u32x4u *k = (u32x4u *) clib_sha2_256_k;
+
+  r0 = w + k[n];
+  s0 = s[0];
+  s[0] = vsha256hq_u32 (s[0], s[1], r0);
+  s[1] = vsha256h2q_u32 (s[1], s0, r0);
+#endif
+}
+#endif
+
+#if defined(CLIB_SHA256_ISA)
+static inline u32x4
+clib_sha256_vec_load (u32x4 r)
+{
+#if defined(CLIB_SHA256_ISA_INTEL)
+  return u32x4_byte_swap (r);
+#elif defined(CLIB_SHA256_ISA_ARM)
+  return vreinterpretq_u32_u8 (vrev32q_u8 (vreinterpretq_u8_u32 (r)));
+#endif
+}
+
+static inline void
+clib_sha256_vec_shuffle (u32x4 d[2])
+{
+#if defined(CLIB_SHA256_ISA_INTEL)
+  /* {0, 1, 2, 3}, {4, 5, 6, 7} -> {7, 6, 3, 2}, {5, 4, 1, 0} */
+  u32x4 r;
+  r = (u32x4) _mm_shuffle_ps ((__m128) d[1], (__m128) d[0], 0xbb);
+  d[1] = (u32x4) _mm_shuffle_ps ((__m128) d[1], (__m128) d[0], 0x11);
+  d[0] = r;
+#endif
+}
+#endif
+
+static inline void
+clib_sha256_block (clib_sha2_state_t *st, const u8 *msg, uword n_blocks)
+{
+#if defined(CLIB_SHA256_ISA)
+  u32x4 h[2];
+  u32x4u *m = (u32x4u *) msg;
+
+  h[0] = st->h.h32x4[0];
+  h[1] = st->h.h32x4[1];
+
+  clib_sha256_vec_shuffle (h);
+
+  for (; n_blocks; m += 4, n_blocks--)
+    {
+      u32x4 s[2], w[4];
+
+      s[0] = h[0];
+      s[1] = h[1];
+
+      w[0] = clib_sha256_vec_load (m[0]);
+      w[1] = clib_sha256_vec_load (m[1]);
+      w[2] = clib_sha256_vec_load (m[2]);
+      w[3] = clib_sha256_vec_load (m[3]);
+
+      clib_sha256_vec_4_rounds (w[0], 0, s);
+      clib_sha256_vec_4_rounds (w[1], 1, s);
+      clib_sha256_vec_4_rounds (w[2], 2, s);
+      clib_sha256_vec_4_rounds (w[3], 3, s);
+
+      clib_sha256_vec_cycle_w (w, 0);
+      clib_sha256_vec_4_rounds (w[0], 4, s);
+      clib_sha256_vec_cycle_w (w, 1);
+      clib_sha256_vec_4_rounds (w[1], 5, s);
+      clib_sha256_vec_cycle_w (w, 2);
+      clib_sha256_vec_4_rounds (w[2], 6, s);
+      clib_sha256_vec_cycle_w (w, 3);
+      clib_sha256_vec_4_rounds (w[3], 7, s);
+
+      clib_sha256_vec_cycle_w (w, 0);
+      clib_sha256_vec_4_rounds (w[0], 8, s);
+      clib_sha256_vec_cycle_w (w, 1);
+      clib_sha256_vec_4_rounds (w[1], 9, s);
+      clib_sha256_vec_cycle_w (w, 2);
+      clib_sha256_vec_4_rounds (w[2], 10, s);
+      clib_sha256_vec_cycle_w (w, 3);
+      clib_sha256_vec_4_rounds (w[3], 11, s);
+
+      clib_sha256_vec_cycle_w (w, 0);
+      clib_sha256_vec_4_rounds (w[0], 12, s);
+      clib_sha256_vec_cycle_w (w, 1);
+      clib_sha256_vec_4_rounds (w[1], 13, s);
+      clib_sha256_vec_cycle_w (w, 2);
+      clib_sha256_vec_4_rounds (w[2], 14, s);
+      clib_sha256_vec_cycle_w (w, 3);
+      clib_sha256_vec_4_rounds (w[3], 15, s);
+
+      h[0] += s[0];
+      h[1] += s[1];
+    }
+
+  clib_sha256_vec_shuffle (h);
+
+  st->h.h32x4[0] = h[0];
+  st->h.h32x4[1] = h[1];
+#else
+  u32 w[64], s[8], i;
+  clib_sha2_h_t h;
+
+  h = st->h;
+
+  for (; n_blocks; msg += CLIB_SHA2_256_BLOCK_SIZE, n_blocks--)
+    {
+      for (i = 0; i < 8; i++)
+	s[i] = h.h32[i];
+
+      for (i = 0; i < 16; i++)
+	{
+	  w[i] = clib_net_to_host_u32 ((((u32u *) msg)[i]));
+	  SHA256_TRANSFORM (s, w, i, clib_sha2_256_k[i]);
+	}
+
+      for (i = 16; i < 64; i++)
+	{
+	  SHA256_MSG_SCHED (w, i);
+	  SHA256_TRANSFORM (s, w, i, clib_sha2_256_k[i]);
+	}
+
+      for (i = 0; i < 8; i++)
+	h.h32[i] += s[i];
+    }
+
+  st->h = h;
+#endif
+}
+
+static_always_inline void
+clib_sha512_block (clib_sha2_state_t *st, const u8 *msg, uword n_blocks)
+{
+  u64 w[80], s[8], i;
+  clib_sha2_h_t h;
+
+  h = st->h;
+
+  for (; n_blocks; msg += CLIB_SHA2_512_BLOCK_SIZE, n_blocks--)
+    {
+      for (i = 0; i < 8; i++)
+	s[i] = h.h64[i];
+
+      for (i = 0; i < 16; i++)
+	{
+	  w[i] = clib_net_to_host_u64 ((((u64u *) msg)[i]));
+	  SHA512_TRANSFORM (s, w, i, clib_sha2_512_k[i]);
+	}
+
+      for (i = 16; i < 80; i++)
+	{
+	  SHA512_MSG_SCHED (w, i);
+	  SHA512_TRANSFORM (s, w, i, clib_sha2_512_k[i]);
+	}
+
+      for (i = 0; i < 8; i++)
+	h.h64[i] += s[i];
+    }
+
+  st->h = h;
+}
+
+static_always_inline void
+clib_sha2_update_internal (clib_sha2_state_t *st, u8 block_size, const u8 *msg,
+			   uword n_bytes)
+{
+  uword n_blocks;
+  if (st->n_pending)
+    {
+      uword n_left = block_size - st->n_pending;
+      if (n_bytes < n_left)
+	{
+	  clib_memcpy_fast (st->pending.as_u8 + st->n_pending, msg, n_bytes);
+	  st->n_pending += n_bytes;
+	  return;
+	}
+      else
+	{
+	  clib_memcpy_fast (st->pending.as_u8 + st->n_pending, msg, n_left);
+	  if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+	    clib_sha512_block (st, st->pending.as_u8, 1);
+	  else
+	    clib_sha256_block (st, st->pending.as_u8, 1);
+	  st->n_pending = 0;
+	  st->total_bytes += block_size;
+	  n_bytes -= n_left;
+	  msg += n_left;
+	}
+    }
+
+  if ((n_blocks = n_bytes / block_size))
+    {
+      if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+	clib_sha512_block (st, msg, n_blocks);
+      else
+	clib_sha256_block (st, msg, n_blocks);
+      n_bytes -= n_blocks * block_size;
+      msg += n_blocks * block_size;
+      st->total_bytes += n_blocks * block_size;
+    }
+
+  if (n_bytes)
+    {
+      clib_memset_u8 (st->pending.as_u8, 0, block_size);
+      clib_memcpy_fast (st->pending.as_u8, msg, n_bytes);
+      st->n_pending = n_bytes;
+    }
+  else
+    st->n_pending = 0;
+}
+
+static_always_inline void
+clib_sha2_update (clib_sha2_ctx_t *ctx, const u8 *msg, uword n_bytes)
+{
+  clib_sha2_update_internal (&ctx->state, ctx->block_size, msg, n_bytes);
+}
+
+static_always_inline void
+clib_sha2_final_internal (clib_sha2_state_t *st, u8 block_size, u8 digest_size,
+			  u8 *digest)
+{
+  int i;
+
+  st->total_bytes += st->n_pending;
+  if (st->n_pending == 0)
+    {
+      clib_memset (st->pending.as_u8, 0, block_size);
+      st->pending.as_u8[0] = 0x80;
+    }
+  else if (st->n_pending + sizeof (u64) + sizeof (u8) > block_size)
+    {
+      st->pending.as_u8[st->n_pending] = 0x80;
+      if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+	clib_sha512_block (st, st->pending.as_u8, 1);
+      else
+	clib_sha256_block (st, st->pending.as_u8, 1);
+      clib_memset (st->pending.as_u8, 0, block_size);
+    }
+  else
+    st->pending.as_u8[st->n_pending] = 0x80;
+
+  st->pending.as_u64[block_size / 8 - 1] =
+    clib_net_to_host_u64 (st->total_bytes * 8);
+
+  if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+    {
+      clib_sha512_block (st, st->pending.as_u8, 1);
+      for (i = 0; i < digest_size / sizeof (u64); i++)
+	((u64 *) digest)[i] = clib_net_to_host_u64 (st->h.h64[i]);
+
+      /* sha512-224 case - write half of u64 */
+      if (i * sizeof (u64) < digest_size)
+	((u32 *) digest)[2 * i] = clib_net_to_host_u32 (st->h.h64[i] >> 32);
+    }
+  else
+    {
+      clib_sha256_block (st, st->pending.as_u8, 1);
+      for (i = 0; i < digest_size / sizeof (u32); i++)
+	*((u32 *) digest + i) = clib_net_to_host_u32 (st->h.h32[i]);
+    }
+}
+
+static_always_inline void
+clib_sha2_final (clib_sha2_ctx_t *ctx, u8 *digest)
+{
+  clib_sha2_final_internal (&ctx->state, ctx->block_size, ctx->digest_size,
+			    digest);
+}
+
+static_always_inline void
+clib_sha2 (clib_sha2_type_t type, const u8 *msg, uword len, u8 *digest)
+{
+  clib_sha2_ctx_t ctx;
+  clib_sha2_init (&ctx, type);
+  clib_sha2_update (&ctx, msg, len);
+  clib_sha2_final (&ctx, digest);
+}
+
+#define clib_sha224(...)     clib_sha2 (CLIB_SHA2_224, __VA_ARGS__)
+#define clib_sha256(...)     clib_sha2 (CLIB_SHA2_256, __VA_ARGS__)
+#define clib_sha384(...)     clib_sha2 (CLIB_SHA2_384, __VA_ARGS__)
+#define clib_sha512(...)     clib_sha2 (CLIB_SHA2_512, __VA_ARGS__)
+#define clib_sha512_224(...) clib_sha2 (CLIB_SHA2_512_224, __VA_ARGS__)
+#define clib_sha512_256(...) clib_sha2 (CLIB_SHA2_512_256, __VA_ARGS__)
+
+/*
+ *  HMAC
+ */
+
+typedef struct
+{
+  clib_sha2_h_t ipad_h;
+  clib_sha2_h_t opad_h;
+} clib_sha2_hmac_key_data_t;
+
+typedef struct
+{
+  clib_sha2_type_t type;
+  u8 block_size;
+  u8 digest_size;
+  clib_sha2_state_t ipad_state;
+  clib_sha2_state_t opad_state;
+} clib_sha2_hmac_ctx_t;
+
+static_always_inline void
+clib_sha2_hmac_key_data (clib_sha2_type_t type, const u8 *key, uword key_len,
+			 clib_sha2_hmac_key_data_t *kd)
+{
+  u8 block_size = clib_sha2_variants[type].block_size;
+  u8 data[SHA2_MAX_BLOCK_SIZE] = {};
+  u8 ikey[SHA2_MAX_BLOCK_SIZE];
+  u8 okey[SHA2_MAX_BLOCK_SIZE];
+  clib_sha2_state_t ipad_state;
+  clib_sha2_state_t opad_state;
+
+  /* key */
+  if (key_len > block_size)
+    {
+      /* key is longer than block, calculate hash of key */
+      clib_sha2_ctx_t ctx;
+      clib_sha2_init (&ctx, type);
+      clib_sha2_update (&ctx, key, key_len);
+      clib_sha2_final (&ctx, (u8 *) data);
+    }
+  else
+    clib_memcpy_fast (data, key, key_len);
+
+  for (int i = 0, w = 0; w < block_size; w += sizeof (uword), i++)
+    {
+      ((uwordu *) ikey)[i] = ((uwordu *) data)[i] ^ 0x3636363636363636UL;
+      ((uwordu *) okey)[i] = ((uwordu *) data)[i] ^ 0x5c5c5c5c5c5c5c5cUL;
+    }
+
+  clib_sha2_state_init (&ipad_state, type);
+  clib_sha2_state_init (&opad_state, type);
+
+  if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+    {
+      clib_sha512_block (&ipad_state, ikey, 1);
+      clib_sha512_block (&opad_state, okey, 1);
+    }
+  else
+    {
+      clib_sha256_block (&ipad_state, ikey, 1);
+      clib_sha256_block (&opad_state, okey, 1);
+    }
+
+  kd->ipad_h = ipad_state.h;
+  kd->opad_h = opad_state.h;
+}
+
+static_always_inline void
+clib_sha2_hmac_init (clib_sha2_hmac_ctx_t *ctx, clib_sha2_type_t type,
+		     clib_sha2_hmac_key_data_t *kd)
+{
+  u8 block_size = clib_sha2_variants[type].block_size;
+  u8 digest_size = clib_sha2_variants[type].digest_size;
+
+  *ctx = (clib_sha2_hmac_ctx_t) {
+    .type = type,
+    .block_size = block_size,
+    .digest_size = digest_size,
+    .ipad_state = {
+      .h = kd->ipad_h,
+      .total_bytes = block_size,
+    },
+    .opad_state = {
+      .h = kd->opad_h,
+      .total_bytes = block_size,
+    },
+  };
+}
+
+static_always_inline void
+clib_sha2_hmac_update (clib_sha2_hmac_ctx_t *ctx, const u8 *msg, uword len)
+{
+  clib_sha2_update_internal (&ctx->ipad_state, ctx->block_size, msg, len);
+}
+
+static_always_inline void
+clib_sha2_hmac_final (clib_sha2_hmac_ctx_t *ctx, u8 *digest)
+{
+  u8 i_digest[SHA2_MAX_DIGEST_SIZE];
+
+  clib_sha2_final_internal (&ctx->ipad_state, ctx->block_size,
+			    ctx->digest_size, i_digest);
+  clib_sha2_update_internal (&ctx->opad_state, ctx->block_size, i_digest,
+			     ctx->digest_size);
+  clib_sha2_final_internal (&ctx->opad_state, ctx->block_size,
+			    ctx->digest_size, digest);
+}
+
+static_always_inline void
+clib_sha2_hmac (clib_sha2_type_t type, const u8 *key, uword key_len,
+		const u8 *msg, uword len, u8 *digest)
+{
+  clib_sha2_hmac_ctx_t _ctx, *ctx = &_ctx;
+  clib_sha2_hmac_key_data_t kd;
+
+  clib_sha2_hmac_key_data (type, key, key_len, &kd);
+  clib_sha2_hmac_init (ctx, type, &kd);
+  clib_sha2_hmac_update (ctx, msg, len);
+  clib_sha2_hmac_final (ctx, digest);
+}
+
+#define clib_hmac_sha224(...) clib_sha2_hmac (CLIB_SHA2_224, __VA_ARGS__)
+#define clib_hmac_sha256(...) clib_sha2_hmac (CLIB_SHA2_256, __VA_ARGS__)
+#define clib_hmac_sha384(...) clib_sha2_hmac (CLIB_SHA2_384, __VA_ARGS__)
+#define clib_hmac_sha512(...) clib_sha2_hmac (CLIB_SHA2_512, __VA_ARGS__)
+#define clib_hmac_sha512_224(...)                                             \
+  clib_sha2_hmac (CLIB_SHA2_512_224, __VA_ARGS__)
+#define clib_hmac_sha512_256(...)                                             \
+  clib_sha2_hmac (CLIB_SHA2_512_256, __VA_ARGS__)
+
+#endif /* included_sha2_h */