aboutsummaryrefslogtreecommitdiffstats
path: root/src/vppinfra/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'src/vppinfra/crypto')
-rw-r--r--src/vppinfra/crypto/aes.h491
-rw-r--r--src/vppinfra/crypto/aes_cbc.h542
-rw-r--r--src/vppinfra/crypto/aes_ctr.h190
-rw-r--r--src/vppinfra/crypto/aes_gcm.h944
-rw-r--r--src/vppinfra/crypto/ghash.h515
-rw-r--r--src/vppinfra/crypto/poly1305.h234
-rw-r--r--src/vppinfra/crypto/sha2.h715
7 files changed, 3631 insertions, 0 deletions
diff --git a/src/vppinfra/crypto/aes.h b/src/vppinfra/crypto/aes.h
new file mode 100644
index 00000000000..9e80e3b0318
--- /dev/null
+++ b/src/vppinfra/crypto/aes.h
@@ -0,0 +1,491 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __aes_h__
+#define __aes_h__
+
+typedef enum
+{
+ AES_KEY_128 = 0,
+ AES_KEY_192 = 1,
+ AES_KEY_256 = 2,
+} aes_key_size_t;
+
+#define AES_KEY_ROUNDS(x) (10 + x * 2)
+#define AES_KEY_BYTES(x) (16 + x * 8)
+
+static_always_inline u8x16
+aes_block_load (u8 * p)
+{
+ return *(u8x16u *) p;
+}
+
+static_always_inline u8x16
+aes_enc_round_x1 (u8x16 a, u8x16 k)
+{
+#if defined (__AES__)
+ return (u8x16) _mm_aesenc_si128 ((__m128i) a, (__m128i) k);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return vaesmcq_u8 (vaeseq_u8 (a, u8x16_splat (0))) ^ k;
+#endif
+}
+
+#if defined(__VAES__) && defined(__AVX512F__)
+static_always_inline u8x64
+aes_enc_round_x4 (u8x64 a, u8x64 k)
+{
+ return (u8x64) _mm512_aesenc_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_enc_last_round_x4 (u8x64 a, u8x64 k)
+{
+ return (u8x64) _mm512_aesenclast_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_dec_round_x4 (u8x64 a, u8x64 k)
+{
+ return (u8x64) _mm512_aesdec_epi128 ((__m512i) a, (__m512i) k);
+}
+
+static_always_inline u8x64
+aes_dec_last_round_x4 (u8x64 a, u8x64 k)
+{
+ return (u8x64) _mm512_aesdeclast_epi128 ((__m512i) a, (__m512i) k);
+}
+#endif
+
+#ifdef __VAES__
+static_always_inline u8x32
+aes_enc_round_x2 (u8x32 a, u8x32 k)
+{
+ return (u8x32) _mm256_aesenc_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_enc_last_round_x2 (u8x32 a, u8x32 k)
+{
+ return (u8x32) _mm256_aesenclast_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_round_x2 (u8x32 a, u8x32 k)
+{
+ return (u8x32) _mm256_aesdec_epi128 ((__m256i) a, (__m256i) k);
+}
+
+static_always_inline u8x32
+aes_dec_last_round_x2 (u8x32 a, u8x32 k)
+{
+ return (u8x32) _mm256_aesdeclast_epi128 ((__m256i) a, (__m256i) k);
+}
+#endif
+
+static_always_inline u8x16
+aes_enc_last_round_x1 (u8x16 a, u8x16 k)
+{
+#if defined (__AES__)
+ return (u8x16) _mm_aesenclast_si128 ((__m128i) a, (__m128i) k);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return vaeseq_u8 (a, u8x16_splat (0)) ^ k;
+#endif
+}
+
+#ifdef __x86_64__
+
+static_always_inline u8x16
+aes_dec_round_x1 (u8x16 a, u8x16 k)
+{
+ return (u8x16) _mm_aesdec_si128 ((__m128i) a, (__m128i) k);
+}
+
+static_always_inline u8x16
+aes_dec_last_round_x1 (u8x16 a, u8x16 k)
+{
+ return (u8x16) _mm_aesdeclast_si128 ((__m128i) a, (__m128i) k);
+}
+#endif
+
+static_always_inline void
+aes_block_store (u8 * p, u8x16 r)
+{
+ *(u8x16u *) p = r;
+}
+
+static_always_inline u8x16
+aes_encrypt_block (u8x16 block, const u8x16 * round_keys, aes_key_size_t ks)
+{
+ int rounds = AES_KEY_ROUNDS (ks);
+ block ^= round_keys[0];
+ for (int i = 1; i < rounds; i += 1)
+ block = aes_enc_round_x1 (block, round_keys[i]);
+ return aes_enc_last_round_x1 (block, round_keys[rounds]);
+}
+
+static_always_inline u8x16
+aes_inv_mix_column (u8x16 a)
+{
+#if defined (__AES__)
+ return (u8x16) _mm_aesimc_si128 ((__m128i) a);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return vaesimcq_u8 (a);
+#endif
+}
+
+#ifdef __x86_64__
+#define aes_keygen_assist(a, b) \
+ (u8x16) _mm_aeskeygenassist_si128((__m128i) a, b)
+
+/* AES-NI based AES key expansion based on code samples from
+ Intel(r) Advanced Encryption Standard (AES) New Instructions White Paper
+ (323641-001) */
+
+static_always_inline void
+aes128_key_assist (u8x16 * rk, u8x16 r)
+{
+ u8x16 t = rk[-1];
+ t ^= u8x16_word_shift_left (t, 4);
+ t ^= u8x16_word_shift_left (t, 4);
+ t ^= u8x16_word_shift_left (t, 4);
+ rk[0] = t ^ (u8x16) u32x4_shuffle ((u32x4) r, 3, 3, 3, 3);
+}
+
+static_always_inline void
+aes128_key_expand (u8x16 *rk, u8x16u const *k)
+{
+ rk[0] = k[0];
+ aes128_key_assist (rk + 1, aes_keygen_assist (rk[0], 0x01));
+ aes128_key_assist (rk + 2, aes_keygen_assist (rk[1], 0x02));
+ aes128_key_assist (rk + 3, aes_keygen_assist (rk[2], 0x04));
+ aes128_key_assist (rk + 4, aes_keygen_assist (rk[3], 0x08));
+ aes128_key_assist (rk + 5, aes_keygen_assist (rk[4], 0x10));
+ aes128_key_assist (rk + 6, aes_keygen_assist (rk[5], 0x20));
+ aes128_key_assist (rk + 7, aes_keygen_assist (rk[6], 0x40));
+ aes128_key_assist (rk + 8, aes_keygen_assist (rk[7], 0x80));
+ aes128_key_assist (rk + 9, aes_keygen_assist (rk[8], 0x1b));
+ aes128_key_assist (rk + 10, aes_keygen_assist (rk[9], 0x36));
+}
+
+static_always_inline void
+aes192_key_assist (u8x16 * r1, u8x16 * r2, u8x16 key_assist)
+{
+ u8x16 t;
+ r1[0] ^= t = u8x16_word_shift_left (r1[0], 4);
+ r1[0] ^= t = u8x16_word_shift_left (t, 4);
+ r1[0] ^= u8x16_word_shift_left (t, 4);
+ r1[0] ^= (u8x16) _mm_shuffle_epi32 ((__m128i) key_assist, 0x55);
+ r2[0] ^= u8x16_word_shift_left (r2[0], 4);
+ r2[0] ^= (u8x16) _mm_shuffle_epi32 ((__m128i) r1[0], 0xff);
+}
+
+static_always_inline void
+aes192_key_expand (u8x16 * rk, u8x16u const *k)
+{
+ u8x16 r1, r2;
+
+ rk[0] = r1 = k[0];
+ rk[1] = r2 = (u8x16) (u64x2) { *(u64 *) (k + 1), 0 };
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x1));
+ rk[1] = (u8x16) _mm_shuffle_pd ((__m128d) rk[1], (__m128d) r1, 0);
+ rk[2] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x2));
+ rk[3] = r1;
+ rk[4] = r2;
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x4));
+ rk[4] = (u8x16) _mm_shuffle_pd ((__m128d) rk[4], (__m128d) r1, 0);
+ rk[5] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x8));
+ rk[6] = r1;
+ rk[7] = r2;
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x10));
+ rk[7] = (u8x16) _mm_shuffle_pd ((__m128d) rk[7], (__m128d) r1, 0);
+ rk[8] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x20));
+ rk[9] = r1;
+ rk[10] = r2;
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x40));
+ rk[10] = (u8x16) _mm_shuffle_pd ((__m128d) rk[10], (__m128d) r1, 0);
+ rk[11] = (u8x16) _mm_shuffle_pd ((__m128d) r1, (__m128d) r2, 1);
+
+ aes192_key_assist (&r1, &r2, aes_keygen_assist (r2, 0x80));
+ rk[12] = r1;
+}
+
+static_always_inline void
+aes256_key_assist (u8x16 * rk, int i, u8x16 key_assist)
+{
+ u8x16 r, t;
+ rk += i;
+ r = rk[-2];
+ r ^= t = u8x16_word_shift_left (r, 4);
+ r ^= t = u8x16_word_shift_left (t, 4);
+ r ^= u8x16_word_shift_left (t, 4);
+ r ^= (u8x16) u32x4_shuffle ((u32x4) key_assist, 3, 3, 3, 3);
+ rk[0] = r;
+
+ if (i >= 14)
+ return;
+
+ key_assist = aes_keygen_assist (rk[0], 0x0);
+ r = rk[-1];
+ r ^= t = u8x16_word_shift_left (r, 4);
+ r ^= t = u8x16_word_shift_left (t, 4);
+ r ^= u8x16_word_shift_left (t, 4);
+ r ^= (u8x16) u32x4_shuffle ((u32x4) key_assist, 2, 2, 2, 2);
+ rk[1] = r;
+}
+
+static_always_inline void
+aes256_key_expand (u8x16 * rk, u8x16u const *k)
+{
+ rk[0] = k[0];
+ rk[1] = k[1];
+ aes256_key_assist (rk, 2, aes_keygen_assist (rk[1], 0x01));
+ aes256_key_assist (rk, 4, aes_keygen_assist (rk[3], 0x02));
+ aes256_key_assist (rk, 6, aes_keygen_assist (rk[5], 0x04));
+ aes256_key_assist (rk, 8, aes_keygen_assist (rk[7], 0x08));
+ aes256_key_assist (rk, 10, aes_keygen_assist (rk[9], 0x10));
+ aes256_key_assist (rk, 12, aes_keygen_assist (rk[11], 0x20));
+ aes256_key_assist (rk, 14, aes_keygen_assist (rk[13], 0x40));
+}
+#endif
+
+#ifdef __aarch64__
+
+static const u8x16 aese_prep_mask1 =
+ { 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 };
+static const u8x16 aese_prep_mask2 =
+ { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 };
+
+static_always_inline void
+aes128_key_expand_round_neon (u8x16 * rk, u32 rcon)
+{
+ u8x16 r, t, last_round = rk[-1], z = { };
+ r = vqtbl1q_u8 (last_round, aese_prep_mask1);
+ r = vaeseq_u8 (r, z);
+ r ^= (u8x16) vdupq_n_u32 (rcon);
+ r ^= last_round;
+ r ^= t = vextq_u8 (z, last_round, 12);
+ r ^= t = vextq_u8 (z, t, 12);
+ r ^= vextq_u8 (z, t, 12);
+ rk[0] = r;
+}
+
+static_always_inline void
+aes128_key_expand (u8x16 *rk, u8x16u const *k)
+{
+ rk[0] = k[0];
+ aes128_key_expand_round_neon (rk + 1, 0x01);
+ aes128_key_expand_round_neon (rk + 2, 0x02);
+ aes128_key_expand_round_neon (rk + 3, 0x04);
+ aes128_key_expand_round_neon (rk + 4, 0x08);
+ aes128_key_expand_round_neon (rk + 5, 0x10);
+ aes128_key_expand_round_neon (rk + 6, 0x20);
+ aes128_key_expand_round_neon (rk + 7, 0x40);
+ aes128_key_expand_round_neon (rk + 8, 0x80);
+ aes128_key_expand_round_neon (rk + 9, 0x1b);
+ aes128_key_expand_round_neon (rk + 10, 0x36);
+}
+
+static_always_inline void
+aes192_key_expand_round_neon (u8x8 * rk, u32 rcon)
+{
+ u8x8 r, last_round = rk[-1], z = { };
+ u8x16 r2, z2 = { };
+
+ r2 = (u8x16) vdupq_lane_u64 ((uint64x1_t) last_round, 0);
+ r2 = vqtbl1q_u8 (r2, aese_prep_mask1);
+ r2 = vaeseq_u8 (r2, z2);
+ r2 ^= (u8x16) vdupq_n_u32 (rcon);
+
+ r = (u8x8) vdup_laneq_u64 ((u64x2) r2, 0);
+ r ^= rk[-3];
+ r ^= vext_u8 (z, rk[-3], 4);
+ rk[0] = r;
+
+ r = rk[-2] ^ vext_u8 (r, z, 4);
+ r ^= vext_u8 (z, r, 4);
+ rk[1] = r;
+
+ if (rcon == 0x80)
+ return;
+
+ r = rk[-1] ^ vext_u8 (r, z, 4);
+ r ^= vext_u8 (z, r, 4);
+ rk[2] = r;
+}
+
+static_always_inline void
+aes192_key_expand (u8x16 * ek, const u8x16u * k)
+{
+ u8x8 *rk = (u8x8 *) ek;
+ ek[0] = k[0];
+ rk[2] = *(u8x8u *) (k + 1);
+ aes192_key_expand_round_neon (rk + 3, 0x01);
+ aes192_key_expand_round_neon (rk + 6, 0x02);
+ aes192_key_expand_round_neon (rk + 9, 0x04);
+ aes192_key_expand_round_neon (rk + 12, 0x08);
+ aes192_key_expand_round_neon (rk + 15, 0x10);
+ aes192_key_expand_round_neon (rk + 18, 0x20);
+ aes192_key_expand_round_neon (rk + 21, 0x40);
+ aes192_key_expand_round_neon (rk + 24, 0x80);
+}
+
+
+static_always_inline void
+aes256_key_expand_round_neon (u8x16 * rk, u32 rcon)
+{
+ u8x16 r, t, z = { };
+
+ r = vqtbl1q_u8 (rk[-1], rcon ? aese_prep_mask1 : aese_prep_mask2);
+ r = vaeseq_u8 (r, z);
+ if (rcon)
+ r ^= (u8x16) vdupq_n_u32 (rcon);
+ r ^= rk[-2];
+ r ^= t = vextq_u8 (z, rk[-2], 12);
+ r ^= t = vextq_u8 (z, t, 12);
+ r ^= vextq_u8 (z, t, 12);
+ rk[0] = r;
+}
+
+static_always_inline void
+aes256_key_expand (u8x16 *rk, u8x16u const *k)
+{
+ rk[0] = k[0];
+ rk[1] = k[1];
+ aes256_key_expand_round_neon (rk + 2, 0x01);
+ aes256_key_expand_round_neon (rk + 3, 0);
+ aes256_key_expand_round_neon (rk + 4, 0x02);
+ aes256_key_expand_round_neon (rk + 5, 0);
+ aes256_key_expand_round_neon (rk + 6, 0x04);
+ aes256_key_expand_round_neon (rk + 7, 0);
+ aes256_key_expand_round_neon (rk + 8, 0x08);
+ aes256_key_expand_round_neon (rk + 9, 0);
+ aes256_key_expand_round_neon (rk + 10, 0x10);
+ aes256_key_expand_round_neon (rk + 11, 0);
+ aes256_key_expand_round_neon (rk + 12, 0x20);
+ aes256_key_expand_round_neon (rk + 13, 0);
+ aes256_key_expand_round_neon (rk + 14, 0x40);
+}
+
+#endif
+
+static_always_inline void
+aes_key_expand (u8x16 * key_schedule, u8 const *key, aes_key_size_t ks)
+{
+ switch (ks)
+ {
+ case AES_KEY_128:
+ aes128_key_expand (key_schedule, (u8x16u const *) key);
+ break;
+ case AES_KEY_192:
+ aes192_key_expand (key_schedule, (u8x16u const *) key);
+ break;
+ case AES_KEY_256:
+ aes256_key_expand (key_schedule, (u8x16u const *) key);
+ break;
+ }
+}
+
+static_always_inline void
+aes_key_enc_to_dec (u8x16 * ke, u8x16 * kd, aes_key_size_t ks)
+{
+ int rounds = AES_KEY_ROUNDS (ks);
+
+ kd[rounds] = ke[0];
+ kd[0] = ke[rounds];
+
+ for (int i = 1; i < (rounds / 2); i++)
+ {
+ kd[rounds - i] = aes_inv_mix_column (ke[i]);
+ kd[i] = aes_inv_mix_column (ke[rounds - i]);
+ }
+
+ kd[rounds / 2] = aes_inv_mix_column (ke[rounds / 2]);
+}
+#if defined(__VAES__) && defined(__AVX512F__)
+#define N_AES_LANES 4
+#define aes_load_partial(p, n) u8x64_load_partial ((u8 *) (p), n)
+#define aes_store_partial(v, p, n) u8x64_store_partial (v, (u8 *) (p), n)
+#define aes_reflect(r) u8x64_reflect_u8x16 (r)
+typedef u8x64 aes_data_t;
+typedef u8x64u aes_mem_t;
+typedef u32x16 aes_counter_t;
+#elif defined(__VAES__)
+#define N_AES_LANES 2
+#define aes_load_partial(p, n) u8x32_load_partial ((u8 *) (p), n)
+#define aes_store_partial(v, p, n) u8x32_store_partial (v, (u8 *) (p), n)
+#define aes_reflect(r) u8x32_reflect_u8x16 (r)
+typedef u8x32 aes_data_t;
+typedef u8x32u aes_mem_t;
+typedef u32x8 aes_counter_t;
+#else
+#define N_AES_LANES 1
+#define aes_load_partial(p, n) u8x16_load_partial ((u8 *) (p), n)
+#define aes_store_partial(v, p, n) u8x16_store_partial (v, (u8 *) (p), n)
+#define aes_reflect(r) u8x16_reflect (r)
+typedef u8x16 aes_data_t;
+typedef u8x16u aes_mem_t;
+typedef u32x4 aes_counter_t;
+#endif
+
+#define N_AES_BYTES (N_AES_LANES * 16)
+
+typedef union
+{
+ u8x16 x1;
+ u8x32 x2;
+ u8x64 x4;
+ u8x16 lanes[4];
+} aes_expaned_key_t;
+
+static_always_inline void
+aes_enc_round (aes_data_t *r, const aes_expaned_key_t *ek, uword n_blocks)
+{
+ for (int i = 0; i < n_blocks; i++)
+#if N_AES_LANES == 4
+ r[i] = aes_enc_round_x4 (r[i], ek->x4);
+#elif N_AES_LANES == 2
+ r[i] = aes_enc_round_x2 (r[i], ek->x2);
+#else
+ r[i] = aes_enc_round_x1 (r[i], ek->x1);
+#endif
+}
+
+static_always_inline void
+aes_enc_last_round (aes_data_t *r, aes_data_t *d, const aes_expaned_key_t *ek,
+ uword n_blocks)
+{
+ for (int i = 0; i < n_blocks; i++)
+#if N_AES_LANES == 4
+ d[i] ^= r[i] = aes_enc_last_round_x4 (r[i], ek->x4);
+#elif N_AES_LANES == 2
+ d[i] ^= r[i] = aes_enc_last_round_x2 (r[i], ek->x2);
+#else
+ d[i] ^= r[i] = aes_enc_last_round_x1 (r[i], ek->x1);
+#endif
+}
+
+#endif /* __aes_h__ */
diff --git a/src/vppinfra/crypto/aes_cbc.h b/src/vppinfra/crypto/aes_cbc.h
new file mode 100644
index 00000000000..cb3d0784051
--- /dev/null
+++ b/src/vppinfra/crypto/aes_cbc.h
@@ -0,0 +1,542 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_cbc_h__
+#define __crypto_aes_cbc_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/crypto/aes.h>
+
+typedef struct
+{
+ const u8x16 encrypt_key[15];
+ const u8x16 decrypt_key[15];
+} aes_cbc_key_data_t;
+
+static_always_inline void
+clib_aes_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *src, uword len,
+ const u8 *iv, aes_key_size_t ks, u8 *dst)
+{
+ int rounds = AES_KEY_ROUNDS (ks);
+ u8x16 r, *k = (u8x16 *) kd->encrypt_key;
+
+ r = *(u8x16u *) iv;
+
+ for (int i = 0; i < len; i += 16)
+ {
+ int j;
+ r = u8x16_xor3 (r, *(u8x16u *) (src + i), k[0]);
+ for (j = 1; j < rounds; j++)
+ r = aes_enc_round_x1 (r, k[j]);
+ r = aes_enc_last_round_x1 (r, k[rounds]);
+ *(u8x16u *) (dst + i) = r;
+ }
+}
+
+static_always_inline void
+clib_aes128_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+ uword len, const u8 *iv, u8 *ciphertext)
+{
+ clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_128, ciphertext);
+}
+
+static_always_inline void
+clib_aes192_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+ uword len, const u8 *iv, u8 *ciphertext)
+{
+ clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_192, ciphertext);
+}
+
+static_always_inline void
+clib_aes256_cbc_encrypt (const aes_cbc_key_data_t *kd, const u8 *plaintext,
+ uword len, const u8 *iv, u8 *ciphertext)
+{
+ clib_aes_cbc_encrypt (kd, plaintext, len, iv, AES_KEY_256, ciphertext);
+}
+
+static_always_inline void __clib_unused
+aes_cbc_dec (const u8x16 *k, u8x16u *src, u8x16u *dst, u8x16u *iv, int count,
+ int rounds)
+{
+ u8x16 r[4], c[4], f;
+
+ f = iv[0];
+ while (count >= 64)
+ {
+ c[0] = r[0] = src[0];
+ c[1] = r[1] = src[1];
+ c[2] = r[2] = src[2];
+ c[3] = r[3] = src[3];
+
+#if __x86_64__
+ r[0] ^= k[0];
+ r[1] ^= k[0];
+ r[2] ^= k[0];
+ r[3] ^= k[0];
+
+ for (int i = 1; i < rounds; i++)
+ {
+ r[0] = aes_dec_round_x1 (r[0], k[i]);
+ r[1] = aes_dec_round_x1 (r[1], k[i]);
+ r[2] = aes_dec_round_x1 (r[2], k[i]);
+ r[3] = aes_dec_round_x1 (r[3], k[i]);
+ }
+
+ r[0] = aes_dec_last_round_x1 (r[0], k[rounds]);
+ r[1] = aes_dec_last_round_x1 (r[1], k[rounds]);
+ r[2] = aes_dec_last_round_x1 (r[2], k[rounds]);
+ r[3] = aes_dec_last_round_x1 (r[3], k[rounds]);
+#else
+ for (int i = 0; i < rounds - 1; i++)
+ {
+ r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
+ r[1] = vaesimcq_u8 (vaesdq_u8 (r[1], k[i]));
+ r[2] = vaesimcq_u8 (vaesdq_u8 (r[2], k[i]));
+ r[3] = vaesimcq_u8 (vaesdq_u8 (r[3], k[i]));
+ }
+ r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
+ r[1] = vaesdq_u8 (r[1], k[rounds - 1]) ^ k[rounds];
+ r[2] = vaesdq_u8 (r[2], k[rounds - 1]) ^ k[rounds];
+ r[3] = vaesdq_u8 (r[3], k[rounds - 1]) ^ k[rounds];
+#endif
+ dst[0] = r[0] ^ f;
+ dst[1] = r[1] ^ c[0];
+ dst[2] = r[2] ^ c[1];
+ dst[3] = r[3] ^ c[2];
+ f = c[3];
+
+ count -= 64;
+ src += 4;
+ dst += 4;
+ }
+
+ while (count > 0)
+ {
+ c[0] = r[0] = src[0];
+#if __x86_64__
+ r[0] ^= k[0];
+ for (int i = 1; i < rounds; i++)
+ r[0] = aes_dec_round_x1 (r[0], k[i]);
+ r[0] = aes_dec_last_round_x1 (r[0], k[rounds]);
+#else
+ c[0] = r[0] = src[0];
+ for (int i = 0; i < rounds - 1; i++)
+ r[0] = vaesimcq_u8 (vaesdq_u8 (r[0], k[i]));
+ r[0] = vaesdq_u8 (r[0], k[rounds - 1]) ^ k[rounds];
+#endif
+ dst[0] = r[0] ^ f;
+ f = c[0];
+
+ count -= 16;
+ src += 1;
+ dst += 1;
+ }
+}
+
+#if __x86_64__
+#if defined(__VAES__) && defined(__AVX512F__)
+
+static_always_inline u8x64
+aes_block_load_x4 (u8 *src[], int i)
+{
+ u8x64 r = {};
+ r = u8x64_insert_u8x16 (r, aes_block_load (src[0] + i), 0);
+ r = u8x64_insert_u8x16 (r, aes_block_load (src[1] + i), 1);
+ r = u8x64_insert_u8x16 (r, aes_block_load (src[2] + i), 2);
+ r = u8x64_insert_u8x16 (r, aes_block_load (src[3] + i), 3);
+ return r;
+}
+
+static_always_inline void
+aes_block_store_x4 (u8 *dst[], int i, u8x64 r)
+{
+ aes_block_store (dst[0] + i, u8x64_extract_u8x16 (r, 0));
+ aes_block_store (dst[1] + i, u8x64_extract_u8x16 (r, 1));
+ aes_block_store (dst[2] + i, u8x64_extract_u8x16 (r, 2));
+ aes_block_store (dst[3] + i, u8x64_extract_u8x16 (r, 3));
+}
+
+static_always_inline u8x64
+aes4_cbc_dec_permute (u8x64 a, u8x64 b)
+{
+ return (u8x64) u64x8_shuffle2 (a, b, 6, 7, 8, 9, 10, 11, 12, 13);
+}
+
+static_always_inline void
+aes4_cbc_dec (const u8x16 *k, u8x64u *src, u8x64u *dst, u8x16u *iv, int count,
+ aes_key_size_t rounds)
+{
+ u8x64 f, k4, r[4], c[4] = {};
+ __mmask8 m;
+ int i, n_blocks = count >> 4;
+
+ f = u8x64_insert_u8x16 (u8x64_zero (), *iv, 3);
+
+ while (n_blocks >= 16)
+ {
+ k4 = u8x64_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+ c[3] = src[3];
+
+ r[0] = c[0] ^ k4;
+ r[1] = c[1] ^ k4;
+ r[2] = c[2] ^ k4;
+ r[3] = c[3] ^ k4;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x4 (r[0], k4);
+ r[1] = aes_dec_round_x4 (r[1], k4);
+ r[2] = aes_dec_round_x4 (r[2], k4);
+ r[3] = aes_dec_round_x4 (r[3], k4);
+ }
+
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x4 (r[0], k4);
+ r[1] = aes_dec_last_round_x4 (r[1], k4);
+ r[2] = aes_dec_last_round_x4 (r[2], k4);
+ r[3] = aes_dec_last_round_x4 (r[3], k4);
+
+ dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
+ dst[3] = r[3] ^= aes4_cbc_dec_permute (c[2], c[3]);
+ f = c[3];
+
+ n_blocks -= 16;
+ src += 4;
+ dst += 4;
+ }
+
+ if (n_blocks >= 12)
+ {
+ k4 = u8x64_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+
+ r[0] = c[0] ^ k4;
+ r[1] = c[1] ^ k4;
+ r[2] = c[2] ^ k4;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x4 (r[0], k4);
+ r[1] = aes_dec_round_x4 (r[1], k4);
+ r[2] = aes_dec_round_x4 (r[2], k4);
+ }
+
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x4 (r[0], k4);
+ r[1] = aes_dec_last_round_x4 (r[1], k4);
+ r[2] = aes_dec_last_round_x4 (r[2], k4);
+
+ dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes4_cbc_dec_permute (c[1], c[2]);
+ f = c[2];
+
+ n_blocks -= 12;
+ src += 3;
+ dst += 3;
+ }
+ else if (n_blocks >= 8)
+ {
+ k4 = u8x64_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+
+ r[0] = c[0] ^ k4;
+ r[1] = c[1] ^ k4;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x4 (r[0], k4);
+ r[1] = aes_dec_round_x4 (r[1], k4);
+ }
+
+ k4 = u8x64_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x4 (r[0], k4);
+ r[1] = aes_dec_last_round_x4 (r[1], k4);
+
+ dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes4_cbc_dec_permute (c[0], c[1]);
+ f = c[1];
+
+ n_blocks -= 8;
+ src += 2;
+ dst += 2;
+ }
+ else if (n_blocks >= 4)
+ {
+ c[0] = src[0];
+
+ r[0] = c[0] ^ u8x64_splat_u8x16 (k[0]);
+
+ for (i = 1; i < rounds; i++)
+ r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+
+ r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+
+ dst[0] = r[0] ^= aes4_cbc_dec_permute (f, c[0]);
+ f = c[0];
+
+ n_blocks -= 4;
+ src += 1;
+ dst += 1;
+ }
+
+ if (n_blocks > 0)
+ {
+ k4 = u8x64_splat_u8x16 (k[0]);
+ m = (1 << (n_blocks * 2)) - 1;
+ c[0] =
+ (u8x64) _mm512_mask_loadu_epi64 ((__m512i) c[0], m, (__m512i *) src);
+ f = aes4_cbc_dec_permute (f, c[0]);
+ r[0] = c[0] ^ k4;
+ for (i = 1; i < rounds; i++)
+ r[0] = aes_dec_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+ r[0] = aes_dec_last_round_x4 (r[0], u8x64_splat_u8x16 (k[i]));
+ _mm512_mask_storeu_epi64 ((__m512i *) dst, m, (__m512i) (r[0] ^ f));
+ }
+}
+#elif defined(__VAES__)
+
+static_always_inline u8x32
+aes_block_load_x2 (u8 *src[], int i)
+{
+ u8x32 r = {};
+ r = u8x32_insert_lo (r, aes_block_load (src[0] + i));
+ r = u8x32_insert_hi (r, aes_block_load (src[1] + i));
+ return r;
+}
+
+static_always_inline void
+aes_block_store_x2 (u8 *dst[], int i, u8x32 r)
+{
+ aes_block_store (dst[0] + i, u8x32_extract_lo (r));
+ aes_block_store (dst[1] + i, u8x32_extract_hi (r));
+}
+
+static_always_inline u8x32
+aes2_cbc_dec_permute (u8x32 a, u8x32 b)
+{
+ return (u8x32) u64x4_shuffle2 ((u64x4) a, (u64x4) b, 2, 3, 4, 5);
+}
+
+static_always_inline void
+aes2_cbc_dec (const u8x16 *k, u8x32u *src, u8x32u *dst, u8x16u *iv, int count,
+ aes_key_size_t rounds)
+{
+ u8x32 k2, f = {}, r[4], c[4] = {};
+ int i, n_blocks = count >> 4;
+
+ f = u8x32_insert_hi (f, *iv);
+
+ while (n_blocks >= 8)
+ {
+ k2 = u8x32_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+ c[3] = src[3];
+
+ r[0] = c[0] ^ k2;
+ r[1] = c[1] ^ k2;
+ r[2] = c[2] ^ k2;
+ r[3] = c[3] ^ k2;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x2 (r[0], k2);
+ r[1] = aes_dec_round_x2 (r[1], k2);
+ r[2] = aes_dec_round_x2 (r[2], k2);
+ r[3] = aes_dec_round_x2 (r[3], k2);
+ }
+
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x2 (r[0], k2);
+ r[1] = aes_dec_last_round_x2 (r[1], k2);
+ r[2] = aes_dec_last_round_x2 (r[2], k2);
+ r[3] = aes_dec_last_round_x2 (r[3], k2);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+ dst[3] = r[3] ^= aes2_cbc_dec_permute (c[2], c[3]);
+ f = c[3];
+
+ n_blocks -= 8;
+ src += 4;
+ dst += 4;
+ }
+
+ if (n_blocks >= 6)
+ {
+ k2 = u8x32_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+ c[2] = src[2];
+
+ r[0] = c[0] ^ k2;
+ r[1] = c[1] ^ k2;
+ r[2] = c[2] ^ k2;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x2 (r[0], k2);
+ r[1] = aes_dec_round_x2 (r[1], k2);
+ r[2] = aes_dec_round_x2 (r[2], k2);
+ }
+
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x2 (r[0], k2);
+ r[1] = aes_dec_last_round_x2 (r[1], k2);
+ r[2] = aes_dec_last_round_x2 (r[2], k2);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ dst[2] = r[2] ^= aes2_cbc_dec_permute (c[1], c[2]);
+ f = c[2];
+
+ n_blocks -= 6;
+ src += 3;
+ dst += 3;
+ }
+ else if (n_blocks >= 4)
+ {
+ k2 = u8x32_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ c[1] = src[1];
+
+ r[0] = c[0] ^ k2;
+ r[1] = c[1] ^ k2;
+
+ for (i = 1; i < rounds; i++)
+ {
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_round_x2 (r[0], k2);
+ r[1] = aes_dec_round_x2 (r[1], k2);
+ }
+
+ k2 = u8x32_splat_u8x16 (k[i]);
+ r[0] = aes_dec_last_round_x2 (r[0], k2);
+ r[1] = aes_dec_last_round_x2 (r[1], k2);
+
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ dst[1] = r[1] ^= aes2_cbc_dec_permute (c[0], c[1]);
+ f = c[1];
+
+ n_blocks -= 4;
+ src += 2;
+ dst += 2;
+ }
+ else if (n_blocks >= 2)
+ {
+ k2 = u8x32_splat_u8x16 (k[0]);
+ c[0] = src[0];
+ r[0] = c[0] ^ k2;
+
+ for (i = 1; i < rounds; i++)
+ r[0] = aes_dec_round_x2 (r[0], u8x32_splat_u8x16 (k[i]));
+
+ r[0] = aes_dec_last_round_x2 (r[0], u8x32_splat_u8x16 (k[i]));
+ dst[0] = r[0] ^= aes2_cbc_dec_permute (f, c[0]);
+ f = c[0];
+
+ n_blocks -= 2;
+ src += 1;
+ dst += 1;
+ }
+
+ if (n_blocks > 0)
+ {
+ u8x16 rl = *(u8x16u *) src ^ k[0];
+ for (i = 1; i < rounds; i++)
+ rl = aes_dec_round_x1 (rl, k[i]);
+ rl = aes_dec_last_round_x1 (rl, k[i]);
+ *(u8x16u *) dst = rl ^ u8x32_extract_hi (f);
+ }
+}
+#endif
+#endif
+
+static_always_inline void
+clib_aes_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key,
+ aes_key_size_t ks)
+{
+ u8x16 e[15], d[15];
+ aes_key_expand (e, key, ks);
+ aes_key_enc_to_dec (e, d, ks);
+ for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+ {
+ ((u8x16 *) kd->decrypt_key)[i] = d[i];
+ ((u8x16 *) kd->encrypt_key)[i] = e[i];
+ }
+}
+
+static_always_inline void
+clib_aes128_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+ clib_aes_cbc_key_expand (kd, key, AES_KEY_128);
+}
+static_always_inline void
+clib_aes192_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+ clib_aes_cbc_key_expand (kd, key, AES_KEY_192);
+}
+static_always_inline void
+clib_aes256_cbc_key_expand (aes_cbc_key_data_t *kd, const u8 *key)
+{
+ clib_aes_cbc_key_expand (kd, key, AES_KEY_256);
+}
+
+static_always_inline void
+clib_aes_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+ uword len, const u8 *iv, aes_key_size_t ks,
+ u8 *plaintext)
+{
+ int rounds = AES_KEY_ROUNDS (ks);
+#if defined(__VAES__) && defined(__AVX512F__)
+ aes4_cbc_dec (kd->decrypt_key, (u8x64u *) ciphertext, (u8x64u *) plaintext,
+ (u8x16u *) iv, (int) len, rounds);
+#elif defined(__VAES__)
+ aes2_cbc_dec (kd->decrypt_key, (u8x32u *) ciphertext, (u8x32u *) plaintext,
+ (u8x16u *) iv, (int) len, rounds);
+#else
+ aes_cbc_dec (kd->decrypt_key, (u8x16u *) ciphertext, (u8x16u *) plaintext,
+ (u8x16u *) iv, (int) len, rounds);
+#endif
+}
+
+static_always_inline void
+clib_aes128_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+ uword len, const u8 *iv, u8 *plaintext)
+{
+ clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_128, plaintext);
+}
+
+static_always_inline void
+clib_aes192_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+ uword len, const u8 *iv, u8 *plaintext)
+{
+ clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_192, plaintext);
+}
+
+static_always_inline void
+clib_aes256_cbc_decrypt (const aes_cbc_key_data_t *kd, const u8 *ciphertext,
+ uword len, const u8 *iv, u8 *plaintext)
+{
+ clib_aes_cbc_decrypt (kd, ciphertext, len, iv, AES_KEY_256, plaintext);
+}
+
+#endif /* __crypto_aes_cbc_h__ */
diff --git a/src/vppinfra/crypto/aes_ctr.h b/src/vppinfra/crypto/aes_ctr.h
new file mode 100644
index 00000000000..74a9f96d90d
--- /dev/null
+++ b/src/vppinfra/crypto/aes_ctr.h
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2024 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_ctr_h__
+#define __crypto_aes_ctr_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+#include <vppinfra/crypto/aes.h>
+
+typedef struct
+{
+ const aes_expaned_key_t exp_key[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+} aes_ctr_key_data_t;
+
+typedef struct
+{
+ const aes_expaned_key_t exp_key[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+ aes_counter_t ctr; /* counter (reflected) */
+ u8 keystream_bytes[N_AES_BYTES]; /* keystream leftovers */
+ u32 n_keystream_bytes; /* number of keystream leftovers */
+} aes_ctr_ctx_t;
+
+static_always_inline aes_counter_t
+aes_ctr_one_block (aes_ctr_ctx_t *ctx, aes_counter_t ctr, const u8 *src,
+ u8 *dst, u32 n_parallel, u32 n_bytes, int rounds, int last)
+{
+ u32 __clib_aligned (N_AES_BYTES)
+ inc[] = { N_AES_LANES, 0, 0, 0, N_AES_LANES, 0, 0, 0,
+ N_AES_LANES, 0, 0, 0, N_AES_LANES, 0, 0, 0 };
+ const aes_expaned_key_t *k = ctx->exp_key;
+ const aes_mem_t *sv = (aes_mem_t *) src;
+ aes_mem_t *dv = (aes_mem_t *) dst;
+ aes_data_t d[4], t[4];
+ u32 r;
+
+ n_bytes -= (n_parallel - 1) * N_AES_BYTES;
+
+ /* AES First Round */
+ for (int i = 0; i < n_parallel; i++)
+ {
+#if N_AES_LANES == 4
+ t[i] = k[0].x4 ^ (u8x64) aes_reflect ((u8x64) ctr);
+#elif N_AES_LANES == 2
+ t[i] = k[0].x2 ^ (u8x32) aes_reflect ((u8x32) ctr);
+#else
+ t[i] = k[0].x1 ^ (u8x16) aes_reflect ((u8x16) ctr);
+#endif
+ ctr += *(aes_counter_t *) inc;
+ }
+
+ /* Load Data */
+ for (int i = 0; i < n_parallel - last; i++)
+ d[i] = sv[i];
+
+ if (last)
+ d[n_parallel - 1] =
+ aes_load_partial ((u8 *) (sv + n_parallel - 1), n_bytes);
+
+ /* AES Intermediate Rounds */
+ for (r = 1; r < rounds; r++)
+ aes_enc_round (t, k + r, n_parallel);
+
+ /* AES Last Round */
+ aes_enc_last_round (t, d, k + r, n_parallel);
+
+ /* Store Data */
+ for (int i = 0; i < n_parallel - last; i++)
+ dv[i] = d[i];
+
+ if (last)
+ {
+ aes_store_partial (d[n_parallel - 1], dv + n_parallel - 1, n_bytes);
+ *(aes_data_t *) ctx->keystream_bytes = t[n_parallel - 1];
+ ctx->n_keystream_bytes = N_AES_BYTES - n_bytes;
+ }
+
+ return ctr;
+}
+
+static_always_inline void
+clib_aes_ctr_init (aes_ctr_ctx_t *ctx, const aes_ctr_key_data_t *kd,
+ const u8 *iv, aes_key_size_t ks)
+{
+ u32x4 ctr = (u32x4) u8x16_reflect (*(u8x16u *) iv);
+#if N_AES_LANES == 4
+ ctx->ctr = (aes_counter_t) u32x16_splat_u32x4 (ctr) +
+ (u32x16){ 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 };
+#elif N_AES_LANES == 2
+ ctx->ctr = (aes_counter_t) u32x8_splat_u32x4 (ctr) +
+ (u32x8){ 0, 0, 0, 0, 1, 0, 0, 0 };
+#else
+ ctx->ctr = ctr;
+#endif
+ for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+ ((aes_expaned_key_t *) ctx->exp_key)[i] = kd->exp_key[i];
+ ctx->n_keystream_bytes = 0;
+}
+
+static_always_inline void
+clib_aes_ctr_transform (aes_ctr_ctx_t *ctx, const u8 *src, u8 *dst,
+ u32 n_bytes, aes_key_size_t ks)
+{
+ int r = AES_KEY_ROUNDS (ks);
+ aes_counter_t ctr = ctx->ctr;
+
+ if (ctx->n_keystream_bytes)
+ {
+ u8 *ks = ctx->keystream_bytes + N_AES_BYTES - ctx->n_keystream_bytes;
+
+ if (ctx->n_keystream_bytes >= n_bytes)
+ {
+ for (int i = 0; i < n_bytes; i++)
+ dst[i] = src[i] ^ ks[i];
+ ctx->n_keystream_bytes -= n_bytes;
+ return;
+ }
+
+ for (int i = 0; i < ctx->n_keystream_bytes; i++)
+ dst++[0] = src++[0] ^ ks[i];
+
+ n_bytes -= ctx->n_keystream_bytes;
+ ctx->n_keystream_bytes = 0;
+ }
+
+ /* main loop */
+ for (int n = 4 * N_AES_BYTES; n_bytes >= n; n_bytes -= n, dst += n, src += n)
+ ctr = aes_ctr_one_block (ctx, ctr, src, dst, 4, n, r, 0);
+
+ if (n_bytes)
+ {
+ if (n_bytes > 3 * N_AES_BYTES)
+ ctr = aes_ctr_one_block (ctx, ctr, src, dst, 4, n_bytes, r, 1);
+ else if (n_bytes > 2 * N_AES_BYTES)
+ ctr = aes_ctr_one_block (ctx, ctr, src, dst, 3, n_bytes, r, 1);
+ else if (n_bytes > N_AES_BYTES)
+ ctr = aes_ctr_one_block (ctx, ctr, src, dst, 2, n_bytes, r, 1);
+ else
+ ctr = aes_ctr_one_block (ctx, ctr, src, dst, 1, n_bytes, r, 1);
+ }
+ else
+ ctx->n_keystream_bytes = 0;
+
+ ctx->ctr = ctr;
+}
+
+static_always_inline void
+clib_aes_ctr_key_expand (aes_ctr_key_data_t *kd, const u8 *key,
+ aes_key_size_t ks)
+{
+ u8x16 ek[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+ aes_expaned_key_t *k = (aes_expaned_key_t *) kd->exp_key;
+
+ /* expand AES key */
+ aes_key_expand (ek, key, ks);
+ for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+ k[i].lanes[0] = k[i].lanes[1] = k[i].lanes[2] = k[i].lanes[3] = ek[i];
+}
+
+static_always_inline void
+clib_aes128_ctr (const aes_ctr_key_data_t *kd, const u8 *src, u32 n_bytes,
+ const u8 *iv, u8 *dst)
+{
+ aes_ctr_ctx_t ctx;
+ clib_aes_ctr_init (&ctx, kd, iv, AES_KEY_128);
+ clib_aes_ctr_transform (&ctx, src, dst, n_bytes, AES_KEY_128);
+}
+
+static_always_inline void
+clib_aes192_ctr (const aes_ctr_key_data_t *kd, const u8 *src, u32 n_bytes,
+ const u8 *iv, u8 *dst)
+{
+ aes_ctr_ctx_t ctx;
+ clib_aes_ctr_init (&ctx, kd, iv, AES_KEY_192);
+ clib_aes_ctr_transform (&ctx, src, dst, n_bytes, AES_KEY_192);
+}
+
+static_always_inline void
+clib_aes256_ctr (const aes_ctr_key_data_t *kd, const u8 *src, u32 n_bytes,
+ const u8 *iv, u8 *dst)
+{
+ aes_ctr_ctx_t ctx;
+ clib_aes_ctr_init (&ctx, kd, iv, AES_KEY_256);
+ clib_aes_ctr_transform (&ctx, src, dst, n_bytes, AES_KEY_256);
+}
+
+#endif /* __crypto_aes_ctr_h__ */
diff --git a/src/vppinfra/crypto/aes_gcm.h b/src/vppinfra/crypto/aes_gcm.h
new file mode 100644
index 00000000000..5b628c87745
--- /dev/null
+++ b/src/vppinfra/crypto/aes_gcm.h
@@ -0,0 +1,944 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __crypto_aes_gcm_h__
+#define __crypto_aes_gcm_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+#include <vppinfra/crypto/aes.h>
+#include <vppinfra/crypto/ghash.h>
+
+#define NUM_HI 36
+#if N_AES_LANES == 4
+typedef u8x64u aes_ghash_t;
+#define aes_gcm_splat(v) u8x64_splat (v)
+#define aes_gcm_ghash_reduce(c) ghash4_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c) ghash4_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c) (c)->T = ghash4_final (&(c)->gd)
+#elif N_AES_LANES == 2
+typedef u8x32u aes_ghash_t;
+#define aes_gcm_splat(v) u8x32_splat (v)
+#define aes_gcm_ghash_reduce(c) ghash2_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c) ghash2_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c) (c)->T = ghash2_final (&(c)->gd)
+#else
+typedef u8x16 aes_ghash_t;
+#define aes_gcm_splat(v) u8x16_splat (v)
+#define aes_gcm_ghash_reduce(c) ghash_reduce (&(c)->gd)
+#define aes_gcm_ghash_reduce2(c) ghash_reduce2 (&(c)->gd)
+#define aes_gcm_ghash_final(c) (c)->T = ghash_final (&(c)->gd)
+#endif
+
+typedef enum
+{
+ AES_GCM_OP_UNKNONW = 0,
+ AES_GCM_OP_ENCRYPT,
+ AES_GCM_OP_DECRYPT,
+ AES_GCM_OP_GMAC
+} aes_gcm_op_t;
+
+typedef struct
+{
+ /* pre-calculated hash key values */
+ const u8x16 Hi[NUM_HI];
+ /* extracted AES key */
+ const aes_expaned_key_t Ke[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+} aes_gcm_key_data_t;
+
+typedef struct
+{
+ aes_gcm_op_t operation;
+ int last;
+ u8 rounds;
+ uword data_bytes;
+ uword aad_bytes;
+
+ u8x16 T;
+
+ /* hash */
+ const u8x16 *Hi;
+ const aes_ghash_t *next_Hi;
+
+ /* expaded keys */
+ const aes_expaned_key_t *Ke;
+
+ /* counter */
+ u32 counter;
+ u8x16 EY0;
+ aes_counter_t Y;
+
+ /* ghash */
+ ghash_ctx_t gd;
+} aes_gcm_ctx_t;
+
+static_always_inline u8x16
+aes_gcm_final_block (aes_gcm_ctx_t *ctx)
+{
+ return (u8x16) ((u64x2){ ctx->data_bytes, ctx->aad_bytes } << 3);
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_first (aes_gcm_ctx_t *ctx, aes_data_t data, u32 n_lanes)
+{
+ uword hash_offset = NUM_HI - n_lanes;
+ ctx->next_Hi = (aes_ghash_t *) (ctx->Hi + hash_offset);
+#if N_AES_LANES == 4
+ u8x64 tag4 = {};
+ tag4 = u8x64_insert_u8x16 (tag4, ctx->T, 0);
+ ghash4_mul_first (&ctx->gd, aes_reflect (data) ^ tag4, *ctx->next_Hi++);
+#elif N_AES_LANES == 2
+ u8x32 tag2 = {};
+ tag2 = u8x32_insert_lo (tag2, ctx->T);
+ ghash2_mul_first (&ctx->gd, aes_reflect (data) ^ tag2, *ctx->next_Hi++);
+#else
+ ghash_mul_first (&ctx->gd, aes_reflect (data) ^ ctx->T, *ctx->next_Hi++);
+#endif
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_next (aes_gcm_ctx_t *ctx, aes_data_t data)
+{
+#if N_AES_LANES == 4
+ ghash4_mul_next (&ctx->gd, aes_reflect (data), *ctx->next_Hi++);
+#elif N_AES_LANES == 2
+ ghash2_mul_next (&ctx->gd, aes_reflect (data), *ctx->next_Hi++);
+#else
+ ghash_mul_next (&ctx->gd, aes_reflect (data), *ctx->next_Hi++);
+#endif
+}
+
+static_always_inline void
+aes_gcm_ghash_mul_final_block (aes_gcm_ctx_t *ctx)
+{
+#if N_AES_LANES == 4
+ u8x64 h = u8x64_insert_u8x16 (u8x64_zero (), ctx->Hi[NUM_HI - 1], 0);
+ u8x64 r4 = u8x64_insert_u8x16 (u8x64_zero (), aes_gcm_final_block (ctx), 0);
+ ghash4_mul_next (&ctx->gd, r4, h);
+#elif N_AES_LANES == 2
+ u8x32 h = u8x32_insert_lo (u8x32_zero (), ctx->Hi[NUM_HI - 1]);
+ u8x32 r2 = u8x32_insert_lo (u8x32_zero (), aes_gcm_final_block (ctx));
+ ghash2_mul_next (&ctx->gd, r2, h);
+#else
+ ghash_mul_next (&ctx->gd, aes_gcm_final_block (ctx), ctx->Hi[NUM_HI - 1]);
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_ctr0_round (aes_gcm_ctx_t *ctx, int aes_round)
+{
+ if (aes_round == 0)
+ ctx->EY0 ^= ctx->Ke[0].x1;
+ else if (aes_round == ctx->rounds)
+ ctx->EY0 = aes_enc_last_round_x1 (ctx->EY0, ctx->Ke[aes_round].x1);
+ else
+ ctx->EY0 = aes_enc_round_x1 (ctx->EY0, ctx->Ke[aes_round].x1);
+}
+
+static_always_inline void
+aes_gcm_ghash (aes_gcm_ctx_t *ctx, u8 *data, u32 n_left)
+{
+ uword i;
+ aes_data_t r = {};
+ const aes_mem_t *d = (aes_mem_t *) data;
+
+ for (int n = 8 * N_AES_BYTES; n_left >= n; n_left -= n, d += 8)
+ {
+ if (ctx->operation == AES_GCM_OP_GMAC && n_left == n)
+ {
+ aes_gcm_ghash_mul_first (ctx, d[0], 8 * N_AES_LANES + 1);
+ for (i = 1; i < 8; i++)
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ aes_gcm_ghash_mul_final_block (ctx);
+ aes_gcm_ghash_reduce (ctx);
+ aes_gcm_ghash_reduce2 (ctx);
+ aes_gcm_ghash_final (ctx);
+ goto done;
+ }
+
+ aes_gcm_ghash_mul_first (ctx, d[0], 8 * N_AES_LANES);
+ for (i = 1; i < 8; i++)
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ aes_gcm_ghash_reduce (ctx);
+ aes_gcm_ghash_reduce2 (ctx);
+ aes_gcm_ghash_final (ctx);
+ }
+
+ if (n_left > 0)
+ {
+ int n_lanes = (n_left + 15) / 16;
+
+ if (ctx->operation == AES_GCM_OP_GMAC)
+ n_lanes++;
+
+ if (n_left < N_AES_BYTES)
+ {
+ clib_memcpy_fast (&r, d, n_left);
+ aes_gcm_ghash_mul_first (ctx, r, n_lanes);
+ }
+ else
+ {
+ aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+ n_left -= N_AES_BYTES;
+ i = 1;
+
+ if (n_left >= 4 * N_AES_BYTES)
+ {
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ aes_gcm_ghash_mul_next (ctx, d[i + 1]);
+ aes_gcm_ghash_mul_next (ctx, d[i + 2]);
+ aes_gcm_ghash_mul_next (ctx, d[i + 3]);
+ n_left -= 4 * N_AES_BYTES;
+ i += 4;
+ }
+ if (n_left >= 2 * N_AES_BYTES)
+ {
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ aes_gcm_ghash_mul_next (ctx, d[i + 1]);
+ n_left -= 2 * N_AES_BYTES;
+ i += 2;
+ }
+
+ if (n_left >= N_AES_BYTES)
+ {
+ aes_gcm_ghash_mul_next (ctx, d[i]);
+ n_left -= N_AES_BYTES;
+ i += 1;
+ }
+
+ if (n_left)
+ {
+ clib_memcpy_fast (&r, d + i, n_left);
+ aes_gcm_ghash_mul_next (ctx, r);
+ }
+ }
+
+ if (ctx->operation == AES_GCM_OP_GMAC)
+ aes_gcm_ghash_mul_final_block (ctx);
+ aes_gcm_ghash_reduce (ctx);
+ aes_gcm_ghash_reduce2 (ctx);
+ aes_gcm_ghash_final (ctx);
+ }
+ else if (ctx->operation == AES_GCM_OP_GMAC)
+ ctx->T =
+ ghash_mul (aes_gcm_final_block (ctx) ^ ctx->T, ctx->Hi[NUM_HI - 1]);
+
+done:
+ /* encrypt counter 0 E(Y0, k) */
+ if (ctx->operation == AES_GCM_OP_GMAC)
+ for (int i = 0; i < ctx->rounds + 1; i += 1)
+ aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline void
+aes_gcm_enc_first_round (aes_gcm_ctx_t *ctx, aes_data_t *r, uword n_blocks)
+{
+ const aes_expaned_key_t Ke0 = ctx->Ke[0];
+ uword i = 0;
+
+ /* As counter is stored in network byte order for performance reasons we
+ are incrementing least significant byte only except in case where we
+ overlow. As we are processing four 128, 256 or 512-blocks in parallel
+ except the last round, overflow can happen only when n_blocks == 4 */
+
+#if N_AES_LANES == 4
+ const u32x16 ctr_inv_4444 = { 0, 0, 0, 4 << 24, 0, 0, 0, 4 << 24,
+ 0, 0, 0, 4 << 24, 0, 0, 0, 4 << 24 };
+
+ const u32x16 ctr_4444 = {
+ 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0,
+ };
+
+ if (n_blocks == 4)
+ for (; i < 2; i++)
+ {
+ r[i] = Ke0.x4 ^ (u8x64) ctx->Y; /* Initial AES round */
+ ctx->Y += ctr_inv_4444;
+ }
+
+ if (n_blocks == 4 && PREDICT_FALSE ((u8) ctx->counter == 242))
+ {
+ u32x16 Yr = (u32x16) aes_reflect ((u8x64) ctx->Y);
+
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x4 ^ (u8x64) ctx->Y; /* Initial AES round */
+ Yr += ctr_4444;
+ ctx->Y = (u32x16) aes_reflect ((u8x64) Yr);
+ }
+ }
+ else
+ {
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x4 ^ (u8x64) ctx->Y; /* Initial AES round */
+ ctx->Y += ctr_inv_4444;
+ }
+ }
+ ctx->counter += n_blocks * 4;
+#elif N_AES_LANES == 2
+ const u32x8 ctr_inv_22 = { 0, 0, 0, 2 << 24, 0, 0, 0, 2 << 24 };
+ const u32x8 ctr_22 = { 2, 0, 0, 0, 2, 0, 0, 0 };
+
+ if (n_blocks == 4)
+ for (; i < 2; i++)
+ {
+ r[i] = Ke0.x2 ^ (u8x32) ctx->Y; /* Initial AES round */
+ ctx->Y += ctr_inv_22;
+ }
+
+ if (n_blocks == 4 && PREDICT_FALSE ((u8) ctx->counter == 250))
+ {
+ u32x8 Yr = (u32x8) aes_reflect ((u8x32) ctx->Y);
+
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x2 ^ (u8x32) ctx->Y; /* Initial AES round */
+ Yr += ctr_22;
+ ctx->Y = (u32x8) aes_reflect ((u8x32) Yr);
+ }
+ }
+ else
+ {
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x2 ^ (u8x32) ctx->Y; /* Initial AES round */
+ ctx->Y += ctr_inv_22;
+ }
+ }
+ ctx->counter += n_blocks * 2;
+#else
+ const u32x4 ctr_inv_1 = { 0, 0, 0, 1 << 24 };
+
+ if (PREDICT_TRUE ((u8) ctx->counter < 0xfe) || n_blocks < 3)
+ {
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x1 ^ (u8x16) ctx->Y; /* Initial AES round */
+ ctx->Y += ctr_inv_1;
+ }
+ ctx->counter += n_blocks;
+ }
+ else
+ {
+ r[i++] = Ke0.x1 ^ (u8x16) ctx->Y; /* Initial AES round */
+ ctx->Y += ctr_inv_1;
+ ctx->counter += 1;
+
+ for (; i < n_blocks; i++)
+ {
+ r[i] = Ke0.x1 ^ (u8x16) ctx->Y; /* Initial AES round */
+ ctx->counter++;
+ ctx->Y[3] = clib_host_to_net_u32 (ctx->counter);
+ }
+ }
+#endif
+}
+
+static_always_inline void
+aes_gcm_enc_last_round (aes_gcm_ctx_t *ctx, aes_data_t *r, aes_data_t *d,
+ const aes_expaned_key_t *Ke, uword n_blocks)
+{
+ /* additional ronuds for AES-192 and AES-256 */
+ for (int i = 10; i < ctx->rounds; i++)
+ aes_enc_round (r, Ke + i, n_blocks);
+
+ aes_enc_last_round (r, d, Ke + ctx->rounds, n_blocks);
+}
+
+static_always_inline void
+aes_gcm_calc (aes_gcm_ctx_t *ctx, aes_data_t *d, const u8 *src, u8 *dst, u32 n,
+ u32 n_bytes, int with_ghash)
+{
+ const aes_expaned_key_t *k = ctx->Ke;
+ const aes_mem_t *sv = (aes_mem_t *) src;
+ aes_mem_t *dv = (aes_mem_t *) dst;
+ uword ghash_blocks, gc = 1;
+ aes_data_t r[4];
+ u32 i, n_lanes;
+
+ if (ctx->operation == AES_GCM_OP_ENCRYPT)
+ {
+ ghash_blocks = 4;
+ n_lanes = N_AES_LANES * 4;
+ }
+ else
+ {
+ ghash_blocks = n;
+ n_lanes = n * N_AES_LANES;
+#if N_AES_LANES != 1
+ if (ctx->last)
+ n_lanes = (n_bytes + 15) / 16;
+#endif
+ }
+
+ n_bytes -= (n - 1) * N_AES_BYTES;
+
+ /* AES rounds 0 and 1 */
+ aes_gcm_enc_first_round (ctx, r, n);
+ aes_enc_round (r, k + 1, n);
+
+ /* load data - decrypt round */
+ if (ctx->operation == AES_GCM_OP_DECRYPT)
+ {
+ for (i = 0; i < n - ctx->last; i++)
+ d[i] = sv[i];
+
+ if (ctx->last)
+ d[n - 1] = aes_load_partial ((u8 *) (sv + n - 1), n_bytes);
+ }
+
+ /* GHASH multiply block 0 */
+ if (with_ghash)
+ aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+
+ /* AES rounds 2 and 3 */
+ aes_enc_round (r, k + 2, n);
+ aes_enc_round (r, k + 3, n);
+
+ /* GHASH multiply block 1 */
+ if (with_ghash && gc++ < ghash_blocks)
+ aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+ /* AES rounds 4 and 5 */
+ aes_enc_round (r, k + 4, n);
+ aes_enc_round (r, k + 5, n);
+
+ /* GHASH multiply block 2 */
+ if (with_ghash && gc++ < ghash_blocks)
+ aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+ /* AES rounds 6 and 7 */
+ aes_enc_round (r, k + 6, n);
+ aes_enc_round (r, k + 7, n);
+
+ /* GHASH multiply block 3 */
+ if (with_ghash && gc++ < ghash_blocks)
+ aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+ /* load 4 blocks of data - decrypt round */
+ if (ctx->operation == AES_GCM_OP_ENCRYPT)
+ {
+ for (i = 0; i < n - ctx->last; i++)
+ d[i] = sv[i];
+
+ if (ctx->last)
+ d[n - 1] = aes_load_partial (sv + n - 1, n_bytes);
+ }
+
+ /* AES rounds 8 and 9 */
+ aes_enc_round (r, k + 8, n);
+ aes_enc_round (r, k + 9, n);
+
+ /* AES last round(s) */
+ aes_gcm_enc_last_round (ctx, r, d, k, n);
+
+ /* store data */
+ for (i = 0; i < n - ctx->last; i++)
+ dv[i] = d[i];
+
+ if (ctx->last)
+ aes_store_partial (d[n - 1], dv + n - 1, n_bytes);
+
+ /* GHASH reduce 1st step */
+ aes_gcm_ghash_reduce (ctx);
+
+ /* GHASH reduce 2nd step */
+ if (with_ghash)
+ aes_gcm_ghash_reduce2 (ctx);
+
+ /* GHASH final step */
+ if (with_ghash)
+ aes_gcm_ghash_final (ctx);
+}
+
+static_always_inline void
+aes_gcm_calc_double (aes_gcm_ctx_t *ctx, aes_data_t *d, const u8 *src, u8 *dst)
+{
+ const aes_expaned_key_t *k = ctx->Ke;
+ const aes_mem_t *sv = (aes_mem_t *) src;
+ aes_mem_t *dv = (aes_mem_t *) dst;
+ aes_data_t r[4];
+
+ /* AES rounds 0 and 1 */
+ aes_gcm_enc_first_round (ctx, r, 4);
+ aes_enc_round (r, k + 1, 4);
+
+ /* load 4 blocks of data - decrypt round */
+ if (ctx->operation == AES_GCM_OP_DECRYPT)
+ for (int i = 0; i < 4; i++)
+ d[i] = sv[i];
+
+ /* GHASH multiply block 0 */
+ aes_gcm_ghash_mul_first (ctx, d[0], N_AES_LANES * 8);
+
+ /* AES rounds 2 and 3 */
+ aes_enc_round (r, k + 2, 4);
+ aes_enc_round (r, k + 3, 4);
+
+ /* GHASH multiply block 1 */
+ aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+ /* AES rounds 4 and 5 */
+ aes_enc_round (r, k + 4, 4);
+ aes_enc_round (r, k + 5, 4);
+
+ /* GHASH multiply block 2 */
+ aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+ /* AES rounds 6 and 7 */
+ aes_enc_round (r, k + 6, 4);
+ aes_enc_round (r, k + 7, 4);
+
+ /* GHASH multiply block 3 */
+ aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+ /* AES rounds 8 and 9 */
+ aes_enc_round (r, k + 8, 4);
+ aes_enc_round (r, k + 9, 4);
+
+ /* load 4 blocks of data - encrypt round */
+ if (ctx->operation == AES_GCM_OP_ENCRYPT)
+ for (int i = 0; i < 4; i++)
+ d[i] = sv[i];
+
+ /* AES last round(s) */
+ aes_gcm_enc_last_round (ctx, r, d, k, 4);
+
+ /* store 4 blocks of data */
+ for (int i = 0; i < 4; i++)
+ dv[i] = d[i];
+
+ /* load next 4 blocks of data data - decrypt round */
+ if (ctx->operation == AES_GCM_OP_DECRYPT)
+ for (int i = 0; i < 4; i++)
+ d[i] = sv[i + 4];
+
+ /* GHASH multiply block 4 */
+ aes_gcm_ghash_mul_next (ctx, (d[0]));
+
+ /* AES rounds 0 and 1 */
+ aes_gcm_enc_first_round (ctx, r, 4);
+ aes_enc_round (r, k + 1, 4);
+
+ /* GHASH multiply block 5 */
+ aes_gcm_ghash_mul_next (ctx, (d[1]));
+
+ /* AES rounds 2 and 3 */
+ aes_enc_round (r, k + 2, 4);
+ aes_enc_round (r, k + 3, 4);
+
+ /* GHASH multiply block 6 */
+ aes_gcm_ghash_mul_next (ctx, (d[2]));
+
+ /* AES rounds 4 and 5 */
+ aes_enc_round (r, k + 4, 4);
+ aes_enc_round (r, k + 5, 4);
+
+ /* GHASH multiply block 7 */
+ aes_gcm_ghash_mul_next (ctx, (d[3]));
+
+ /* AES rounds 6 and 7 */
+ aes_enc_round (r, k + 6, 4);
+ aes_enc_round (r, k + 7, 4);
+
+ /* GHASH reduce 1st step */
+ aes_gcm_ghash_reduce (ctx);
+
+ /* AES rounds 8 and 9 */
+ aes_enc_round (r, k + 8, 4);
+ aes_enc_round (r, k + 9, 4);
+
+ /* GHASH reduce 2nd step */
+ aes_gcm_ghash_reduce2 (ctx);
+
+ /* load 4 blocks of data - encrypt round */
+ if (ctx->operation == AES_GCM_OP_ENCRYPT)
+ for (int i = 0; i < 4; i++)
+ d[i] = sv[i + 4];
+
+ /* AES last round(s) */
+ aes_gcm_enc_last_round (ctx, r, d, k, 4);
+
+ /* store data */
+ for (int i = 0; i < 4; i++)
+ dv[i + 4] = d[i];
+
+ /* GHASH final step */
+ aes_gcm_ghash_final (ctx);
+}
+
+static_always_inline void
+aes_gcm_mask_bytes (aes_data_t *d, uword n_bytes)
+{
+ const union
+ {
+ u8 b[64];
+ aes_data_t r;
+ } scale = {
+ .b = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+ };
+
+ d[0] &= (aes_gcm_splat (n_bytes) > scale.r);
+}
+
+static_always_inline void
+aes_gcm_calc_last (aes_gcm_ctx_t *ctx, aes_data_t *d, int n_blocks,
+ u32 n_bytes)
+{
+ int n_lanes = (N_AES_LANES == 1 ? n_blocks : (n_bytes + 15) / 16) + 1;
+ n_bytes -= (n_blocks - 1) * N_AES_BYTES;
+ int i;
+
+ aes_gcm_enc_ctr0_round (ctx, 0);
+ aes_gcm_enc_ctr0_round (ctx, 1);
+
+ if (n_bytes != N_AES_BYTES)
+ aes_gcm_mask_bytes (d + n_blocks - 1, n_bytes);
+
+ aes_gcm_ghash_mul_first (ctx, d[0], n_lanes);
+
+ aes_gcm_enc_ctr0_round (ctx, 2);
+ aes_gcm_enc_ctr0_round (ctx, 3);
+
+ if (n_blocks > 1)
+ aes_gcm_ghash_mul_next (ctx, d[1]);
+
+ aes_gcm_enc_ctr0_round (ctx, 4);
+ aes_gcm_enc_ctr0_round (ctx, 5);
+
+ if (n_blocks > 2)
+ aes_gcm_ghash_mul_next (ctx, d[2]);
+
+ aes_gcm_enc_ctr0_round (ctx, 6);
+ aes_gcm_enc_ctr0_round (ctx, 7);
+
+ if (n_blocks > 3)
+ aes_gcm_ghash_mul_next (ctx, d[3]);
+
+ aes_gcm_enc_ctr0_round (ctx, 8);
+ aes_gcm_enc_ctr0_round (ctx, 9);
+
+ aes_gcm_ghash_mul_final_block (ctx);
+ aes_gcm_ghash_reduce (ctx);
+
+ for (i = 10; i < ctx->rounds; i++)
+ aes_gcm_enc_ctr0_round (ctx, i);
+
+ aes_gcm_ghash_reduce2 (ctx);
+
+ aes_gcm_ghash_final (ctx);
+
+ aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline void
+aes_gcm_enc (aes_gcm_ctx_t *ctx, const u8 *src, u8 *dst, u32 n_left)
+{
+ aes_data_t d[4];
+
+ if (PREDICT_FALSE (n_left == 0))
+ {
+ int i;
+ for (i = 0; i < ctx->rounds + 1; i++)
+ aes_gcm_enc_ctr0_round (ctx, i);
+ return;
+ }
+
+ if (n_left < 4 * N_AES_BYTES)
+ {
+ ctx->last = 1;
+ if (n_left > 3 * N_AES_BYTES)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 0);
+ aes_gcm_calc_last (ctx, d, 4, n_left);
+ }
+ else if (n_left > 2 * N_AES_BYTES)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 0);
+ aes_gcm_calc_last (ctx, d, 3, n_left);
+ }
+ else if (n_left > N_AES_BYTES)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 0);
+ aes_gcm_calc_last (ctx, d, 2, n_left);
+ }
+ else
+ {
+ aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 0);
+ aes_gcm_calc_last (ctx, d, 1, n_left);
+ }
+ return;
+ }
+
+ aes_gcm_calc (ctx, d, src, dst, 4, 4 * N_AES_BYTES, /* with_ghash */ 0);
+
+ /* next */
+ n_left -= 4 * N_AES_BYTES;
+ dst += 4 * N_AES_BYTES;
+ src += 4 * N_AES_BYTES;
+
+ for (int n = 8 * N_AES_BYTES; n_left >= n; n_left -= n, src += n, dst += n)
+ aes_gcm_calc_double (ctx, d, src, dst);
+
+ if (n_left >= 4 * N_AES_BYTES)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 4, 4 * N_AES_BYTES, /* with_ghash */ 1);
+
+ /* next */
+ n_left -= 4 * N_AES_BYTES;
+ dst += 4 * N_AES_BYTES;
+ src += 4 * N_AES_BYTES;
+ }
+
+ if (n_left == 0)
+ {
+ aes_gcm_calc_last (ctx, d, 4, 4 * N_AES_BYTES);
+ return;
+ }
+
+ ctx->last = 1;
+
+ if (n_left > 3 * N_AES_BYTES)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 1);
+ aes_gcm_calc_last (ctx, d, 4, n_left);
+ }
+ else if (n_left > 2 * N_AES_BYTES)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 1);
+ aes_gcm_calc_last (ctx, d, 3, n_left);
+ }
+ else if (n_left > N_AES_BYTES)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 1);
+ aes_gcm_calc_last (ctx, d, 2, n_left);
+ }
+ else
+ {
+ aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 1);
+ aes_gcm_calc_last (ctx, d, 1, n_left);
+ }
+}
+
+static_always_inline void
+aes_gcm_dec (aes_gcm_ctx_t *ctx, const u8 *src, u8 *dst, uword n_left)
+{
+ aes_data_t d[4] = {};
+ ghash_ctx_t gd;
+
+ /* main encryption loop */
+ for (int n = 8 * N_AES_BYTES; n_left >= n; n_left -= n, dst += n, src += n)
+ aes_gcm_calc_double (ctx, d, src, dst);
+
+ if (n_left >= 4 * N_AES_BYTES)
+ {
+ aes_gcm_calc (ctx, d, src, dst, 4, 4 * N_AES_BYTES, /* with_ghash */ 1);
+
+ /* next */
+ n_left -= 4 * N_AES_BYTES;
+ dst += N_AES_BYTES * 4;
+ src += N_AES_BYTES * 4;
+ }
+
+ if (n_left)
+ {
+ ctx->last = 1;
+
+ if (n_left > 3 * N_AES_BYTES)
+ aes_gcm_calc (ctx, d, src, dst, 4, n_left, /* with_ghash */ 1);
+ else if (n_left > 2 * N_AES_BYTES)
+ aes_gcm_calc (ctx, d, src, dst, 3, n_left, /* with_ghash */ 1);
+ else if (n_left > N_AES_BYTES)
+ aes_gcm_calc (ctx, d, src, dst, 2, n_left, /* with_ghash */ 1);
+ else
+ aes_gcm_calc (ctx, d, src, dst, 1, n_left, /* with_ghash */ 1);
+ }
+
+ /* interleaved counter 0 encryption E(Y0, k) and ghash of final GCM
+ * (bit length) block */
+
+ aes_gcm_enc_ctr0_round (ctx, 0);
+ aes_gcm_enc_ctr0_round (ctx, 1);
+
+ ghash_mul_first (&gd, aes_gcm_final_block (ctx) ^ ctx->T,
+ ctx->Hi[NUM_HI - 1]);
+
+ aes_gcm_enc_ctr0_round (ctx, 2);
+ aes_gcm_enc_ctr0_round (ctx, 3);
+
+ ghash_reduce (&gd);
+
+ aes_gcm_enc_ctr0_round (ctx, 4);
+ aes_gcm_enc_ctr0_round (ctx, 5);
+
+ ghash_reduce2 (&gd);
+
+ aes_gcm_enc_ctr0_round (ctx, 6);
+ aes_gcm_enc_ctr0_round (ctx, 7);
+
+ ctx->T = ghash_final (&gd);
+
+ aes_gcm_enc_ctr0_round (ctx, 8);
+ aes_gcm_enc_ctr0_round (ctx, 9);
+
+ for (int i = 10; i < ctx->rounds + 1; i += 1)
+ aes_gcm_enc_ctr0_round (ctx, i);
+}
+
+static_always_inline int
+aes_gcm (const u8 *src, u8 *dst, const u8 *aad, u8 *ivp, u8 *tag,
+ u32 data_bytes, u32 aad_bytes, u8 tag_len,
+ const aes_gcm_key_data_t *kd, int aes_rounds, aes_gcm_op_t op)
+{
+ u8 *addt = (u8 *) aad;
+ u32x4 Y0;
+
+ aes_gcm_ctx_t _ctx = { .counter = 2,
+ .rounds = aes_rounds,
+ .operation = op,
+ .data_bytes = data_bytes,
+ .aad_bytes = aad_bytes,
+ .Ke = kd->Ke,
+ .Hi = kd->Hi },
+ *ctx = &_ctx;
+
+ /* initalize counter */
+ Y0 = (u32x4) (u64x2){ *(u64u *) ivp, 0 };
+ Y0[2] = *(u32u *) (ivp + 8);
+ Y0[3] = 1 << 24;
+ ctx->EY0 = (u8x16) Y0;
+
+#if N_AES_LANES == 4
+ ctx->Y = u32x16_splat_u32x4 (Y0) + (u32x16){
+ 0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24, 0, 0, 0, 3 << 24, 0, 0, 0, 4 << 24,
+ };
+#elif N_AES_LANES == 2
+ ctx->Y =
+ u32x8_splat_u32x4 (Y0) + (u32x8){ 0, 0, 0, 1 << 24, 0, 0, 0, 2 << 24 };
+#else
+ ctx->Y = Y0 + (u32x4){ 0, 0, 0, 1 << 24 };
+#endif
+
+ /* calculate ghash for AAD */
+ aes_gcm_ghash (ctx, addt, aad_bytes);
+
+ /* ghash and encrypt/edcrypt */
+ if (op == AES_GCM_OP_ENCRYPT)
+ aes_gcm_enc (ctx, src, dst, data_bytes);
+ else if (op == AES_GCM_OP_DECRYPT)
+ aes_gcm_dec (ctx, src, dst, data_bytes);
+
+ /* final tag is */
+ ctx->T = u8x16_reflect (ctx->T) ^ ctx->EY0;
+
+ /* tag_len 16 -> 0 */
+ tag_len &= 0xf;
+
+ if (op == AES_GCM_OP_ENCRYPT || op == AES_GCM_OP_GMAC)
+ {
+ /* store tag */
+ if (tag_len)
+ u8x16_store_partial (ctx->T, tag, tag_len);
+ else
+ ((u8x16u *) tag)[0] = ctx->T;
+ }
+ else
+ {
+ /* check tag */
+ if (tag_len)
+ {
+ u16 mask = pow2_mask (tag_len);
+ u8x16 expected = u8x16_load_partial (tag, tag_len);
+ if ((u8x16_msb_mask (expected == ctx->T) & mask) == mask)
+ return 1;
+ }
+ else
+ {
+ if (u8x16_is_equal (ctx->T, *(u8x16u *) tag))
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static_always_inline void
+clib_aes_gcm_key_expand (aes_gcm_key_data_t *kd, const u8 *key,
+ aes_key_size_t ks)
+{
+ u8x16 H;
+ u8x16 ek[AES_KEY_ROUNDS (AES_KEY_256) + 1];
+ aes_expaned_key_t *Ke = (aes_expaned_key_t *) kd->Ke;
+
+ /* expand AES key */
+ aes_key_expand (ek, key, ks);
+ for (int i = 0; i < AES_KEY_ROUNDS (ks) + 1; i++)
+ Ke[i].lanes[0] = Ke[i].lanes[1] = Ke[i].lanes[2] = Ke[i].lanes[3] = ek[i];
+
+ /* pre-calculate H */
+ H = aes_encrypt_block (u8x16_zero (), ek, ks);
+ H = u8x16_reflect (H);
+ ghash_precompute (H, (u8x16 *) kd->Hi, ARRAY_LEN (kd->Hi));
+}
+
+static_always_inline void
+clib_aes128_gcm_enc (const aes_gcm_key_data_t *kd, const u8 *plaintext,
+ u32 data_bytes, const u8 *aad, u32 aad_bytes,
+ const u8 *iv, u32 tag_bytes, u8 *cyphertext, u8 *tag)
+{
+ aes_gcm (plaintext, cyphertext, aad, (u8 *) iv, tag, data_bytes, aad_bytes,
+ tag_bytes, kd, AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_ENCRYPT);
+}
+
+static_always_inline void
+clib_aes256_gcm_enc (const aes_gcm_key_data_t *kd, const u8 *plaintext,
+ u32 data_bytes, const u8 *aad, u32 aad_bytes,
+ const u8 *iv, u32 tag_bytes, u8 *cyphertext, u8 *tag)
+{
+ aes_gcm (plaintext, cyphertext, aad, (u8 *) iv, tag, data_bytes, aad_bytes,
+ tag_bytes, kd, AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_ENCRYPT);
+}
+
+static_always_inline int
+clib_aes128_gcm_dec (const aes_gcm_key_data_t *kd, const u8 *cyphertext,
+ u32 data_bytes, const u8 *aad, u32 aad_bytes,
+ const u8 *iv, const u8 *tag, u32 tag_bytes, u8 *plaintext)
+{
+ return aes_gcm (cyphertext, plaintext, aad, (u8 *) iv, (u8 *) tag,
+ data_bytes, aad_bytes, tag_bytes, kd,
+ AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_DECRYPT);
+}
+
+static_always_inline int
+clib_aes256_gcm_dec (const aes_gcm_key_data_t *kd, const u8 *cyphertext,
+ u32 data_bytes, const u8 *aad, u32 aad_bytes,
+ const u8 *iv, const u8 *tag, u32 tag_bytes, u8 *plaintext)
+{
+ return aes_gcm (cyphertext, plaintext, aad, (u8 *) iv, (u8 *) tag,
+ data_bytes, aad_bytes, tag_bytes, kd,
+ AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_DECRYPT);
+}
+
+static_always_inline void
+clib_aes128_gmac (const aes_gcm_key_data_t *kd, const u8 *data, u32 data_bytes,
+ const u8 *iv, u32 tag_bytes, u8 *tag)
+{
+ aes_gcm (0, 0, data, (u8 *) iv, tag, 0, data_bytes, tag_bytes, kd,
+ AES_KEY_ROUNDS (AES_KEY_128), AES_GCM_OP_GMAC);
+}
+
+static_always_inline void
+clib_aes256_gmac (const aes_gcm_key_data_t *kd, const u8 *data, u32 data_bytes,
+ const u8 *iv, u32 tag_bytes, u8 *tag)
+{
+ aes_gcm (0, 0, data, (u8 *) iv, tag, 0, data_bytes, tag_bytes, kd,
+ AES_KEY_ROUNDS (AES_KEY_256), AES_GCM_OP_GMAC);
+}
+
+#endif /* __crypto_aes_gcm_h__ */
diff --git a/src/vppinfra/crypto/ghash.h b/src/vppinfra/crypto/ghash.h
new file mode 100644
index 00000000000..66e3f6a673a
--- /dev/null
+++ b/src/vppinfra/crypto/ghash.h
@@ -0,0 +1,515 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2019 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+/*
+ *------------------------------------------------------------------
+ * Copyright(c) 2018, Intel Corporation All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
+ * DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *------------------------------------------------------------------
+ */
+
+/*
+ * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
+ * Vinodh Gopal, James Guilford, Tomasz Kantecki
+ *
+ * References:
+ * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
+ * Intel Architecture Processors. August, 2010
+ * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
+ * Intel Architecture Processors. October, 2012.
+ * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
+ *
+ * Definitions:
+ * GF Galois Extension Field GF(2^128) - finite field where elements are
+ * represented as polynomials with coefficients in GF(2) with the
+ * highest degree of 127. Polynomials are represented as 128-bit binary
+ * numbers where each bit represents one coefficient.
+ * e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
+ * H hash key (128 bit)
+ * POLY irreducible polynomial x^127 + x^7 + x^2 + x + 1
+ * RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
+ * + addition in GF, which equals to XOR operation
+ * * multiplication in GF
+ *
+ * GF multiplication consists of 2 steps:
+ * - carry-less multiplication of two 128-bit operands into 256-bit result
+ * - reduction of 256-bit result into 128-bit with modulo POLY
+ *
+ * GHash is calculated on 128-bit blocks of data according to the following
+ * formula:
+ * GH = (GH + data) * hash_key
+ *
+ * To avoid bit-reflection of data, this code uses GF multipication
+ * with reversed polynomial:
+ * a * b * x^-127 mod RPOLY
+ *
+ * To improve computation speed table Hi is precomputed with powers of H',
+ * where H' is calculated as H<<1 mod RPOLY.
+ * This allows us to improve performance by deferring reduction. For example
+ * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
+ *
+ * u8x16 Hi[4];
+ * ghash_precompute (H, Hi, 4);
+ *
+ * ghash_ctx_t _gd, *gd = &_gd;
+ * ghash_mul_first (gd, GH ^ b0, Hi[3]);
+ * ghash_mul_next (gd, b1, Hi[2]);
+ * ghash_mul_next (gd, b2, Hi[1]);
+ * ghash_mul_next (gd, b3, Hi[0]);
+ * ghash_reduce (gd);
+ * ghash_reduce2 (gd);
+ * GH = ghash_final (gd);
+ *
+ * Reduction step is split into 3 functions so it can be better interleaved
+ * with other code, (i.e. with AES computation).
+ */
+
+#ifndef __ghash_h__
+#define __ghash_h__
+
+static_always_inline u8x16
+gmul_lo_lo (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+ return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
+ (poly64_t) vget_low_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_hi_lo (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+ return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
+ (poly64_t) vget_low_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_lo_hi (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+ return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
+ (poly64_t) vget_high_p64 ((poly64x2_t) b));
+#endif
+}
+
+static_always_inline u8x16
+gmul_hi_hi (u8x16 a, u8x16 b)
+{
+#if defined (__PCLMUL__)
+ return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
+#elif defined (__ARM_FEATURE_CRYPTO)
+ return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
+#endif
+}
+
+typedef struct
+{
+ u8x16 mid, hi, lo, tmp_lo, tmp_hi;
+ u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
+ u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
+ int pending;
+} ghash_ctx_t;
+
+static const u8x16 ghash_poly = {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
+};
+
+static const u8x16 ghash_poly2 = {
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
+};
+
+static_always_inline void
+ghash_mul_first (ghash_ctx_t *gd, u8x16 a, u8x16 b)
+{
+ /* a1 * b1 */
+ gd->hi = gmul_hi_hi (a, b);
+ /* a0 * b0 */
+ gd->lo = gmul_lo_lo (a, b);
+ /* a0 * b1 ^ a1 * b0 */
+ gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
+
+ /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
+ there is no pending data in tmp_lo and tmp_hi */
+ gd->pending = 0;
+}
+
+static_always_inline void
+ghash_mul_next (ghash_ctx_t *gd, u8x16 a, u8x16 b)
+{
+ /* a1 * b1 */
+ u8x16 hi = gmul_hi_hi (a, b);
+ /* a0 * b0 */
+ u8x16 lo = gmul_lo_lo (a, b);
+
+ /* this branch will be optimized out by the compiler, and it allows us to
+ reduce number of XOR operations by using ternary logic */
+ if (gd->pending)
+ {
+ /* there is peding data from previous invocation so we can XOR */
+ gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
+ gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
+ gd->pending = 0;
+ }
+ else
+ {
+ /* there is no peding data from previous invocation so we postpone XOR */
+ gd->tmp_hi = hi;
+ gd->tmp_lo = lo;
+ gd->pending = 1;
+ }
+
+ /* gd->mid ^= a0 * b1 ^ a1 * b0 */
+ gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash_reduce (ghash_ctx_t *gd)
+{
+ u8x16 r;
+
+ /* Final combination:
+ gd->lo ^= gd->mid << 64
+ gd->hi ^= gd->mid >> 64 */
+ u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
+ u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
+
+ if (gd->pending)
+ {
+ gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
+ gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
+ }
+ else
+ {
+ gd->lo ^= midl;
+ gd->hi ^= midr;
+ }
+ r = gmul_hi_lo (ghash_poly2, gd->lo);
+ gd->lo ^= u8x16_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash_reduce2 (ghash_ctx_t *gd)
+{
+ gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
+ gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
+}
+
+static_always_inline u8x16
+ghash_final (ghash_ctx_t *gd)
+{
+ return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
+ u8x16_word_shift_left (gd->tmp_hi, 4));
+}
+
+static_always_inline u8x16
+ghash_mul (u8x16 a, u8x16 b)
+{
+ ghash_ctx_t _gd, *gd = &_gd;
+ ghash_mul_first (gd, a, b);
+ ghash_reduce (gd);
+ ghash_reduce2 (gd);
+ return ghash_final (gd);
+}
+
+#if defined(__VPCLMULQDQ__) && defined(__AVX512F__)
+
+static const u8x64 ghash4_poly2 = {
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+};
+
+static_always_inline u8x64
+gmul4_lo_lo (u8x64 a, u8x64 b)
+{
+ return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
+}
+
+static_always_inline u8x64
+gmul4_hi_lo (u8x64 a, u8x64 b)
+{
+ return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
+}
+
+static_always_inline u8x64
+gmul4_lo_hi (u8x64 a, u8x64 b)
+{
+ return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
+}
+
+static_always_inline u8x64
+gmul4_hi_hi (u8x64 a, u8x64 b)
+{
+ return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
+}
+
+static_always_inline void
+ghash4_mul_first (ghash_ctx_t *gd, u8x64 a, u8x64 b)
+{
+ gd->hi4 = gmul4_hi_hi (a, b);
+ gd->lo4 = gmul4_lo_lo (a, b);
+ gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
+ gd->pending = 0;
+}
+
+static_always_inline void
+ghash4_mul_next (ghash_ctx_t *gd, u8x64 a, u8x64 b)
+{
+ u8x64 hi = gmul4_hi_hi (a, b);
+ u8x64 lo = gmul4_lo_lo (a, b);
+
+ if (gd->pending)
+ {
+ /* there is peding data from previous invocation so we can XOR */
+ gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
+ gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
+ gd->pending = 0;
+ }
+ else
+ {
+ /* there is no peding data from previous invocation so we postpone XOR */
+ gd->tmp_hi4 = hi;
+ gd->tmp_lo4 = lo;
+ gd->pending = 1;
+ }
+ gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash4_reduce (ghash_ctx_t *gd)
+{
+ u8x64 r;
+
+ /* Final combination:
+ gd->lo4 ^= gd->mid4 << 64
+ gd->hi4 ^= gd->mid4 >> 64 */
+
+ u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
+ u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
+
+ if (gd->pending)
+ {
+ gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
+ gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
+ }
+ else
+ {
+ gd->lo4 ^= midl;
+ gd->hi4 ^= midr;
+ }
+
+ r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
+ gd->lo4 ^= u8x64_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash4_reduce2 (ghash_ctx_t *gd)
+{
+ gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
+ gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
+}
+
+static_always_inline u8x16
+ghash4_final (ghash_ctx_t *gd)
+{
+ u8x64 r;
+ u8x32 t;
+
+ r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
+ u8x64_word_shift_left (gd->tmp_hi4, 4));
+
+ /* horizontal XOR of 4 128-bit lanes */
+ t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
+ return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
+}
+#endif
+
+#if defined(__VPCLMULQDQ__)
+
+static const u8x32 ghash2_poly2 = {
+ 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
+};
+
+static_always_inline u8x32
+gmul2_lo_lo (u8x32 a, u8x32 b)
+{
+ return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
+}
+
+static_always_inline u8x32
+gmul2_hi_lo (u8x32 a, u8x32 b)
+{
+ return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
+}
+
+static_always_inline u8x32
+gmul2_lo_hi (u8x32 a, u8x32 b)
+{
+ return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
+}
+
+static_always_inline u8x32
+gmul2_hi_hi (u8x32 a, u8x32 b)
+{
+ return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
+}
+
+static_always_inline void
+ghash2_mul_first (ghash_ctx_t *gd, u8x32 a, u8x32 b)
+{
+ gd->hi2 = gmul2_hi_hi (a, b);
+ gd->lo2 = gmul2_lo_lo (a, b);
+ gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
+ gd->pending = 0;
+}
+
+static_always_inline void
+ghash2_mul_next (ghash_ctx_t *gd, u8x32 a, u8x32 b)
+{
+ u8x32 hi = gmul2_hi_hi (a, b);
+ u8x32 lo = gmul2_lo_lo (a, b);
+
+ if (gd->pending)
+ {
+ /* there is peding data from previous invocation so we can XOR */
+ gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
+ gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
+ gd->pending = 0;
+ }
+ else
+ {
+ /* there is no peding data from previous invocation so we postpone XOR */
+ gd->tmp_hi2 = hi;
+ gd->tmp_lo2 = lo;
+ gd->pending = 1;
+ }
+ gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
+}
+
+static_always_inline void
+ghash2_reduce (ghash_ctx_t *gd)
+{
+ u8x32 r;
+
+ /* Final combination:
+ gd->lo2 ^= gd->mid2 << 64
+ gd->hi2 ^= gd->mid2 >> 64 */
+
+ u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
+ u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
+
+ if (gd->pending)
+ {
+ gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
+ gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
+ }
+ else
+ {
+ gd->lo2 ^= midl;
+ gd->hi2 ^= midr;
+ }
+
+ r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
+ gd->lo2 ^= u8x32_word_shift_left (r, 8);
+}
+
+static_always_inline void
+ghash2_reduce2 (ghash_ctx_t *gd)
+{
+ gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
+ gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
+}
+
+static_always_inline u8x16
+ghash2_final (ghash_ctx_t *gd)
+{
+ u8x32 r;
+
+ r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
+ u8x32_word_shift_left (gd->tmp_hi2, 4));
+
+ /* horizontal XOR of 2 128-bit lanes */
+ return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
+}
+#endif
+
+static_always_inline void
+ghash_precompute (u8x16 H, u8x16 * Hi, int n)
+{
+ u8x16 r8;
+ u32x4 r32;
+ /* calcullate H<<1 mod poly from the hash key */
+ r8 = (u8x16) ((u64x2) H >> 63);
+ H = (u8x16) ((u64x2) H << 1);
+ H |= u8x16_word_shift_left (r8, 8);
+ r32 = (u32x4) u8x16_word_shift_right (r8, 8);
+#ifdef __SSE2__
+ r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
+#else
+ r32[3] = r32[0];
+#endif
+ r32 = r32 == (u32x4) {1, 0, 0, 1};
+ Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
+
+ /* calculate H^(i + 1) */
+ for (int i = n - 2; i >= 0; i--)
+ Hi[i] = ghash_mul (H, Hi[i + 1]);
+}
+
+#endif /* __ghash_h__ */
+
diff --git a/src/vppinfra/crypto/poly1305.h b/src/vppinfra/crypto/poly1305.h
new file mode 100644
index 00000000000..cd6ea60cdf7
--- /dev/null
+++ b/src/vppinfra/crypto/poly1305.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __clib_poly1305_h__
+#define __clib_poly1305_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+
+/* implementation of DJB's poly1305 using 64-bit arithmetrics */
+
+typedef struct
+{
+ const u64 r[3], s[2];
+ u64 h[3];
+
+ /* partial data */
+ union
+ {
+ u8 as_u8[16];
+ u64 as_u64[2];
+ } partial;
+
+ size_t n_partial_bytes;
+} clib_poly1305_ctx;
+
+static_always_inline void
+clib_poly1305_init (clib_poly1305_ctx *ctx, const u8 key[32])
+{
+ u64u *k = (u64u *) key;
+ u64 *h = (u64 *) ctx->h;
+ u64 *r = (u64 *) ctx->r;
+ u64 *s = (u64 *) ctx->s;
+
+ /* initialize accumulator */
+ h[0] = h[1] = h[2] = 0;
+
+ /* clamp 1st half of the key and store it into r[] */
+ r[0] = k[0] & 0x0ffffffc0fffffff;
+ r[1] = k[1] & 0x0ffffffc0ffffffc;
+ s[0] = k[2];
+ s[1] = k[3];
+
+ /* precompute (r[1] >> 2) * 5 */
+ r[2] = r[1] + (r[1] >> 2);
+
+ ctx->n_partial_bytes = 0;
+}
+
+static_always_inline void
+_clib_poly1305_multiply_and_reduce (u64 h[3], const u64 r[3])
+{
+ union
+ {
+ struct
+ {
+ u64 lo, hi;
+ };
+ u128 n;
+ } l0, l1, l2;
+ u64 c;
+
+ /*
+ h2 h1 h0
+ x r1 r0
+ ---------------------------------------
+ r0 x h2 r0 x h1 r0 × h0
+ + r1 x h2 r1 x h1 r1 x h0
+ ---------------------------------------
+
+ for p = 2^130-5, following applies:
+ (r * 2^130) mod p == (r * 5) mod p
+
+ bits above 130 can be shifted right (divided by 2^130)
+ and multiplied by 5 per equation above
+
+ h2 h1 h0
+ x r1 r0
+ ----------------------------------------------
+ r0 x h2 r0 x h1 r0 × h0
+ + r1 x h0
+ + 5x (r1 >>2) x h2 5x (r1 >>2) x h1
+ ----------------------------------------------
+ [0:l2.lo] [l1.hi:l1.lo] [l0.hi:l0.lo]
+ */
+
+ l0.n = l1.n = l2.n = 0;
+ /* u64 x u64 = u128 multiplications */
+ l0.n += (u128) h[0] * r[0];
+ l0.n += (u128) h[1] * r[2]; /* r[2] holds precomputed (r[1] >> 2) * 5 */
+ l1.n += (u128) h[0] * r[1];
+ l1.n += (u128) h[1] * r[0];
+
+ /* u64 x u64 = u64 multiplications, as h[2] may have only lower 2 bits set
+ * and r[1] have clamped bits 60-63 */
+ l1.n += (u128) (h[2] * r[2]);
+ l2.n += (u128) (h[2] * r[0]);
+
+ /* propagate upper 64 bits to higher limb */
+ c = 0;
+ l1.lo = u64_add_with_carry (&c, l1.lo, l0.hi);
+ l2.lo = u64_add_with_carry (&c, l2.lo, l1.hi);
+
+ l2.hi = l2.lo;
+ /* keep bits [128:129] */
+ l2.lo &= 3;
+
+ /* bits 130 and above multiply with 5 and store to l2.hi */
+ l2.hi -= l2.lo;
+ l2.hi += l2.hi >> 2;
+
+ /* add l2.hi to l0.lo with carry propagation and store result to h2:h1:h0 */
+ c = 0;
+ h[0] = u64_add_with_carry (&c, l0.lo, l2.hi);
+ h[1] = u64_add_with_carry (&c, l1.lo, 0);
+ h[2] = u64_add_with_carry (&c, l2.lo, 0);
+}
+
+static_always_inline u32
+_clib_poly1305_add_blocks (clib_poly1305_ctx *ctx, const u8 *msg,
+ uword n_bytes, const u32 bit17)
+{
+ u64 r[3], h[3];
+
+ for (int i = 0; i < 3; i++)
+ {
+ h[i] = ctx->h[i];
+ r[i] = ctx->r[i];
+ }
+
+ for (const u64u *m = (u64u *) msg; n_bytes >= 16; n_bytes -= 16, m += 2)
+ {
+ u64 c = 0;
+
+ /* h += m */
+ h[0] = u64_add_with_carry (&c, h[0], m[0]);
+ h[1] = u64_add_with_carry (&c, h[1], m[1]);
+ h[2] = u64_add_with_carry (&c, h[2], bit17 ? 1 : 0);
+
+ /* h = (h * r) mod p */
+ _clib_poly1305_multiply_and_reduce (h, r);
+ }
+
+ for (int i = 0; i < 3; i++)
+ ctx->h[i] = h[i];
+
+ return n_bytes;
+}
+
+static_always_inline void
+clib_poly1305_update (clib_poly1305_ctx *ctx, const u8 *msg, uword len)
+{
+ uword n_left = len;
+
+ if (n_left == 0)
+ return;
+
+ if (ctx->n_partial_bytes)
+ {
+ u16 missing_bytes = 16 - ctx->n_partial_bytes;
+ if (PREDICT_FALSE (n_left < missing_bytes))
+ {
+ clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg,
+ n_left);
+ ctx->n_partial_bytes += n_left;
+ return;
+ }
+
+ clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg,
+ missing_bytes);
+ _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 1);
+ ctx->n_partial_bytes = 0;
+ n_left -= missing_bytes;
+ msg += missing_bytes;
+ }
+
+ n_left = _clib_poly1305_add_blocks (ctx, msg, n_left, 1);
+
+ if (n_left)
+ {
+ ctx->partial.as_u64[0] = ctx->partial.as_u64[1] = 0;
+ clib_memcpy_fast (ctx->partial.as_u8, msg + len - n_left, n_left);
+ ctx->n_partial_bytes = n_left;
+ }
+}
+
+static_always_inline void
+clib_poly1305_final (clib_poly1305_ctx *ctx, u8 *out)
+{
+ const u64 p[] = { 0xFFFFFFFFFFFFFFFB, 0xFFFFFFFFFFFFFFFF, 3 }; /* 2^128-5 */
+ const u64 *s = ctx->s;
+ u64u *t = (u64u *) out;
+ u64 h0, h1, t0, t1;
+ u64 c;
+
+ if (ctx->n_partial_bytes)
+ {
+ ctx->partial.as_u8[ctx->n_partial_bytes] = 1;
+ _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 0);
+ }
+
+ h0 = ctx->h[0];
+ h1 = ctx->h[1];
+
+ /* h may not be fully reduced, try to subtract 2^128-5 */
+ c = 0;
+ t0 = u64_sub_with_borrow (&c, h0, p[0]);
+ t1 = u64_sub_with_borrow (&c, h1, p[1]);
+ u64_sub_with_borrow (&c, ctx->h[2], p[2]);
+
+ if (!c)
+ {
+ h0 = t0;
+ h1 = t1;
+ }
+
+ c = 0;
+ t[0] = u64_add_with_carry (&c, h0, s[0]);
+ t[1] = u64_add_with_carry (&c, h1, s[1]);
+}
+
+static_always_inline void
+clib_poly1305 (const u8 *key, const u8 *msg, uword len, u8 *out)
+{
+ clib_poly1305_ctx ctx;
+ clib_poly1305_init (&ctx, key);
+ clib_poly1305_update (&ctx, msg, len);
+ clib_poly1305_final (&ctx, out);
+}
+
+#endif /* __clib_poly1305_h__ */
diff --git a/src/vppinfra/crypto/sha2.h b/src/vppinfra/crypto/sha2.h
new file mode 100644
index 00000000000..69a24a2d087
--- /dev/null
+++ b/src/vppinfra/crypto/sha2.h
@@ -0,0 +1,715 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2024 Cisco Systems, Inc.
+ */
+
+#ifndef included_sha2_h
+#define included_sha2_h
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/string.h>
+
+#define SHA256_ROTR(x, y) ((x >> y) | (x << (32 - y)))
+#define SHA256_CH(a, b, c) ((a & b) ^ (~a & c))
+#define SHA256_MAJ(a, b, c) ((a & b) ^ (a & c) ^ (b & c))
+#define SHA256_CSIGMA0(x) \
+ (SHA256_ROTR (x, 2) ^ SHA256_ROTR (x, 13) ^ SHA256_ROTR (x, 22));
+#define SHA256_CSIGMA1(x) \
+ (SHA256_ROTR (x, 6) ^ SHA256_ROTR (x, 11) ^ SHA256_ROTR (x, 25));
+#define SHA256_SSIGMA0(x) (SHA256_ROTR (x, 7) ^ SHA256_ROTR (x, 18) ^ (x >> 3))
+#define SHA256_SSIGMA1(x) \
+ (SHA256_ROTR (x, 17) ^ SHA256_ROTR (x, 19) ^ (x >> 10))
+
+#define SHA256_MSG_SCHED(w, j) \
+ { \
+ w[j] = w[j - 7] + w[j - 16]; \
+ w[j] += SHA256_SSIGMA0 (w[j - 15]); \
+ w[j] += SHA256_SSIGMA1 (w[j - 2]); \
+ }
+
+#define SHA256_TRANSFORM(s, w, i, k) \
+ { \
+ __typeof__ (s[0]) t1, t2; \
+ t1 = k + w[i] + s[7]; \
+ t1 += SHA256_CSIGMA1 (s[4]); \
+ t1 += SHA256_CH (s[4], s[5], s[6]); \
+ t2 = SHA256_CSIGMA0 (s[0]); \
+ t2 += SHA256_MAJ (s[0], s[1], s[2]); \
+ s[7] = s[6]; \
+ s[6] = s[5]; \
+ s[5] = s[4]; \
+ s[4] = s[3] + t1; \
+ s[3] = s[2]; \
+ s[2] = s[1]; \
+ s[1] = s[0]; \
+ s[0] = t1 + t2; \
+ }
+
+#define SHA512_ROTR(x, y) ((x >> y) | (x << (64 - y)))
+#define SHA512_CH(a, b, c) ((a & b) ^ (~a & c))
+#define SHA512_MAJ(a, b, c) ((a & b) ^ (a & c) ^ (b & c))
+#define SHA512_CSIGMA0(x) \
+ (SHA512_ROTR (x, 28) ^ SHA512_ROTR (x, 34) ^ SHA512_ROTR (x, 39))
+#define SHA512_CSIGMA1(x) \
+ (SHA512_ROTR (x, 14) ^ SHA512_ROTR (x, 18) ^ SHA512_ROTR (x, 41))
+#define SHA512_SSIGMA0(x) (SHA512_ROTR (x, 1) ^ SHA512_ROTR (x, 8) ^ (x >> 7))
+#define SHA512_SSIGMA1(x) \
+ (SHA512_ROTR (x, 19) ^ SHA512_ROTR (x, 61) ^ (x >> 6))
+
+#define SHA512_MSG_SCHED(w, j) \
+ { \
+ w[j] = w[j - 7] + w[j - 16]; \
+ w[j] += SHA512_SSIGMA0 (w[j - 15]); \
+ w[j] += SHA512_SSIGMA1 (w[j - 2]); \
+ }
+
+#define SHA512_TRANSFORM(s, w, i, k) \
+ { \
+ __typeof__ (s[0]) t1, t2; \
+ t1 = k + w[i] + s[7]; \
+ t1 += SHA512_CSIGMA1 (s[4]); \
+ t1 += SHA512_CH (s[4], s[5], s[6]); \
+ t2 = SHA512_CSIGMA0 (s[0]); \
+ t2 += SHA512_MAJ (s[0], s[1], s[2]); \
+ s[7] = s[6]; \
+ s[6] = s[5]; \
+ s[5] = s[4]; \
+ s[4] = s[3] + t1; \
+ s[3] = s[2]; \
+ s[2] = s[1]; \
+ s[1] = s[0]; \
+ s[0] = t1 + t2; \
+ }
+
+#if defined(__SHA__) && defined(__x86_64__)
+#define CLIB_SHA256_ISA_INTEL
+#define CLIB_SHA256_ISA
+#endif
+
+#ifdef __ARM_FEATURE_SHA2
+#define CLIB_SHA256_ISA_ARM
+#define CLIB_SHA256_ISA
+#endif
+
+static const u32 sha224_h[8] = { 0xc1059ed8, 0x367cd507, 0x3070dd17,
+ 0xf70e5939, 0xffc00b31, 0x68581511,
+ 0x64f98fa7, 0xbefa4fa4 };
+
+static const u32 sha256_h[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372,
+ 0xa54ff53a, 0x510e527f, 0x9b05688c,
+ 0x1f83d9ab, 0x5be0cd19 };
+
+static const u32 clib_sha2_256_k[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+ 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+ 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+ 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+ 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+ 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static const u64 sha384_h[8] = { 0xcbbb9d5dc1059ed8, 0x629a292a367cd507,
+ 0x9159015a3070dd17, 0x152fecd8f70e5939,
+ 0x67332667ffc00b31, 0x8eb44a8768581511,
+ 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4 };
+
+static const u64 sha512_h[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+ 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+ 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+ 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 };
+
+static const u64 sha512_224_h[8] = { 0x8c3d37c819544da2, 0x73e1996689dcd4d6,
+ 0x1dfab7ae32ff9c82, 0x679dd514582f9fcf,
+ 0x0f6d2b697bd44da8, 0x77e36f7304c48942,
+ 0x3f9d85a86a1d36c8, 0x1112e6ad91d692a1 };
+
+static const u64 sha512_256_h[8] = { 0x22312194fc2bf72c, 0x9f555fa3c84c64c2,
+ 0x2393b86b6f53b151, 0x963877195940eabd,
+ 0x96283ee2a88effe3, 0xbe5e1e2553863992,
+ 0x2b0199fc2c85b8aa, 0x0eb72ddc81c52ca2 };
+
+static const u64 clib_sha2_512_k[80] = {
+ 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f,
+ 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019,
+ 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242,
+ 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+ 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
+ 0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
+ 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275,
+ 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+ 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f,
+ 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
+ 0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc,
+ 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+ 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6,
+ 0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
+ 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
+ 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+ 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99,
+ 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
+ 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc,
+ 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+ 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915,
+ 0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207,
+ 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba,
+ 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+ 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
+ 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
+ 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+};
+
+typedef enum
+{
+ CLIB_SHA2_224,
+ CLIB_SHA2_256,
+ CLIB_SHA2_384,
+ CLIB_SHA2_512,
+ CLIB_SHA2_512_224,
+ CLIB_SHA2_512_256,
+} clib_sha2_type_t;
+
+#define CLIB_SHA2_256_BLOCK_SIZE 64
+#define CLIB_SHA2_512_BLOCK_SIZE 128
+#define SHA2_MAX_BLOCK_SIZE CLIB_SHA2_512_BLOCK_SIZE
+#define SHA2_MAX_DIGEST_SIZE 64
+
+static const struct
+{
+ u8 block_size;
+ u8 digest_size;
+ const u32 *h32;
+ const u64 *h64;
+} clib_sha2_variants[] = {
+ [CLIB_SHA2_224] = {
+ .block_size = CLIB_SHA2_256_BLOCK_SIZE,
+ .digest_size = 28,
+ .h32 = sha224_h,
+ },
+ [CLIB_SHA2_256] = {
+ .block_size = CLIB_SHA2_256_BLOCK_SIZE,
+ .digest_size = 32,
+ .h32 = sha256_h,
+ },
+ [CLIB_SHA2_384] = {
+ .block_size = CLIB_SHA2_512_BLOCK_SIZE,
+ .digest_size = 48,
+ .h64 = sha384_h,
+ },
+ [CLIB_SHA2_512] = {
+ .block_size = CLIB_SHA2_512_BLOCK_SIZE,
+ .digest_size = 64,
+ .h64 = sha512_h,
+ },
+ [CLIB_SHA2_512_224] = {
+ .block_size = CLIB_SHA2_512_BLOCK_SIZE,
+ .digest_size = 28,
+ .h64 = sha512_224_h,
+ },
+ [CLIB_SHA2_512_256] = {
+ .block_size = CLIB_SHA2_512_BLOCK_SIZE,
+ .digest_size = 32,
+ .h64 = sha512_256_h,
+ },
+};
+
+typedef union
+{
+ u32 h32[8];
+ u64 h64[8];
+#ifdef CLIB_SHA256_ISA
+ u32x4 h32x4[2];
+#endif
+} clib_sha2_h_t;
+
+typedef struct
+{
+ u64 total_bytes;
+ u16 n_pending;
+ clib_sha2_h_t h;
+ union
+ {
+ u8 as_u8[SHA2_MAX_BLOCK_SIZE];
+ u64 as_u64[SHA2_MAX_BLOCK_SIZE / sizeof (u64)];
+ uword as_uword[SHA2_MAX_BLOCK_SIZE / sizeof (uword)];
+ } pending;
+} clib_sha2_state_t;
+
+typedef struct
+{
+ clib_sha2_type_t type;
+ u8 block_size;
+ u8 digest_size;
+ clib_sha2_state_t state;
+} clib_sha2_ctx_t;
+
+static_always_inline void
+clib_sha2_state_init (clib_sha2_state_t *state, clib_sha2_type_t type)
+{
+ clib_sha2_state_t st = {};
+
+ if (clib_sha2_variants[type].block_size == CLIB_SHA2_256_BLOCK_SIZE)
+ for (int i = 0; i < 8; i++)
+ st.h.h32[i] = clib_sha2_variants[type].h32[i];
+ else
+ for (int i = 0; i < 8; i++)
+ st.h.h64[i] = clib_sha2_variants[type].h64[i];
+
+ *state = st;
+}
+
+static_always_inline void
+clib_sha2_init (clib_sha2_ctx_t *ctx, clib_sha2_type_t type)
+{
+ clib_sha2_state_init (&ctx->state, type);
+ ctx->block_size = clib_sha2_variants[type].block_size;
+ ctx->digest_size = clib_sha2_variants[type].digest_size;
+ ctx->type = type;
+}
+
+#ifdef CLIB_SHA256_ISA
+static inline void
+clib_sha256_vec_cycle_w (u32x4 w[], u8 i)
+{
+ u8 j = (i + 1) % 4;
+ u8 k = (i + 2) % 4;
+ u8 l = (i + 3) % 4;
+#ifdef CLIB_SHA256_ISA_INTEL
+ w[i] = (u32x4) _mm_sha256msg1_epu32 ((__m128i) w[i], (__m128i) w[j]);
+ w[i] += (u32x4) _mm_alignr_epi8 ((__m128i) w[l], (__m128i) w[k], 4);
+ w[i] = (u32x4) _mm_sha256msg2_epu32 ((__m128i) w[i], (__m128i) w[l]);
+#elif defined(CLIB_SHA256_ISA_ARM)
+ w[i] = vsha256su1q_u32 (vsha256su0q_u32 (w[i], w[j]), w[k], w[l]);
+#endif
+}
+
+static inline void
+clib_sha256_vec_4_rounds (u32x4 w, u8 n, u32x4 s[])
+{
+#ifdef CLIB_SHA256_ISA_INTEL
+ u32x4 r = *(u32x4 *) (clib_sha2_256_k + 4 * n) + w;
+ s[0] = (u32x4) _mm_sha256rnds2_epu32 ((__m128i) s[0], (__m128i) s[1],
+ (__m128i) r);
+ r = (u32x4) u64x2_interleave_hi ((u64x2) r, (u64x2) r);
+ s[1] = (u32x4) _mm_sha256rnds2_epu32 ((__m128i) s[1], (__m128i) s[0],
+ (__m128i) r);
+#elif defined(CLIB_SHA256_ISA_ARM)
+ u32x4 r0, s0;
+ const u32x4u *k = (u32x4u *) clib_sha2_256_k;
+
+ r0 = w + k[n];
+ s0 = s[0];
+ s[0] = vsha256hq_u32 (s[0], s[1], r0);
+ s[1] = vsha256h2q_u32 (s[1], s0, r0);
+#endif
+}
+#endif
+
+#if defined(CLIB_SHA256_ISA)
+static inline u32x4
+clib_sha256_vec_load (u32x4 r)
+{
+#if defined(CLIB_SHA256_ISA_INTEL)
+ return u32x4_byte_swap (r);
+#elif defined(CLIB_SHA256_ISA_ARM)
+ return vreinterpretq_u32_u8 (vrev32q_u8 (vreinterpretq_u8_u32 (r)));
+#endif
+}
+
+static inline void
+clib_sha256_vec_shuffle (u32x4 d[2])
+{
+#if defined(CLIB_SHA256_ISA_INTEL)
+ /* {0, 1, 2, 3}, {4, 5, 6, 7} -> {7, 6, 3, 2}, {5, 4, 1, 0} */
+ u32x4 r;
+ r = (u32x4) _mm_shuffle_ps ((__m128) d[1], (__m128) d[0], 0xbb);
+ d[1] = (u32x4) _mm_shuffle_ps ((__m128) d[1], (__m128) d[0], 0x11);
+ d[0] = r;
+#endif
+}
+#endif
+
+static inline void
+clib_sha256_block (clib_sha2_state_t *st, const u8 *msg, uword n_blocks)
+{
+#if defined(CLIB_SHA256_ISA)
+ u32x4 h[2];
+ u32x4u *m = (u32x4u *) msg;
+
+ h[0] = st->h.h32x4[0];
+ h[1] = st->h.h32x4[1];
+
+ clib_sha256_vec_shuffle (h);
+
+ for (; n_blocks; m += 4, n_blocks--)
+ {
+ u32x4 s[2], w[4];
+
+ s[0] = h[0];
+ s[1] = h[1];
+
+ w[0] = clib_sha256_vec_load (m[0]);
+ w[1] = clib_sha256_vec_load (m[1]);
+ w[2] = clib_sha256_vec_load (m[2]);
+ w[3] = clib_sha256_vec_load (m[3]);
+
+ clib_sha256_vec_4_rounds (w[0], 0, s);
+ clib_sha256_vec_4_rounds (w[1], 1, s);
+ clib_sha256_vec_4_rounds (w[2], 2, s);
+ clib_sha256_vec_4_rounds (w[3], 3, s);
+
+ clib_sha256_vec_cycle_w (w, 0);
+ clib_sha256_vec_4_rounds (w[0], 4, s);
+ clib_sha256_vec_cycle_w (w, 1);
+ clib_sha256_vec_4_rounds (w[1], 5, s);
+ clib_sha256_vec_cycle_w (w, 2);
+ clib_sha256_vec_4_rounds (w[2], 6, s);
+ clib_sha256_vec_cycle_w (w, 3);
+ clib_sha256_vec_4_rounds (w[3], 7, s);
+
+ clib_sha256_vec_cycle_w (w, 0);
+ clib_sha256_vec_4_rounds (w[0], 8, s);
+ clib_sha256_vec_cycle_w (w, 1);
+ clib_sha256_vec_4_rounds (w[1], 9, s);
+ clib_sha256_vec_cycle_w (w, 2);
+ clib_sha256_vec_4_rounds (w[2], 10, s);
+ clib_sha256_vec_cycle_w (w, 3);
+ clib_sha256_vec_4_rounds (w[3], 11, s);
+
+ clib_sha256_vec_cycle_w (w, 0);
+ clib_sha256_vec_4_rounds (w[0], 12, s);
+ clib_sha256_vec_cycle_w (w, 1);
+ clib_sha256_vec_4_rounds (w[1], 13, s);
+ clib_sha256_vec_cycle_w (w, 2);
+ clib_sha256_vec_4_rounds (w[2], 14, s);
+ clib_sha256_vec_cycle_w (w, 3);
+ clib_sha256_vec_4_rounds (w[3], 15, s);
+
+ h[0] += s[0];
+ h[1] += s[1];
+ }
+
+ clib_sha256_vec_shuffle (h);
+
+ st->h.h32x4[0] = h[0];
+ st->h.h32x4[1] = h[1];
+#else
+ u32 w[64], s[8], i;
+ clib_sha2_h_t h;
+
+ h = st->h;
+
+ for (; n_blocks; msg += CLIB_SHA2_256_BLOCK_SIZE, n_blocks--)
+ {
+ for (i = 0; i < 8; i++)
+ s[i] = h.h32[i];
+
+ for (i = 0; i < 16; i++)
+ {
+ w[i] = clib_net_to_host_u32 ((((u32u *) msg)[i]));
+ SHA256_TRANSFORM (s, w, i, clib_sha2_256_k[i]);
+ }
+
+ for (i = 16; i < 64; i++)
+ {
+ SHA256_MSG_SCHED (w, i);
+ SHA256_TRANSFORM (s, w, i, clib_sha2_256_k[i]);
+ }
+
+ for (i = 0; i < 8; i++)
+ h.h32[i] += s[i];
+ }
+
+ st->h = h;
+#endif
+}
+
+static_always_inline void
+clib_sha512_block (clib_sha2_state_t *st, const u8 *msg, uword n_blocks)
+{
+ u64 w[80], s[8], i;
+ clib_sha2_h_t h;
+
+ h = st->h;
+
+ for (; n_blocks; msg += CLIB_SHA2_512_BLOCK_SIZE, n_blocks--)
+ {
+ for (i = 0; i < 8; i++)
+ s[i] = h.h64[i];
+
+ for (i = 0; i < 16; i++)
+ {
+ w[i] = clib_net_to_host_u64 ((((u64u *) msg)[i]));
+ SHA512_TRANSFORM (s, w, i, clib_sha2_512_k[i]);
+ }
+
+ for (i = 16; i < 80; i++)
+ {
+ SHA512_MSG_SCHED (w, i);
+ SHA512_TRANSFORM (s, w, i, clib_sha2_512_k[i]);
+ }
+
+ for (i = 0; i < 8; i++)
+ h.h64[i] += s[i];
+ }
+
+ st->h = h;
+}
+
+static_always_inline void
+clib_sha2_update_internal (clib_sha2_state_t *st, u8 block_size, const u8 *msg,
+ uword n_bytes)
+{
+ uword n_blocks;
+ if (st->n_pending)
+ {
+ uword n_left = block_size - st->n_pending;
+ if (n_bytes < n_left)
+ {
+ clib_memcpy_fast (st->pending.as_u8 + st->n_pending, msg, n_bytes);
+ st->n_pending += n_bytes;
+ return;
+ }
+ else
+ {
+ clib_memcpy_fast (st->pending.as_u8 + st->n_pending, msg, n_left);
+ if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+ clib_sha512_block (st, st->pending.as_u8, 1);
+ else
+ clib_sha256_block (st, st->pending.as_u8, 1);
+ st->n_pending = 0;
+ st->total_bytes += block_size;
+ n_bytes -= n_left;
+ msg += n_left;
+ }
+ }
+
+ if ((n_blocks = n_bytes / block_size))
+ {
+ if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+ clib_sha512_block (st, msg, n_blocks);
+ else
+ clib_sha256_block (st, msg, n_blocks);
+ n_bytes -= n_blocks * block_size;
+ msg += n_blocks * block_size;
+ st->total_bytes += n_blocks * block_size;
+ }
+
+ if (n_bytes)
+ {
+ clib_memset_u8 (st->pending.as_u8, 0, block_size);
+ clib_memcpy_fast (st->pending.as_u8, msg, n_bytes);
+ st->n_pending = n_bytes;
+ }
+ else
+ st->n_pending = 0;
+}
+
+static_always_inline void
+clib_sha2_update (clib_sha2_ctx_t *ctx, const u8 *msg, uword n_bytes)
+{
+ clib_sha2_update_internal (&ctx->state, ctx->block_size, msg, n_bytes);
+}
+
+static_always_inline void
+clib_sha2_final_internal (clib_sha2_state_t *st, u8 block_size, u8 digest_size,
+ u8 *digest)
+{
+ int i;
+
+ st->total_bytes += st->n_pending;
+ if (st->n_pending == 0)
+ {
+ clib_memset (st->pending.as_u8, 0, block_size);
+ st->pending.as_u8[0] = 0x80;
+ }
+ else if (st->n_pending + sizeof (u64) + sizeof (u8) > block_size)
+ {
+ st->pending.as_u8[st->n_pending] = 0x80;
+ if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+ clib_sha512_block (st, st->pending.as_u8, 1);
+ else
+ clib_sha256_block (st, st->pending.as_u8, 1);
+ clib_memset (st->pending.as_u8, 0, block_size);
+ }
+ else
+ st->pending.as_u8[st->n_pending] = 0x80;
+
+ st->pending.as_u64[block_size / 8 - 1] =
+ clib_net_to_host_u64 (st->total_bytes * 8);
+
+ if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+ {
+ clib_sha512_block (st, st->pending.as_u8, 1);
+ for (i = 0; i < digest_size / sizeof (u64); i++)
+ ((u64 *) digest)[i] = clib_net_to_host_u64 (st->h.h64[i]);
+
+ /* sha512-224 case - write half of u64 */
+ if (i * sizeof (u64) < digest_size)
+ ((u32 *) digest)[2 * i] = clib_net_to_host_u32 (st->h.h64[i] >> 32);
+ }
+ else
+ {
+ clib_sha256_block (st, st->pending.as_u8, 1);
+ for (i = 0; i < digest_size / sizeof (u32); i++)
+ *((u32 *) digest + i) = clib_net_to_host_u32 (st->h.h32[i]);
+ }
+}
+
+static_always_inline void
+clib_sha2_final (clib_sha2_ctx_t *ctx, u8 *digest)
+{
+ clib_sha2_final_internal (&ctx->state, ctx->block_size, ctx->digest_size,
+ digest);
+}
+
+static_always_inline void
+clib_sha2 (clib_sha2_type_t type, const u8 *msg, uword len, u8 *digest)
+{
+ clib_sha2_ctx_t ctx;
+ clib_sha2_init (&ctx, type);
+ clib_sha2_update (&ctx, msg, len);
+ clib_sha2_final (&ctx, digest);
+}
+
+#define clib_sha224(...) clib_sha2 (CLIB_SHA2_224, __VA_ARGS__)
+#define clib_sha256(...) clib_sha2 (CLIB_SHA2_256, __VA_ARGS__)
+#define clib_sha384(...) clib_sha2 (CLIB_SHA2_384, __VA_ARGS__)
+#define clib_sha512(...) clib_sha2 (CLIB_SHA2_512, __VA_ARGS__)
+#define clib_sha512_224(...) clib_sha2 (CLIB_SHA2_512_224, __VA_ARGS__)
+#define clib_sha512_256(...) clib_sha2 (CLIB_SHA2_512_256, __VA_ARGS__)
+
+/*
+ * HMAC
+ */
+
+typedef struct
+{
+ clib_sha2_h_t ipad_h;
+ clib_sha2_h_t opad_h;
+} clib_sha2_hmac_key_data_t;
+
+typedef struct
+{
+ clib_sha2_type_t type;
+ u8 block_size;
+ u8 digest_size;
+ clib_sha2_state_t ipad_state;
+ clib_sha2_state_t opad_state;
+} clib_sha2_hmac_ctx_t;
+
+static_always_inline void
+clib_sha2_hmac_key_data (clib_sha2_type_t type, const u8 *key, uword key_len,
+ clib_sha2_hmac_key_data_t *kd)
+{
+ u8 block_size = clib_sha2_variants[type].block_size;
+ u8 data[SHA2_MAX_BLOCK_SIZE] = {};
+ u8 ikey[SHA2_MAX_BLOCK_SIZE];
+ u8 okey[SHA2_MAX_BLOCK_SIZE];
+ clib_sha2_state_t ipad_state;
+ clib_sha2_state_t opad_state;
+
+ /* key */
+ if (key_len > block_size)
+ {
+ /* key is longer than block, calculate hash of key */
+ clib_sha2_ctx_t ctx;
+ clib_sha2_init (&ctx, type);
+ clib_sha2_update (&ctx, key, key_len);
+ clib_sha2_final (&ctx, (u8 *) data);
+ }
+ else
+ clib_memcpy_fast (data, key, key_len);
+
+ for (int i = 0, w = 0; w < block_size; w += sizeof (uword), i++)
+ {
+ ((uwordu *) ikey)[i] = ((uwordu *) data)[i] ^ 0x3636363636363636UL;
+ ((uwordu *) okey)[i] = ((uwordu *) data)[i] ^ 0x5c5c5c5c5c5c5c5cUL;
+ }
+
+ clib_sha2_state_init (&ipad_state, type);
+ clib_sha2_state_init (&opad_state, type);
+
+ if (block_size == CLIB_SHA2_512_BLOCK_SIZE)
+ {
+ clib_sha512_block (&ipad_state, ikey, 1);
+ clib_sha512_block (&opad_state, okey, 1);
+ }
+ else
+ {
+ clib_sha256_block (&ipad_state, ikey, 1);
+ clib_sha256_block (&opad_state, okey, 1);
+ }
+
+ kd->ipad_h = ipad_state.h;
+ kd->opad_h = opad_state.h;
+}
+
+static_always_inline void
+clib_sha2_hmac_init (clib_sha2_hmac_ctx_t *ctx, clib_sha2_type_t type,
+ clib_sha2_hmac_key_data_t *kd)
+{
+ u8 block_size = clib_sha2_variants[type].block_size;
+ u8 digest_size = clib_sha2_variants[type].digest_size;
+
+ *ctx = (clib_sha2_hmac_ctx_t) {
+ .type = type,
+ .block_size = block_size,
+ .digest_size = digest_size,
+ .ipad_state = {
+ .h = kd->ipad_h,
+ .total_bytes = block_size,
+ },
+ .opad_state = {
+ .h = kd->opad_h,
+ .total_bytes = block_size,
+ },
+ };
+}
+
+static_always_inline void
+clib_sha2_hmac_update (clib_sha2_hmac_ctx_t *ctx, const u8 *msg, uword len)
+{
+ clib_sha2_update_internal (&ctx->ipad_state, ctx->block_size, msg, len);
+}
+
+static_always_inline void
+clib_sha2_hmac_final (clib_sha2_hmac_ctx_t *ctx, u8 *digest)
+{
+ u8 i_digest[SHA2_MAX_DIGEST_SIZE];
+
+ clib_sha2_final_internal (&ctx->ipad_state, ctx->block_size,
+ ctx->digest_size, i_digest);
+ clib_sha2_update_internal (&ctx->opad_state, ctx->block_size, i_digest,
+ ctx->digest_size);
+ clib_sha2_final_internal (&ctx->opad_state, ctx->block_size,
+ ctx->digest_size, digest);
+}
+
+static_always_inline void
+clib_sha2_hmac (clib_sha2_type_t type, const u8 *key, uword key_len,
+ const u8 *msg, uword len, u8 *digest)
+{
+ clib_sha2_hmac_ctx_t _ctx, *ctx = &_ctx;
+ clib_sha2_hmac_key_data_t kd;
+
+ clib_sha2_hmac_key_data (type, key, key_len, &kd);
+ clib_sha2_hmac_init (ctx, type, &kd);
+ clib_sha2_hmac_update (ctx, msg, len);
+ clib_sha2_hmac_final (ctx, digest);
+}
+
+#define clib_hmac_sha224(...) clib_sha2_hmac (CLIB_SHA2_224, __VA_ARGS__)
+#define clib_hmac_sha256(...) clib_sha2_hmac (CLIB_SHA2_256, __VA_ARGS__)
+#define clib_hmac_sha384(...) clib_sha2_hmac (CLIB_SHA2_384, __VA_ARGS__)
+#define clib_hmac_sha512(...) clib_sha2_hmac (CLIB_SHA2_512, __VA_ARGS__)
+#define clib_hmac_sha512_224(...) \
+ clib_sha2_hmac (CLIB_SHA2_512_224, __VA_ARGS__)
+#define clib_hmac_sha512_256(...) \
+ clib_sha2_hmac (CLIB_SHA2_512_256, __VA_ARGS__)
+
+#endif /* included_sha2_h */