aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/gtpu/gtpu_api.c
blob: 0c4a31521bbd1c5e2d71b87abc4d160993dafba4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/*
 *------------------------------------------------------------------
 * gtpu_api.c - gtpu api
 *
 * Copyright (c) 2017 Intel and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *------------------------------------------------------------------
 */

#include <vnet/interface.h>
#include <vnet/api_errno.h>
#include <vnet/feature/feature.h>
#include <vnet/fib/fib_table.h>

#include <vppinfra/byte_order.h>
#include <vlibmemory/api.h>
#include <vnet/ip/ip_types_api.h>
#include <gtpu/gtpu.h>

#include <vnet/format_fns.h>
#include <gtpu/gtpu.api_enum.h>
#include <gtpu/gtpu.api_types.h>

#define REPLY_MSG_ID_BASE gtm->msg_id_base
#include <vlibapi/api_helper_macros.h>

static void
  vl_api_sw_interface_set_gtpu_bypass_t_handler
  (vl_api_sw_interface_set_gtpu_bypass_t * mp)
{
  vl_api_sw_interface_set_gtpu_bypass_reply_t *rmp;
  int rv = 0;
  u32 sw_if_index = ntohl (mp->sw_if_index);
  gtpu_main_t *gtm = &gtpu_main;

  VALIDATE_SW_IF_INDEX (mp);

  vnet_int_gtpu_bypass_mode (sw_if_index, mp->is_ipv6, mp->enable);
  BAD_SW_IF_INDEX_LABEL;

  REPLY_MACRO (VL_API_SW_INTERFACE_SET_GTPU_BYPASS_REPLY);
}

static void vl_api_gtpu_add_del_tunnel_t_handler
  (vl_api_gtpu_add_del_tunnel_t * mp)
{
  vl_api_gtpu_add_del_tunnel_reply_t *rmp;
  int rv = 0;
  ip4_main_t *im = &ip4_main;
  gtpu_main_t *gtm = &gtpu_main;

  uword *p = hash_get (im->fib_index_by_table_id, ntohl (mp->encap_vrf_id));
  if (!p)
    {
      rv = VNET_API_ERROR_NO_SUCH_FIB;
      goto out;
    }

  vnet_gtpu_add_del_tunnel_args_t a = {
    .is_add = mp->is_add,
    .mcast_sw_if_index = ntohl (mp->mcast_sw_if_index),
    .encap_fib_index = p[0],
    .decap_next_index = ntohl (mp->decap_next_index),
    .teid = ntohl (mp->teid),
  };
  ip_address_decode (&mp->dst_address, &a.dst);
  ip_address_decode (&mp->src_address, &a.src);

  /* Check src & dst are different */
  if (ip46_address_cmp (&a.dst, &a.src) == 0)
    {
      rv = VNET_API_ERROR_SAME_SRC_DST;
      goto out;
    }
  if (ip46_address_is_multicast (&a.dst) &&
      !vnet_sw_if_index_is_api_valid (a.mcast_sw_if_index))
    {
      rv = VNET_API_ERROR_INVALID_SW_IF_INDEX;
      goto out;
    }

  u32 sw_if_index = ~0;
  rv = vnet_gtpu_add_del_tunnel (&a, &sw_if_index);

out:
  /* *INDENT-OFF* */
  REPLY_MACRO2(VL_API_GTPU_ADD_DEL_TUNNEL_REPLY,
  ({
    rmp->sw_if_index = ntohl (sw_if_index);
  }));
  /* *INDENT-ON* */
}

static void send_gtpu_tunnel_details
  (gtpu_tunnel_t * t, vl_api_registration_t * reg, u32 context)
{
  vl_api_gtpu_tunnel_details_t *rmp;
  gtpu_main_t *gtm = &gtpu_main;
  ip4_main_t *im4 = &ip4_main;
  ip6_main_t *im6 = &ip6_main;
  u8 is_ipv6 = !ip46_address_is_ip4 (&t->dst);

  rmp = vl_msg_api_alloc (sizeof (*rmp));
  clib_memset (rmp, 0, sizeof (*rmp));
  rmp->_vl_msg_id = ntohs (VL_API_GTPU_TUNNEL_DETAILS + gtm->msg_id_base);

  ip_address_encode (&t->src, is_ipv6 ? IP46_TYPE_IP6 : IP46_TYPE_IP4,
		     &rmp->src_address);
  ip_address_encode (&t->dst, is_ipv6 ? IP46_TYPE_IP6 : IP46_TYPE_IP4,
		     &rmp->dst_address);

  rmp->encap_vrf_id =
    is_ipv6 ? htonl (im6->fibs[t->encap_fib_index].ft_table_id) :
    htonl (im4->fibs[t->encap_fib_index].ft_table_id);
  rmp->mcast_sw_if_index = htonl (t->mcast_sw_if_index);
  rmp->teid = htonl (t->teid);
  rmp->decap_next_index = htonl (t->decap_next_index);
  rmp->sw_if_index = htonl (t->sw_if_index);
  rmp->context = context;

  vl_api_send_msg (reg, (u8 *) rmp);
}

static void
vl_api_gtpu_tunnel_dump_t_handler (vl_api_gtpu_tunnel_dump_t * mp)
{
  vl_api_registration_t *reg;
  gtpu_main_t *gtm = &gtpu_main;
  gtpu_tunnel_t *t;
  u32 sw_if_index;

  reg = vl_api_client_index_to_registration (mp->client_index);
  if (!reg)
    return;

  sw_if_index = ntohl (mp->sw_if_index);

  if (~0 == sw_if_index)
    {
      /* *INDENT-OFF* */
      pool_foreach (t, gtm->tunnels,
      ({
        send_gtpu_tunnel_details(t, reg, mp->context);
      }));
      /* *INDENT-ON* */
    }
  else
    {
      if ((sw_if_index >= vec_len (gtm->tunnel_index_by_sw_if_index)) ||
	  (~0 == gtm->tunnel_index_by_sw_if_index[sw_if_index]))
	{
	  return;
	}
      t = &gtm->tunnels[gtm->tunnel_index_by_sw_if_index[sw_if_index]];
      send_gtpu_tunnel_details (t, reg, mp->context);
    }
}

#include <gtpu/gtpu.api.c>
static clib_error_t *
gtpu_api_hookup (vlib_main_t * vm)
{
  gtpu_main_t *gtm = &gtpu_main;

  gtm->msg_id_base = setup_message_id_table ();
  return 0;
}

VLIB_API_INIT_FUNCTION (gtpu_api_hookup);

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */
lass="n">_mm512_movepi16_mask ((__m512i) v); } #define u64x8_i64gather(index, base, scale) \ (u64x8) _mm512_i64gather_epi64 ((__m512i) index, base, scale) /* 512-bit packs */ #define _(f, t, fn) \ always_inline t t##_pack (f lo, f hi) \ { \ return (t) fn ((__m512i) lo, (__m512i) hi); \ } _ (i16x32, i8x64, _mm512_packs_epi16) _ (i16x32, u8x64, _mm512_packus_epi16) _ (i32x16, i16x32, _mm512_packs_epi32) _ (i32x16, u16x32, _mm512_packus_epi32) #undef _ static_always_inline u64x8 u64x8_byte_swap (u64x8 v) { u8x64 swap = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, }; return (u64x8) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap); } static_always_inline u32x16 u32x16_byte_swap (u32x16 v) { u8x64 swap = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap); } static_always_inline u16x32 u16x32_byte_swap (u16x32 v) { u8x64 swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }; return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap); } #define _(f, t) \ static_always_inline t f##_extract_lo (f v) \ { \ return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \ } \ static_always_inline t f##_extract_hi (f v) \ { \ return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \ } _ (u64x8, u64x4) _ (u32x16, u32x8) _ (u16x32, u16x16) _ (u8x64, u8x32) #undef _ static_always_inline u32 u32x16_min_scalar (u32x16 v) { return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v), u32x16_extract_hi (v))); } static_always_inline u32x16 u32x16_insert_lo (u32x16 r, u32x8 v) { return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0); } static_always_inline u32x16 u32x16_insert_hi (u32x16 r, u32x8 v) { return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1); } static_always_inline u64x8 u64x8_permute (u64x8 a, u64x8 b, u64x8 mask) { return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask, (__m512i) b); } #define u32x16_ternary_logic(a, b, c, d) \ (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d) #define u8x64_insert_u8x16(a, b, n) \ (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n) #define u8x64_extract_u8x16(a, n) \ (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n) #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n) #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n) static_always_inline u8x64 u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c) { return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, 0x96); } static_always_inline u64x8 u64x8_xor3 (u64x8 a, u64x8 b, u64x8 c) { return (u64x8) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, 0x96); } static_always_inline u8x64 u8x64_reflect_u8x16 (u8x64 x) { static const u8x64 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, }; return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask); } #define u8x64_align_right(a, b, imm) \ (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm) #define u64x8_align_right(a, b, imm) \ (u64x8) _mm512_alignr_epi64 ((__m512i) a, (__m512i) b, imm) static_always_inline u32 u32x16_sum_elts (u32x16 sum16) { u32x8 sum8; sum16 += (u32x16) u8x64_align_right (sum16, sum16, 8); sum16 += (u32x16) u8x64_align_right (sum16, sum16, 4); sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16); return sum8[0] + sum8[4]; } #define _(t, m, p, i, e) \ static_always_inline t t##_mask_load (t a, void *p, m mask) \ { \ return (t) p##_mask_loadu_##e ((i) a, mask, p); \ } \ static_always_inline t t##_mask_load_zero (void *p, m mask) \ { \ return (t) p##_maskz_loadu_##e (mask, p); \ } \ static_always_inline void t##_mask_store (t a, void *p, m mask) \ { \ p##_mask_storeu_##e (p, mask, (i) a); \ } _ (u8x64, u64, _mm512, __m512i, epi8) _ (u8x32, u32, _mm256, __m256i, epi8) _ (u8x16, u16, _mm, __m128i, epi8) _ (u16x32, u32, _mm512, __m512i, epi16) _ (u16x16, u16, _mm256, __m256i, epi16) _ (u16x8, u8, _mm, __m128i, epi16) _ (u32x16, u16, _mm512, __m512i, epi32) _ (u32x8, u8, _mm256, __m256i, epi32) _ (u32x4, u8, _mm, __m128i, epi32) _ (u64x8, u8, _mm512, __m512i, epi64) _ (u64x4, u8, _mm256, __m256i, epi64) _ (u64x2, u8, _mm, __m128i, epi64) #undef _ #define _(t, m, p, i, e) \ static_always_inline t t##_mask_and (t a, t b, m mask) \ { \ return (t) p##_mask_and_##e ((i) a, mask, (i) a, (i) b); \ } \ static_always_inline t t##_mask_andnot (t a, t b, m mask) \ { \ return (t) p##_mask_andnot_##e ((i) a, mask, (i) a, (i) b); \ } \ static_always_inline t t##_mask_xor (t a, t b, m mask) \ { \ return (t) p##_mask_xor_##e ((i) a, mask, (i) a, (i) b); \ } \ static_always_inline t t##_mask_or (t a, t b, m mask) \ { \ return (t) p##_mask_or_##e ((i) a, mask, (i) a, (i) b); \ } _ (u32x16, u16, _mm512, __m512i, epi32) _ (u32x8, u8, _mm256, __m256i, epi32) _ (u32x4, u8, _mm, __m128i, epi32) _ (u64x8, u8, _mm512, __m512i, epi64) _ (u64x4, u8, _mm256, __m256i, epi64) _ (u64x2, u8, _mm, __m128i, epi64) #undef _ #ifdef CLIB_HAVE_VEC512 #define CLIB_HAVE_VEC512_MASK_LOAD_STORE #define CLIB_HAVE_VEC512_MASK_BITWISE_OPS #endif #ifdef CLIB_HAVE_VEC256 #define CLIB_HAVE_VEC256_MASK_LOAD_STORE #define CLIB_HAVE_VEC256_MASK_BITWISE_OPS #endif #ifdef CLIB_HAVE_VEC128 #define CLIB_HAVE_VEC128_MASK_LOAD_STORE #define CLIB_HAVE_VEC128_MASK_BITWISE_OPS #endif static_always_inline u8x64 u8x64_splat_u8x16 (u8x16 a) { return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a); } static_always_inline u32x16 u32x16_splat_u32x4 (u32x4 a) { return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a); } static_always_inline u64x8 u64x8_splat_u64x2 (u64x2 a) { return (u64x8) _mm512_broadcast_i64x2 ((__m128i) a); } static_always_inline u32x16 u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask) { return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b); } static_always_inline u8x64 u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask) { return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b); } static_always_inline u8x64 u8x64_permute (u8x64 idx, u8x64 a) { return (u8x64) _mm512_permutexvar_epi8 ((__m512i) idx, (__m512i) a); } static_always_inline u8x64 u8x64_permute2 (u8x64 idx, u8x64 a, u8x64 b) { return (u8x64) _mm512_permutex2var_epi8 ((__m512i) a, (__m512i) idx, (__m512i) b); } #define _(t, m, e, p, it) \ static_always_inline m t##_is_equal_mask (t a, t b) \ { \ return p##_cmpeq_##e##_mask ((it) a, (it) b); \ } _ (u8x16, u16, epu8, _mm, __m128i) _ (u16x8, u8, epu16, _mm, __m128i) _ (u32x4, u8, epu32, _mm, __m128i) _ (u64x2, u8, epu64, _mm, __m128i) _ (u8x32, u32, epu8, _mm256, __m256i) _ (u16x16, u16, epu16, _mm256, __m256i) _ (u32x8, u8, epu32, _mm256, __m256i) _ (u64x4, u8, epu64, _mm256, __m256i) _ (u8x64, u64, epu8, _mm512, __m512i) _ (u16x32, u32, epu16, _mm512, __m512i) _ (u32x16, u16, epu32, _mm512, __m512i) _ (u64x8, u8, epu64, _mm512, __m512i) #undef _ #define _(t, m, e, p, it) \ static_always_inline m t##_is_not_equal_mask (t a, t b) \ { \ return p##_cmpneq_##e##_mask ((it) a, (it) b); \ } _ (u8x16, u16, epu8, _mm, __m128i) _ (u16x8, u8, epu16, _mm, __m128i) _ (u32x4, u8, epu32, _mm, __m128i) _ (u64x2, u8, epu64, _mm, __m128i) _ (u8x32, u32, epu8, _mm256, __m256i) _ (u16x16, u16, epu16, _mm256, __m256i) _ (u32x8, u8, epu32, _mm256, __m256i) _ (u64x4, u8, epu64, _mm256, __m256i) _ (u8x64, u64, epu8, _mm512, __m512i) _ (u16x32, u32, epu16, _mm512, __m512i) _ (u32x16, u16, epu32, _mm512, __m512i) _ (u64x8, u8, epu64, _mm512, __m512i) #undef _ #define _(f, t, fn, it) \ static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); } _ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i) _ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i) _ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i) _ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i) #undef _ #define _(vt, mt, p, it, epi) \ static_always_inline vt vt##_compress (vt a, mt mask) \ { \ return (vt) p##_maskz_compress_##epi (mask, (it) a); \ } \ static_always_inline vt vt##_expand (vt a, mt mask) \ { \ return (vt) p##_maskz_expand_##epi (mask, (it) a); \ } \ static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \ { \ p##_mask_compressstoreu_##epi (p, mask, (it) v); \ } _ (u64x8, u8, _mm512, __m512i, epi64) _ (u32x16, u16, _mm512, __m512i, epi32) _ (u64x4, u8, _mm256, __m256i, epi64) _ (u32x8, u8, _mm256, __m256i, epi32) _ (u64x2, u8, _mm, __m128i, epi64) _ (u32x4, u8, _mm, __m128i, epi32) #ifdef __AVX512VBMI2__ _ (u16x32, u32, _mm512, __m512i, epi16) _ (u8x64, u64, _mm512, __m512i, epi8) _ (u16x16, u16, _mm256, __m256i, epi16) _ (u8x32, u32, _mm256, __m256i, epi8) _ (u16x8, u8, _mm, __m128i, epi16) _ (u8x16, u16, _mm, __m128i, epi8) #endif #undef _ #ifdef CLIB_HAVE_VEC256 #define CLIB_HAVE_VEC256_COMPRESS #ifdef __AVX512VBMI2__ #define CLIB_HAVE_VEC256_COMPRESS_U8_U16 #endif #endif #ifdef CLIB_HAVE_VEC512 #define CLIB_HAVE_VEC512_COMPRESS #ifdef __AVX512VBMI2__ #define CLIB_HAVE_VEC512_COMPRESS_U8_U16 #endif #endif #ifndef __AVX512VBMI2__ static_always_inline u16x16 u16x16_compress (u16x16 v, u16 mask) { return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v), mask)); } static_always_inline u16x8 u16x8_compress (u16x8 v, u8 mask) { return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v), mask)); } #endif static_always_inline u64 u64x8_hxor (u64x8 v) { v ^= u64x8_align_right (v, v, 4); v ^= u64x8_align_right (v, v, 2); return v[0] ^ v[1]; } static_always_inline void u32x16_transpose (u32x16 m[16]) { __m512i r[16], a, b, c, d, x, y; /* *INDENT-OFF* */ __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13}; __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15}; __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11}; __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15}; /* *INDENT-ON* */ r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]); r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]); r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]); r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]); r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]); r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]); r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]); r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]); r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]); r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]); r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]); r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]); r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]); r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]); r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]); r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]); a = _mm512_unpacklo_epi64 (r[0], r[1]); b = _mm512_unpacklo_epi64 (r[2], r[3]); c = _mm512_unpacklo_epi64 (r[4], r[5]); d = _mm512_unpacklo_epi64 (r[6], r[7]); x = _mm512_permutex2var_epi64 (a, pm1, b); y = _mm512_permutex2var_epi64 (c, pm1, d); m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y); m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y); x = _mm512_permutex2var_epi64 (a, pm2, b); y = _mm512_permutex2var_epi64 (c, pm2, d); m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y); m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y); a = _mm512_unpacklo_epi64 (r[8], r[9]); b = _mm512_unpacklo_epi64 (r[10], r[11]); c = _mm512_unpacklo_epi64 (r[12], r[13]); d = _mm512_unpacklo_epi64 (r[14], r[15]); x = _mm512_permutex2var_epi64 (a, pm1, b); y = _mm512_permutex2var_epi64 (c, pm1, d); m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y); m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y); x = _mm512_permutex2var_epi64 (a, pm2, b); y = _mm512_permutex2var_epi64 (c, pm2, d); m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y); m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y); a = _mm512_unpackhi_epi64 (r[0], r[1]); b = _mm512_unpackhi_epi64 (r[2], r[3]); c = _mm512_unpackhi_epi64 (r[4], r[5]); d = _mm512_unpackhi_epi64 (r[6], r[7]); x = _mm512_permutex2var_epi64 (a, pm1, b); y = _mm512_permutex2var_epi64 (c, pm1, d); m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y); m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y); x = _mm512_permutex2var_epi64 (a, pm2, b); y = _mm512_permutex2var_epi64 (c, pm2, d); m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y); m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y); a = _mm512_unpackhi_epi64 (r[8], r[9]); b = _mm512_unpackhi_epi64 (r[10], r[11]); c = _mm512_unpackhi_epi64 (r[12], r[13]); d = _mm512_unpackhi_epi64 (r[14], r[15]); x = _mm512_permutex2var_epi64 (a, pm1, b); y = _mm512_permutex2var_epi64 (c, pm1, d); m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y); m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y); x = _mm512_permutex2var_epi64 (a, pm2, b); y = _mm512_permutex2var_epi64 (c, pm2, d); m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y); m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y); } static_always_inline void u64x8_transpose (u64x8 m[8]) { __m512i r[8], x, y; /* *INDENT-OFF* */ __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13}; __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15}; __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11}; __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15}; /* *INDENT-ON* */ r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]); r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]); r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]); r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]); r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]); r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]); r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]); r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]); x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]); y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]); m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y); m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y); x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]); y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]); m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y); m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y); x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]); y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]); m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y); m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y); x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]); y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]); m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y); m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y); } static_always_inline u8x64 u8x64_load_partial (u8 *data, uword n) { return u8x64_mask_load_zero (data, pow2_mask (n)); } static_always_inline void u8x64_store_partial (u8x64 r, u8 *data, uword n) { u8x64_mask_store (r, data, pow2_mask (n)); } #endif /* included_vector_avx512_h */ /* * fd.io coding-style-patch-verification: ON * * Local Variables: * eval: (c-set-style "gnu") * End: */