summaryrefslogtreecommitdiffstats
path: root/src/vppinfra/vector_avx512.h
blob: 8a82650e1e8d89237359d389a34fac95b00d7910 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

@media only all and (prefers-color-scheme: dark) {
.highlight .hll { background-color: #49483e }
.highlight .c { color: #75715e } /* Comment */
.highlight .err { color: #960050; background-color: #1e0010 } /* Error */
.highlight .k { color: #66d9ef } /* Keyword */
.highlight .l { color: #ae81ff } /* Literal */
.highlight .n { color: #f8f8f2 } /* Name */
.highlight .o { color: #f92672 } /* Operator */
.highlight .p { color: #f8f8f2 } /* Punctuation */
.highlight .ch { color: #75715e } /* Comment.Hashbang */
.highlight .cm { color: #75715e } /* Comment.Multiline */
.highlight .cp { color: #75715e } /* Comment.Preproc */
.highlight .cpf { color: #75715e } /* Comment.PreprocFile */
.highlight .c1 { color: #75715e } /* Comment.Single */
.highlight .cs { color: #75715e } /* Comment.Special */
.highlight .gd { color: #f92672 } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gi { color: #a6e22e } /* Generic.Inserted */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #75715e } /* Generic.Subheading */
.highlight .kc { color: #66d9ef } /* Keyword.Constant */
.highlight .kd { color: #66d9ef } /* Keyword.Declaration */
.highlight .kn { color: #f92672 } /* Keyword.Namespace */
.highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
.highlight .kr { color: #66d9ef } /* Keyword.Reserved */
.highlight .kt { color: #66d9ef } /* Keyword.Type */
.highlight .ld { color: #e6db74 } /* Literal.Date */
.highlight .m { color: #ae81ff } /* Literal.Number */
.highlight .s { color: #e6db74 } /* Literal.String */
.highlight .na { color: #a6e22e } /* Name.Attribute */
.highlight .nb { color: #f8f8f2 } /* Name.Builtin */
.highlight .nc { color: #a6e22e } /* Name.Class */
.highlight .no { color: #66d9ef } /* Name.Constant */
.highlight .nd { color: #a6e22e } /* Name.Decorator */
.highlight .ni { color: #f8f8f2 } /* Name.Entity */
.highlight .ne { color: #a6e22e } /* Name.Exception */
.highlight .nf { color: #a6e22e } /* Name.Function */
.highlight .nl { color: #f8f8f2 } /* Name.Label */
.highlight .nn { color: #f8f8f2 } /* Name.Namespace */
.highlight .nx { color: #a6e22e } /* Name.Other */
.highlight .py { color: #f8f8f2 } /* Name.Property */
.highlight .nt { color: #f92672 } /* Name.Tag */
.highlight .nv { color: #f8f8f2 } /* Name.Variable */
.highlight .ow { color: #f92672 } /* Operator.Word */
.highlight .w { color: #f8f8f2 } /* Text.Whitespace */
.highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
.highlight .mf { color: #ae81ff } /* Literal.Number.Float */
.highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
.highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
.highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
.highlight .sa { color: #e6db74 } /* Literal.String.Affix */
.highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
.highlight .sc { color: #e6db74 } /* Literal.String.Char */
.highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
.highlight .sd { color: #e6db74 } /* Literal.String.Doc */
.highlight .s2 { color: #e6db74 } /* Literal.String.Double */
.highlight .se { color: #ae81ff } /* Literal.String.Escape */
.highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
.highlight .si { color: #e6db74 } /* Literal.String.Interpol */
.highlight .sx { color: #e6db74 } /* Literal.String.Other */
.highlight .sr { color: #e6db74 } /* Literal.String.Regex */
.highlight .s1 { color: #e6db74 } /* Literal.String.Single */
.highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
.highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #a6e22e } /* Name.Function.Magic */
.highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
.highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
.highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
.highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
.highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */
}
@media (prefers-color-scheme: light) {
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
}
/*
 * Copyright (c) 2019 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in wr
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef included_vector_avx512_h
#define included_vector_avx512_h

#include <vppinfra/clib.h>
#include <x86intrin.h>

/* *INDENT-OFF* */
#define foreach_avx512_vec512i \
  _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32)  _(i,64,8,epi64)
#define foreach_avx512_vec512u \
  _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32)  _(u,64,8,epi64)
#define foreach_avx512_vec512f \
  _(f,32,8,ps) _(f,64,4,pd)

/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
   is_all_equal, is_zero_mask */
#define _(t, s, c, i) \
static_always_inline t##s##x##c						\
t##s##x##c##_splat (t##s x)						\
{ return (t##s##x##c) _mm512_set1_##i (x); }				\
\
static_always_inline t##s##x##c						\
t##s##x##c##_load_aligned (void *p)					\
{ return (t##s##x##c) _mm512_load_si512 (p); }				\
\
static_always_inline void						\
t##s##x##c##_store_aligned (t##s##x##c v, void *p)			\
{ _mm512_store_si512 ((__m512i *) p, (__m512i) v); }			\
\
static_always_inline t##s##x##c						\
t##s##x##c##_load_unaligned (void *p)					\
{ return (t##s##x##c) _mm512_loadu_si512 (p); }				\
\
static_always_inline void						\
t##s##x##c##_store_unaligned (t##s##x##c v, void *p)			\
{ _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); }			\
\
static_always_inline int						\
t##s##x##c##_is_all_zero (t##s##x##c v)					\
{ return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); }	\
\
static_always_inline int						\
t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b)			\
{ return t##s##x##c##_is_all_zero (a ^ b); }				\
\
static_always_inline int						\
t##s##x##c##_is_all_equal (t##s##x##c v, t##s x)			\
{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }		\
\
static_always_inline u##c						\
t##s##x##c##_is_zero_mask (t##s##x##c v)				\
{ return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); }		\
\
static_always_inline t##s##x##c                                         \
t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b)                 \
{ return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
\
static_always_inline t##s##x##c                                         \
t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b)                 \
{ return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \


foreach_avx512_vec512i foreach_avx512_vec512u
#undef _
/* *INDENT-ON* */

static_always_inline u32
u16x32_msb_mask (u16x32 v)
{
  return (u32) _mm512_movepi16_mask ((__m512i) v);
}

static_always_inline u32x16
u32x16_byte_swap (u32x16 v)
{
  u8x64 swap = {
    3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
    3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
    3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
    3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  };
  return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
}

static_always_inline u16x32
u16x32_byte_swap (u16x32 v)
{
  u8x64 swap = {
    1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
    1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
    1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
    1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
  };
  return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
}

static_always_inline u32x8
u32x16_extract_lo (u32x16 v)
{
  return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
}

static_always_inline u32x8
u32x16_extract_hi (u32x16 v)
{
  return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
}

static_always_inline u8x32
u8x64_extract_lo (u8x64 v)
{
  return (u8x32) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
}

static_always_inline u8x32
u8x64_extract_hi (u8x64 v)
{
  return (u8x32) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
}

static_always_inline u32
u32x16_min_scalar (u32x16 v)
{
  return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
				      u32x16_extract_hi (v)));
}

static_always_inline u32x16
u32x16_insert_lo (u32x16 r, u32x8 v)
{
  return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
}

static_always_inline u32x16
u32x16_insert_hi (u32x16 r, u32x8 v)
{
  return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
}

static_always_inline u64x8
u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
{
  return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
					    (__m512i) b);
}


#define u32x16_ternary_logic(a, b, c, d) \
  (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)

#define u8x64_insert_u8x16(a, b, n) \
  (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)

#define u8x64_extract_u8x16(a, n) \
  (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)

#define u8x64_word_shift_left(a,n)  (u8x64) _mm512_bslli_epi128((__m512i) a, n)
#define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)

static_always_inline u8x64
u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
{
  return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
					    (__m512i) c, 0x96);
}

static_always_inline u8x64
u8x64_reflect_u8x16 (u8x64 x)
{
  static const u8x64 mask = {
    15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
    15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
    15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
    15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
  };
  return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
}

static_always_inline u8x64
u8x64_mask_load (u8x64 a, void *p, u64 mask)
{
  return (u8x64) _mm512_mask_loadu_epi8 ((__m512i) a, mask, p);
}

static_always_inline void
u8x64_mask_store (u8x64 a, void *p, u64 mask)
{
  _mm512_mask_storeu_epi8 (p, mask, (__m512i) a);
}

static_always_inline u8x64
u8x64_splat_u8x16 (u8x16 a)
{
  return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
}

static_always_inline u32x16
u32x16_splat_u32x4 (u32x4 a)
{
  return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
}

static_always_inline u32x16
u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
{
  return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
}

static_always_inline u8x64
u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
{
  return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
}

static_always_inline void
u32x16_transpose (u32x16 m[16])
{
  __m512i r[16], a, b, c, d, x, y;

  /* *INDENT-OFF* */
  __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
  __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
  __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
  __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
  /* *INDENT-ON* */

  r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
  r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
  r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
  r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
  r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
  r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
  r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
  r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);

  r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
  r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
  r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
  r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
  r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
  r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
  r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
  r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);

  a = _mm512_unpacklo_epi64 (r[0], r[1]);
  b = _mm512_unpacklo_epi64 (r[2], r[3]);
  c = _mm512_unpacklo_epi64 (r[4], r[5]);
  d = _mm512_unpacklo_epi64 (r[6], r[7]);
  x = _mm512_permutex2var_epi64 (a, pm1, b);
  y = _mm512_permutex2var_epi64 (c, pm1, d);
  m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
  m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
  x = _mm512_permutex2var_epi64 (a, pm2, b);
  y = _mm512_permutex2var_epi64 (c, pm2, d);
  m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
  m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);

  a = _mm512_unpacklo_epi64 (r[8], r[9]);
  b = _mm512_unpacklo_epi64 (r[10], r[11]);
  c = _mm512_unpacklo_epi64 (r[12], r[13]);
  d = _mm512_unpacklo_epi64 (r[14], r[15]);
  x = _mm512_permutex2var_epi64 (a, pm1, b);
  y = _mm512_permutex2var_epi64 (c, pm1, d);
  m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
  m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
  x = _mm512_permutex2var_epi64 (a, pm2, b);
  y = _mm512_permutex2var_epi64 (c, pm2, d);
  m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
  m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);

  a = _mm512_unpackhi_epi64 (r[0], r[1]);
  b = _mm512_unpackhi_epi64 (r[2], r[3]);
  c = _mm512_unpackhi_epi64 (r[4], r[5]);
  d = _mm512_unpackhi_epi64 (r[6], r[7]);
  x = _mm512_permutex2var_epi64 (a, pm1, b);
  y = _mm512_permutex2var_epi64 (c, pm1, d);
  m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
  m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
  x = _mm512_permutex2var_epi64 (a, pm2, b);
  y = _mm512_permutex2var_epi64 (c, pm2, d);
  m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
  m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);

  a = _mm512_unpackhi_epi64 (r[8], r[9]);
  b = _mm512_unpackhi_epi64 (r[10], r[11]);
  c = _mm512_unpackhi_epi64 (r[12], r[13]);
  d = _mm512_unpackhi_epi64 (r[14], r[15]);
  x = _mm512_permutex2var_epi64 (a, pm1, b);
  y = _mm512_permutex2var_epi64 (c, pm1, d);
  m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
  m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
  x = _mm512_permutex2var_epi64 (a, pm2, b);
  y = _mm512_permutex2var_epi64 (c, pm2, d);
  m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
  m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
}



static_always_inline void
u64x8_transpose (u64x8 m[8])
{
  __m512i r[8], x, y;

  /* *INDENT-OFF* */
  __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
  __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
  __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
  __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
  /* *INDENT-ON* */

  r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
  r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
  r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
  r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
  r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
  r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
  r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
  r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);

  x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
  y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
  m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
  m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
  x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
  y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
  m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
  m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);

  x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
  y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
  m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
  m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
  x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
  y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
  m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
  m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
}

#endif /* included_vector_avx512_h */
/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */