summaryrefslogtreecommitdiffstats
path: root/src/vppinfra/vector_sse42.h
blob: dab22deff7c943bc2d32547a681bdfd241c5976c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
@media only all and (prefers-color-scheme: dark) {
.highlight .hll { background-color: #49483e }
.highlight .c { color: #75715e } /* Comment */
.highlight .err { color: #960050; background-color: #1e0010 } /* Error */
.highlight .k { color: #66d9ef } /* Keyword */
.highlight .l { color: #ae81ff } /* Literal */
.highlight .n { color: #f8f8f2 } /* Name */
.highlight .o { color: #f92672 } /* Operator */
.highlight .p { color: #f8f8f2 } /* Punctuation */
.highlight .ch { color: #75715e } /* Comment.Hashbang */
.highlight .cm { color: #75715e } /* Comment.Multiline */
.highlight .cp { color: #75715e } /* Comment.Preproc */
.highlight .cpf { color: #75715e } /* Comment.PreprocFile */
.highlight .c1 { color: #75715e } /* Comment.Single */
.highlight .cs { color: #75715e } /* Comment.Special */
.highlight .gd { color: #f92672 } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gi { color: #a6e22e } /* Generic.Inserted */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #75715e } /* Generic.Subheading */
.highlight .kc { color: #66d9ef } /* Keyword.Constant */
.highlight .kd { color: #66d9ef } /* Keyword.Declaration */
.highlight .kn { color: #f92672 } /* Keyword.Namespace */
.highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
.highlight .kr { color: #66d9ef } /* Keyword.Reserved */
.highlight .kt { color: #66d9ef } /* Keyword.Type */
.highlight .ld { color: #e6db74 } /* Literal.Date */
.highlight .m { color: #ae81ff } /* Literal.Number */
.highlight .s { color: #e6db74 } /* Literal.String */
.highlight .na { color: #a6e22e } /* Name.Attribute */
.highlight .nb { color: #f8f8f2 } /* Name.Builtin */
.highlight .nc { color: #a6e22e } /* Name.Class */
.highlight .no { color: #66d9ef } /* Name.Constant */
.highlight .nd { color: #a6e22e } /* Name.Decorator */
.highlight .ni { color: #f8f8f2 } /* Name.Entity */
.highlight .ne { color: #a6e22e } /* Name.Exception */
.highlight .nf { color: #a6e22e } /* Name.Function */
.highlight .nl { color: #f8f8f2 } /* Name.Label */
.highlight .nn { color: #f8f8f2 } /* Name.Namespace */
.highlight .nx { color: #a6e22e } /* Name.Other */
.highlight .py { color: #f8f8f2 } /* Name.Property */
.highlight .nt { color: #f92672 } /* Name.Tag */
.highlight .nv { color: #f8f8f2 } /* Name.Variable */
.highlight .ow { color: #f92672 } /* Operator.Word */
.highlight .w { color: #f8f8f2 } /* Text.Whitespace */
.highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
.highlight .mf { color: #ae81ff } /* Literal.Number.Float */
.highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
.highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
.highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
.highlight .sa { color: #e6db74 } /* Literal.String.Affix */
.highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
.highlight .sc { color: #e6db74 } /* Literal.String.Char */
.highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
.highlight .sd { color: #e6db74 } /* Literal.String.Doc */
.highlight .s2 { color: #e6db74 } /* Literal.String.Double */
.highlight .se { color: #ae81ff } /* Literal.String.Escape */
.highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
.highlight .si { color: #e6db74 } /* Literal.String.Interpol */
.highlight .sx { color: #e6db74 } /* Literal.String.Other */
.highlight .sr { color: #e6db74 } /* Literal.String.Regex */
.highlight .s1 { color: #e6db74 } /* Literal.String.Single */
.highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
.highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #a6e22e } /* Name.Function.Magic */
.highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
.highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
.highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
.highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
.highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */
}
@media (prefers-color-scheme: light) {
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
}
/* Hey Emacs use -*- mode: C -*- */
/*
 * Copyright (c) 2015-2016 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

option version = "1.0.1";

/** \brief IPv6 SR LocalSID add/del request
    @param client_index - opaque cookie to identify the sender
    @param context - sender context, to match reply w/ request
    @param is_del Boolean of whether its a delete instruction
    @param localsid_addr IPv6 address of the localsid
    @param end_psp Boolean of whether decapsulation is allowed in this function
    @param behavior Type of behavior (function) for this localsid
    @param sw_if_index Only for L2/L3 xconnect. OIF. In VRF variant the
     fib_table.
    @param vlan_index Only for L2 xconnect. Outgoing VLAN tag.
    @param fib_table  FIB table in which we should install the localsid entry
    @param nh_addr Next Hop IPv4/IPv6 address. Only for L2/L3 xconnect.
*/
autoreply define sr_localsid_add_del
{
  u32 client_index;
  u32 context;
  u8 is_del;
  u8 localsid_addr[16];
  u8 end_psp;
  u8 behavior;
  u32 sw_if_index;
  u32 vlan_index;
  u32 fib_table;
  u8 nh_addr[16];
};

/** \brief IPv6 SR policy add
    @param client_index - opaque cookie to identify the sender
    @param context - sender context, to match reply w/ request
    @param bsid is the bindingSID of the SR Policy
    @param weight is the weight of the sid list. optional.
    @param is_encap is the behavior of the SR policy. (0.SRH insert // 1.Encapsulation)
    @param type is the type of the SR policy. (0.Default // 1.Spray)
    @param fib_table is the VRF where to install the FIB entry for the BSID
    @param segments is a vector of IPv6 address composing the segment list
*/
autoreply define sr_policy_add
{
  u32 client_index;
  u32 context;
  u8 bsid_addr[16];
  u32 weight;
  u8 is_encap;
  u8 type;
  u32 fib_table;
  u8 n_segments;
  u8 segments[0];
};

/** \brief IPv6 SR policy modification
    @param client_index - opaque cookie to identify the sender
    @param context - sender context, to match reply w/ request
    @param bsid is the bindingSID of the SR Policy
    @param sr_policy_index is the index of the SR policy
    @param fib_table is the VRF where to install the FIB entry for the BSID
    @param operation is the operation to perform (among the top ones)
    @param segments is a vector of IPv6 address composing the segment list
    @param sl_index is the index of the Segment List to modify/delete
    @param weight is the weight of the sid list. optional.
    @param is_encap Mode. Encapsulation or SRH insertion.
*/
autoreply define sr_policy_mod
{
  u32 client_index;
  u32 context;
  u8 bsid_addr[16];
  u32 sr_policy_index;
  u32 fib_table;
  u8 operation;
  u32 sl_index;
  u32 weight;
  u8 n_segments;
  u8 segments[0];
};

/** \brief IPv6 SR policy deletion
    @param client_index - opaque cookie to identify the sender
    @param context - sender context, to match reply w/ request
    @param bsid is the bindingSID of the SR Policy
    @param index is the index of the SR policy
*/
autoreply define sr_policy_del
{
  u32 client_index;
  u32 context;
  u8 bsid_addr[16];
  u32 sr_policy_index;
};

/** \brief IPv6 SR Set SRv6 encapsulation source
    @param client_index - opaque cookie to identify the sender
    @param context - sender context, to match reply w/ request
    @param bsid is the bindingSID of the SR Policy
    @param index is the index of the SR policy
*/
autoreply define sr_set_encap_source
{
  u32 client_index;
  u32 context;
  u8 encaps_source[16];
};

/** \brief IPv6 SR steering add/del
    @param client_index - opaque cookie to identify the sender
    @param context - sender context, to match reply w/ request
    @param is_del
    @param bsid is the bindingSID of the SR Policy (alt to sr_policy_index)
    @param sr_policy is the index of the SR Policy (alt to bsid)
    @param table_id is the VRF where to install the FIB entry for the BSID
    @param prefix is the IPv4/v6 address for L3 traffic type
    @param mask_width is the mask for L3 traffic type
    @param sw_if_index is the incoming interface for L2 traffic
    @param traffic_type describes the type of traffic
*/
autoreply define sr_steering_add_del
{
  u32 client_index;
  u32 context;
  u8 is_del;
  u8 bsid_addr[16];
  u32 sr_policy_index;
  u32 table_id;
  u8 prefix_addr[16];
  u32 mask_width;
  u32 sw_if_index;
  u8 traffic_type;
};

/** \brief Dump the list of SR LocalSIDs
    @param client_index - opaque cookie to identify the sender
    @param context - s
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
  Copyright (c) 2005 Eliot Dresselhaus

  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
  "Software"), to deal in the Software without restriction, including
  without limitation the rights to use, copy, modify, merge, publish,
  distribute, sublicense, and/or sell copies of the Software, and to
  permit persons to whom the Software is furnished to do so, subject to
  the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef included_vector_sse2_h
#define included_vector_sse2_h

#include <vppinfra/error_bootstrap.h>	/* for ASSERT */
#include <x86intrin.h>

/* 128 bit interleaves. */
always_inline u8x16
u8x16_interleave_hi (u8x16 a, u8x16 b)
{
  return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
}

always_inline u8x16
u8x16_interleave_lo (u8x16 a, u8x16 b)
{
  return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
}

always_inline u16x8
u16x8_interleave_hi (u16x8 a, u16x8 b)
{
  return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
}

always_inline u16x8
u16x8_interleave_lo (u16x8 a, u16x8 b)
{
  return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
}

always_inline u32x4
u32x4_interleave_hi (u32x4 a, u32x4 b)
{
  return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
}

always_inline u32x4
u32x4_interleave_lo (u32x4 a, u32x4 b)
{
  return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
}

always_inline u64x2
u64x2_interleave_hi (u64x2 a, u64x2 b)
{
  return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
}

always_inline u64x2
u64x2_interleave_lo (u64x2 a, u64x2 b)
{
  return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
}

/* 64 bit interleaves. */
always_inline u8x8
u8x8_interleave_hi (u8x8 a, u8x8 b)
{
  return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
}

always_inline u8x8
u8x8_interleave_lo (u8x8 a, u8x8 b)
{
  return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
}

always_inline u16x4
u16x4_interleave_hi (u16x4 a, u16x4 b)
{
  return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
}

always_inline u16x4
u16x4_interleave_lo (u16x4 a, u16x4 b)
{
  return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
}

always_inline u32x2
u32x2_interleave_hi (u32x2 a, u32x2 b)
{
  return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
}

always_inline u32x2
u32x2_interleave_lo (u32x2 a, u32x2 b)
{
  return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
}

/* 128 bit packs. */
always_inline u8x16
u16x8_pack (u16x8 lo, u16x8 hi)
{
  return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
}

always_inline i8x16
i16x8_pack (i16x8 lo, i16x8 hi)
{
  return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
}

always_inline u16x8
u32x4_pack (u32x4 lo, u32x4 hi)
{
  return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
}

/* 64 bit packs. */
always_inline u8x8
u16x4_pack (u16x4 lo, u16x4 hi)
{
  return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
}

always_inline i8x8
i16x4_pack (i16x4 lo, i16x4 hi)
{
  return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
}

always_inline u16x4
u32x2_pack (u32x2 lo, u32x2 hi)
{
  return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
}

always_inline i16x4
i32x2_pack (i32x2 lo, i32x2 hi)
{
  return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
}

#ifndef __ICC
always_inline u64x2
u64x2_read_lo (u64x2 x, u64 * a)
{
  return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
}

always_inline u64x2
u64x2_read_hi (u64x2 x, u64 * a)
{
  return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
}

always_inline void
u64x2_write_lo (u64x2 x, u64 * a)
{
  _mm_storel_pi ((__m64 *) a, (__m128) x);
}

always_inline void
u64x2_write_hi (u64x2 x, u64 * a)
{
  _mm_storeh_pi ((__m64 *) a, (__m128) x);
}
#endif

/* Unaligned loads/stores. */

#define _(t)						\
  always_inline void t##_store_unaligned (t x, t * a)	\
  { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); }	\
  always_inline t t##_load_unaligned (t * a)		\
  { return (t) _mm_loadu_si128 ((__m128i *) a); }

_(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
#undef _
#define _signed_binop(n,m,f,g)                                         \
  /* Unsigned */                                                       \
  always_inline u##n##x##m                                             \
  u##n##x##m##_##f (u##n##x##m x, u##n##x##m y)                        \
  { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
                                                                       \
  /* Signed */                                                         \
  always_inline i##n##x##m                                             \
  i##n##x##m##_##f (i##n##x##m x, i##n##x##m y)                        \
  { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
/* Addition/subtraction with saturation. */
  _signed_binop (8, 16, add_saturate, adds_epu)
_signed_binop (16, 8, add_saturate, adds_epu)
_signed_binop (8, 16, sub_saturate, subs_epu)
_signed_binop (16, 8, sub_saturate, subs_epu)
/* Multiplication. */
     always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
{
  return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
}

always_inline u16x8
u16x8_mul_lo (u16x8 x, u16x8 y)
{
  return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
}

always_inline i16x8
i16x8_mul_hi (i16x8 x, i16x8 y)
{
  return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
}

always_inline u16x8
u16x8_mul_hi (u16x8 x, u16x8 y)
{
  return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
}

/* 128 bit shifts. */

#define _(p,a,b,c,f)           \
  always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i)       \
  { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); }                  \
                                                                               \
  always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
  { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }

_(u, 16, 8, left, sll)
_(u, 32, 4, left, sll)
_(u, 64, 2, left, sll)
_(u, 16, 8, right, srl)
_(u, 32, 4, right, srl)
_(u, 64, 2, right, srl)
_(i, 16, 8, left, sll)
_(i, 32, 4, left, sll)
_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
#undef _
/* 64 bit shifts. */
  always_inline u16x4
u16x4_shift_left (u16x4 x, u16x4 i)
{
  return (u16x4) _m_psllw ((__m64) x, (__m64) i);
};

always_inline u32x2
u32x2_shift_left (u32x2 x, u32x2 i)
{
  return (u32x2) _m_pslld ((__m64) x, (__m64) i);
};

always_inline u16x4
u16x4_shift_right (u16x4 x, u16x4 i)
{
  return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
};

always_inline u32x2
u32x2_shift_right (u32x2 x, u32x2 i)
{
  return (u32x2) _m_psrld ((__m64) x, (__m64) i);
};

always_inline i16x4
i16x4_shift_left (i16x4 x, i16x4 i)
{
  return (i16x4) _m_psllw ((__m64) x, (__m64) i);
};

always_inline i32x2
i32x2_shift_left (i32x2 x, i32x2 i)
{
  return (i32x2) _m_pslld ((__m64) x, (__m64) i);
};

always_inline i16x4
i16x4_shift_right (i16x4 x, i16x4 i)
{
  return (i16x4) _m_psraw ((__m64) x, (__m64) i);
};

always_inline i32x2
i32x2_shift_right (i32x2 x, i32x2 i)
{
  return (i32x2) _m_psrad ((__m64) x, (__m64) i);
};

#define u8x16_word_shift_left(a,n)  (u8x16) _mm_slli_si128((__m128i) a, n)
#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)

#define i8x16_word_shift_left(a,n) \
  ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
#define i8x16_word_shift_right(a,n) \
  ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))

#define u16x8_word_shift_left(a,n) \
  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
#define i16x8_word_shift_left(a,n) \
  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
#define u16x8_word_shift_right(a,n) \
  ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
#define i16x8_word_shift_right(a,n) \
  ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))

#define u32x4_word_shift_left(a,n) \
  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
#define i32x4_word_shift_left(a,n) \
  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
#define u32x4_word_shift_right(a,n) \
  ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
#define i32x4_word_shift_right(a,n) \
  ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))

#define u64x2_word_shift_left(a,n) \
  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
#define i64x2_word_shift_left(a,n) \
  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
#define u64x2_word_shift_right(a,n) \
  ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
#define i64x2_word_shift_right(a,n) \
  ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))

/* SSE2 has no rotate instructions: use shifts to simulate them. */
#define _(t,n,lr1,lr2)					\
  always_inline t##x##n					\
  t##x##n##_irotate_##lr1 (t##x##n w, int i)		\
  {							\
    ASSERT (i >= 0 && i <= BITS (t));			\
    return (t##x##n##_ishift_##lr1 (w, i)		\
	    | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
  }							\
							\
  always_inline t##x##n					\
  t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i)		\
  {							\
    t##x##n j = t##x##n##_splat (BITS (t));		\
    return (t##x##n##_shift_##lr1 (w, i)		\
	    | t##x##n##_shift_##lr2 (w, j - i));	\
  }

_(u16, 8, left, right);
_(u16, 8, right, left);
_(u32, 4, left, right);
_(u32, 4, right, left);
_(u64, 2, left, right);
_(u64, 2, right, left);

#undef _

#ifndef __clang__
#define _(t,n,lr1,lr2)						\
  always_inline t##x##n						\
  t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i)	\
  {								\
    int m = sizeof (t##x##n) / sizeof (t);			\
    ASSERT (i >= 0 && i < m);					\
    return (t##x##n##_word_shift_##lr1 (w0, i)			\
	    | t##x##n##_word_shift_##lr2 (w1, m - i));		\
  }								\
								\
  always_inline t##x##n						\
  t##x##n##_word_rotate_##lr1 (t##x##n w0, int i)		\
  { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }

_(u8, 16, left, right);
_(u8, 16, right, left);
_(u16, 8, left, right);
_(u16, 8, right, left);
_(u32, 4, left, right);
_(u32, 4, right, left);
_(u64, 2, left, right);
_(u64, 2, right, left);

#undef _
#endif

always_inline int
u8x16_is_all_zero (u8x16 x)
{
  return _mm_testz_si128 ((__m128i) x, (__m128i) x);
}

always_inline int
u16x8_is_all_zero (u16x8 x)
{
  return _mm_testz_si128 ((__m128i) x, (__m128i) x);
}

always_inline int
u32x4_is_all_zero (u32x4 x)
{
  return _mm_testz_si128 ((__m128i) x, (__m128i) x);
}

always_inline int
u64x2_is_all_zero (u64x2 x)
{
  return _mm_testz_si128 ((__m128i) x, (__m128i) x);
}

#define u32x4_select(A,MASK)						\
({									\
  u32x4 _x, _y;								\
  _x = (A);								\
  asm volatile ("pshufd %[mask], %[x], %[y]"				\
		: /* outputs */ [y] "=x" (_y)				\
		: /* inputs */  [x] "x" (_x), [mask] "i" (MASK));	\
  _y;									\
})

#define u32x4_splat_word(x,i)			\
  u32x4_select ((x), (((i) << (2*0))		\
		      | ((i) << (2*1))		\
		      | ((i) << (2*2))		\
		      | ((i) << (2*3))))

/* Extract low order 32 bit word. */
always_inline u32
u32x4_get0 (u32x4 x)
{
  u32 result;
  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
		: /* inputs */ [x] "x" (x));
  return result;
}

always_inline u32x4
u32x4_set0 (u32 x)
{
  u32x4 result;
  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
		: /* inputs */ [x] "r" (x));
  return result;
}

always_inline i32x4
i32x4_set0 (i32 x)
{
  return (i32x4) u32x4_set0 ((u32) x);
}

always_inline i32
i32x4_get0 (i32x4 x)
{
  return (i32) u32x4_get0 ((u32x4) x);
}

/* Converts all ones/zeros compare mask to bitmap. */
always_inline u32
u8x16_compare_byte_mask (u8x16 x)
{
  return _mm_movemask_epi8 ((__m128i) x);
}

extern u8 u32x4_compare_word_mask_table[256];

always_inline u32
u32x4_compare_word_mask (u32x4 x)
{
  u32 m = u8x16_compare_byte_mask ((u8x16) x);
  return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
	  | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
}

always_inline u32
u8x16_zero_byte_mask (u8x16 x)
{
  u8x16 zero = { 0 };
  return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
}

always_inline u32
u16x8_zero_byte_mask (u16x8 x)
{
  u16x8 zero = { 0 };
  return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
}

always_inline u32
u32x4_zero_byte_mask (u32x4 x)
{
  u32x4 zero = { 0 };
  return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
}

always_inline u8x16
u8x16_max (u8x16 x, u8x16 y)
{
  return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
}

always_inline u32
u8x16_max_scalar (u8x16 x)
{
  x = u8x16_max (x, u8x16_word_shift_right (x, 8));
  x = u8x16_max (x, u8x16_word_shift_right (x, 4));
  x = u8x16_max (x, u8x16_word_shift_right (x, 2));
  x = u8x16_max (x, u8x16_word_shift_right (x, 1));
  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
}

always_inline u8x16
u8x16_min (u8x16 x, u8x16 y)
{
  return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
}

always_inline u8
u8x16_min_scalar (u8x16 x)
{
  x = u8x16_min (x, u8x16_word_shift_right (x, 8));
  x = u8x16_min (x, u8x16_word_shift_right (x, 4));
  x = u8x16_min (x, u8x16_word_shift_right (x, 2));
  x = u8x16_min (x, u8x16_word_shift_right (x, 1));
  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
}

always_inline i16x8
i16x8_max (i16x8 x, i16x8 y)
{
  return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
}

always_inline i16
i16x8_max_scalar (i16x8 x)
{
  x = i16x8_max (x, i16x8_word_shift_right (x, 4));
  x = i16x8_max (x, i16x8_word_shift_right (x, 2));
  x = i16x8_max (x, i16x8_word_shift_right (x, 1));
  return _mm_extract_epi16 ((__m128i) x, 0);
}

always_inline i16x8
i16x8_min (i16x8 x, i16x8 y)
{
  return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
}

always_inline i16
i16x8_min_scalar (i16x8 x)
{
  x = i16x8_min (x, i16x8_word_shift_right (x, 4));
  x = i16x8_min (x, i16x8_word_shift_right (x, 2));
  x = i16x8_min (x, i16x8_word_shift_right (x, 1));
  return _mm_extract_epi16 ((__m128i) x, 0);
}

#undef _signed_binop

#endif /* included_vector_sse2_h */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */