aboutsummaryrefslogtreecommitdiffstats
path: root/src/vppinfra/vector_neon.h
blob: 3ed783602969bc410cc50ab0da208377cdd220ed (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef included_vector_neon_h
#define included_vector_neon_h
#include <arm_neon.h>

/* Arithmetic */
#define u16x8_sub_saturate(a,b) vsubq_u16(a,b)
#define i16x8_sub_saturate(a,b) vsubq_s16(a,b)

always_inline int
u8x16_is_all_zero (u8x16 x)
{
  return !(vaddvq_u8 (x));
}

always_inline int
u16x8_is_all_zero (u16x8 x)
{
  return !(vaddvq_u16 (x));
}

always_inline int
u32x4_is_all_zero (u32x4 x)
{
  return !(vaddvq_u32 (x));
}

always_inline int
u64x2_is_all_zero (u64x2 x)
{
  return !(vaddvq_u64 (x));
}

/* Converts all ones/zeros compare mask to bitmap. */
always_inline u32
u8x16_compare_byte_mask (u8x16 x)
{
  uint8x16_t mask_shift =
    { -7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0 };
  uint8x16_t mask_and = vdupq_n_u8 (0x80);
  x = vandq_u8 (x, mask_and);
  x = vshlq_u8 (x, vreinterpretq_s8_u8 (mask_shift));
  x = vpaddq_u8 (x, x);
  x = vpaddq_u8 (x, x);
  x = vpaddq_u8 (x, x);
  return vgetq_lane_u8 (x, 0) | (vgetq_lane_u8 (x, 1) << 8);
}

always_inline u32
u16x8_zero_byte_mask (u16x8 input)
{
  u8x16 vall_one = vdupq_n_u8 (0x0);
  u8x16 res_values = { 0x01, 0x02, 0x04, 0x08,
    0x10, 0x20, 0x40, 0x80,
    0x01, 0x02, 0x04, 0x08,
    0x10, 0x20, 0x40, 0x80
  };

  /* input --> [0x80, 0x40, 0x01, 0xf0, ... ] */
  u8x16 test_result =
    vreinterpretq_u8_u16 (vceqq_u16 (input, vreinterpretq_u16_u8 (vall_one)));
  u8x16 before_merge = vminq_u8 (test_result, res_values);
  /*before_merge--> [0x80, 0x00, 0x00, 0x10, ... ] */
  /* u8x16 --> [a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p] */
  /* pair add until we have 2 uint64_t  */
  u16x8 merge1 = vpaddlq_u8 (before_merge);
  /* u16x8-->  [a+b,c+d, e+f,g+h, i+j,k+l, m+n,o+p] */
  u32x4 merge2 = vpaddlq_u16 (merge1);
  /* u32x4-->  [a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p] */
  u64x2 merge3 = vpaddlq_u32 (merge2);
  /* u64x2-->  [a+b+c+d+e+f+g+h,  i+j+k+l+m+n+o+p]  */
  return (u32) (vgetq_lane_u64 (merge3, 1) << 8) + vgetq_lane_u64 (merge3, 0);
}

always_inline u32
u8x16_zero_byte_mask (u8x16 input)
{
  return u16x8_zero_byte_mask ((u16x8) input);
}

always_inline u32
u32x4_zero_byte_mask (u32x4 input)
{
  return u16x8_zero_byte_mask ((u16x8) input);
}

always_inline u32
u64x2_zero_byte_mask (u64x2 input)
{
  return u16x8_zero_byte_mask ((u16x8) input);
}



#endif /* included_vector_neon_h */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */
RAY_LEN (bufs)); n_alloc = vlib_buffer_alloc (vm, bi, n); if (n_alloc != n) { vlib_buffer_free (vm, bi, n_alloc); return 0; } vlib_get_buffers (vm, bis, bufs, n); while (n > 0) { b[0]->next_buffer = bi[1]; b[0]->flags |= VLIB_BUFFER_NEXT_PRESENT; b[0]->current_data = tmpl->current_data; b[0]->current_length = tmpl->current_length; b[0]->ref_count = 0xff == tmpl->ref_count ? 1 : tmpl->ref_count; if (rand) { const u16 len = b[0]->current_length; if (len) { vec_add (*rand, clib_random_buffer_get_data (randbuf, len), len); void *dst = vlib_buffer_get_current (b[0]); const void *src = vec_elt_at_index (*rand, vec_len (*rand) - len); clib_memcpy_fast (dst, src, len); } } b++; bi++; tmpl++; n--; } b[-1]->flags &= ~VLIB_BUFFER_NEXT_PRESENT; *b_ = bufs[0]; *bi_ = bis[0]; return 1; } static int check_chain (vlib_main_t *vm, vlib_buffer_t *b, const u8 *rand) { int len_chain = vlib_buffer_length_in_chain (vm, b); int len; /* check for data corruption */ if (clib_memcmp (vlib_buffer_get_current (b), vec_elt_at_index (rand, 0), b->current_length)) return 0; len = b->current_length; while (b->flags & VLIB_BUFFER_NEXT_PRESENT) { b = vlib_get_buffer (vm, b->next_buffer); if (clib_memcmp (vlib_buffer_get_current (b), vec_elt_at_index (rand, len), b->current_length)) return 0; len += b->current_length; } /* check for data truncation */ if (len != vec_len (rand)) return 0; /* check total length update is correct */ if (len != len_chain) return 0; return 1; } static int test_chain (vlib_main_t *vm, const chained_buffer_template_t *tmpl, const u32 n, const int clone_off, clib_random_buffer_t *randbuf, u8 **rand) { vlib_buffer_t *b; u32 bi[2]; int ret = 0; if (!build_chain (vm, tmpl, n, randbuf, rand, &b, bi)) goto err0; if (clone_off) { if (2 != vlib_buffer_clone (vm, bi[0], bi, 2, clone_off)) goto err1; b = vlib_get_buffer (vm, bi[0]); } if (!(ret = vlib_buffer_chain_linearize (vm, b))) goto err2; if (!check_chain (vm, b, *rand)) { ret = 0; goto err2; } err2: if (clone_off) vlib_buffer_free_one (vm, bi[1]); err1: vlib_buffer_free_one (vm, bi[0]); err0: return ret; } static int linearize_test (vlib_main_t *vm) { chained_buffer_template_t tmpl[VLIB_BUFFER_LINEARIZE_MAX]; clib_random_buffer_t randbuf; u32 data_size = vlib_buffer_get_default_data_size (vm); u8 *rand = 0; int ret = 0; int i; clib_random_buffer_init (&randbuf, 0); clib_memset (tmpl, 0xff, sizeof (tmpl)); for (i = 0; i < 2; i++) { tmpl[i].current_data = -14; tmpl[i].current_length = 14 + data_size; } TEST (2 == test_chain (vm, tmpl, 2, 0, &randbuf, &rand), "linearize chain with negative current data"); clib_memset (tmpl, 0xff, sizeof (tmpl)); tmpl[0].current_data = 12; tmpl[0].current_length = data_size - 12; tmpl[1].current_data = 0; tmpl[1].current_length = 0; TEST (1 == test_chain (vm, tmpl, 2, 0, &randbuf, &rand), "linearize chain with empty next"); clib_memset (tmpl, 0xff, sizeof (tmpl)); tmpl[0].current_data = 0; tmpl[0].current_length = data_size - 17; tmpl[1].current_data = -5; tmpl[1].current_length = 3; tmpl[2].current_data = 17; tmpl[2].current_length = 9; tmpl[3].current_data = 3; tmpl[3].current_length = 5; TEST (1 == test_chain (vm, tmpl, 4, 0, &randbuf, &rand), "linearize chain into a single buffer"); clib_memset (tmpl, 0xff, sizeof (tmpl)); tmpl[0].current_data = 0; tmpl[0].current_length = data_size - 2; tmpl[1].current_data = -VLIB_BUFFER_PRE_DATA_SIZE; tmpl[1].current_length = 20; tmpl[2].current_data = data_size - 10; tmpl[2].current_length = 10; tmpl[3].current_data = 0; tmpl[3].current_length = data_size; TEST (2 == test_chain (vm, tmpl, 4, data_size - 1, &randbuf, &rand), "linearize cloned chain"); clib_memset (tmpl, 0xff, sizeof (tmpl)); for (i = 0; i < 100; i++) { u8 *r = clib_random_buffer_get_data (&randbuf, 1); int n = clib_max (r[0] % ARRAY_LEN (tmpl), 1); int j; for (j = 0; j < n; j++) { r = clib_random_buffer_get_data (&randbuf, 3); i16 current_data = (i16) r[0] - VLIB_BUFFER_PRE_DATA_SIZE; u16 current_length = *(u16 *) (r + 1) % (data_size - current_data); tmpl[j].current_data = current_data; tmpl[j].current_length = current_length; } r = clib_random_buffer_get_data (&randbuf, 1); TEST ( test_chain (vm, tmpl, n, r[0] > 250 ? r[0] % 128 : 0, &randbuf, &rand), "linearize random chain %d", i); } ret = 1; err: clib_random_buffer_free (&randbuf); vec_free (rand); return ret; } static clib_error_t * test_linearize_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { if (!linearize_test (vm)) { return clib_error_return (0, "linearize test failed"); } return 0; } /* *INDENT-OFF* */ VLIB_CLI_COMMAND (test_linearize_command, static) = { .path = "test chained-buffer-linearization", .short_help = "test chained-buffer-linearization", .function = test_linearize_fn, }; /* *INDENT-ON* */ static clib_error_t * test_linearize_speed_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd) { /* typical 9000-bytes TCP jumbo frames */ const chained_buffer_template_t tmpl[5] = { { 14, 2034, 1 }, { 0, 2048, 1 }, { 0, 2048, 1 }, { 0, 2048, 1 }, { 0, 808, 1 } }; int i, j; for (i = 0; i < 10; i++) { u64 tot = 0; for (j = 0; j < 100000; j++) { vlib_buffer_t *b; u32 bi; if (!build_chain (vm, tmpl, 5, 0, 0, &b, &bi)) return clib_error_create ("build_chain() failed"); CLIB_COMPILER_BARRIER (); u64 start = clib_cpu_time_now (); CLIB_COMPILER_BARRIER (); vlib_buffer_chain_linearize (vm, b); CLIB_COMPILER_BARRIER (); tot += clib_cpu_time_now () - start; CLIB_COMPILER_BARRIER (); vlib_buffer_free_one (vm, bi); } vlib_cli_output (vm, "%.03f ticks/call", (f64) tot / j); } return 0; } VLIB_CLI_COMMAND (test_linearize_speed_command, static) = { .path = "test chained-buffer-linearization speed", .short_help = "test chained-buffer-linearization speed", .function = test_linearize_speed_fn, }; /* * fd.io coding-style-patch-verification: ON * * Local Variables: * eval: (c-set-style "gnu") * End: */