test/test/test_barrier.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2010-2018 Intel Corporation
 */

 /*
  * This is a simple functional test for rte_smp_mb() implementation.
  * I.E. make sure that LOAD and STORE operations that precede the
  * rte_smp_mb() call are globally visible across the lcores
  * before the the LOAD and STORE operations that follows it.
  * The test uses simple implementation of Peterson's lock algorithm
  * (https://en.wikipedia.org/wiki/Peterson%27s_algorithm)
  * for two execution units to make sure that rte_smp_mb() prevents
  * store-load reordering to happen.
  * Also when executed on a single lcore could be used as a approxiamate
  * estimation of number of cycles particular implementation of rte_smp_mb()
  * will take.
  */

#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <inttypes.h>

#include <rte_memory.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_atomic.h>
#include <rte_eal.h>
#include <rte_lcore.h>
#include <rte_pause.h>
#include <rte_random.h>
#include <rte_cycles.h>
#include <rte_vect.h>
#include <rte_debug.h>

#include "test.h"

#define ADD_MAX		8
#define ITER_MAX	0x1000000

enum plock_use_type {
	USE_MB,
	USE_SMP_MB,
	USE_NUM
};

struct plock {
	volatile uint32_t flag[2];
	volatile uint32_t victim;
	enum plock_use_type utype;
};

/*
 * Lock plus protected by it two counters.
 */
struct plock_test {
	struct plock lock;
	uint32_t val;
	uint32_t iter;
};

/*
 * Each active lcore shares plock_test struct with it's left and right
 * neighbours.
 */
struct lcore_plock_test {
	struct plock_test *pt[2]; /* shared, lock-protected data */
	uint32_t sum[2];          /* local copy of the shared data */
	uint32_t iter;            /* number of iterations to perfom */
	uint32_t lc;              /* given lcore id */
};

static inline void
store_load_barrier(uint32_t utype)
{
	if (utype == USE_MB)
		rte_mb();
	else if (utype == USE_SMP_MB)
		rte_smp_mb();
	else
		RTE_VERIFY(0);
}

/*
 * Peterson lock implementation.
 */
static void
plock_lock(struct plock *l, uint32_t self)
{
	uint32_t other;

	other = self ^ 1;

	l->flag[self] = 1;
	l->victim = self;

	store_load_barrier(l->utype);

	while (l->flag[other] == 1 && l->victim == self)
		rte_pause();
}

static void
plock_unlock(struct plock *l, uint32_t self)
{
	rte_smp_wmb();
	l->flag[self] = 0;
}

static void
plock_reset(struct plock *l, enum plock_use_type utype)
{
	memset(l, 0, sizeof(*l));
	l->utype = utype;
}

/*
 * grab the lock, update both counters, release the lock.
 */
static void
plock_add(struct plock_test *pt, uint32_t self, uint32_t n)
{
	plock_lock(&pt->lock, self);
	pt->iter++;
	pt->val += n;
	plock_unlock(&pt->lock, self);
}

static int
plock_test1_lcore(void *data)
{
	uint64_t tm;
	uint32_t i, lc, ln, n;
	struct lcore_plock_test *lpt;

	lpt = data;
	lc = rte_lcore_id();

	/* find lcore_plock_test struct for given lcore */
	for (ln = rte_lcore_count(); ln != 0 && lpt->lc != lc; lpt++, ln--)
		;

	if (ln == 0) {
		printf("%s(%u) error at init\n", __func__, lc);
		return -1;
	}

	n = rte_rand() % ADD_MAX;
	tm = rte_get_timer_cycles();

	/*
	 * for each iteration:
	 * - update shared, locked protected data in a safe manner
	 * - update local copy of the shared data
	 */
	for (i = 0; i != lpt->iter; i++) {

		plock_add(lpt->pt[0], 0, n);
		plock_add(lpt->pt[1], 1, n);

		lpt->sum[0] += n;
		lpt->sum[1] += n;

		n = (n + 1) % ADD_MAX;
	}

	tm = rte_get_timer_cycles() - tm;

	printf("%s(%u): %u iterations finished, in %" PRIu64
		" cycles, %#Lf cycles/iteration, "
		"local sum={%u, %u}\n",
		__func__, lc, i, tm, (long double)tm / i,
		lpt->sum[0], lpt->sum[1]);
	return 0;
}

/*
 * For N active lcores we allocate N+1 lcore_plock_test structures.
 * Each active lcore shares one lcore_plock_test structure with its
 * left lcore neighbor and one lcore_plock_test structure with its
 * right lcore neighbor.
 * During the test each lcore updates data in both shared structures and
 * its local copies. Then at validation phase we check that our shared
 * and local data are the same.
 */
static int
plock_test(uint32_t iter, enum plock_use_type utype)
{
	int32_t rc;
	uint32_t i, lc, n;
	uint32_t *sum;
	struct plock_test *pt;
	struct lcore_plock_test *lpt;

	/* init phase, allocate and initialize shared data */

	n = rte_lcore_count();
	pt = calloc(n + 1, sizeof(*pt));
	lpt = calloc(n, sizeof(*lpt));
	sum = calloc(n + 1, sizeof(*sum));

	printf("%s(iter=%u, utype=%u) started on %u lcores\n",
		__func__, iter, utype, n);

	if (pt == NULL || lpt == NULL) {
		printf("%s: failed to allocate memory for %u lcores\n",
			__func__, n);
		free(pt);
		free(lpt);
		free(sum);
		return -ENOMEM;
	}

	for (i = 0; i != n + 1; i++)
		plock_reset(&pt[i].lock, utype);

	i = 0;
	RTE_LCORE_FOREACH(lc) {

		lpt[i].lc = lc;
		lpt[i].iter = iter;
		lpt[i].pt[0] = pt + i;
		lpt[i].pt[1] = pt + i + 1;
		i++;
	}

	lpt[i - 1].pt[1] = pt;

	for (i = 0; i != n; i++)
		printf("lpt[%u]={lc=%u, pt={%p, %p},};\n",
			i, lpt[i].lc, lpt[i].pt[0], lpt[i].pt[1]);


	/* test phase - start and wait for completion on each active lcore */

	rte_eal_mp_remote_launch(plock_test1_lcore, lpt, CALL_MASTER);
	rte_eal_mp_wait_lcore();

	/* validation phase - make sure that shared and local data match */

	for (i = 0; i != n; i++) {
		sum[i] += lpt[i].sum[0];
		sum[i + 1] += lpt[i].sum[1];
	}

	sum[0] += sum[i];

	rc = 0;
	for (i = 0; i != n; i++) {
		printf("%s: sum[%u]=%u, pt[%u].val=%u, pt[%u].iter=%u;\n",
			__func__, i, sum[i], i, pt[i].val, i, pt[i].iter);

		/* race condition occurred, lock doesn't work properly */
		if (sum[i] != pt[i].val || 2 * iter != pt[i].iter) {
			printf("error: local and shared sums don't much\n");
			rc = -1;
		}
	}

	free(pt);
	free(lpt);
	free(sum);

	printf("%s(utype=%u) returns %d\n", __func__, utype, rc);
	return rc;
}

static int
test_barrier(void)
{
	int32_t i, ret, rc[USE_NUM];

	for (i = 0; i != RTE_DIM(rc); i++)
		rc[i] = plock_test(ITER_MAX, i);

	ret = 0;
	for (i = 0; i != RTE_DIM(rc); i++) {
		printf("%s for utype=%d %s\n",
			__func__, i, rc[i] == 0 ? "passed" : "failed");
		ret |= rc[i];
	}

	return ret;
}

REGISTER_TEST_COMMAND(barrier_autotest, test_barrier);