1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
|
/*
* Copyright (c) 2017 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file
* @brief NAT64 DB
*/
#include <nat/nat64_db.h>
#include <nat/nat_ipfix_logging.h>
#include <nat/nat_inlines.h>
#include <nat/nat_syslog.h>
#include <vnet/fib/fib_table.h>
int
nat64_db_init (nat64_db_t * db, u32 bib_buckets, u32 bib_memory_size,
u32 st_buckets, u32 st_memory_size,
nat64_db_free_addr_port_function_t free_addr_port_cb)
{
clib_bihash_init_24_8 (&db->bib.in2out, "bib-in2out", bib_buckets,
bib_memory_size);
clib_bihash_init_24_8 (&db->bib.out2in, "bib-out2in", bib_buckets,
bib_memory_size);
clib_bihash_init_48_8 (&db->st.in2out, "st-in2out", st_buckets,
st_memory_size);
clib_bihash_init_48_8 (&db->st.out2in, "st-out2in", st_buckets,
st_memory_size);
db->free_addr_port_cb = free_addr_port_cb;
db->bib.limit = 10 * bib_buckets;
db->bib.bib_entries_num = 0;
db->st.limit = 10 * st_buckets;
db->st.st_entries_num = 0;
db->addr_free = 0;
return 0;
}
nat64_db_bib_entry_t *
nat64_db_bib_entry_create (nat64_db_t * db, ip6_address_t * in_addr,
ip4_address_t * out_addr, u16 in_port,
u16 out_port, u32 fib_index, u8 proto,
u8 is_static)
{
nat64_db_bib_entry_t *bibe;
nat64_db_bib_entry_key_t bibe_key;
clib_bihash_kv_24_8_t kv;
fib_table_t *fib;
if (db->bib.bib_entries_num >= db->bib.limit)
{
db->free_addr_port_cb (db, out_addr, out_port, proto);
nat_ipfix_logging_max_bibs (db->bib.limit);
return 0;
}
/* create pool entry */
switch (ip_proto_to_snat_proto (proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
pool_get (db->bib._##n##_bib, bibe); \
kv.value = bibe - db->bib._##n##_bib; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
pool_get (db->bib._unk_proto_bib, bibe);
kv.value = bibe - db->bib._unk_proto_bib;
break;
}
db->bib.bib_entries_num++;
clib_memset (bibe, 0, sizeof (*bibe));
bibe->in_addr.as_u64[0] = in_addr->as_u64[0];
bibe->in_addr.as_u64[1] = in_addr->as_u64[1];
bibe->in_port = in_port;
bibe->out_addr.as_u32 = out_addr->as_u32;
bibe->out_port = out_port;
bibe->fib_index = fib_index;
bibe->proto = proto;
bibe->is_static = is_static;
/* create hash lookup */
bibe_key.addr.as_u64[0] = bibe->in_addr.as_u64[0];
bibe_key.addr.as_u64[1] = bibe->in_addr.as_u64[1];
bibe_key.fib_index = bibe->fib_index;
bibe_key.port = bibe->in_port;
bibe_key.proto = bibe->proto;
bibe_key.rsvd = 0;
kv.key[0] = bibe_key.as_u64[0];
kv.key[1] = bibe_key.as_u64[1];
kv.key[2] = bibe_key.as_u64[2];
clib_bihash_add_del_24_8 (&db->bib.in2out, &kv, 1);
clib_memset (&bibe_key.addr, 0, sizeof (bibe_key.addr));
bibe_key.addr.ip4.as_u32 = bibe->out_addr.as_u32;
bibe_key.fib_index = 0;
bibe_key.port = bibe->out_port;
kv.key[0] = bibe_key.as_u64[0];
kv.key[1] = bibe_key.as_u64[1];
kv.key[2] = bibe_key.as_u64[2];
clib_bihash_add_del_24_8 (&db->bib.out2in, &kv, 1);
fib = fib_table_get (bibe->fib_index, FIB_PROTOCOL_IP6);
nat_ipfix_logging_nat64_bib (in_addr, out_addr, proto, in_port, out_port,
fib->ft_table_id, 1);
return bibe;
}
void
nat64_db_bib_entry_free (nat64_db_t * db, nat64_db_bib_entry_t * bibe)
{
nat64_db_bib_entry_key_t bibe_key;
clib_bihash_kv_24_8_t kv;
nat64_db_bib_entry_t *bib;
u32 *ste_to_be_free = 0, *ste_index, bibe_index;
nat64_db_st_entry_t *st, *ste;
fib_table_t *fib;
switch (ip_proto_to_snat_proto (bibe->proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
bib = db->bib._##n##_bib; \
st = db->st._##n##_st; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
bib = db->bib._unk_proto_bib;
st = db->st._unk_proto_st;
break;
}
db->bib.bib_entries_num--;
bibe_index = bibe - bib;
/* delete ST entries for static BIB entry */
if (bibe->is_static)
{
pool_foreach (ste, st, (
{
if (ste->bibe_index == bibe_index)
vec_add1 (ste_to_be_free, ste - st);}
));
vec_foreach (ste_index, ste_to_be_free)
nat64_db_st_entry_free (db, pool_elt_at_index (st, ste_index[0]));
vec_free (ste_to_be_free);
}
/* delete hash lookup */
bibe_key.addr.as_u64[0] = bibe->in_addr.as_u64[0];
bibe_key.addr.as_u64[1] = bibe->in_addr.as_u64[1];
bibe_key.fib_index = bibe->fib_index;
bibe_key.port = bibe->in_port;
bibe_key.proto = bibe->proto;
bibe_key.rsvd = 0;
kv.key[0] = bibe_key.as_u64[0];
kv.key[1] = bibe_key.as_u64[1];
kv.key[2] = bibe_key.as_u64[2];
clib_bihash_add_del_24_8 (&db->bib.in2out, &kv, 0);
clib_memset (&bibe_key.addr, 0, sizeof (bibe_key.addr));
bibe_key.addr.ip4.as_u32 = bibe->out_addr.as_u32;
bibe_key.fib_index = 0;
bibe_key.port = bibe->out_port;
kv.key[0] = bibe_key.as_u64[0];
kv.key[1] = bibe_key.as_u64[1];
kv.key[2] = bibe_key.as_u64[2];
clib_bihash_add_del_24_8 (&db->bib.out2in, &kv, 0);
if (!db->addr_free)
db->free_addr_port_cb (db, &bibe->out_addr, bibe->out_port, bibe->proto);
fib = fib_table_get (bibe->fib_index, FIB_PROTOCOL_IP6);
nat_ipfix_logging_nat64_bib (&bibe->in_addr, &bibe->out_addr, bibe->proto,
bibe->in_port, bibe->out_port,
fib->ft_table_id, 0);
/* delete from pool */
pool_put (bib, bibe);
}
nat64_db_bib_entry_t *
nat64_db_bib_entry_find (nat64_db_t * db, ip46_address_t * addr, u16 port,
u8 proto, u32 fib_index, u8 is_ip6)
{
nat64_db_bib_entry_t *bibe = 0;
nat64_db_bib_entry_key_t bibe_key;
clib_bihash_kv_24_8_t kv, value;
nat64_db_bib_entry_t *bib;
switch (ip_proto_to_snat_proto (proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
bib = db->bib._##n##_bib; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
bib = db->bib._unk_proto_bib;
break;
}
bibe_key.addr.as_u64[0] = addr->as_u64[0];
bibe_key.addr.as_u64[1] = addr->as_u64[1];
bibe_key.fib_index = fib_index;
bibe_key.port = port;
bibe_key.proto = proto;
bibe_key.rsvd = 0;
kv.key[0] = bibe_key.as_u64[0];
kv.key[1] = bibe_key.as_u64[1];
kv.key[2] = bibe_key.as_u64[2];
if (!clib_bihash_search_24_8
(is_ip6 ? &db->bib.in2out : &db->bib.out2in, &kv, &value))
bibe = pool_elt_at_index (bib, value.value);
return bibe;
}
void
nat64_db_bib_walk (nat64_db_t * db, u8 proto,
nat64_db_bib_walk_fn_t fn, void *ctx)
{
nat64_db_bib_entry_t *bib, *bibe;
if (proto == 255)
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
bib = db->bib._##n##_bib; \
pool_foreach (bibe, bib, ({ \
if (fn (bibe, ctx)) \
return; \
}));
foreach_snat_protocol
#undef _
bib = db->bib._unk_proto_bib;
pool_foreach (bibe, bib, ({
if (fn (bibe, ctx))
return;
}));
/* *INDENT-ON* */
}
else
{
switch (ip_proto_to_snat_proto (proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
bib = db->bib._##n##_bib; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
bib = db->bib._unk_proto_bib;
break;
}
/* *INDENT-OFF* */
pool_foreach (bibe, bib,
({
if (fn (bibe, ctx))
return;
}));
/* *INDENT-ON* */
}
}
nat64_db_bib_entry_t *
nat64_db_bib_entry_by_index (nat64_db_t * db, u8 proto, u32 bibe_index)
{
nat64_db_bib_entry_t *bib;
switch (ip_proto_to_snat_proto (proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
bib = db->bib._##n##_bib; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
bib = db->bib._unk_proto_bib;
break;
}
return pool_elt_at_index (bib, bibe_index);
}
void
nat64_db_st_walk (nat64_db_t * db, u8 proto,
nat64_db_st_walk_fn_t fn, void *ctx)
{
nat64_db_st_entry_t *st, *ste;
if (proto == 255)
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
st = db->st._##n##_st; \
pool_foreach (ste, st, ({ \
if (fn (ste, ctx)) \
return; \
}));
foreach_snat_protocol
#undef _
st = db->st._unk_proto_st;
pool_foreach (ste, st, ({
if (fn (ste, ctx))
return;
}));
/* *INDENT-ON* */
}
else
{
switch (ip_proto_to_snat_proto (proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
st = db->st._##n##_st; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
st = db->st._unk_proto_st;
break;
}
/* *INDENT-OFF* */
pool_foreach (ste, st,
({
if (fn (ste, ctx))
return;
}));
/* *INDENT-ON* */
}
}
nat64_db_st_entry_t *
nat64_db_st_entry_create (nat64_db_t * db, nat64_db_bib_entry_t * bibe,
ip6_address_t * in_r_addr,
ip4_address_t * out_r_addr, u16 r_port)
{
nat64_db_st_entry_t *ste;
nat64_db_bib_entry_t *bib;
nat64_db_st_entry_key_t ste_key;
clib_bihash_kv_48_8_t kv;
fib_table_t *fib;
if (db->st.st_entries_num >= db->st.limit)
{
nat_ipfix_logging_max_sessions (db->st.limit);
return 0;
}
/* create pool entry */
switch (ip_proto_to_snat_proto (bibe->proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
pool_get (db->st._##n##_st, ste); \
kv.value = ste - db->st._##n##_st; \
bib = db->bib._##n##_bib; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
pool_get (db->st._unk_proto_st, ste);
kv.value = ste - db->st._unk_proto_st;
bib = db->bib._unk_proto_bib;
break;
}
db->st.st_entries_num++;
clib_memset (ste, 0, sizeof (*ste));
ste->in_r_addr.as_u64[0] = in_r_addr->as_u64[0];
ste->in_r_addr.as_u64[1] = in_r_addr->as_u64[1];
ste->out_r_addr.as_u32 = out_r_addr->as_u32;
ste->r_port = r_port;
ste->bibe_index = bibe - bib;
ste->proto = bibe->proto;
/* increment session number for BIB entry */
bibe->ses_num++;
/* create hash lookup */
clib_memset (&ste_key, 0, sizeof (ste_key));
ste_key.l_addr.as_u64[0] = bibe->in_addr.as_u64[0];
ste_key.l_addr.as_u64[1] = bibe->in_addr.as_u64[1];
ste_key.r_addr.as_u64[0] = ste->in_r_addr.as_u64[0];
ste_key.r_addr.as_u64[1] = ste->in_r_addr.as_u64[1];
ste_key.fib_index = bibe->fib_index;
ste_key.l_port = bibe->in_port;
ste_key.r_port = ste->r_port;
ste_key.proto = ste->proto;
kv.key[0] = ste_key.as_u64[0];
kv.key[1] = ste_key.as_u64[1];
kv.key[2] = ste_key.as_u64[2];
kv.key[3] = ste_key.as_u64[3];
kv.key[4] = ste_key.as_u64[4];
kv.key[5] = ste_key.as_u64[5];
clib_bihash_add_del_48_8 (&db->st.in2out, &kv, 1);
clib_memset (&ste_key, 0, sizeof (ste_key));
ste_key.l_addr.ip4.as_u32 = bibe->out_addr.as_u32;
ste_key.r_addr.ip4.as_u32 = ste->out_r_addr.as_u32;
ste_key.l_port = bibe->out_port;
ste_key.r_port = ste->r_port;
ste_key.proto = ste->proto;
kv.key[0] = ste_key.as_u64[0];
kv.key[1] = ste_key.as_u64[1];
kv.key[2] = ste_key.as_u64[2];
kv.key[3] = ste_key.as_u64[3];
kv.key[4] = ste_key.as_u64[4];
kv.key[5] = ste_key.as_u64[5];
clib_bihash_add_del_48_8 (&db->st.out2in, &kv, 1);
fib = fib_table_get (bibe->fib_index, FIB_PROTOCOL_IP6);
nat_ipfix_logging_nat64_session (&bibe->in_addr, &bibe->out_addr,
bibe->proto, bibe->in_port, bibe->out_port,
&ste->in_r_addr, &ste->out_r_addr,
ste->r_port, ste->r_port, fib->ft_table_id,
1);
nat_syslog_nat64_sadd (bibe->fib_index, &bibe->in_addr, bibe->in_port,
&bibe->out_addr, bibe->out_port, &ste->out_r_addr,
ste->r_port, bibe->proto);
return ste;
}
void
nat64_db_st_entry_free (nat64_db_t * db, nat64_db_st_entry_t * ste)
{
nat64_db_st_entry_t *st;
nat64_db_bib_entry_t *bib, *bibe;
nat64_db_st_entry_key_t ste_key;
clib_bihash_kv_48_8_t kv;
fib_table_t *fib;
switch (ip_proto_to_snat_proto (ste->proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
st = db->st._##n##_st; \
bib = db->bib._##n##_bib; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
st = db->st._unk_proto_st;
bib = db->bib._unk_proto_bib;
break;
}
bibe = pool_elt_at_index (bib, ste->bibe_index);
db->st.st_entries_num--;
/* delete hash lookup */
clib_memset (&ste_key, 0, sizeof (ste_key));
ste_key.l_addr.as_u64[0] = bibe->in_addr.as_u64[0];
ste_key.l_addr.as_u64[1] = bibe->in_addr.as_u64[1];
ste_key.r_addr.as_u64[0] = ste->in_r_addr.as_u64[0];
ste_key.r_addr.as_u64[1] = ste->in_r_addr.as_u64[1];
ste_key.fib_index = bibe->fib_index;
ste_key.l_port = bibe->in_port;
ste_key.r_port = ste->r_port;
ste_key.proto = ste->proto;
kv.key[0] = ste_key.as_u64[0];
kv.key[1] = ste_key.as_u64[1];
kv.key[2] = ste_key.as_u64[2];
kv.key[3] = ste_key.as_u64[3];
kv.key[4] = ste_key.as_u64[4];
kv.key[5] = ste_key.as_u64[5];
clib_bihash_add_del_48_8 (&db->st.in2out, &kv, 0);
clib_memset (&ste_key, 0, sizeof (ste_key));
ste_key.l_addr.ip4.as_u32 = bibe->out_addr.as_u32;
ste_key.r_addr.ip4.as_u32 = ste->out_r_addr.as_u32;
ste_key.l_port = bibe->out_port;
ste_key.r_port = ste->r_port;
ste_key.proto = ste->proto;
kv.key[0] = ste_key.as_u64[0];
kv.key[1] = ste_key.as_u64[1];
kv.key[2] = ste_key.as_u64[2];
kv.key[3] = ste_key.as_u64[3];
kv.key[4] = ste_key.as_u64[4];
kv.key[5] = ste_key.as_u64[5];
clib_bihash_add_del_48_8 (&db->st.out2in, &kv, 0);
fib = fib_table_get (bibe->fib_index, FIB_PROTOCOL_IP6);
nat_ipfix_logging_nat64_session (&bibe->in_addr, &bibe->out_addr,
bibe->proto, bibe->in_port, bibe->out_port,
&ste->in_r_addr, &ste->out_r_addr,
ste->r_port, ste->r_port, fib->ft_table_id,
0);
nat_syslog_nat64_sdel (bibe->fib_index, &bibe->in_addr, bibe->in_port,
&bibe->out_addr, bibe->out_port, &ste->out_r_addr,
ste->r_port, bibe->proto);
/* delete from pool */
pool_put (st, ste);
/* decrement session number for BIB entry */
bibe->ses_num--;
/* delete BIB entry if last session and dynamic */
if (!bibe->is_static && !bibe->ses_num)
nat64_db_bib_entry_free (db, bibe);
}
nat64_db_st_entry_t *
nat64_db_st_entry_find (nat64_db_t * db, ip46_address_t * l_addr,
ip46_address_t * r_addr, u16 l_port, u16 r_port,
u8 proto, u32 fib_index, u8 is_ip6)
{
nat64_db_st_entry_t *ste = 0;
nat64_db_st_entry_t *st;
nat64_db_st_entry_key_t ste_key;
clib_bihash_kv_48_8_t kv, value;
switch (ip_proto_to_snat_proto (proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
st = db->st._##n##_st; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
st = db->st._unk_proto_st;
break;
}
clib_memset (&ste_key, 0, sizeof (ste_key));
ste_key.l_addr.as_u64[0] = l_addr->as_u64[0];
ste_key.l_addr.as_u64[1] = l_addr->as_u64[1];
ste_key.r_addr.as_u64[0] = r_addr->as_u64[0];
ste_key.r_addr.as_u64[1] = r_addr->as_u64[1];
ste_key.fib_index = fib_index;
ste_key.l_port = l_port;
ste_key.r_port = r_port;
ste_key.proto = proto;
kv.key[0] = ste_key.as_u64[0];
kv.key[1] = ste_key.as_u64[1];
kv.key[2] = ste_key.as_u64[2];
kv.key[3] = ste_key.as_u64[3];
kv.key[4] = ste_key.as_u64[4];
kv.key[5] = ste_key.as_u64[5];
if (!clib_bihash_search_48_8
(is_ip6 ? &db->st.in2out : &db->st.out2in, &kv, &value))
ste = pool_elt_at_index (st, value.value);
return ste;
}
u32
nat64_db_st_entry_get_index (nat64_db_t * db, nat64_db_st_entry_t * ste)
{
nat64_db_st_entry_t *st;
switch (ip_proto_to_snat_proto (ste->proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
st = db->st._##n##_st; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
st = db->st._unk_proto_st;
return (u32) ~ 0;
}
return ste - st;
}
nat64_db_st_entry_t *
nat64_db_st_entry_by_index (nat64_db_t * db, u8 proto, u32 ste_index)
{
nat64_db_st_entry_t *st;
switch (ip_proto_to_snat_proto (proto))
{
/* *INDENT-OFF* */
#define _(N, i, n, s) \
case SNAT_PROTOCOL_##N: \
st = db->st._##n##_st; \
break;
foreach_snat_protocol
#undef _
/* *INDENT-ON* */
default:
st = db->st._unk_proto_st;
break;
}
return pool_elt_at_index (st, ste_index);
}
void
nad64_db_st_free_expired (nat64_db_t * db, u32 now)
{
u32 *ste_to_be_free = 0, *ste_index;
nat64_db_st_entry_t *st, *ste;
/* *INDENT-OFF* */
#define _(N, i, n, s) \
st = db->st._##n##_st; \
pool_foreach (ste, st, ({\
if (i == SNAT_PROTOCOL_TCP && !ste->tcp_state) \
continue; \
if (ste->expire < now) \
vec_add1 (ste_to_be_free, ste - st); \
})); \
vec_foreach (ste_index, ste_to_be_free) \
nat64_db_st_entry_free (db, pool_elt_at_index(st, ste_index[0])); \
vec_free (ste_to_be_free); \
ste_to_be_free = 0;
foreach_snat_protocol
#undef _
st = db->st._unk_proto_st;
pool_foreach (ste, st, ({
if (ste->expire < now)
vec_add1 (ste_to_be_free, ste - st);
}));
vec_foreach (ste_index, ste_to_be_free)
nat64_db_st_entry_free (db, pool_elt_at_index(st, ste_index[0]));
vec_free (ste_to_be_free);
/* *INDENT-ON* */
}
void
nat64_db_free_out_addr (nat64_db_t * db, ip4_address_t * out_addr)
{
u32 *ste_to_be_free = 0, *ste_index;
nat64_db_st_entry_t *st, *ste;
nat64_db_bib_entry_t *bibe;
db->addr_free = 1;
/* *INDENT-OFF* */
#define _(N, i, n, s) \
st = db->st._##n##_st; \
pool_foreach (ste, st, ({ \
bibe = pool_elt_at_index (db->bib._##n##_bib, ste->bibe_index); \
if (bibe->out_addr.as_u32 == out_addr->as_u32) \
vec_add1 (ste_to_be_free, ste - st); \
})); \
vec_foreach (ste_index, ste_to_be_free) \
nat64_db_st_entry_free (db, pool_elt_at_index(st, ste_index[0])); \
vec_free (ste_to_be_free); \
ste_to_be_free = 0;
foreach_snat_protocol
#undef _
st = db->st._unk_proto_st;
pool_foreach (ste, st, ({
bibe = pool_elt_at_index (db->bib._unk_proto_bib, ste->bibe_index);
if (bibe->out_addr.as_u32 == out_addr->as_u32)
vec_add1 (ste_to_be_free, ste - st);
}));
vec_foreach (ste_index, ste_to_be_free)
nat64_db_st_entry_free (db, pool_elt_at_index(st, ste_index[0]));
vec_free (ste_to_be_free);
db->addr_free = 0;
/* *INDENT-ON* */
}
/*
* fd.io coding-style-patch-verification: ON
*
* Local Variables:
* eval: (c-set-style "gnu")
* End:
*/
3687' href='#n3687'>3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
|
/*
* Copyright (c) 2016-2019 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vppinfra/sparse_vec.h>
#include <vnet/fib/ip4_fib.h>
#include <vnet/fib/ip6_fib.h>
#include <vnet/tcp/tcp_packet.h>
#include <vnet/tcp/tcp.h>
#include <vnet/session/session.h>
#include <math.h>
static char *tcp_error_strings[] = {
#define tcp_error(n,s) s,
#include <vnet/tcp/tcp_error.def>
#undef tcp_error
};
/* All TCP nodes have the same outgoing arcs */
#define foreach_tcp_state_next \
_ (DROP4, "ip4-drop") \
_ (DROP6, "ip6-drop") \
_ (TCP4_OUTPUT, "tcp4-output") \
_ (TCP6_OUTPUT, "tcp6-output")
typedef enum _tcp_established_next
{
#define _(s,n) TCP_ESTABLISHED_NEXT_##s,
foreach_tcp_state_next
#undef _
TCP_ESTABLISHED_N_NEXT,
} tcp_established_next_t;
typedef enum _tcp_rcv_process_next
{
#define _(s,n) TCP_RCV_PROCESS_NEXT_##s,
foreach_tcp_state_next
#undef _
TCP_RCV_PROCESS_N_NEXT,
} tcp_rcv_process_next_t;
typedef enum _tcp_syn_sent_next
{
#define _(s,n) TCP_SYN_SENT_NEXT_##s,
foreach_tcp_state_next
#undef _
TCP_SYN_SENT_N_NEXT,
} tcp_syn_sent_next_t;
typedef enum _tcp_listen_next
{
#define _(s,n) TCP_LISTEN_NEXT_##s,
foreach_tcp_state_next
#undef _
TCP_LISTEN_N_NEXT,
} tcp_listen_next_t;
/* Generic, state independent indices */
typedef enum _tcp_state_next
{
#define _(s,n) TCP_NEXT_##s,
foreach_tcp_state_next
#undef _
TCP_STATE_N_NEXT,
} tcp_state_next_t;
#define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \
: TCP_NEXT_TCP6_OUTPUT)
#define tcp_next_drop(is_ip4) (is_ip4 ? TCP_NEXT_DROP4 \
: TCP_NEXT_DROP6)
/**
* Validate segment sequence number. As per RFC793:
*
* Segment Receive Test
* Length Window
* ------- ------- -------------------------------------------
* 0 0 SEG.SEQ = RCV.NXT
* 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
* >0 0 not acceptable
* >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
* or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
*
* This ultimately consists in checking if segment falls within the window.
* The one important difference compared to RFC793 is that we use rcv_las,
* or the rcv_nxt at last ack sent instead of rcv_nxt since that's the
* peer's reference when computing our receive window.
*
* This:
* seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las)
* however, is too strict when we have retransmits. Instead we just check that
* the seq is not beyond the right edge and that the end of the segment is not
* less than the left edge.
*
* N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so
* use rcv_nxt in the right edge window test instead of rcv_las.
*
*/
always_inline u8
tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq)
{
return (seq_geq (end_seq, tc->rcv_las)
&& seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd));
}
/**
* Parse TCP header options.
*
* @param th TCP header
* @param to TCP options data structure to be populated
* @param is_syn set if packet is syn
* @return -1 if parsing failed
*/
static inline int
tcp_options_parse (tcp_header_t * th, tcp_options_t * to, u8 is_syn)
{
const u8 *data;
u8 opt_len, opts_len, kind;
int j;
sack_block_t b;
opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
data = (const u8 *) (th + 1);
/* Zero out all flags but those set in SYN */
to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
| TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS);
for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
{
kind = data[0];
/* Get options length */
if (kind == TCP_OPTION_EOL)
break;
else if (kind == TCP_OPTION_NOOP)
{
opt_len = 1;
continue;
}
else
{
/* broken options */
if (opts_len < 2)
return -1;
opt_len = data[1];
/* weird option length */
if (opt_len < 2 || opt_len > opts_len)
return -1;
}
/* Parse options */
switch (kind)
{
case TCP_OPTION_MSS:
if (!is_syn)
break;
if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
{
to->flags |= TCP_OPTS_FLAG_MSS;
to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
}
break;
case TCP_OPTION_WINDOW_SCALE:
if (!is_syn)
break;
if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
{
to->flags |= TCP_OPTS_FLAG_WSCALE;
to->wscale = data[2];
if (to->wscale > TCP_MAX_WND_SCALE)
to->wscale = TCP_MAX_WND_SCALE;
}
break;
case TCP_OPTION_TIMESTAMP:
if (is_syn)
to->flags |= TCP_OPTS_FLAG_TSTAMP;
if ((to->flags & TCP_OPTS_FLAG_TSTAMP)
&& opt_len == TCP_OPTION_LEN_TIMESTAMP)
{
to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
}
break;
case TCP_OPTION_SACK_PERMITTED:
if (!is_syn)
break;
if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
break;
case TCP_OPTION_SACK_BLOCK:
/* If SACK permitted was not advertised or a SYN, break */
if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
break;
/* If too short or not correctly formatted, break */
if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
break;
to->flags |= TCP_OPTS_FLAG_SACK;
to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
vec_reset_length (to->sacks);
for (j = 0; j < to->n_sack_blocks; j++)
{
b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
vec_add1 (to->sacks, b);
}
break;
default:
/* Nothing to see here */
continue;
}
}
return 0;
}
/**
* RFC1323: Check against wrapped sequence numbers (PAWS). If we have
* timestamp to echo and it's less than tsval_recent, drop segment
* but still send an ACK in order to retain TCP's mechanism for detecting
* and recovering from half-open connections
*
* Or at least that's what the theory says. It seems that this might not work
* very well with packet reordering and fast retransmit. XXX
*/
always_inline int
tcp_segment_check_paws (tcp_connection_t * tc)
{
return tcp_opts_tstamp (&tc->rcv_opts)
&& timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent);
}
/**
* Update tsval recent
*/
always_inline void
tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end)
{
/*
* RFC1323: If Last.ACK.sent falls within the range of sequence numbers
* of an incoming segment:
* SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN
* then the TSval from the segment is copied to TS.Recent;
* otherwise, the TSval is ignored.
*/
if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las)
&& seq_leq (tc->rcv_las, seq_end))
{
ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
tc->tsval_recent = tc->rcv_opts.tsval;
tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index);
}
}
/**
* Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
*
* It first verifies if segment has a wrapped sequence number (PAWS) and then
* does the processing associated to the first four steps (ignoring security
* and precedence): sequence number, rst bit and syn bit checks.
*
* @return 0 if segments passes validation.
*/
static int
tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0,
vlib_buffer_t * b0, tcp_header_t * th0, u32 * error0)
{
/* We could get a burst of RSTs interleaved with acks */
if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
{
tcp_send_reset (tc0);
*error0 = TCP_ERROR_CONNECTION_CLOSED;
goto error;
}
if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
{
*error0 = TCP_ERROR_SEGMENT_INVALID;
goto error;
}
if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts, 0)))
{
*error0 = TCP_ERROR_OPTIONS;
goto error;
}
if (PREDICT_FALSE (tcp_segment_check_paws (tc0)))
{
*error0 = TCP_ERROR_PAWS;
TCP_EVT (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
vnet_buffer (b0)->tcp.seq_end);
/* If it just so happens that a segment updates tsval_recent for a
* segment over 24 days old, invalidate tsval_recent. */
if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
tcp_time_now_w_thread (tc0->c_thread_index)))
{
tc0->tsval_recent = tc0->rcv_opts.tsval;
clib_warning ("paws failed: 24-day old segment");
}
/* Drop after ack if not rst. Resets can fail paws check as per
* RFC 7323 sec. 5.2: When an <RST> segment is received, it MUST NOT
* be subjected to the PAWS check by verifying an acceptable value in
* SEG.TSval */
else if (!tcp_rst (th0))
{
tcp_program_ack (tc0);
TCP_EVT (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
goto error;
}
}
/* 1st: check sequence number */
if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number,
vnet_buffer (b0)->tcp.seq_end))
{
/* SYN/SYN-ACK retransmit */
if (tcp_syn (th0)
&& vnet_buffer (b0)->tcp.seq_number == tc0->rcv_nxt - 1)
{
tcp_options_parse (th0, &tc0->rcv_opts, 1);
if (tc0->state == TCP_STATE_SYN_RCVD)
{
tcp_send_synack (tc0);
TCP_EVT (TCP_EVT_SYN_RCVD, tc0, 0);
*error0 = TCP_ERROR_SYNS_RCVD;
}
else
{
tcp_program_ack (tc0);
TCP_EVT (TCP_EVT_SYNACK_RCVD, tc0);
*error0 = TCP_ERROR_SYN_ACKS_RCVD;
}
goto error;
}
/* If our window is 0 and the packet is in sequence, let it pass
* through for ack processing. It should be dropped later. */
if (tc0->rcv_wnd < tc0->snd_mss
&& tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
goto check_reset;
/* If we entered recovery and peer did so as well, there's a chance that
* dup acks won't be acceptable on either end because seq_end may be less
* than rcv_las. This can happen if acks are lost in both directions. */
if (tcp_in_recovery (tc0)
&& seq_geq (vnet_buffer (b0)->tcp.seq_number,
tc0->rcv_las - tc0->rcv_wnd)
&& seq_leq (vnet_buffer (b0)->tcp.seq_end,
tc0->rcv_nxt + tc0->rcv_wnd))
goto check_reset;
*error0 = TCP_ERROR_RCV_WND;
/* If we advertised a zero rcv_wnd and the segment is in the past or the
* next one that we expect, it is probably a window probe */
if ((tc0->flags & TCP_CONN_ZERO_RWND_SENT)
&& seq_lt (vnet_buffer (b0)->tcp.seq_end,
tc0->rcv_las + tc0->rcv_opts.mss))
*error0 = TCP_ERROR_ZERO_RWND;
tc0->errors.below_data_wnd += seq_lt (vnet_buffer (b0)->tcp.seq_end,
tc0->rcv_las);
/* If not RST, send dup ack */
if (!tcp_rst (th0))
{
tcp_program_dupack (tc0);
TCP_EVT (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
}
goto error;
check_reset:
;
}
/* 2nd: check the RST bit */
if (PREDICT_FALSE (tcp_rst (th0)))
{
tcp_connection_reset (tc0);
*error0 = TCP_ERROR_RST_RCVD;
goto error;
}
/* 3rd: check security and precedence (skip) */
/* 4th: check the SYN bit (in window) */
if (PREDICT_FALSE (tcp_syn (th0)))
{
/* As per RFC5961 send challenge ack instead of reset */
tcp_program_ack (tc0);
*error0 = TCP_ERROR_SPURIOUS_SYN;
goto error;
}
/* If segment in window, save timestamp */
tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
vnet_buffer (b0)->tcp.seq_end);
return 0;
error:
return -1;
}
always_inline int
tcp_rcv_ack_no_cc (tcp_connection_t * tc, vlib_buffer_t * b, u32 * error)
{
/* SND.UNA =< SEG.ACK =< SND.NXT */
if (!(seq_leq (tc->snd_una, vnet_buffer (b)->tcp.ack_number)
&& seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
{
if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
&& seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
{
tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
goto acceptable;
}
*error = TCP_ERROR_ACK_INVALID;
return -1;
}
acceptable:
tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
tc->snd_una = vnet_buffer (b)->tcp.ack_number;
*error = TCP_ERROR_ACK_OK;
return 0;
}
/**
* Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298
*
* Note that although the original article, srtt and rttvar are scaled
* to minimize round-off errors, here we don't. Instead, we rely on
* better precision time measurements.
*
* TODO support us rtt resolution
*/
static void
tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt)
{
int err, diff;
if (tc->srtt != 0)
{
err = mrtt - tc->srtt;
/* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
* The increase should be bound */
tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
}
else
{
/* First measurement. */
tc->srtt = mrtt;
tc->rttvar = mrtt >> 1;
}
}
#ifndef CLIB_MARCH_VARIANT
void
tcp_update_rto (tcp_connection_t * tc)
{
tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
}
#endif /* CLIB_MARCH_VARIANT */
/**
* Update RTT estimate and RTO timer
*
* Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
* timing. Middle boxes are known to fiddle with TCP options so we
* should give higher priority to ACK timing.
*
* This should be called only if previously sent bytes have been acked.
*
* return 1 if valid rtt 0 otherwise
*/
static int
tcp_update_rtt (tcp_connection_t * tc, tcp_rate_sample_t * rs, u32 ack)
{
u32 mrtt = 0;
/* Karn's rule, part 1. Don't use retransmitted segments to estimate
* RTT because they're ambiguous. */
if (tcp_in_cong_recovery (tc))
{
/* Accept rtt estimates for samples that have not been retransmitted */
if ((tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
&& !(rs->flags & TCP_BTS_IS_RXT))
{
mrtt = rs->rtt_time * THZ;
goto estimate_rtt;
}
goto done;
}
if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
{
f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125;
mrtt = clib_max ((u32) (sample * THZ), 1);
/* Allow measuring of a new RTT */
tc->rtt_ts = 0;
}
/* As per RFC7323 TSecr can be used for RTTM only if the segment advances
* snd_una, i.e., the left side of the send window:
* seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
{
u32 now = tcp_tstamp (tc);
mrtt = clib_max (now - tc->rcv_opts.tsecr, 1);
}
estimate_rtt:
/* Ignore dubious measurements */
if (mrtt == 0 || mrtt > TCP_RTT_MAX)
goto done;
tcp_estimate_rtt (tc, mrtt);
done:
/* If we got here something must've been ACKed so make sure boff is 0,
* even if mrtt is not valid since we update the rto lower */
tc->rto_boff = 0;
tcp_update_rto (tc);
return 0;
}
static void
tcp_estimate_initial_rtt (tcp_connection_t * tc)
{
u8 thread_index = vlib_num_workers ()? 1 : 0;
int mrtt;
if (tc->rtt_ts)
{
tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts;
tc->mrtt_us = clib_max (tc->mrtt_us, 0.0001);
mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
tc->rtt_ts = 0;
}
else
{
mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr;
mrtt = clib_max (mrtt, 1);
/* Due to retransmits we don't know the initial mrtt */
if (tc->rto_boff && mrtt > 1 * THZ)
mrtt = 1 * THZ;
tc->mrtt_us = (f64) mrtt *TCP_TICK;
}
if (mrtt > 0 && mrtt < TCP_RTT_MAX)
tcp_estimate_rtt (tc, mrtt);
tcp_update_rto (tc);
}
always_inline u8
tcp_recovery_no_snd_space (tcp_connection_t * tc)
{
u32 space;
ASSERT (tcp_in_cong_recovery (tc));
if (tcp_in_recovery (tc))
space = tcp_available_output_snd_space (tc);
else
space = tcp_fastrecovery_prr_snd_space (tc);
return (space < tc->snd_mss + tc->burst_acked);
}
/**
* Dequeue bytes for connections that have received acks in last burst
*/
static void
tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk)
{
u32 thread_index = wrk->vm->thread_index;
u32 *pending_deq_acked;
tcp_connection_t *tc;
int i;
if (!vec_len (wrk->pending_deq_acked))
return;
pending_deq_acked = wrk->pending_deq_acked;
for (i = 0; i < vec_len (pending_deq_acked); i++)
{
tc = tcp_connection_get (pending_deq_acked[i], thread_index);
tc->flags &= ~TCP_CONN_DEQ_PENDING;
if (tc->burst_acked)
{
/* Dequeue the newly ACKed bytes */
session_tx_fifo_dequeue_drop (&tc->connection, tc->burst_acked);
tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
{
if (seq_leq (tc->psh_seq, tc->snd_una))
tc->flags &= ~TCP_CONN_PSH_PENDING;
}
/* If everything has been acked, stop retransmit timer
* otherwise update. */
tcp_retransmit_timer_update (tc);
/* Update pacer based on our new cwnd estimate */
tcp_connection_tx_pacer_update (tc);
}
/* Reset the pacer if we've been idle, i.e., no data sent or if
* we're in recovery and snd space constrained */
if (tc->data_segs_out == tc->prev_dsegs_out
|| (tcp_in_cong_recovery (tc) && tcp_recovery_no_snd_space (tc)))
transport_connection_tx_pacer_reset_bucket (&tc->connection);
tc->prev_dsegs_out = tc->data_segs_out;
tc->burst_acked = 0;
}
_vec_len (wrk->pending_deq_acked) = 0;
}
static void
tcp_program_dequeue (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
{
if (!(tc->flags & TCP_CONN_DEQ_PENDING))
{
vec_add1 (wrk->pending_deq_acked, tc->c_c_index);
tc->flags |= TCP_CONN_DEQ_PENDING;
}
tc->burst_acked += tc->bytes_acked;
}
#ifndef CLIB_MARCH_VARIANT
static u32
scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
{
ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes));
return hole - sb->holes;
}
static u32
scoreboard_hole_bytes (sack_scoreboard_hole_t * hole)
{
return hole->end - hole->start;
}
sack_scoreboard_hole_t *
scoreboard_get_hole (sack_scoreboard_t * sb, u32 index)
{
if (index != TCP_INVALID_SACK_HOLE_INDEX)
return pool_elt_at_index (sb->holes, index);
return 0;
}
sack_scoreboard_hole_t *
scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
{
if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
return pool_elt_at_index (sb->holes, hole->next);
return 0;
}
sack_scoreboard_hole_t *
scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
{
if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
return pool_elt_at_index (sb->holes, hole->prev);
return 0;
}
sack_scoreboard_hole_t *
scoreboard_first_hole (sack_scoreboard_t * sb)
{
if (sb->head != TCP_INVALID_SACK_HOLE_INDEX)
return pool_elt_at_index (sb->holes, sb->head);
return 0;
}
sack_scoreboard_hole_t *
scoreboard_last_hole (sack_scoreboard_t * sb)
{
if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX)
return pool_elt_at_index (sb->holes, sb->tail);
return 0;
}
static void
scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
{
sack_scoreboard_hole_t *next, *prev;
if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
{
next = pool_elt_at_index (sb->holes, hole->next);
next->prev = hole->prev;
}
else
{
sb->tail = hole->prev;
}
if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
{
prev = pool_elt_at_index (sb->holes, hole->prev);
prev->next = hole->next;
}
else
{
sb->head = hole->next;
}
if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
/* Poison the entry */
if (CLIB_DEBUG > 0)
clib_memset (hole, 0xfe, sizeof (*hole));
pool_put (sb->holes, hole);
}
static sack_scoreboard_hole_t *
scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index,
u32 start, u32 end)
{
sack_scoreboard_hole_t *hole, *next, *prev;
u32 hole_index;
pool_get (sb->holes, hole);
clib_memset (hole, 0, sizeof (*hole));
hole->start = start;
hole->end = end;
hole_index = scoreboard_hole_index (sb, hole);
prev = scoreboard_get_hole (sb, prev_index);
if (prev)
{
hole->prev = prev_index;
hole->next = prev->next;
if ((next = scoreboard_next_hole (sb, hole)))
next->prev = hole_index;
else
sb->tail = hole_index;
prev->next = hole_index;
}
else
{
sb->head = hole_index;
hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
hole->next = TCP_INVALID_SACK_HOLE_INDEX;
}
return hole;
}
always_inline void
scoreboard_update_sacked_rxt (sack_scoreboard_t * sb, u32 start, u32 end,
u8 has_rxt)
{
if (!has_rxt || seq_geq (start, sb->high_rxt))
return;
sb->rxt_sacked +=
seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start);
}
always_inline void
scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss)
{
sack_scoreboard_hole_t *left, *right;
u32 sacked = 0, blks = 0, old_sacked;
old_sacked = sb->sacked_bytes;
sb->last_lost_bytes = 0;
sb->lost_bytes = 0;
sb->sacked_bytes = 0;
right = scoreboard_last_hole (sb);
if (!right)
{
sb->sacked_bytes = sb->high_sacked - ack;
return;
}
if (seq_gt (sb->high_sacked, right->end))
{
sacked = sb->high_sacked - right->end;
blks = 1;
}
while (sacked < (TCP_DUPACK_THRESHOLD - 1) * snd_mss
&& blks < TCP_DUPACK_THRESHOLD)
{
if (right->is_lost)
sb->lost_bytes += scoreboard_hole_bytes (right);
left = scoreboard_prev_hole (sb, right);
if (!left)
{
ASSERT (right->start == ack || sb->is_reneging);
sacked += right->start - ack;
right = 0;
break;
}
sacked += right->start - left->end;
blks++;
right = left;
}
/* right is first lost */
while (right)
{
sb->lost_bytes += scoreboard_hole_bytes (right);
sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start);
right->is_lost = 1;
left = scoreboard_prev_hole (sb, right);
if (!left)
{
ASSERT (right->start == ack || sb->is_reneging);
sacked += right->start - ack;
break;
}
sacked += right->start - left->end;
right = left;
}
sb->sacked_bytes = sacked;
sb->last_sacked_bytes = sacked - (old_sacked - sb->last_bytes_delivered);
}
/**
* Figure out the next hole to retransmit
*
* Follows logic proposed in RFC6675 Sec. 4, NextSeg()
*/
sack_scoreboard_hole_t *
scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
sack_scoreboard_hole_t * start,
u8 have_unsent, u8 * can_rescue, u8 * snd_limited)
{
sack_scoreboard_hole_t *hole = 0;
hole = start ? start : scoreboard_first_hole (sb);
while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
hole = scoreboard_next_hole (sb, hole);
/* Nothing, return */
if (!hole)
{
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
return 0;
}
/* Rule (1): if higher than rxt, less than high_sacked and lost */
if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
{
sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
}
else
{
/* Rule (2): available unsent data */
if (have_unsent)
{
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
return 0;
}
/* Rule (3): if hole not lost */
else if (seq_lt (hole->start, sb->high_sacked))
{
/* And we didn't already retransmit it */
if (seq_leq (hole->end, sb->high_rxt))
{
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
return 0;
}
*snd_limited = 0;
sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
}
/* Rule (4): if hole beyond high_sacked */
else
{
ASSERT (seq_geq (hole->start, sb->high_sacked));
*snd_limited = 1;
*can_rescue = 1;
/* HighRxt MUST NOT be updated */
return 0;
}
}
if (hole && seq_lt (sb->high_rxt, hole->start))
sb->high_rxt = hole->start;
return hole;
}
void
scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una)
{
sack_scoreboard_hole_t *hole;
hole = scoreboard_first_hole (sb);
if (hole)
{
snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start;
sb->cur_rxt_hole = sb->head;
}
sb->high_rxt = snd_una;
sb->rescue_rxt = snd_una - 1;
}
void
scoreboard_init (sack_scoreboard_t * sb)
{
sb->head = TCP_INVALID_SACK_HOLE_INDEX;
sb->tail = TCP_INVALID_SACK_HOLE_INDEX;
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
}
void
scoreboard_clear (sack_scoreboard_t * sb)
{
sack_scoreboard_hole_t *hole;
while ((hole = scoreboard_first_hole (sb)))
{
scoreboard_remove_hole (sb, hole);
}
ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX);
ASSERT (pool_elts (sb->holes) == 0);
sb->sacked_bytes = 0;
sb->last_sacked_bytes = 0;
sb->last_bytes_delivered = 0;
sb->lost_bytes = 0;
sb->last_lost_bytes = 0;
sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
sb->is_reneging = 0;
}
void
scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end)
{
sack_scoreboard_hole_t *last_hole;
clib_warning ("sack reneging");
scoreboard_clear (sb);
last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
start, end);
last_hole->is_lost = 1;
sb->tail = scoreboard_hole_index (sb, last_hole);
sb->high_sacked = start;
scoreboard_init_rxt (sb, start);
}
#endif /* CLIB_MARCH_VARIANT */
/**
* Test that scoreboard is sane after recovery
*
* Returns 1 if scoreboard is empty or if first hole beyond
* snd_una.
*/
static u8
tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc)
{
sack_scoreboard_hole_t *hole;
hole = scoreboard_first_hole (&tc->sack_sb);
return (!hole || (seq_geq (hole->start, tc->snd_una)
&& seq_lt (hole->end, tc->snd_nxt)));
}
#ifndef CLIB_MARCH_VARIANT
void
tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
{
sack_scoreboard_hole_t *hole, *next_hole;
sack_scoreboard_t *sb = &tc->sack_sb;
sack_block_t *blk, *rcv_sacks;
u32 blk_index = 0, i, j;
u8 has_rxt;
sb->last_sacked_bytes = 0;
sb->last_bytes_delivered = 0;
sb->rxt_sacked = 0;
if (!tcp_opts_sack (&tc->rcv_opts)
&& sb->head == TCP_INVALID_SACK_HOLE_INDEX)
return;
has_rxt = tcp_in_cong_recovery (tc);
/* Remove invalid blocks */
blk = tc->rcv_opts.sacks;
while (blk < vec_end (tc->rcv_opts.sacks))
{
if (seq_lt (blk->start, blk->end)
&& seq_gt (blk->start, tc->snd_una)
&& seq_gt (blk->start, ack)
&& seq_lt (blk->start, tc->snd_nxt)
&& seq_leq (blk->end, tc->snd_nxt))
{
blk++;
continue;
}
vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
}
/* Add block for cumulative ack */
if (seq_gt (ack, tc->snd_una))
{
vec_add2 (tc->rcv_opts.sacks, blk, 1);
blk->start = tc->snd_una;
blk->end = ack;
}
if (vec_len (tc->rcv_opts.sacks) == 0)
return;
tcp_scoreboard_trace_add (tc, ack);
/* Make sure blocks are ordered */
rcv_sacks = tc->rcv_opts.sacks;
for (i = 0; i < vec_len (rcv_sacks); i++)
for (j = i + 1; j < vec_len (rcv_sacks); j++)
if (seq_lt (rcv_sacks[j].start, rcv_sacks[i].start))
{
sack_block_t tmp = rcv_sacks[i];
rcv_sacks[i] = rcv_sacks[j];
rcv_sacks[j] = tmp;
}
if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
{
/* Handle reneging as a special case */
if (PREDICT_FALSE (sb->is_reneging))
{
/* No holes, only sacked bytes */
if (seq_leq (tc->snd_nxt, sb->high_sacked))
{
/* No progress made so return */
if (seq_leq (ack, tc->snd_una))
return;
/* Update sacked bytes delivered and return */
sb->last_bytes_delivered = ack - tc->snd_una;
sb->sacked_bytes -= sb->last_bytes_delivered;
sb->is_reneging = seq_lt (ack, sb->high_sacked);
return;
}
/* New hole above high sacked. Add it and process normally */
hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
sb->high_sacked, tc->snd_nxt);
sb->tail = scoreboard_hole_index (sb, hole);
}
/* Not reneging and no holes. Insert the first that covers all
* outstanding bytes */
else
{
hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
tc->snd_una, tc->snd_nxt);
sb->tail = scoreboard_hole_index (sb, hole);
}
sb->high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end;
}
else
{
/* If we have holes but snd_nxt is beyond the last hole, update
* last hole end or add new hole after high sacked */
hole = scoreboard_last_hole (sb);
if (seq_gt (tc->snd_nxt, hole->end))
{
if (seq_geq (hole->start, sb->high_sacked))
{
hole->end = tc->snd_nxt;
}
/* New hole after high sacked block */
else if (seq_lt (sb->high_sacked, tc->snd_nxt))
{
scoreboard_insert_hole (sb, sb->tail, sb->high_sacked,
tc->snd_nxt);
}
}
/* Keep track of max byte sacked for when the last hole
* is acked */
sb->high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end,
sb->high_sacked);
}
/* Walk the holes with the SACK blocks */
hole = pool_elt_at_index (sb->holes, sb->head);
if (PREDICT_FALSE (sb->is_reneging))
sb->last_bytes_delivered += hole->start - tc->snd_una;
while (hole && blk_index < vec_len (rcv_sacks))
{
blk = &rcv_sacks[blk_index];
if (seq_leq (blk->start, hole->start))
{
/* Block covers hole. Remove hole */
if (seq_geq (blk->end, hole->end))
{
next_hole = scoreboard_next_hole (sb, hole);
/* If covered by ack, compute delivered bytes */
if (blk->end == ack)
{
u32 sacked = next_hole ? next_hole->start : sb->high_sacked;
if (PREDICT_FALSE (seq_lt (ack, sacked)))
{
sb->last_bytes_delivered += ack - hole->end;
sb->is_reneging = 1;
}
else
{
sb->last_bytes_delivered += sacked - hole->end;
sb->is_reneging = 0;
}
}
scoreboard_update_sacked_rxt (sb, hole->start, hole->end,
has_rxt);
scoreboard_remove_hole (sb, hole);
hole = next_hole;
}
/* Partial 'head' overlap */
else
{
if (seq_gt (blk->end, hole->start))
{
scoreboard_update_sacked_rxt (sb, hole->start, blk->end,
has_rxt);
hole->start = blk->end;
}
blk_index++;
}
}
else
{
/* Hole must be split */
if (seq_lt (blk->end, hole->end))
{
u32 hole_index = scoreboard_hole_index (sb, hole);
next_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
hole->end);
/* Pool might've moved */
hole = scoreboard_get_hole (sb, hole_index);
hole->end = blk->start;
scoreboard_update_sacked_rxt (sb, blk->start, blk->end,
has_rxt);
blk_index++;
ASSERT (hole->next == scoreboard_hole_index (sb, next_hole));
}
else if (seq_lt (blk->start, hole->end))
{
scoreboard_update_sacked_rxt (sb, blk->start, hole->end,
has_rxt);
hole->end = blk->start;
}
hole = scoreboard_next_hole (sb, hole);
}
}
scoreboard_update_bytes (sb, ack, tc->snd_mss);
ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
|| sb->sacked_bytes <= tc->snd_nxt - seq_max (tc->snd_una, ack));
ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_nxt
- seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
|| sb->is_reneging || sb->holes[sb->head].start == ack);
ASSERT (sb->last_lost_bytes <= sb->lost_bytes);
ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes
- sb->last_bytes_delivered >= sb->rxt_sacked);
ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered
|| (tc->flags & TCP_CONN_FINSNT));
TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc);
}
#endif /* CLIB_MARCH_VARIANT */
/**
* Try to update snd_wnd based on feedback received from peer.
*
* If successful, and new window is 'effectively' 0, activate persist
* timer.
*/
static void
tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
{
/* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
* SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
if (seq_lt (tc->snd_wl1, seq)
|| (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
{
tc->snd_wnd = snd_wnd;
tc->snd_wl1 = seq;
tc->snd_wl2 = ack;
TCP_EVT (TCP_EVT_SND_WND, tc);
if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
{
/* Set persist timer if not set and we just got 0 wnd */
if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
&& !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))
tcp_persist_timer_set (tc);
}
else
{
tcp_persist_timer_reset (tc);
if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0))
{
tc->rto_boff = 0;
tcp_update_rto (tc);
}
}
}
}
/**
* Init loss recovery/fast recovery.
*
* Triggered by dup acks as opposed to timer timeout. Note that cwnd is
* updated in @ref tcp_cc_handle_event after fast retransmit
*/
static void
tcp_cc_init_congestion (tcp_connection_t * tc)
{
tcp_fastrecovery_on (tc);
tc->snd_congestion = tc->snd_nxt;
tc->cwnd_acc_bytes = 0;
tc->snd_rxt_bytes = 0;
tc->rxt_delivered = 0;
tc->prr_delivered = 0;
tc->prr_start = tc->snd_una;
tc->prev_ssthresh = tc->ssthresh;
tc->prev_cwnd = tc->cwnd;
tc->snd_rxt_ts = tcp_tstamp (tc);
tcp_cc_congestion (tc);
/* Post retransmit update cwnd to ssthresh and account for the
* three segments that have left the network and should've been
* buffered at the receiver XXX */
if (!tcp_opts_sack_permitted (&tc->rcv_opts))
tc->cwnd += 3 * tc->snd_mss;
tc->fr_occurences += 1;
TCP_EVT (TCP_EVT_CC_EVT, tc, 4);
}
static void
tcp_cc_congestion_undo (tcp_connection_t * tc)
{
tc->cwnd = tc->prev_cwnd;
tc->ssthresh = tc->prev_ssthresh;
tcp_cc_undo_recovery (tc);
ASSERT (tc->rto_boff == 0);
TCP_EVT (TCP_EVT_CC_EVT, tc, 5);
}
static inline u8
tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc)
{
return (tcp_in_recovery (tc) && tc->rto_boff == 1
&& tc->snd_rxt_ts
&& tcp_opts_tstamp (&tc->rcv_opts)
&& timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
}
static inline u8
tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
{
return (tcp_cc_is_spurious_timeout_rxt (tc));
}
static inline u8
tcp_should_fastrecover_sack (tcp_connection_t * tc)
{
return (tc->sack_sb.lost_bytes
|| ((TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
< tc->sack_sb.sacked_bytes));
}
static inline u8
tcp_should_fastrecover (tcp_connection_t * tc, u8 has_sack)
{
if (!has_sack)
{
/* If of of the two conditions lower hold, reset dupacks because
* we're probably after timeout (RFC6582 heuristics).
* If Cumulative ack does not cover more than congestion threshold,
* and:
* 1) The following doesn't hold: The congestion window is greater
* than SMSS bytes and the difference between highest_ack
* and prev_highest_ack is at most 4*SMSS bytes
* 2) Echoed timestamp in the last non-dup ack does not equal the
* stored timestamp
*/
if (seq_leq (tc->snd_una, tc->snd_congestion)
&& ((!(tc->cwnd > tc->snd_mss
&& tc->bytes_acked <= 4 * tc->snd_mss))
|| (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
{
tc->rcv_dupacks = 0;
return 0;
}
}
return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
|| tcp_should_fastrecover_sack (tc));
}
static int
tcp_cc_recover (tcp_connection_t * tc)
{
sack_scoreboard_hole_t *hole;
u8 is_spurious = 0;
ASSERT (tcp_in_cong_recovery (tc));
if (tcp_cc_is_spurious_retransmit (tc))
{
tcp_cc_congestion_undo (tc);
is_spurious = 1;
}
tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
tc->rcv_dupacks = 0;
/* Previous recovery left us congested. Continue sending as part
* of the current recovery event with an updated snd_congestion */
if (tc->sack_sb.sacked_bytes)
{
tc->snd_congestion = tc->snd_nxt;
tcp_program_retransmit (tc);
return is_spurious;
}
tc->rxt_delivered = 0;
tc->snd_rxt_bytes = 0;
tc->snd_rxt_ts = 0;
tc->prr_delivered = 0;
tc->rtt_ts = 0;
tc->flags &= ~TCP_CONN_RXT_PENDING;
hole = scoreboard_first_hole (&tc->sack_sb);
if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt)
scoreboard_clear (&tc->sack_sb);
if (!tcp_in_recovery (tc) && !is_spurious)
tcp_cc_recovered (tc);
tcp_fastrecovery_off (tc);
tcp_fastrecovery_first_off (tc);
tcp_recovery_off (tc);
TCP_EVT (TCP_EVT_CC_EVT, tc, 3);
ASSERT (tc->rto_boff == 0);
ASSERT (!tcp_in_cong_recovery (tc));
ASSERT (tcp_scoreboard_is_sane_post_recovery (tc));
return is_spurious;
}
static void
tcp_cc_update (tcp_connection_t * tc, tcp_rate_sample_t * rs)
{
ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc));
/* Congestion avoidance */
tcp_cc_rcv_ack (tc, rs);
/* If a cumulative ack, make sure dupacks is 0 */
tc->rcv_dupacks = 0;
/* When dupacks hits the threshold we only enter fast retransmit if
* cumulative ack covers more than snd_congestion. Should snd_una
* wrap this test may fail under otherwise valid circumstances.
* Therefore, proactively update snd_congestion when wrap detected. */
if (PREDICT_FALSE
(seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked)
&& seq_gt (tc->snd_congestion, tc->snd_una)))
tc->snd_congestion = tc->snd_una - 1;
}
/**
* One function to rule them all ... and in the darkness bind them
*/
static void
tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs,
u32 is_dack)
{
u8 has_sack = tcp_opts_sack_permitted (&tc->rcv_opts);
/*
* If not in recovery, figure out if we should enter
*/
if (!tcp_in_cong_recovery (tc))
{
ASSERT (is_dack);
tc->rcv_dupacks++;
TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
if (tcp_should_fastrecover (tc, has_sack))
{
tcp_cc_init_congestion (tc);
if (has_sack)
scoreboard_init_rxt (&tc->sack_sb, tc->snd_una);
tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
tcp_program_retransmit (tc);
}
return;
}
/*
* Already in recovery
*/
/*
* Process (re)transmit feedback. Output path uses this to decide how much
* more data to release into the network
*/
if (has_sack)
{
if (!tc->bytes_acked && tc->sack_sb.rxt_sacked)
tcp_fastrecovery_first_on (tc);
tc->rxt_delivered += tc->sack_sb.rxt_sacked;
tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes
- tc->sack_sb.last_bytes_delivered;
tcp_program_retransmit (tc);
}
else
{
if (is_dack)
{
tc->rcv_dupacks += 1;
TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
}
tc->rxt_delivered = clib_max (tc->rxt_delivered + tc->bytes_acked,
tc->snd_rxt_bytes);
if (is_dack)
tc->prr_delivered += clib_min (tc->snd_mss,
tc->snd_nxt - tc->snd_una);
else
tc->prr_delivered += tc->bytes_acked - clib_min (tc->bytes_acked,
tc->snd_mss *
tc->rcv_dupacks);
/* If partial ack, assume that the first un-acked segment was lost */
if (tc->bytes_acked || tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
tcp_fastrecovery_first_on (tc);
tcp_program_retransmit (tc);
}
/*
* See if we can exit and stop retransmitting
*/
if (seq_geq (tc->snd_una, tc->snd_congestion))
{
/* If spurious return, we've already updated everything */
if (tcp_cc_recover (tc))
{
tc->tsecr_last_ack = tc->rcv_opts.tsecr;
return;
}
/* Treat as congestion avoidance ack */
tcp_cc_rcv_ack (tc, rs);
return;
}
/*
* Notify cc of the event
*/
if (!tc->bytes_acked)
{
tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
return;
}
/* RFC6675: If the incoming ACK is a cumulative acknowledgment,
* reset dupacks to 0. Also needed if in congestion recovery */
tc->rcv_dupacks = 0;
if (tcp_in_recovery (tc))
tcp_cc_rcv_ack (tc, rs);
else
tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs);
}
/**
* Check if duplicate ack as per RFC5681 Sec. 2
*/
always_inline u8
tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd,
u32 prev_snd_una)
{
return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
&& seq_gt (tc->snd_nxt, tc->snd_una)
&& (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
&& (prev_snd_wnd == tc->snd_wnd));
}
/**
* Checks if ack is a congestion control event.
*/
static u8
tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b,
u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
{
/* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
* defined to be 'duplicate' as well */
*is_dack = tc->sack_sb.last_sacked_bytes
|| tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
/* If reneging, wait for timer based retransmits */
if (PREDICT_FALSE (tcp_is_lost_fin (tc) || tc->sack_sb.is_reneging))
return 0;
return (*is_dack || tcp_in_cong_recovery (tc));
}
/**
* Process incoming ACK
*/
static int
tcp_rcv_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b,
tcp_header_t * th, u32 * error)
{
u32 prev_snd_wnd, prev_snd_una;
tcp_rate_sample_t rs = { 0 };
u8 is_dack;
TCP_EVT (TCP_EVT_CC_STAT, tc);
/* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
{
/* We've probably entered recovery and the peer still has some
* of the data we've sent. Update snd_nxt and accept the ack */
if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
&& seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
{
tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
goto process_ack;
}
tc->errors.above_ack_wnd += 1;
*error = TCP_ERROR_ACK_FUTURE;
TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 0, vnet_buffer (b)->tcp.ack_number);
return -1;
}
/* If old ACK, probably it's an old dupack */
if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
{
tc->errors.below_ack_wnd += 1;
*error = TCP_ERROR_ACK_OLD;
TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 1, vnet_buffer (b)->tcp.ack_number);
if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
tcp_cc_handle_event (tc, 0, 1);
/* Don't drop yet */
return 0;
}
process_ack:
/*
* Looks okay, process feedback
*/
if (tcp_opts_sack_permitted (&tc->rcv_opts))
tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
prev_snd_wnd = tc->snd_wnd;
prev_snd_una = tc->snd_una;
tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
vnet_buffer (b)->tcp.ack_number,
clib_net_to_host_u16 (th->window) << tc->snd_wscale);
tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
tc->snd_una = vnet_buffer (b)->tcp.ack_number;
tcp_validate_txf_size (tc, tc->bytes_acked);
if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
tcp_bt_sample_delivery_rate (tc, &rs);
tcp_program_dequeue (wrk, tc);
if (tc->bytes_acked)
tcp_update_rtt (tc, &rs, vnet_buffer (b)->tcp.ack_number);
TCP_EVT (TCP_EVT_ACK_RCVD, tc);
/*
* Check if we have congestion event
*/
if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
{
tcp_cc_handle_event (tc, &rs, is_dack);
tc->dupacks_in += is_dack;
if (!tcp_in_cong_recovery (tc))
{
*error = TCP_ERROR_ACK_OK;
return 0;
}
*error = TCP_ERROR_ACK_DUP;
if (vnet_buffer (b)->tcp.data_len || tcp_is_fin (th))
return 0;
return -1;
}
/*
* Update congestion control (slow start/congestion avoidance)
*/
tcp_cc_update (tc, &rs);
*error = TCP_ERROR_ACK_OK;
return 0;
}
static void
tcp_program_disconnect (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
{
if (!tcp_disconnect_pending (tc))
{
vec_add1 (wrk->pending_disconnects, tc->c_c_index);
tcp_disconnect_pending_on (tc);
}
}
static void
tcp_handle_disconnects (tcp_worker_ctx_t * wrk)
{
u32 thread_index, *pending_disconnects;
tcp_connection_t *tc;
int i;
if (!vec_len (wrk->pending_disconnects))
return;
thread_index = wrk->vm->thread_index;
pending_disconnects = wrk->pending_disconnects;
for (i = 0; i < vec_len (pending_disconnects); i++)
{
tc = tcp_connection_get (pending_disconnects[i], thread_index);
tcp_disconnect_pending_off (tc);
session_transport_closing_notify (&tc->connection);
}
_vec_len (wrk->pending_disconnects) = 0;
}
static void
tcp_rcv_fin (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b,
u32 * error)
{
/* Reject out-of-order fins */
if (vnet_buffer (b)->tcp.seq_end != tc->rcv_nxt)
return;
/* Account for the FIN and send ack */
tc->rcv_nxt += 1;
tc->flags |= TCP_CONN_FINRCVD;
tcp_program_ack (tc);
/* Enter CLOSE-WAIT and notify session. To avoid lingering
* in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
tcp_connection_set_state (tc, TCP_STATE_CLOSE_WAIT);
tcp_program_disconnect (wrk, tc);
tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, tcp_cfg.closewait_time);
TCP_EVT (TCP_EVT_FIN_RCVD, tc);
*error = TCP_ERROR_FIN_RCVD;
}
#ifndef CLIB_MARCH_VARIANT
static u8
tcp_sack_vector_is_sane (sack_block_t * sacks)
{
int i;
for (i = 1; i < vec_len (sacks); i++)
{
if (sacks[i - 1].end == sacks[i].start)
return 0;
}
return 1;
}
/**
* Build SACK list as per RFC2018.
*
* Makes sure the first block contains the segment that generated the current
* ACK and the following ones are the ones most recently reported in SACK
* blocks.
*
* @param tc TCP connection for which the SACK list is updated
* @param start Start sequence number of the newest SACK block
* @param end End sequence of the newest SACK block
*/
void
tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end)
{
sack_block_t *new_list = tc->snd_sacks_fl, *block = 0;
int i;
/* If the first segment is ooo add it to the list. Last write might've moved
* rcv_nxt over the first segment. */
if (seq_lt (tc->rcv_nxt, start))
{
vec_add2 (new_list, block, 1);
block->start = start;
block->end = end;
}
/* Find the blocks still worth keeping. */
for (i = 0; i < vec_len (tc->snd_sacks); i++)
{
/* Discard if rcv_nxt advanced beyond current block */
if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
continue;
/* Merge or drop if segment overlapped by the new segment */
if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
&& seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
{
if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
new_list[0].start = tc->snd_sacks[i].start;
if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
new_list[0].end = tc->snd_sacks[i].end;
continue;
}
/* Save to new SACK list if we have space. */
if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
vec_add1 (new_list, tc->snd_sacks[i]);
}
ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
/* Replace old vector with new one */
vec_reset_length (tc->snd_sacks);
tc->snd_sacks_fl = tc->snd_sacks;
tc->snd_sacks = new_list;
/* Segments should not 'touch' */
ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
}
u32
tcp_sack_list_bytes (tcp_connection_t * tc)
{
u32 bytes = 0, i;
for (i = 0; i < vec_len (tc->snd_sacks); i++)
bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start;
return bytes;
}
#endif /* CLIB_MARCH_VARIANT */
/** Enqueue data for delivery to application */
static int
tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
u16 data_len)
{
int written, error = TCP_ERROR_ENQUEUED;
ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
ASSERT (data_len);
written = session_enqueue_stream_connection (&tc->connection, b, 0,
1 /* queue event */ , 1);
tc->bytes_in += written;
TCP_EVT (TCP_EVT_INPUT, tc, 0, data_len, written);
/* Update rcv_nxt */
if (PREDICT_TRUE (written == data_len))
{
tc->rcv_nxt += written;
}
/* If more data written than expected, account for out-of-order bytes. */
else if (written > data_len)
{
tc->rcv_nxt += written;
TCP_EVT (TCP_EVT_CC_INPUT, tc, data_len, written);
}
else if (written > 0)
{
/* We've written something but FIFO is probably full now */
tc->rcv_nxt += written;
error = TCP_ERROR_PARTIALLY_ENQUEUED;
}
else
{
return TCP_ERROR_FIFO_FULL;
}
/* Update SACK list if need be */
if (tcp_opts_sack_permitted (&tc->rcv_opts))
{
/* Remove SACK blocks that have been delivered */
tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
}
return error;
}
/** Enqueue out-of-order data */
static int
tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b,
u16 data_len)
{
session_t *s0;
int rv, offset;
ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
ASSERT (data_len);
/* Enqueue out-of-order data with relative offset */
rv = session_enqueue_stream_connection (&tc->connection, b,
vnet_buffer (b)->tcp.seq_number -
tc->rcv_nxt, 0 /* queue event */ ,
0);
/* Nothing written */
if (rv)
{
TCP_EVT (TCP_EVT_INPUT, tc, 1, data_len, 0);
return TCP_ERROR_FIFO_FULL;
}
TCP_EVT (TCP_EVT_INPUT, tc, 1, data_len, data_len);
tc->bytes_in += data_len;
/* Update SACK list if in use */
if (tcp_opts_sack_permitted (&tc->rcv_opts))
{
ooo_segment_t *newest;
u32 start, end;
s0 = session_get (tc->c_s_index, tc->c_thread_index);
/* Get the newest segment from the fifo */
newest = svm_fifo_newest_ooo_segment (s0->rx_fifo);
if (newest)
{
offset = ooo_segment_offset_prod (s0->rx_fifo, newest);
ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt);
start = tc->rcv_nxt + offset;
end = start + ooo_segment_length (s0->rx_fifo, newest);
tcp_update_sack_list (tc, start, end);
svm_fifo_newest_ooo_segment_reset (s0->rx_fifo);
TCP_EVT (TCP_EVT_CC_SACKS, tc);
}
}
return TCP_ERROR_ENQUEUED_OOO;
}
/**
* Check if ACK could be delayed. If ack can be delayed, it should return
* true for a full frame. If we're always acking return 0.
*/
always_inline int
tcp_can_delack (tcp_connection_t * tc)
{
/* Send ack if ... */
if (TCP_ALWAYS_ACK
/* just sent a rcv wnd 0
|| (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 */
/* constrained to send ack */
|| (tc->flags & TCP_CONN_SNDACK) != 0
/* we're almost out of tx wnd */
|| tcp_available_cc_snd_space (tc) < 4 * tc->snd_mss)
return 0;
return 1;
}
static int
tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop)
{
u32 discard, first = b->current_length;
vlib_main_t *vm = vlib_get_main ();
/* Handle multi-buffer segments */
if (n_bytes_to_drop > b->current_length)
{
if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
return -1;
do
{
discard = clib_min (n_bytes_to_drop, b->current_length);
vlib_buffer_advance (b, discard);
b = vlib_get_buffer (vm, b->next_buffer);
n_bytes_to_drop -= discard;
}
while (n_bytes_to_drop);
if (n_bytes_to_drop > first)
b->total_length_not_including_first_buffer -= n_bytes_to_drop - first;
}
else
vlib_buffer_advance (b, n_bytes_to_drop);
vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop;
return 0;
}
/**
* Receive buffer for connection and handle acks
*
* It handles both in order or out-of-order data.
*/
static int
tcp_segment_rcv (tcp_worker_ctx_t * wrk, tcp_connection_t * tc,
vlib_buffer_t * b)
{
u32 error, n_bytes_to_drop, n_data_bytes;
vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
n_data_bytes = vnet_buffer (b)->tcp.data_len;
ASSERT (n_data_bytes);
tc->data_segs_in += 1;
/* Handle out-of-order data */
if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
{
/* Old sequence numbers allowed through because they overlapped
* the rx window */
if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
{
/* Completely in the past (possible retransmit). Ack
* retransmissions since we may not have any data to send */
if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
{
tcp_program_ack (tc);
error = TCP_ERROR_SEGMENT_OLD;
goto done;
}
/* Chop off the bytes in the past and see if what is left
* can be enqueued in order */
n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
n_data_bytes -= n_bytes_to_drop;
vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
if (tcp_buffer_discard_bytes (b, n_bytes_to_drop))
{
error = TCP_ERROR_SEGMENT_OLD;
goto done;
}
goto in_order;
}
/* RFC2581: Enqueue and send DUPACK for fast retransmit */
error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
tcp_program_dupack (tc);
TCP_EVT (TCP_EVT_DUPACK_SENT, tc, vnet_buffer (b)->tcp);
tc->errors.above_data_wnd += seq_gt (vnet_buffer (b)->tcp.seq_end,
tc->rcv_las + tc->rcv_wnd);
goto done;
}
in_order:
/* In order data, enqueue. Fifo figures out by itself if any out-of-order
* segments can be enqueued after fifo tail offset changes. */
error = tcp_session_enqueue_data (tc, b, n_data_bytes);
if (tcp_can_delack (tc))
{
if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
tcp_timer_set (tc, TCP_TIMER_DELACK, tcp_cfg.delack_time);
goto done;
}
tcp_program_ack (tc);
done:
return error;
}
typedef struct
{
tcp_header_t tcp_header;
tcp_connection_t tcp_connection;
} tcp_rx_trace_t;
static u8 *
format_tcp_rx_trace (u8 * s, va_list * args)
{
CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
u32 indent = format_get_indent (s);
s = format (s, "%U\n%U%U",
format_tcp_header, &t->tcp_header, 128,
format_white_space, indent,
format_tcp_connection, &t->tcp_connection, 1);
return s;
}
static u8 *
format_tcp_rx_trace_short (u8 * s, va_list * args)
{
CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
s = format (s, "%d -> %d (%U)",
clib_net_to_host_u16 (t->tcp_header.dst_port),
clib_net_to_host_u16 (t->tcp_header.src_port), format_tcp_state,
t->tcp_connection.state);
return s;
}
static void
tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0,
tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4)
{
if (tc0)
{
clib_memcpy_fast (&t0->tcp_connection, tc0,
sizeof (t0->tcp_connection));
}
else
{
th0 = tcp_buffer_hdr (b0);
}
clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header));
}
static void
tcp_established_trace_frame (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * frame, u8 is_ip4)
{
u32 *from, n_left;
n_left = frame->n_vectors;
from = vlib_frame_vector_args (frame);
while (n_left >= 1)
{
tcp_connection_t *tc0;
tcp_rx_trace_t *t0;
tcp_header_t *th0;
vlib_buffer_t *b0;
u32 bi0;
bi0 = from[0];
b0 = vlib_get_buffer (vm, bi0);
if (b0->flags & VLIB_BUFFER_IS_TRACED)
{
t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
vm->thread_index);
th0 = tcp_buffer_hdr (b0);
tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4);
}
from += 1;
n_left -= 1;
}
}
always_inline void
tcp_node_inc_counter_i (vlib_main_t * vm, u32 tcp4_node, u32 tcp6_node,
u8 is_ip4, u32 evt, u32 val)
{
if (is_ip4)
vlib_node_increment_counter (vm, tcp4_node, evt, val);
else
vlib_node_increment_counter (vm, tcp6_node, evt, val);
}
#define tcp_maybe_inc_counter(node_id, err, count) \
{ \
if (next0 != tcp_next_drop (is_ip4)) \
tcp_node_inc_counter_i (vm, tcp4_##node_id##_node.index, \
tcp6_##node_id##_node.index, is_ip4, err, \
1); \
}
#define tcp_inc_counter(node_id, err, count) \
tcp_node_inc_counter_i (vm, tcp4_##node_id##_node.index, \
tcp6_##node_id##_node.index, is_ip4, \
err, count)
#define tcp_maybe_inc_err_counter(cnts, err) \
{ \
cnts[err] += (next0 != tcp_next_drop (is_ip4)); \
}
#define tcp_inc_err_counter(cnts, err, val) \
{ \
cnts[err] += val; \
}
#define tcp_store_err_counters(node_id, cnts) \
{ \
int i; \
for (i = 0; i < TCP_N_ERROR; i++) \
if (cnts[i]) \
tcp_inc_counter(node_id, i, cnts[i]); \
}
always_inline uword
tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * frame, int is_ip4)
{
u32 thread_index = vm->thread_index, errors = 0;
tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
u32 n_left_from, *from, *first_buffer;
u16 err_counters[TCP_N_ERROR] = { 0 };
if (node->flags & VLIB_NODE_FLAG_TRACE)
tcp_established_trace_frame (vm, node, frame, is_ip4);
first_buffer = from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
while (n_left_from > 0)
{
u32 bi0, error0 = TCP_ERROR_ACK_OK;
vlib_buffer_t *b0;
tcp_header_t *th0;
tcp_connection_t *tc0;
if (n_left_from > 1)
{
vlib_buffer_t *pb;
pb = vlib_get_buffer (vm, from[1]);
vlib_prefetch_buffer_header (pb, LOAD);
CLIB_PREFETCH (pb->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
}
bi0 = from[0];
from += 1;
n_left_from -= 1;
b0 = vlib_get_buffer (vm, bi0);
tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
thread_index);
if (PREDICT_FALSE (tc0 == 0))
{
error0 = TCP_ERROR_INVALID_CONNECTION;
goto done;
}
th0 = tcp_buffer_hdr (b0);
/* TODO header prediction fast path */
/* 1-4: check SEQ, RST, SYN */
if (PREDICT_FALSE (tcp_segment_validate (wrk, tc0, b0, th0, &error0)))
{
TCP_EVT (TCP_EVT_SEG_INVALID, tc0, vnet_buffer (b0)->tcp);
goto done;
}
/* 5: check the ACK field */
if (PREDICT_FALSE (tcp_rcv_ack (wrk, tc0, b0, th0, &error0)))
goto done;
/* 6: check the URG bit TODO */
/* 7: process the segment text */
if (vnet_buffer (b0)->tcp.data_len)
error0 = tcp_segment_rcv (wrk, tc0, b0);
/* 8: check the FIN bit */
if (PREDICT_FALSE (tcp_is_fin (th0)))
tcp_rcv_fin (wrk, tc0, b0, &error0);
done:
tcp_inc_err_counter (err_counters, error0, 1);
}
errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
thread_index);
err_counters[TCP_ERROR_MSG_QUEUE_FULL] = errors;
tcp_store_err_counters (established, err_counters);
tcp_handle_postponed_dequeues (wrk);
tcp_handle_disconnects (wrk);
vlib_buffer_free (vm, first_buffer, frame->n_vectors);
return frame->n_vectors;
}
VLIB_NODE_FN (tcp4_established_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ );
}
VLIB_NODE_FN (tcp6_established_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ );
}
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp4_established_node) =
{
.name = "tcp4-established",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_ESTABLISHED_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
},
.format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp6_established_node) =
{
.name = "tcp6-established",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_ESTABLISHED_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
},
.format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
static u8
tcp_lookup_is_valid (tcp_connection_t * tc, vlib_buffer_t * b,
tcp_header_t * hdr)
{
transport_connection_t *tmp = 0;
u64 handle;
if (!tc)
return 1;
/* Proxy case */
if (tc->c_lcl_port == 0 && tc->state == TCP_STATE_LISTEN)
return 1;
u8 is_ip_valid = 0, val_l, val_r;
if (tc->connection.is_ip4)
{
ip4_header_t *ip4_hdr = (ip4_header_t *) vlib_buffer_get_current (b);
val_l = !ip4_address_compare (&ip4_hdr->dst_address,
&tc->connection.lcl_ip.ip4);
val_l = val_l || ip_is_zero (&tc->connection.lcl_ip, 1);
val_r = !ip4_address_compare (&ip4_hdr->src_address,
&tc->connection.rmt_ip.ip4);
val_r = val_r || tc->state == TCP_STATE_LISTEN;
is_ip_valid = val_l && val_r;
}
else
{
ip6_header_t *ip6_hdr = (ip6_header_t *) vlib_buffer_get_current (b);
val_l = !ip6_address_compare (&ip6_hdr->dst_address,
&tc->connection.lcl_ip.ip6);
val_l = val_l || ip_is_zero (&tc->connection.lcl_ip, 0);
val_r = !ip6_address_compare (&ip6_hdr->src_address,
&tc->connection.rmt_ip.ip6);
val_r = val_r || tc->state == TCP_STATE_LISTEN;
is_ip_valid = val_l && val_r;
}
u8 is_valid = (tc->c_lcl_port == hdr->dst_port
&& (tc->state == TCP_STATE_LISTEN
|| tc->c_rmt_port == hdr->src_port) && is_ip_valid);
if (!is_valid)
{
handle = session_lookup_half_open_handle (&tc->connection);
tmp = session_lookup_half_open_connection (handle & 0xFFFFFFFF,
tc->c_proto, tc->c_is_ip4);
if (tmp)
{
if (tmp->lcl_port == hdr->dst_port
&& tmp->rmt_port == hdr->src_port)
{
TCP_DBG ("half-open is valid!");
is_valid = 1;
}
}
}
return is_valid;
}
/**
* Lookup transport connection
*/
static tcp_connection_t *
tcp_lookup_connection (u32 fib_index, vlib_buffer_t * b, u8 thread_index,
u8 is_ip4)
{
tcp_header_t *tcp;
transport_connection_t *tconn;
tcp_connection_t *tc;
u8 is_filtered = 0;
if (is_ip4)
{
ip4_header_t *ip4;
ip4 = vlib_buffer_get_current (b);
tcp = ip4_next_header (ip4);
tconn = session_lookup_connection_wt4 (fib_index,
&ip4->dst_address,
&ip4->src_address,
tcp->dst_port,
tcp->src_port,
TRANSPORT_PROTO_TCP,
thread_index, &is_filtered);
tc = tcp_get_connection_from_transport (tconn);
ASSERT (tcp_lookup_is_valid (tc, b, tcp));
}
else
{
ip6_header_t *ip6;
ip6 = vlib_buffer_get_current (b);
tcp = ip6_next_header (ip6);
tconn = session_lookup_connection_wt6 (fib_index,
&ip6->dst_address,
&ip6->src_address,
tcp->dst_port,
tcp->src_port,
TRANSPORT_PROTO_TCP,
thread_index, &is_filtered);
tc = tcp_get_connection_from_transport (tconn);
ASSERT (tcp_lookup_is_valid (tc, b, tcp));
}
return tc;
}
always_inline void
tcp_check_tx_offload (tcp_connection_t * tc, int is_ipv4)
{
vnet_main_t *vnm = vnet_get_main ();
const dpo_id_t *dpo;
const load_balance_t *lb;
vnet_hw_interface_t *hw_if;
u32 sw_if_idx, lb_idx;
if (is_ipv4)
{
ip4_address_t *dst_addr = &(tc->c_rmt_ip.ip4);
lb_idx = ip4_fib_forwarding_lookup (tc->c_fib_index, dst_addr);
}
else
{
ip6_address_t *dst_addr = &(tc->c_rmt_ip.ip6);
lb_idx = ip6_fib_table_fwding_lookup (tc->c_fib_index, dst_addr);
}
lb = load_balance_get (lb_idx);
dpo = load_balance_get_bucket_i (lb, 0);
sw_if_idx = dpo->dpoi_index;
hw_if = vnet_get_sup_hw_interface (vnm, sw_if_idx);
if (hw_if->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO)
tc->cfg_flags |= TCP_CFG_F_TSO;
}
always_inline uword
tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame, int is_ip4)
{
u32 n_left_from, *from, *first_buffer, errors = 0;
u32 my_thread_index = vm->thread_index;
tcp_worker_ctx_t *wrk = tcp_get_worker (my_thread_index);
from = first_buffer = vlib_frame_vector_args (from_frame);
n_left_from = from_frame->n_vectors;
while (n_left_from > 0)
{
u32 bi0, ack0, seq0, error0 = TCP_ERROR_NONE;
tcp_connection_t *tc0, *new_tc0;
tcp_header_t *tcp0 = 0;
tcp_rx_trace_t *t0;
vlib_buffer_t *b0;
bi0 = from[0];
from += 1;
n_left_from -= 1;
b0 = vlib_get_buffer (vm, bi0);
tc0 =
tcp_half_open_connection_get (vnet_buffer (b0)->tcp.connection_index);
if (PREDICT_FALSE (tc0 == 0))
{
error0 = TCP_ERROR_INVALID_CONNECTION;
goto drop;
}
/* Half-open completed recently but the connection was't removed
* yet by the owning thread */
if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE))
{
/* Make sure the connection actually exists */
ASSERT (tcp_lookup_connection (tc0->c_fib_index, b0,
my_thread_index, is_ip4));
error0 = TCP_ERROR_SPURIOUS_SYN_ACK;
goto drop;
}
ack0 = vnet_buffer (b0)->tcp.ack_number;
seq0 = vnet_buffer (b0)->tcp.seq_number;
tcp0 = tcp_buffer_hdr (b0);
/* Crude check to see if the connection handle does not match
* the packet. Probably connection just switched to established */
if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port
|| tcp0->src_port != tc0->c_rmt_port))
{
error0 = TCP_ERROR_INVALID_CONNECTION;
goto drop;
}
if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0)
&& !tcp_syn (tcp0)))
{
error0 = TCP_ERROR_SEGMENT_INVALID;
goto drop;
}
/* SYNs consume sequence numbers */
vnet_buffer (b0)->tcp.seq_end += tcp_is_syn (tcp0);
/*
* 1. check the ACK bit
*/
/*
* If the ACK bit is set
* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless
* the RST bit is set, if so drop the segment and return)
* <SEQ=SEG.ACK><CTL=RST>
* and discard the segment. Return.
* If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
*/
if (tcp_ack (tcp0))
{
if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt))
{
if (!tcp_rst (tcp0))
tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
error0 = TCP_ERROR_RCV_WND;
goto drop;
}
/* Make sure ACK is valid */
if (seq_gt (tc0->snd_una, ack0))
{
error0 = TCP_ERROR_ACK_INVALID;
goto drop;
}
}
/*
* 2. check the RST bit
*/
if (tcp_rst (tcp0))
{
/* If ACK is acceptable, signal client that peer is not
* willing to accept connection and drop connection*/
if (tcp_ack (tcp0))
tcp_connection_reset (tc0);
error0 = TCP_ERROR_RST_RCVD;
goto drop;
}
/*
* 3. check the security and precedence (skipped)
*/
/*
* 4. check the SYN bit
*/
/* No SYN flag. Drop. */
if (!tcp_syn (tcp0))
{
error0 = TCP_ERROR_SEGMENT_INVALID;
goto drop;
}
/* Parse options */
if (tcp_options_parse (tcp0, &tc0->rcv_opts, 1))
{
error0 = TCP_ERROR_OPTIONS;
goto drop;
}
/* Valid SYN or SYN-ACK. Move connection from half-open pool to
* current thread pool. */
new_tc0 = tcp_connection_alloc_w_base (my_thread_index, tc0);
new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
new_tc0->irs = seq0;
new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID;
new_tc0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
/* If this is not the owning thread, wait for syn retransmit to
* expire and cleanup then */
if (tcp_half_open_connection_cleanup (tc0))
tc0->flags |= TCP_CONN_HALF_OPEN_DONE;
if (tcp_opts_tstamp (&new_tc0->rcv_opts))
{
new_tc0->tsval_recent = new_tc0->rcv_opts.tsval;
new_tc0->tsval_recent_age = tcp_time_now ();
}
if (tcp_opts_wscale (&new_tc0->rcv_opts))
new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
else
new_tc0->rcv_wscale = 0;
new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
<< new_tc0->snd_wscale;
new_tc0->snd_wl1 = seq0;
new_tc0->snd_wl2 = ack0;
tcp_connection_init_vars (new_tc0);
/* SYN-ACK: See if we can switch to ESTABLISHED state */
if (PREDICT_TRUE (tcp_ack (tcp0)))
{
/* Our SYN is ACKed: we have iss < ack = snd_una */
/* TODO Dequeue acknowledged segments if we support Fast Open */
new_tc0->snd_una = ack0;
new_tc0->state = TCP_STATE_ESTABLISHED;
/* Make sure las is initialized for the wnd computation */
new_tc0->rcv_las = new_tc0->rcv_nxt;
/* Notify app that we have connection. If session layer can't
* allocate session send reset */
if (session_stream_connect_notify (&new_tc0->connection, 0))
{
tcp_send_reset_w_pkt (new_tc0, b0, my_thread_index, is_ip4);
tcp_connection_cleanup (new_tc0);
error0 = TCP_ERROR_CREATE_SESSION_FAIL;
goto drop;
}
new_tc0->tx_fifo_size =
transport_tx_fifo_size (&new_tc0->connection);
/* Update rtt with the syn-ack sample */
tcp_estimate_initial_rtt (new_tc0);
TCP_EVT (TCP_EVT_SYNACK_RCVD, new_tc0);
error0 = TCP_ERROR_SYN_ACKS_RCVD;
}
/* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
else
{
new_tc0->state = TCP_STATE_SYN_RCVD;
/* Notify app that we have connection */
if (session_stream_connect_notify (&new_tc0->connection, 0))
{
tcp_connection_cleanup (new_tc0);
tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
TCP_EVT (TCP_EVT_RST_SENT, tc0);
error0 = TCP_ERROR_CREATE_SESSION_FAIL;
goto drop;
}
new_tc0->tx_fifo_size =
transport_tx_fifo_size (&new_tc0->connection);
new_tc0->rtt_ts = 0;
tcp_init_snd_vars (new_tc0);
tcp_send_synack (new_tc0);
error0 = TCP_ERROR_SYNS_RCVD;
goto drop;
}
if (!(new_tc0->cfg_flags & TCP_CFG_F_NO_TSO))
tcp_check_tx_offload (new_tc0, is_ip4);
/* Read data, if any */
if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
{
clib_warning ("rcvd data in syn-sent");
error0 = tcp_segment_rcv (wrk, new_tc0, b0);
if (error0 == TCP_ERROR_ACK_OK)
error0 = TCP_ERROR_SYN_ACKS_RCVD;
}
else
{
/* Send ack now instead of programming it because connection was
* just established and it's not optional. */
tcp_send_ack (new_tc0);
}
drop:
tcp_inc_counter (syn_sent, error0, 1);
if (PREDICT_FALSE ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0))
{
t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
clib_memcpy_fast (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
clib_memcpy_fast (&t0->tcp_connection, tc0,
sizeof (t0->tcp_connection));
}
}
errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
my_thread_index);
tcp_inc_counter (syn_sent, TCP_ERROR_MSG_QUEUE_FULL, errors);
vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
return from_frame->n_vectors;
}
VLIB_NODE_FN (tcp4_syn_sent_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ );
}
VLIB_NODE_FN (tcp6_syn_sent_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ );
}
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp4_syn_sent_node) =
{
.name = "tcp4-syn-sent",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_SYN_SENT_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
},
.format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp6_syn_sent_node) =
{
.name = "tcp6-syn-sent",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_SYN_SENT_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
},
.format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
/**
* Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
* as per RFC793 p. 64
*/
always_inline uword
tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame, int is_ip4)
{
u32 thread_index = vm->thread_index, errors = 0, *first_buffer;
tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
u32 n_left_from, *from, max_dequeue;
from = first_buffer = vlib_frame_vector_args (from_frame);
n_left_from = from_frame->n_vectors;
while (n_left_from > 0)
{
u32 bi0, error0 = TCP_ERROR_NONE;
tcp_header_t *tcp0 = 0;
tcp_connection_t *tc0;
vlib_buffer_t *b0;
u8 is_fin0;
bi0 = from[0];
from += 1;
n_left_from -= 1;
b0 = vlib_get_buffer (vm, bi0);
tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
thread_index);
if (PREDICT_FALSE (tc0 == 0))
{
error0 = TCP_ERROR_INVALID_CONNECTION;
goto drop;
}
tcp0 = tcp_buffer_hdr (b0);
is_fin0 = tcp_is_fin (tcp0);
if (CLIB_DEBUG)
{
if (!(tc0->connection.flags & TRANSPORT_CONNECTION_F_NO_LOOKUP))
{
tcp_connection_t *tmp;
tmp = tcp_lookup_connection (tc0->c_fib_index, b0, thread_index,
is_ip4);
if (tmp->state != tc0->state)
{
if (tc0->state != TCP_STATE_CLOSED)
clib_warning ("state changed");
goto drop;
}
}
}
/*
* Special treatment for CLOSED
*/
if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
{
error0 = TCP_ERROR_CONNECTION_CLOSED;
goto drop;
}
/*
* For all other states (except LISTEN)
*/
/* 1-4: check SEQ, RST, SYN */
if (PREDICT_FALSE (tcp_segment_validate (wrk, tc0, b0, tcp0, &error0)))
goto drop;
/* 5: check the ACK field */
switch (tc0->state)
{
case TCP_STATE_SYN_RCVD:
/* Make sure the segment is exactly right */
if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
{
tcp_connection_reset (tc0);
error0 = TCP_ERROR_SEGMENT_INVALID;
goto drop;
}
/*
* If the segment acknowledgment is not acceptable, form a
* reset segment,
* <SEQ=SEG.ACK><CTL=RST>
* and send it.
*/
if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
{
tcp_connection_reset (tc0);
goto drop;
}
/* Update rtt and rto */
tcp_estimate_initial_rtt (tc0);
tcp_connection_tx_pacer_update (tc0);
/* Switch state to ESTABLISHED */
tc0->state = TCP_STATE_ESTABLISHED;
TCP_EVT (TCP_EVT_STATE_CHANGE, tc0);
if (!(tc0->cfg_flags & TCP_CFG_F_NO_TSO))
tcp_check_tx_offload (tc0, is_ip4);
/* Initialize session variables */
tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
<< tc0->rcv_opts.wscale;
tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
/* Reset SYN-ACK retransmit and SYN_RCV establish timers */
tcp_retransmit_timer_reset (tc0);
if (session_stream_accept_notify (&tc0->connection))
{
error0 = TCP_ERROR_MSG_QUEUE_FULL;
tcp_connection_reset (tc0);
goto drop;
}
error0 = TCP_ERROR_ACK_OK;
break;
case TCP_STATE_ESTABLISHED:
/* We can get packets in established state here because they
* were enqueued before state change */
if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
goto drop;
break;
case TCP_STATE_FIN_WAIT_1:
/* In addition to the processing for the ESTABLISHED state, if
* our FIN is now acknowledged then enter FIN-WAIT-2 and
* continue processing in that state. */
if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
goto drop;
/* Still have to send the FIN */
if (tc0->flags & TCP_CONN_FINPNDG)
{
/* TX fifo finally drained */
max_dequeue = transport_max_tx_dequeue (&tc0->connection);
if (max_dequeue <= tc0->burst_acked)
tcp_send_fin (tc0);
/* If a fin was received and data was acked extend wait */
else if ((tc0->flags & TCP_CONN_FINRCVD) && tc0->bytes_acked)
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
tcp_cfg.closewait_time);
}
/* If FIN is ACKed */
else if (tc0->snd_una == tc0->snd_nxt)
{
/* Stop all retransmit timers because we have nothing more
* to send. */
tcp_connection_timers_reset (tc0);
/* We already have a FIN but didn't transition to CLOSING
* because of outstanding tx data. Close the connection. */
if (tc0->flags & TCP_CONN_FINRCVD)
{
tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE,
tcp_cfg.cleanup_time);
session_transport_closed_notify (&tc0->connection);
goto drop;
}
tcp_connection_set_state (tc0, TCP_STATE_FIN_WAIT_2);
/* Enable waitclose because we're willing to wait for peer's
* FIN but not indefinitely. */
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.finwait2_time);
/* Don't try to deq the FIN acked */
if (tc0->burst_acked > 1)
session_tx_fifo_dequeue_drop (&tc0->connection,
tc0->burst_acked - 1);
tc0->burst_acked = 0;
}
break;
case TCP_STATE_FIN_WAIT_2:
/* In addition to the processing for the ESTABLISHED state, if
* the retransmission queue is empty, the user's CLOSE can be
* acknowledged ("ok") but do not delete the TCB. */
if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
goto drop;
tc0->burst_acked = 0;
break;
case TCP_STATE_CLOSE_WAIT:
/* Do the same processing as for the ESTABLISHED state. */
if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
goto drop;
if (!(tc0->flags & TCP_CONN_FINPNDG))
break;
/* Still have outstanding tx data */
max_dequeue = transport_max_tx_dequeue (&tc0->connection);
if (max_dequeue > tc0->burst_acked)
break;
tcp_send_fin (tc0);
tcp_connection_timers_reset (tc0);
tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.lastack_time);
break;
case TCP_STATE_CLOSING:
/* In addition to the processing for the ESTABLISHED state, if
* the ACK acknowledges our FIN then enter the TIME-WAIT state,
* otherwise ignore the segment. */
if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
goto drop;
if (tc0->snd_una != tc0->snd_nxt)
goto drop;
tcp_connection_timers_reset (tc0);
tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
session_transport_closed_notify (&tc0->connection);
goto drop;
break;
case TCP_STATE_LAST_ACK:
/* The only thing that [should] arrive in this state is an
* acknowledgment of our FIN. If our FIN is now acknowledged,
* delete the TCB, enter the CLOSED state, and return. */
if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
goto drop;
/* Apparently our ACK for the peer's FIN was lost */
if (is_fin0 && tc0->snd_una != tc0->snd_nxt)
{
tcp_send_fin (tc0);
goto drop;
}
tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
session_transport_closed_notify (&tc0->connection);
/* Don't free the connection from the data path since
* we can't ensure that we have no packets already enqueued
* to output. Rely instead on the waitclose timer */
tcp_connection_timers_reset (tc0);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.cleanup_time);
goto drop;
break;
case TCP_STATE_TIME_WAIT:
/* The only thing that can arrive in this state is a
* retransmission of the remote FIN. Acknowledge it, and restart
* the 2 MSL timeout. */
if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
goto drop;
if (!is_fin0)
goto drop;
tcp_program_ack (tc0);
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
goto drop;
break;
default:
ASSERT (0);
}
/* 6: check the URG bit TODO */
/* 7: process the segment text */
switch (tc0->state)
{
case TCP_STATE_ESTABLISHED:
case TCP_STATE_FIN_WAIT_1:
case TCP_STATE_FIN_WAIT_2:
if (vnet_buffer (b0)->tcp.data_len)
error0 = tcp_segment_rcv (wrk, tc0, b0);
break;
case TCP_STATE_CLOSE_WAIT:
case TCP_STATE_CLOSING:
case TCP_STATE_LAST_ACK:
case TCP_STATE_TIME_WAIT:
/* This should not occur, since a FIN has been received from the
* remote side. Ignore the segment text. */
break;
}
/* 8: check the FIN bit */
if (!is_fin0)
goto drop;
TCP_EVT (TCP_EVT_FIN_RCVD, tc0);
switch (tc0->state)
{
case TCP_STATE_ESTABLISHED:
/* Account for the FIN and send ack */
tc0->rcv_nxt += 1;
tcp_program_ack (tc0);
tcp_connection_set_state (tc0, TCP_STATE_CLOSE_WAIT);
tcp_program_disconnect (wrk, tc0);
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.closewait_time);
break;
case TCP_STATE_SYN_RCVD:
/* Send FIN-ACK, enter LAST-ACK and because the app was not
* notified yet, set a cleanup timer instead of relying on
* disconnect notify and the implicit close call. */
tcp_connection_timers_reset (tc0);
tc0->rcv_nxt += 1;
tcp_send_fin (tc0);
tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.lastack_time);
break;
case TCP_STATE_CLOSE_WAIT:
case TCP_STATE_CLOSING:
case TCP_STATE_LAST_ACK:
/* move along .. */
break;
case TCP_STATE_FIN_WAIT_1:
tc0->rcv_nxt += 1;
if (tc0->flags & TCP_CONN_FINPNDG)
{
/* If data is outstanding, stay in FIN_WAIT_1 and try to finish
* sending it. Since we already received a fin, do not wait
* for too long. */
tc0->flags |= TCP_CONN_FINRCVD;
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
tcp_cfg.closewait_time);
}
else
{
tcp_connection_set_state (tc0, TCP_STATE_CLOSING);
tcp_program_ack (tc0);
/* Wait for ACK for our FIN but not forever */
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
tcp_cfg.closing_time);
}
break;
case TCP_STATE_FIN_WAIT_2:
/* Got FIN, send ACK! Be more aggressive with resource cleanup */
tc0->rcv_nxt += 1;
tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
tcp_connection_timers_reset (tc0);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
tcp_program_ack (tc0);
session_transport_closed_notify (&tc0->connection);
break;
case TCP_STATE_TIME_WAIT:
/* Remain in the TIME-WAIT state. Restart the time-wait
* timeout.
*/
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, tcp_cfg.timewait_time);
break;
}
error0 = TCP_ERROR_FIN_RCVD;
drop:
tcp_inc_counter (rcv_process, error0, 1);
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
tcp_rx_trace_t *t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
}
}
errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
thread_index);
tcp_inc_counter (rcv_process, TCP_ERROR_MSG_QUEUE_FULL, errors);
tcp_handle_postponed_dequeues (wrk);
tcp_handle_disconnects (wrk);
vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
return from_frame->n_vectors;
}
VLIB_NODE_FN (tcp4_rcv_process_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ );
}
VLIB_NODE_FN (tcp6_rcv_process_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ );
}
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp4_rcv_process_node) =
{
.name = "tcp4-rcv-process",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
},
.format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp6_rcv_process_node) =
{
.name = "tcp6-rcv-process",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
},
.format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
/**
* LISTEN state processing as per RFC 793 p. 65
*/
always_inline uword
tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame, int is_ip4)
{
u32 n_left_from, *from, n_syns = 0, *first_buffer;
u32 my_thread_index = vm->thread_index;
from = first_buffer = vlib_frame_vector_args (from_frame);
n_left_from = from_frame->n_vectors;
while (n_left_from > 0)
{
u32 bi0;
vlib_buffer_t *b0;
tcp_rx_trace_t *t0;
tcp_header_t *th0 = 0;
tcp_connection_t *lc0;
ip4_header_t *ip40;
ip6_header_t *ip60;
tcp_connection_t *child0;
u32 error0 = TCP_ERROR_NONE;
bi0 = from[0];
from += 1;
n_left_from -= 1;
b0 = vlib_get_buffer (vm, bi0);
lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
if (is_ip4)
{
ip40 = vlib_buffer_get_current (b0);
th0 = ip4_next_header (ip40);
}
else
{
ip60 = vlib_buffer_get_current (b0);
th0 = ip6_next_header (ip60);
}
/* Create child session. For syn-flood protection use filter */
/* 1. first check for an RST: handled in dispatch */
/* if (tcp_rst (th0))
goto drop;
*/
/* 2. second check for an ACK: handled in dispatch */
/* if (tcp_ack (th0))
{
tcp_send_reset (b0, is_ip4);
goto drop;
}
*/
/* 3. check for a SYN (did that already) */
/* Make sure connection wasn't just created */
child0 = tcp_lookup_connection (lc0->c_fib_index, b0, my_thread_index,
is_ip4);
if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN))
{
error0 = TCP_ERROR_CREATE_EXISTS;
goto drop;
}
/* Create child session and send SYN-ACK */
child0 = tcp_connection_alloc (my_thread_index);
child0->c_lcl_port = th0->dst_port;
child0->c_rmt_port = th0->src_port;
child0->c_is_ip4 = is_ip4;
child0->state = TCP_STATE_SYN_RCVD;
child0->c_fib_index = lc0->c_fib_index;
child0->cc_algo = lc0->cc_algo;
if (is_ip4)
{
child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32;
child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32;
}
else
{
clib_memcpy_fast (&child0->c_lcl_ip6, &ip60->dst_address,
sizeof (ip6_address_t));
clib_memcpy_fast (&child0->c_rmt_ip6, &ip60->src_address,
sizeof (ip6_address_t));
}
if (tcp_options_parse (th0, &child0->rcv_opts, 1))
{
error0 = TCP_ERROR_OPTIONS;
tcp_connection_free (child0);
goto drop;
}
child0->irs = vnet_buffer (b0)->tcp.seq_number;
child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
child0->rcv_las = child0->rcv_nxt;
child0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
/* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
* segments are used to initialize PAWS. */
if (tcp_opts_tstamp (&child0->rcv_opts))
{
child0->tsval_recent = child0->rcv_opts.tsval;
child0->tsval_recent_age = tcp_time_now ();
}
if (tcp_opts_wscale (&child0->rcv_opts))
child0->snd_wscale = child0->rcv_opts.wscale;
child0->snd_wnd = clib_net_to_host_u16 (th0->window)
<< child0->snd_wscale;
child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
tcp_connection_init_vars (child0);
child0->rto = TCP_RTO_MIN;
if (session_stream_accept (&child0->connection, lc0->c_s_index,
lc0->c_thread_index, 0 /* notify */ ))
{
tcp_connection_cleanup (child0);
error0 = TCP_ERROR_CREATE_SESSION_FAIL;
goto drop;
}
TCP_EVT (TCP_EVT_SYN_RCVD, child0, 1);
child0->tx_fifo_size = transport_tx_fifo_size (&child0->connection);
tcp_send_synack (child0);
drop:
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header));
clib_memcpy_fast (&t0->tcp_connection, lc0,
sizeof (t0->tcp_connection));
}
n_syns += (error0 == TCP_ERROR_NONE);
}
tcp_inc_counter (listen, TCP_ERROR_SYNS_RCVD, n_syns);
vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
return from_frame->n_vectors;
}
VLIB_NODE_FN (tcp4_listen_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ );
}
VLIB_NODE_FN (tcp6_listen_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ );
}
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp4_listen_node) =
{
.name = "tcp4-listen",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_LISTEN_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
},
.format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp6_listen_node) =
{
.name = "tcp6-listen",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_LISTEN_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
foreach_tcp_state_next
#undef _
},
.format_trace = format_tcp_rx_trace_short,
};
/* *INDENT-ON* */
typedef enum _tcp_input_next
{
TCP_INPUT_NEXT_DROP,
TCP_INPUT_NEXT_LISTEN,
TCP_INPUT_NEXT_RCV_PROCESS,
TCP_INPUT_NEXT_SYN_SENT,
TCP_INPUT_NEXT_ESTABLISHED,
TCP_INPUT_NEXT_RESET,
TCP_INPUT_NEXT_PUNT,
TCP_INPUT_N_NEXT
} tcp_input_next_t;
#define foreach_tcp4_input_next \
_ (DROP, "ip4-drop") \
_ (LISTEN, "tcp4-listen") \
_ (RCV_PROCESS, "tcp4-rcv-process") \
_ (SYN_SENT, "tcp4-syn-sent") \
_ (ESTABLISHED, "tcp4-established") \
_ (RESET, "tcp4-reset") \
_ (PUNT, "ip4-punt")
#define foreach_tcp6_input_next \
_ (DROP, "ip6-drop") \
_ (LISTEN, "tcp6-listen") \
_ (RCV_PROCESS, "tcp6-rcv-process") \
_ (SYN_SENT, "tcp6-syn-sent") \
_ (ESTABLISHED, "tcp6-established") \
_ (RESET, "tcp6-reset") \
_ (PUNT, "ip6-punt")
#define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
static void
tcp_input_trace_frame (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_buffer_t ** bs, u32 n_bufs, u8 is_ip4)
{
tcp_connection_t *tc;
tcp_header_t *tcp;
tcp_rx_trace_t *t;
int i;
for (i = 0; i < n_bufs; i++)
{
if (bs[i]->flags & VLIB_BUFFER_IS_TRACED)
{
t = vlib_add_trace (vm, node, bs[i], sizeof (*t));
tc = tcp_connection_get (vnet_buffer (bs[i])->tcp.connection_index,
vm->thread_index);
tcp = vlib_buffer_get_current (bs[i]);
tcp_set_rx_trace_data (t, tc, tcp, bs[i], is_ip4);
}
}
}
static void
tcp_input_set_error_next (tcp_main_t * tm, u16 * next, u32 * error, u8 is_ip4)
{
if (*error == TCP_ERROR_FILTERED || *error == TCP_ERROR_WRONG_THREAD)
{
*next = TCP_INPUT_NEXT_DROP;
}
else if ((is_ip4 && tm->punt_unknown4) || (!is_ip4 && tm->punt_unknown6))
{
*next = TCP_INPUT_NEXT_PUNT;
*error = TCP_ERROR_PUNT;
}
else
{
*next = TCP_INPUT_NEXT_RESET;
*error = TCP_ERROR_NO_LISTENER;
}
}
always_inline tcp_connection_t *
tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error,
u8 is_ip4, u8 is_nolookup)
{
u32 fib_index = vnet_buffer (b)->ip.fib_index;
int n_advance_bytes, n_data_bytes;
transport_connection_t *tc;
tcp_header_t *tcp;
u8 result = 0;
if (is_ip4)
{
ip4_header_t *ip4 = vlib_buffer_get_current (b);
int ip_hdr_bytes = ip4_header_bytes (ip4);
if (PREDICT_FALSE (b->current_length < ip_hdr_bytes + sizeof (*tcp)))
{
*error = TCP_ERROR_LENGTH;
return 0;
}
tcp = ip4_next_header (ip4);
vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip4;
n_advance_bytes = (ip_hdr_bytes + tcp_header_bytes (tcp));
n_data_bytes = clib_net_to_host_u16 (ip4->length) - n_advance_bytes;
/* Length check. Checksum computed by ipx_local no need to compute again */
if (PREDICT_FALSE (n_data_bytes < 0))
{
*error = TCP_ERROR_LENGTH;
return 0;
}
if (!is_nolookup)
tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address,
&ip4->src_address, tcp->dst_port,
tcp->src_port,
TRANSPORT_PROTO_TCP, thread_index,
&result);
}
else
{
ip6_header_t *ip6 = vlib_buffer_get_current (b);
if (PREDICT_FALSE (b->current_length < sizeof (*ip6) + sizeof (*tcp)))
{
*error = TCP_ERROR_LENGTH;
return 0;
}
tcp = ip6_next_header (ip6);
vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip6;
n_advance_bytes = tcp_header_bytes (tcp);
n_data_bytes = clib_net_to_host_u16 (ip6->payload_length)
- n_advance_bytes;
n_advance_bytes += sizeof (ip6[0]);
if (PREDICT_FALSE (n_data_bytes < 0))
{
*error = TCP_ERROR_LENGTH;
return 0;
}
if (!is_nolookup)
{
if (PREDICT_FALSE
(ip6_address_is_link_local_unicast (&ip6->dst_address)))
{
ip4_main_t *im = &ip4_main;
fib_index = vec_elt (im->fib_index_by_sw_if_index,
vnet_buffer (b)->sw_if_index[VLIB_RX]);
}
tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address,
&ip6->src_address,
tcp->dst_port, tcp->src_port,
TRANSPORT_PROTO_TCP,
thread_index, &result);
}
}
if (is_nolookup)
tc =
(transport_connection_t *) tcp_connection_get (vnet_buffer (b)->
tcp.connection_index,
thread_index);
vnet_buffer (b)->tcp.seq_number = clib_net_to_host_u32 (tcp->seq_number);
vnet_buffer (b)->tcp.ack_number = clib_net_to_host_u32 (tcp->ack_number);
vnet_buffer (b)->tcp.data_offset = n_advance_bytes;
vnet_buffer (b)->tcp.data_len = n_data_bytes;
vnet_buffer (b)->tcp.seq_end = vnet_buffer (b)->tcp.seq_number
+ n_data_bytes;
vnet_buffer (b)->tcp.flags = 0;
*error = result ? TCP_ERROR_NONE + result : *error;
return tcp_get_connection_from_transport (tc);
}
static inline void
tcp_input_dispatch_buffer (tcp_main_t * tm, tcp_connection_t * tc,
vlib_buffer_t * b, u16 * next, u32 * error)
{
tcp_header_t *tcp;
u8 flags;
tcp = tcp_buffer_hdr (b);
flags = tcp->flags & filter_flags;
*next = tm->dispatch_table[tc->state][flags].next;
*error = tm->dispatch_table[tc->state][flags].error;
tc->segs_in += 1;
if (PREDICT_FALSE (*error == TCP_ERROR_DISPATCH
|| *next == TCP_INPUT_NEXT_RESET))
{
/* Overload tcp flags to store state */
tcp_state_t state = tc->state;
vnet_buffer (b)->tcp.flags = tc->state;
if (*error == TCP_ERROR_DISPATCH)
clib_warning ("tcp conn %u disp error state %U flags %U",
tc->c_c_index, format_tcp_state, state,
format_tcp_flags, (int) flags);
}
}
always_inline uword
tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * frame, int is_ip4, u8 is_nolookup)
{
u32 n_left_from, *from, thread_index = vm->thread_index;
tcp_main_t *tm = vnet_get_tcp_main ();
vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
u16 nexts[VLIB_FRAME_SIZE], *next;
tcp_set_time_now (tcp_get_worker (thread_index));
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
vlib_get_buffers (vm, from, bufs, n_left_from);
b = bufs;
next = nexts;
while (n_left_from >= 4)
{
u32 error0 = TCP_ERROR_NO_LISTENER, error1 = TCP_ERROR_NO_LISTENER;
tcp_connection_t *tc0, *tc1;
{
vlib_prefetch_buffer_header (b[2], STORE);
CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
vlib_prefetch_buffer_header (b[3], STORE);
CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
}
next[0] = next[1] = TCP_INPUT_NEXT_DROP;
tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
is_nolookup);
tc1 = tcp_input_lookup_buffer (b[1], thread_index, &error1, is_ip4,
is_nolookup);
if (PREDICT_TRUE (!tc0 + !tc1 == 0))
{
ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], &error1);
}
else
{
if (PREDICT_TRUE (tc0 != 0))
{
ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
}
else
tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
if (PREDICT_TRUE (tc1 != 0))
{
ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], &error1);
}
else
tcp_input_set_error_next (tm, &next[1], &error1, is_ip4);
}
b += 2;
next += 2;
n_left_from -= 2;
}
while (n_left_from > 0)
{
tcp_connection_t *tc0;
u32 error0 = TCP_ERROR_NO_LISTENER;
if (n_left_from > 1)
{
vlib_prefetch_buffer_header (b[1], STORE);
CLIB_PREFETCH (b[1]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
}
next[0] = TCP_INPUT_NEXT_DROP;
tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
is_nolookup);
if (PREDICT_TRUE (tc0 != 0))
{
ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], &error0);
}
else
tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
b += 1;
next += 1;
n_left_from -= 1;
}
if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
tcp_input_trace_frame (vm, node, bufs, frame->n_vectors, is_ip4);
vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
return frame->n_vectors;
}
VLIB_NODE_FN (tcp4_input_nolookup_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
1 /* is_nolookup */ );
}
VLIB_NODE_FN (tcp6_input_nolookup_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
1 /* is_nolookup */ );
}
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp4_input_nolookup_node) =
{
.name = "tcp4-input-nolookup",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_INPUT_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_INPUT_NEXT_##s] = n,
foreach_tcp4_input_next
#undef _
},
.format_buffer = format_tcp_header,
.format_trace = format_tcp_rx_trace,
};
/* *INDENT-ON* */
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp6_input_nolookup_node) =
{
.name = "tcp6-input-nolookup",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_INPUT_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_INPUT_NEXT_##s] = n,
foreach_tcp6_input_next
#undef _
},
.format_buffer = format_tcp_header,
.format_trace = format_tcp_rx_trace,
};
/* *INDENT-ON* */
VLIB_NODE_FN (tcp4_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
0 /* is_nolookup */ );
}
VLIB_NODE_FN (tcp6_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame)
{
return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
0 /* is_nolookup */ );
}
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp4_input_node) =
{
.name = "tcp4-input",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_INPUT_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_INPUT_NEXT_##s] = n,
foreach_tcp4_input_next
#undef _
},
.format_buffer = format_tcp_header,
.format_trace = format_tcp_rx_trace,
};
/* *INDENT-ON* */
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (tcp6_input_node) =
{
.name = "tcp6-input",
/* Takes a vector of packets. */
.vector_size = sizeof (u32),
.n_errors = TCP_N_ERROR,
.error_strings = tcp_error_strings,
.n_next_nodes = TCP_INPUT_N_NEXT,
.next_nodes =
{
#define _(s,n) [TCP_INPUT_NEXT_##s] = n,
foreach_tcp6_input_next
#undef _
},
.format_buffer = format_tcp_header,
.format_trace = format_tcp_rx_trace,
};
/* *INDENT-ON* */
#ifndef CLIB_MARCH_VARIANT
static void
tcp_dispatch_table_init (tcp_main_t * tm)
{
int i, j;
for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++)
for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++)
{
tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP;
tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH;
}
#define _(t,f,n,e) \
do { \
tm->dispatch_table[TCP_STATE_##t][f].next = (n); \
tm->dispatch_table[TCP_STATE_##t][f].error = (e); \
} while (0)
/* RFC 793: In LISTEN if RST drop and if ACK return RST */
_(LISTEN, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_ACK_INVALID);
_(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_INVALID_CONNECTION);
_(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
_(LISTEN, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
TCP_ERROR_ACK_INVALID);
_(LISTEN, TCP_FLAG_SYN | TCP_FLAG_RST, TCP_INPUT_NEXT_DROP,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
TCP_ERROR_INVALID_CONNECTION);
_(LISTEN, TCP_FLAG_FIN, TCP_INPUT_NEXT_RESET, TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_RST, TCP_INPUT_NEXT_DROP,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
TCP_ERROR_NONE);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_SYN, TCP_INPUT_NEXT_DROP,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST, TCP_INPUT_NEXT_DROP,
TCP_ERROR_SEGMENT_INVALID);
_(LISTEN, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
/* ACK for for a SYN-ACK -> tcp-rcv-process. */
_(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_SYN | TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
/* SYN-ACK for a SYN */
_(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
TCP_ERROR_NONE);
_(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
_(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
_(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
TCP_ERROR_NONE);
_(SYN_SENT, TCP_FLAG_FIN, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
_(SYN_SENT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
TCP_ERROR_NONE);
/* ACK for for established connection -> tcp-established. */
_(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
/* FIN for for established connection -> tcp-established. */
_(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED,
TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED,
TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED,
TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
/* ACK or FIN-ACK to our FIN */
_(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
/* FIN in reply to our FIN from the other side */
_(FIN_WAIT_1, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
_(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_SYN | TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSING, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
_(CLOSING, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_SYN | TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
/* FIN confirming that the peer (app) has closed */
_(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(FIN_WAIT_2, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_2, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSE_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(CLOSE_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(CLOSE_WAIT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(LAST_ACK, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
_(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_SYN | TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
/* RFC793 CLOSED: An incoming segment containing a RST is discarded. An
* incoming segment not containing a RST causes a RST to be sent in
* response.*/
_(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
_(CLOSED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
TCP_ERROR_CONNECTION_CLOSED);
_(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
_(CLOSED, TCP_FLAG_SYN, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
_(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
TCP_ERROR_NONE);
#undef _
}
static clib_error_t *
tcp_input_init (vlib_main_t * vm)
{
clib_error_t *error = 0;
tcp_main_t *tm = vnet_get_tcp_main ();
if ((error = vlib_call_init_function (vm, tcp_init)))
return error;
/* Initialize dispatch table. */
tcp_dispatch_table_init (tm);
return error;
}
VLIB_INIT_FUNCTION (tcp_input_init);
#endif /* CLIB_MARCH_VARIANT */
/*
* fd.io coding-style-patch-verification: ON
*
* Local Variables:
* eval: (c-set-style "gnu")
* End:
*/
|