summaryrefslogtreecommitdiffstats
path: root/src/vppinfra/bihash_template.c
blob: 56c410b5a51eda2da00f3f765beb141fe3fe8c74 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/** @cond DOCUMENTATION_IS_IN_BIHASH_DOC_H */

void BV (clib_bihash_init)
  (BVT (clib_bihash) * h, char *name, u32 nbuckets, uword memory_size)
{
  void *oldheap;
  int i;

  nbuckets = 1 << (max_log2 (nbuckets));

  h->name = (u8 *) name;
  h->nbuckets = nbuckets;
  h->log2_nbuckets = max_log2 (nbuckets);
  h->cache_hits = 0;
  h->cache_misses = 0;

  h->mheap = mheap_alloc (0 /* use VM */ , memory_size);

  oldheap = clib_mem_set_heap (h->mheap);
  vec_validate_aligned (h->buckets, nbuckets - 1, CLIB_CACHE_LINE_BYTES);
  h->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
					   CLIB_CACHE_LINE_BYTES);
  h->writer_lock[0] = 0;

  for (i = 0; i < nbuckets; i++)
    BV (clib_bihash_reset_cache) (h->buckets + i);

  clib_mem_set_heap (oldheap);
}

void BV (clib_bihash_free) (BVT (clib_bihash) * h)
{
  mheap_free (h->mheap);
  memset (h, 0, sizeof (*h));
}

static
BVT (clib_bihash_value) *
BV (value_alloc) (BVT (clib_bihash) * h, u32 log2_pages)
{
  BVT (clib_bihash_value) * rv = 0;
  void *oldheap;

  ASSERT (h->writer_lock[0]);
  if (log2_pages >= vec_len (h->freelists) || h->freelists[log2_pages] == 0)
    {
      oldheap = clib_mem_set_heap (h->mheap);

      vec_validate (h->freelists, log2_pages);
      rv = clib_mem_alloc_aligned ((sizeof (*rv) * (1 << log2_pages)),
				   CLIB_CACHE_LINE_BYTES);
      clib_mem_set_heap (oldheap);
      goto initialize;
    }
  rv = h->freelists[log2_pages];
  h->freelists[log2_pages] = rv->next_free;

initialize:
  ASSERT (rv);
  /*
   * Latest gcc complains that the length arg is zero
   * if we replace (1<<log2_pages) with vec_len(rv).
   * No clue.
   */
  memset (rv, 0xff, sizeof (*rv) * (1 << log2_pages));
  return rv;
}

static void
BV (value_free) (BVT (clib_bihash) * h, BVT (clib_bihash_value) * v,
		 u32 log2_pages)
{
  ASSERT (h->writer_lock[0]);

  ASSERT (vec_len (h->freelists) > log2_pages);

  v->next_free = h->freelists[log2_pages];
  h->freelists[log2_pages] = v;
}

static inline void
BV (make_working_copy) (BVT (clib_bihash) * h, BVT (clib_bihash_bucket) * b)
{
  BVT (clib_bihash_value) * v;
  BVT (clib_bihash_bucket) working_bucket __attribute__ ((aligned (8)));
  void *oldheap;
  BVT (clib_bihash_value) * working_copy;
  u32 thread_index = os_get_thread_index ();
  int log2_working_copy_length;

  if (thread_index >= vec_len (h->working_copies))
    {
      oldheap = clib_mem_set_heap (h->mheap);
      vec_validate (h->working_copies, thread_index);
      vec_validate_init_empty (h->working_copy_lengths, thread_index, ~0);
      clib_mem_set_heap (oldheap);
    }

  /*
   * working_copies are per-cpu so that near-simultaneous
   * updates from multiple threads will not result in sporadic, spurious
   * lookup failures.
   */
  working_copy = h->working_copies[thread_index];
  log2_working_copy_length = h->working_copy_lengths[thread_index];

  h->saved_bucket.as_u64 = b->as_u64;
  oldheap = clib_mem_set_heap (h->mheap);

  if (b->log2_pages > log2_working_copy_length)
    {
      if (working_copy)
	clib_mem_free (working_copy);

      working_copy = clib_mem_alloc_aligned
	(sizeof (working_copy[0]) * (1 << b->log2_pages),
	 CLIB_CACHE_LINE_BYTES);
      h->working_copy_lengths[thread_index] = b->log2_pages;
      h->working_copies[thread_index] = working_copy;
    }

  clib_mem_set_heap (oldheap);

  /* Lock the bucket... */
  while (BV (clib_bihash_lock_bucket) (b) == 0)
    ;

  v = BV (clib_bihash_get_value) (h, b->offset);

  clib_memcpy (working_copy, v, sizeof (*v) * (1 << b->log2_pages));
  working_bucket.as_u64 = b->as_u64;
  working_bucket.offset = BV (clib_bihash_get_offset) (h, working_copy);
  CLIB_MEMORY_BARRIER ();
  b->as_u64 = working_bucket.as_u64;
  h->working_copies[thread_index] = working_copy;
}

static
BVT (clib_bihash_value) *
BV (split_and_rehash)
  (BVT (clib_bihash) * h,
   BVT (clib_bihash_value) * old_values, u32 old_log2_pages,
   u32 new_log2_pages)
{
  BVT (clib_bihash_value) * new_values, *new_v;
  int i, j, length_in_kvs;

  new_values = BV (value_alloc) (h, new_log2_pages);
  length_in_kvs = (1 << old_log2_pages) * BIHASH_KVP_PER_PAGE;

  for (i = 0; i < length_in_kvs; i++)
    {
      u64 new_hash;

      /* Entry not in use? Forget it */
      if (BV (clib_bihash_is_free) (&(old_values->kvp[i])))
	continue;

      /* rehash the item onto its new home-page */
      new_hash = BV (clib_bihash_hash) (&(old_values->kvp[i]));
      new_hash >>= h->log2_nbuckets;
      new_hash &= (1 << new_log2_pages) - 1;
      new_v = &new_values[new_hash];

      /* Across the new home-page */
      for (j = 0; j < BIHASH_KVP_PER_PAGE; j++)
	{
	  /* Empty slot */
	  if (BV (clib_bihash_is_free) (&(new_v->kvp[j])))
	    {
	      clib_memcpy (&(new_v->kvp[j]), &(old_values->kvp[i]),
			   sizeof (new_v->kvp[j]));
	      goto doublebreak;
	    }
	}
      /* Crap. Tell caller to try again */
      BV (value_free) (h, new_values, new_log2_pages);
      return 0;
    doublebreak:;
    }

  return new_values;
}

static
BVT (clib_bihash_value) *
BV (split_and_rehash_linear)
  (BVT (clib_bihash) * h,
   BVT (clib_bihash_value) * old_values, u32 old_log2_pages,
   u32 new_log2_pages)
{
  BVT (clib_bihash_value) * new_values;
  int i, j, new_length, old_length;

  new_values = BV (value_alloc) (h, new_log2_pages);
  new_length = (1 << new_log2_pages) * BIHASH_KVP_PER_PAGE;
  old_length = (1 << old_log2_pages) * BIHASH_KVP_PER_PAGE;

  j = 0;
  /* Across the old value array */
  for (i = 0; i < old_length; i++)
    {
      /* Find a free slot in the new linear scan bucket */
      for (; j < new_length; j++)
	{
	  /* Old value not in use? Forget it. */
	  if (BV (clib_bihash_is_free) (&(old_values->kvp[i])))
	    goto doublebreak;

	  /* New value should never be in use */
	  if (BV (clib_bihash_is_free) (&(new_values->kvp[j])))
	    {
	      /* Copy the old value and move along */
	      clib_memcpy (&(new_values->kvp[j]), &(old_values->kvp[i]),
			   sizeof (new_values->kvp[j]));
	      j++;
	      goto doublebreak;
	    }
	}
      /* This should never happen... */
      clib_warning ("BUG: linear rehash failed!");
      BV (value_free) (h, new_values, new_log2_pages);
      return 0;

    doublebreak:;
    }
  return new_values;
}

int BV (clib_bihash_add_del)
  (BVT (clib_bihash) * h, BVT (clib_bihash_kv) * add_v, int is_add)
{
  u32 bucket_index;
  BVT (clib_bihash_bucket) * b, tmp_b;
  BVT (clib_bihash_value) * v, *new_v, *save_new_v, *working_copy;
  int rv = 0;
  int i, limit;
  u64 hash, new_hash;
  u32 new_log2_pages, old_log2_pages;
  u32 thread_index = os_get_thread_index ();
  int mark_bucket_linear;
  int resplit_once;

  hash = BV (clib_bihash_hash) (add_v);

  bucket_index = hash & (h->nbuckets - 1);
  b = &h->buckets[bucket_index];

  hash >>= h->log2_nbuckets;

  tmp_b.linear_search = 0;

  while (__sync_lock_test_and_set (h->writer_lock, 1))
    ;

  /* First elt in the bucket? */
  if (b->offset == 0)
    {
      if (is_add == 0)
	{
	  rv = -1;
	  goto unlock;
	}

      v = BV (value_alloc) (h, 0);

      *v->kvp = *add_v;
      tmp_b.as_u64 = 0;
      tmp_b.offset = BV (clib_bihash_get_offset) (h, v);

      b->as_u64 = tmp_b.as_u64;
      goto unlock;
    }

  /* Note: this leaves the cache disabled */
  BV (make_working_copy) (h, b);

  v = BV (clib_bihash_get_value) (h, h->saved_bucket.offset);

  limit = BIHASH_KVP_PER_PAGE;
  v += (b->linear_search == 0) ? hash & ((1 << b->log2_pages) - 1) : 0;
  if (b->linear_search)
    limit <<= b->log2_pages;

  if (is_add)
    {
      /*
       * For obvious (in hindsight) reasons, see if we're supposed to
       * replace an existing key, then look for an empty slot.
       */
      for (i = 0; i < limit; i++)
	{
	  if (!memcmp (&(v->kvp[i]), &add_v->key, sizeof (add_v->key)))
	    {
	      clib_memcpy (&(v->kvp[i]), add_v, sizeof (*add_v));
	      CLIB_MEMORY_BARRIER ();
	      /* Restore the previous (k,v) pairs */
	      b->as_u64 = h->saved_bucket.as_u64;
	      goto unlock;
	    }
	}
      for (i = 0; i < limit; i++)
	{
	  if (BV (clib_bihash_is_free) (&(v->kvp[i])))
	    {
	      clib_memcpy (&(v->kvp[i]), add_v, sizeof (*add_v));
	      CLIB_MEMORY_BARRIER ();
	      b->as_u64 = h->saved_bucket.as_u64;
	      goto unlock;
	    }
	}
      /* no room at the inn... split case... */
    }
  else
    {
      for (i = 0; i < limit; i++)
	{
	  if (!memcmp (&(v->kvp[i]), &add_v->key, sizeof (add_v->key)))
	    {
	      memset (&(v->kvp[i]), 0xff, sizeof (*(add_v)));
	      CLIB_MEMORY_BARRIER ();
	      b->as_u64 = h->saved_bucket.as_u64;
	      goto unlock;
	    }
	}
      rv = -3;
      b->as_u64 = h->saved_bucket.as_u64;
      goto unlock;
    }

  old_log2_pages = h->saved_bucket.log2_pages;
  new_log2_pages = old_log2_pages + 1;
  mark_bucket_linear = 0;

  working_copy = h->working_copies[thread_index];
  resplit_once = 0;

  new_v = BV (split_and_rehash) (h, working_copy, old_log2_pages,
				 new_log2_pages);
  if (new_v == 0)
    {
    try_resplit:
      resplit_once = 1;
      new_log2_pages++;
      /* Try re-splitting. If that fails, fall back to linear search */
      new_v = BV (split_and_rehash) (h, working_copy, old_log2_pages,
				     new_log2_pages);
      if (new_v == 0)
	{
	mark_linear:
	  new_log2_pages--;
	  /* pinned collisions, use linear search */
	  new_v =
	    BV (split_and_rehash_linear) (h, working_copy, old_log2_pages,
					  new_log2_pages);
	  mark_bucket_linear = 1;
	}
    }

  /* Try to add the new entry */
  save_new_v = new_v;
  new_hash = BV (clib_bihash_hash) (add_v);
  limit = BIHASH_KVP_PER_PAGE;
  if (mark_bucket_linear)
    limit <<= new_log2_pages;
  new_hash >>= h->log2_nbuckets;
  new_hash &= (1 << new_log2_pages) - 1;
  new_v += mark_bucket_linear ? 0 : new_hash;

  for (i = 0; i < limit; i++)
    {
      if (BV (clib_bihash_is_free) (&(new_v->kvp[i])))
	{
	  clib_memcpy (&(new_v->kvp[i]), add_v, sizeof (*add_v));
	  goto expand_ok;
	}
    }

  /* Crap. Try again */
  BV (value_free) (h, save_new_v, new_log2_pages);
  /*
   * If we've already doubled the size of the bucket once,
   * fall back to linear search now.
   */
  if (resplit_once)
    goto mark_linear;
  else
    goto try_resplit;

expand_ok:
  /* Keep track of the number of linear-scan buckets */
  if (tmp_b.linear_search ^ mark_bucket_linear)
    h->linear_buckets += (mark_bucket_linear == 1) ? 1 : -1;

  tmp_b.log2_pages = new_log2_pages;
  tmp_b.offset = BV (clib_bihash_get_offset) (h, save_new_v);
  tmp_b.linear_search = mark_bucket_linear;

  CLIB_MEMORY_BARRIER ();
  b->as_u64 = tmp_b.as_u64;
  v = BV (clib_bihash_get_value) (h, h->saved_bucket.offset);
  BV (value_free) (h, v, old_log2_pages);

unlock:
  BV (clib_bihash_reset_cache) (b);
  BV (clib_bihash_unlock_bucket) (b);
  CLIB_MEMORY_BARRIER ();
  h->writer_lock[0] = 0;
  return rv;
}

int BV (clib_bihash_search)
  (BVT (clib_bihash) * h,
   BVT (clib_bihash_kv) * search_key, BVT (clib_bihash_kv) * valuep)
{
  u64 hash;
  u32 bucket_index;
  BVT (clib_bihash_value) * v;
#if BIHASH_KVP_CACHE_SIZE > 0
  BVT (clib_bihash_kv) * kvp;
#endif
  BVT (clib_bihash_bucket) * b;
  int i, limit;

  ASSERT (valuep);

  hash = BV (clib_bihash_hash) (search_key);

  bucket_index = hash & (h->nbuckets - 1);
  b = &h->buckets[bucket_index];

  if (b->offset == 0)
    return -1;

#if BIHASH_KVP_CACHE_SIZE > 0
  /* Check the cache, if currently enabled */
  if (PREDICT_TRUE ((b->cache_lru & (1 << 15)) == 0))
    {
      limit = BIHASH_KVP_CACHE_SIZE;
      kvp = b->cache;
      for (i = 0; i < limit; i++)
	{
	  if (BV (clib_bihash_key_compare) (kvp[i].key, search_key->key))
	    {
	      *valuep = kvp[i];
	      h->cache_hits++;
	      return 0;
	    }
	}
    }
#endif

  hash >>= h->log2_nbuckets;

  v = BV (clib_bihash_get_value) (h, b->offset);
  limit = BIHASH_KVP_PER_PAGE;
  v += (b->linear_search == 0) ? hash & ((1 << b->log2_pages) - 1) : 0;
  if (PREDICT_FALSE (b->linear_search))
    limit <<= b->log2_pages;

  for (i = 0; i < limit; i++)
    {
      if (BV (clib_bihash_key_compare) (v->kvp[i].key, search_key->key))
	{
	  *valuep = v->kvp[i];

#if BIHASH_KVP_CACHE_SIZE > 0
	  u8 cache_slot;
	  /* Shut off the cache */
	  if (BV (clib_bihash_lock_bucket) (b))
	    {
	      cache_slot = BV (clib_bihash_get_lru) (b);
	      b->cache[cache_slot] = v->kvp[i];
	      BV (clib_bihash_update_lru) (b, cache_slot);

	      /* Reenable the cache */
	      BV (clib_bihash_unlock_bucket) (b);
	      h->cache_misses++;
	    }
#endif
	  return 0;
	}
    }
  return -1;
}

u8 *BV (format_bihash_lru) (u8 * s, va_list * args)
{
#if BIHASH_KVP_SIZE > 0
  int i;
  BVT (clib_bihash_bucket) * b = va_arg (*args, BVT (clib_bihash_bucket) *);
  u16 cache_lru = b->cache_lru;

  s = format (s, "cache %s, order ", cache_lru & (1 << 15) ? "on" : "off");

  for (i = 0; i < BIHASH_KVP_CACHE_SIZE; i++)
    s = format (s, "[%d] ", ((cache_lru >> (3 * i)) & 7));

  return (s);
#else
  return format (s, "cache not configured");
#endif
}

void
BV (clib_bihash_update_lru_not_inline) (BVT (clib_bihash_bucket) * b, u8 slot)
{
#if BIHASH_KVP_SIZE > 0
  BV (clib_bihash_update_lru) (b, slot);
#endif
}

u8 *BV (format_bihash) (u8 * s, va_list * args)
{
  BVT (clib_bihash) * h = va_arg (*args, BVT (clib_bihash) *);
  int verbose = va_arg (*args, int);
  BVT (clib_bihash_bucket) * b;
  BVT (clib_bihash_value) * v;
  int i, j, k;
  u64 active_elements = 0;

  s = format (s, "Hash table %s\n", h->name ? h->name : (u8 *) "(unnamed)");

  for (i = 0; i < h->nbuckets; i++)
    {
      b = &h->buckets[i];
      if (b->offset == 0)
	{
	  if (verbose > 1)
	    s = format (s, "[%d]: empty\n", i);
	  continue;
	}

      if (verbose)
	{
	  s = format (s, "[%d]: heap offset %d, len %d, linear %d\n", i,
		      b->offset, (1 << b->log2_pages), b->linear_search);
	}

      v = BV (clib_bihash_get_value) (h, b->offset);
      for (j = 0; j < (1 << b->log2_pages); j++)
	{
	  for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
	    {
	      if (BV (clib_bihash_is_free) (&v->kvp[k]))
		{
		  if (verbose > 1)
		    s = format (s, "    %d: empty\n",
				j * BIHASH_KVP_PER_PAGE + k);
		  continue;
		}
	      if (verbose)
		{
		  s = format (s, "    %d: %U\n",
			      j * BIHASH_KVP_PER_PAGE + k,
			      BV (format_bihash_kvp), &(v->kvp[k]));
		}
	      active_elements++;
	    }
	  v++;
	}
    }

  s = format (s, "    %lld active elements\n", active_elements);
  s = format (s, "    %d free lists\n", vec_len (h->freelists));
  s = format (s, "    %d linear search buckets\n", h->linear_buckets);
  s = format (s, "    %lld cache hits, %lld cache misses\n",
	      h->cache_hits, h->cache_misses);
  return s;
}

void BV (clib_bihash_foreach_key_value_pair)
  (BVT (clib_bihash) * h, void *callback, void *arg)
{
  int i, j, k;
  BVT (clib_bihash_bucket) * b;
  BVT (clib_bihash_value) * v;
  void (*fp) (BVT (clib_bihash_kv) *, void *) = callback;

  for (i = 0; i < h->nbuckets; i++)
    {
      b = &h->buckets[i];
      if (b->offset == 0)
	continue;

      v = BV (clib_bihash_get_value) (h, b->offset);
      for (j = 0; j < (1 << b->log2_pages); j++)
	{
	  for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
	    {
	      if (BV (clib_bihash_is_free) (&v->kvp[k]))
		continue;

	      (*fp) (&v->kvp[k], arg);
	    }
	  v++;
	}
    }
}

/** @endcond */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */
{ color: #66d9ef } /* Keyword.Constant */ .highlight .kd { color: #66d9ef } /* Keyword.Declaration */ .highlight .kn { color: #f92672 } /* Keyword.Namespace */ .highlight .kp { color: #66d9ef } /* Keyword.Pseudo */ .highlight .kr { color: #66d9ef } /* Keyword.Reserved */ .highlight .kt { color: #66d9ef } /* Keyword.Type */ .highlight .ld { color: #e6db74 } /* Literal.Date */ .highlight .m { color: #ae81ff } /* Literal.Number */ .highlight .s { color: #e6db74 } /* Literal.String */ .highlight .na { color: #a6e22e } /* Name.Attribute */ .highlight .nb { color: #f8f8f2 } /* Name.Builtin */ .highlight .nc { color: #a6e22e } /* Name.Class */ .highlight .no { color: #66d9ef } /* Name.Constant */ .highlight .nd { color: #a6e22e } /* Name.Decorator */ .highlight .ni { color: #f8f8f2 } /* Name.Entity */ .highlight .ne { color: #a6e22e } /* Name.Exception */ .highlight .nf { color: #a6e22e } /* Name.Function */ .highlight .nl { color: #f8f8f2 } /* Name.Label */ .highlight .nn { color: #f8f8f2 } /* Name.Namespace */ .highlight .nx { color: #a6e22e } /* Name.Other */ .highlight .py { color: #f8f8f2 } /* Name.Property */ .highlight .nt { color: #f92672 } /* Name.Tag */ .highlight .nv { color: #f8f8f2 } /* Name.Variable */ .highlight .ow { color: #f92672 } /* Operator.Word */ .highlight .w { color: #f8f8f2 } /* Text.Whitespace */ .highlight .mb { color: #ae81ff } /* Literal.Number.Bin */ .highlight .mf { color: #ae81ff } /* Literal.Number.Float */ .highlight .mh { color: #ae81ff } /* Literal.Number.Hex */ .highlight .mi { color: #ae81ff } /* Literal.Number.Integer */ .highlight .mo { color: #ae81ff } /* Literal.Number.Oct */ .highlight .sa { color: #e6db74 } /* Literal.String.Affix */ .highlight .sb { color: #e6db74 } /* Literal.String.Backtick */ .highlight .sc { color: #e6db74 } /* Literal.String.Char */ .highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */ .highlight .sd { color: #e6db74 } /* Literal.String.Doc */ .highlight .s2 { color: #e6db74 } /* Literal.String.Double */ .highlight .se { color: #ae81ff } /* Literal.String.Escape */ .highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */ .highlight .si { color: #e6db74 } /* Literal.String.Interpol */ .highlight .sx { color: #e6db74 } /* Literal.String.Other */ .highlight .sr { color: #e6db74 } /* Literal.String.Regex */ .highlight .s1 { color: #e6db74 } /* Literal.String.Single */ .highlight .ss { color: #e6db74 } /* Literal.String.Symbol */ .highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #a6e22e } /* Name.Function.Magic */ .highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */ .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */ .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */ .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */ .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */ } @media (prefers-color-scheme: light) { .highlight .hll { background-color: #ffffcc } .highlight .c { color: #888888 } /* Comment */ .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ .highlight .k { color: #008800; font-weight: bold } /* Keyword */ .highlight .ch { color: #888888 } /* Comment.Hashbang */ .highlight .cm { color: #888888 } /* Comment.Multiline */ .highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */ .highlight .cpf { color: #888888 } /* Comment.PreprocFile */ .highlight .c1 { color: #888888 } /* Comment.Single */ .highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */ .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ .highlight .ge { font-style: italic } /* Generic.Emph */ .highlight .gr { color: #aa0000 } /* Generic.Error */ .highlight .gh { color: #333333 } /* Generic.Heading */ .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ .highlight .go { color: #888888 } /* Generic.Output */ .highlight .gp { color: #555555 } /* Generic.Prompt */ .highlight .gs { font-weight: bold } /* Generic.Strong */ .highlight .gu { color: #666666 } /* Generic.Subheading */ .highlight .gt { color: #aa0000 } /* Generic.Traceback */ .highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */ .highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */ .highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */ .highlight .kp { color: #008800 } /* Keyword.Pseudo */ .highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */ .highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */ .highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */ .highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */ .highlight .na { color: #336699 } /* Name.Attribute */ .highlight .nb { color: #003388 } /* Name.Builtin */ .highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */ .highlight .no { color: #003366; font-weight: bold } /* Name.Constant */ .highlight .nd { color: #555555 } /* Name.Decorator */ .highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */ .highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */ .highlight .nl { color: #336699; font-style: italic } /* Name.Label */ .highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */ .highlight .py { color: #336699; font-weight: bold } /* Name.Property */ .highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */ .highlight .nv { color: #336699 } /* Name.Variable */ .highlight .ow { color: #008800 } /* Operator.Word */ .highlight .w { color: #bbbbbb } /* Text.Whitespace */ .highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */ .highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */ .highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */ .highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */ .highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */ .highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */ .highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */ .highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */ .highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */ .highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */ .highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */ .highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */ .highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */ .highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */ .highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */ .highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */ .highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */ .highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */ .highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */ .highlight .vc { color: #336699 } /* Name.Variable.Class */ .highlight .vg { color: #dd7700 } /* Name.Variable.Global */ .highlight .vi { color: #3333bb } /* Name.Variable.Instance */ .highlight .vm { color: #336699 } /* Name.Variable.Magic */ .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */ }
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * ip/ip4_forward.c: IP v4 forwarding
 *
 * Copyright (c) 2008 Eliot Dresselhaus
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include <vnet/vnet.h>
#include <vnet/ip/ip.h>
#include <vnet/ethernet/ethernet.h>	/* for ethernet_header_t */
#include <vnet/ethernet/arp_packet.h>	/* for ethernet_arp_header_t */
#include <vnet/ppp/ppp.h>
#include <vnet/srp/srp.h>	/* for srp_hw_interface_class */
#include <vnet/api_errno.h>	/* for API error numbers */
#include <vnet/fib/fib_table.h>	/* for FIB table and entry creation */
#include <vnet/fib/fib_entry.h>	/* for FIB table and entry creation */
#include <vnet/fib/fib_urpf_list.h>	/* for FIB uRPF check */
#include <vnet/fib/ip4_fib.h>
#include <vnet/dpo/load_balance.h>
#include <vnet/dpo/classify_dpo.h>
#include <vnet/mfib/mfib_table.h>	/* for mFIB table and entry creation */

/**
 * @file
 * @brief IPv4 Forwarding.
 *
 * This file contains the source code for IPv4 forwarding.
 */

void
ip4_forward_next_trace (vlib_main_t * vm,
			vlib_node_runtime_t * node,
			vlib_frame_t * frame,
			vlib_rx_or_tx_t which_adj_index);

always_inline uword
ip4_lookup_inline (vlib_main_t * vm,
		   vlib_node_runtime_t * node,
		   vlib_frame_t * frame,
		   int lookup_for_responses_to_locally_received_packets)
{
  ip4_main_t *im = &ip4_main;
  vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters;
  u32 n_left_from, n_left_to_next, *from, *to_next;
  ip_lookup_next_t next;
  u32 cpu_index = os_get_cpu_number ();

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next = node->cached_next_index;

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);

      while (n_left_from >= 8 && n_left_to_next >= 4)
	{
	  vlib_buffer_t *p0, *p1, *p2, *p3;
	  ip4_header_t *ip0, *ip1, *ip2, *ip3;
	  __attribute__ ((unused)) tcp_header_t *tcp0, *tcp1, *tcp2, *tcp3;
	  ip_lookup_next_t next0, next1, next2, next3;
	  const load_balance_t *lb0, *lb1, *lb2, *lb3;
	  ip4_fib_mtrie_t *mtrie0, *mtrie1, *mtrie2, *mtrie3;
	  ip4_fib_mtrie_leaf_t leaf0, leaf1, leaf2, leaf3;
	  ip4_address_t *dst_addr0, *dst_addr1, *dst_addr2, *dst_addr3;
	  __attribute__ ((unused)) u32 pi0, fib_index0, lb_index0,
	    is_tcp_udp0;
	  __attribute__ ((unused)) u32 pi1, fib_index1, lb_index1,
	    is_tcp_udp1;
	  __attribute__ ((unused)) u32 pi2, fib_index2, lb_index2,
	    is_tcp_udp2;
	  __attribute__ ((unused)) u32 pi3, fib_index3, lb_index3,
	    is_tcp_udp3;
	  flow_hash_config_t flow_hash_config0, flow_hash_config1;
	  flow_hash_config_t flow_hash_config2, flow_hash_config3;
	  u32 hash_c0, hash_c1, hash_c2, hash_c3;
	  const dpo_id_t *dpo0, *dpo1, *dpo2, *dpo3;

	  /* Prefetch next iteration. */
	  {
	    vlib_buffer_t *p4, *p5, *p6, *p7;

	    p4 = vlib_get_buffer (vm, from[4]);
	    p5 = vlib_get_buffer (vm, from[5]);
	    p6 = vlib_get_buffer (vm, from[6]);
	    p7 = vlib_get_buffer (vm, from[7]);

	    vlib_prefetch_buffer_header (p4, LOAD);
	    vlib_prefetch_buffer_header (p5, LOAD);
	    vlib_prefetch_buffer_header (p6, LOAD);
	    vlib_prefetch_buffer_header (p7, LOAD);

	    CLIB_PREFETCH (p4->data, sizeof (ip0[0]), LOAD);
	    CLIB_PREFETCH (p5->data, sizeof (ip0[0]), LOAD);
	    CLIB_PREFETCH (p6->data, sizeof (ip0[0]), LOAD);
	    CLIB_PREFETCH (p7->data, sizeof (ip0[0]), LOAD);
	  }

	  pi0 = to_next[0] = from[0];
	  pi1 = to_next[1] = from[1];
	  pi2 = to_next[2] = from[2];
	  pi3 = to_next[3] = from[3];

	  from += 4;
	  to_next += 4;
	  n_left_to_next -= 4;
	  n_left_from -= 4;

	  p0 = vlib_get_buffer (vm, pi0);
	  p1 = vlib_get_buffer (vm, pi1);
	  p2 = vlib_get_buffer (vm, pi2);
	  p3 = vlib_get_buffer (vm, pi3);

	  ip0 = vlib_buffer_get_current (p0);
	  ip1 = vlib_buffer_get_current (p1);
	  ip2 = vlib_buffer_get_current (p2);
	  ip3 = vlib_buffer_get_current (p3);

	  dst_addr0 = &ip0->dst_address;
	  dst_addr1 = &ip1->dst_address;
	  dst_addr2 = &ip2->dst_address;
	  dst_addr3 = &ip3->dst_address;

	  fib_index0 =
	    vec_elt (im->fib_index_by_sw_if_index,
		     vnet_buffer (p0)->sw_if_index[VLIB_RX]);
	  fib_index1 =
	    vec_elt (im->fib_index_by_sw_if_index,
		     vnet_buffer (p1)->sw_if_index[VLIB_RX]);
	  fib_index2 =
	    vec_elt (im->fib_index_by_sw_if_index,
		     vnet_buffer (p2)->sw_if_index[VLIB_RX]);
	  fib_index3 =
	    vec_elt (im->fib_index_by_sw_if_index,
		     vnet_buffer (p3)->sw_if_index[VLIB_RX]);
	  fib_index0 =
	    (vnet_buffer (p0)->sw_if_index[VLIB_TX] ==
	     (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX];
	  fib_index1 =
	    (vnet_buffer (p1)->sw_if_index[VLIB_TX] ==
	     (u32) ~ 0) ? fib_index1 : vnet_buffer (p1)->sw_if_index[VLIB_TX];
	  fib_index2 =
	    (vnet_buffer (p2)->sw_if_index[VLIB_TX] ==
	     (u32) ~ 0) ? fib_index2 : vnet_buffer (p2)->sw_if_index[VLIB_TX];
	  fib_index3 =
	    (vnet_buffer (p3)->sw_if_index[VLIB_TX] ==
	     (u32) ~ 0) ? fib_index3 : vnet_buffer (p3)->sw_if_index[VLIB_TX];


	  if (!lookup_for_responses_to_locally_received_packets)
	    {
	      mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
	      mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
	      mtrie2 = &ip4_fib_get (fib_index2)->mtrie;
	      mtrie3 = &ip4_fib_get (fib_index3)->mtrie;

	      leaf0 = leaf1 = leaf2 = leaf3 = IP4_FIB_MTRIE_LEAF_ROOT;

	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 0);
	      leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 0);
	      leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 0);
	    }

	  tcp0 = (void *) (ip0 + 1);
	  tcp1 = (void *) (ip1 + 1);
	  tcp2 = (void *) (ip2 + 1);
	  tcp3 = (void *) (ip3 + 1);

	  is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
			 || ip0->protocol == IP_PROTOCOL_UDP);
	  is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP
			 || ip1->protocol == IP_PROTOCOL_UDP);
	  is_tcp_udp2 = (ip2->protocol == IP_PROTOCOL_TCP
			 || ip2->protocol == IP_PROTOCOL_UDP);
	  is_tcp_udp3 = (ip1->protocol == IP_PROTOCOL_TCP
			 || ip1->protocol == IP_PROTOCOL_UDP);

	  if (!lookup_for_responses_to_locally_received_packets)
	    {
	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 1);
	      leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 1);
	      leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 1);
	    }

	  if (!lookup_for_responses_to_locally_received_packets)
	    {
	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
	      leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 2);
	      leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 2);
	    }

	  if (!lookup_for_responses_to_locally_received_packets)
	    {
	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
	      leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 3);
	      leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 3);
	    }

	  if (lookup_for_responses_to_locally_received_packets)
	    {
	      lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
	      lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
	      lb_index2 = vnet_buffer (p2)->ip.adj_index[VLIB_RX];
	      lb_index3 = vnet_buffer (p3)->ip.adj_index[VLIB_RX];
	    }
	  else
	    {
	      /* Handle default route. */
	      leaf0 =
		(leaf0 ==
		 IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
	      leaf1 =
		(leaf1 ==
		 IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
	      leaf2 =
		(leaf2 ==
		 IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie2->default_leaf : leaf2);
	      leaf3 =
		(leaf3 ==
		 IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie3->default_leaf : leaf3);
	      lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
	      lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
	      lb_index2 = ip4_fib_mtrie_leaf_get_adj_index (leaf2);
	      lb_index3 = ip4_fib_mtrie_leaf_get_adj_index (leaf3);
	    }

	  lb0 = load_balance_get (lb_index0);
	  lb1 = load_balance_get (lb_index1);
	  lb2 = load_balance_get (lb_index2);
	  lb3 = load_balance_get (lb_index3);

	  /* Use flow hash to compute multipath adjacency. */
	  hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
	  hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
	  hash_c2 = vnet_buffer (p2)->ip.flow_hash = 0;
	  hash_c3 = vnet_buffer (p3)->ip.flow_hash = 0;
	  if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
	    {
	      flow_hash_config0 = lb0->lb_hash_config;
	      hash_c0 = vnet_buffer (p0)->ip.flow_hash =
		ip4_compute_flow_hash (ip0, flow_hash_config0);
	    }
	  if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
	    {
	      flow_hash_config1 = lb1->lb_hash_config;
	      hash_c1 = vnet_buffer (p1)->ip.flow_hash =
		ip4_compute_flow_hash (ip1, flow_hash_config1);
	    }
	  if (PREDICT_FALSE (lb2->lb_n_buckets > 1))
	    {
	      flow_hash_config2 = lb2->lb_hash_config;
	      hash_c2 = vnet_buffer (p2)->ip.flow_hash =
		ip4_compute_flow_hash (ip2, flow_hash_config2);
	    }
	  if (PREDICT_FALSE (lb3->lb_n_buckets > 1))
	    {
	      flow_hash_config3 = lb3->lb_hash_config;
	      hash_c3 = vnet_buffer (p3)->ip.flow_hash =
		ip4_compute_flow_hash (ip3, flow_hash_config3);
	    }

	  ASSERT (lb0->lb_n_buckets > 0);
	  ASSERT (is_pow2 (lb0->lb_n_buckets));
	  ASSERT (lb1->lb_n_buckets > 0);
	  ASSERT (is_pow2 (lb1->lb_n_buckets));
	  ASSERT (lb2->lb_n_buckets > 0);
	  ASSERT (is_pow2 (lb2->lb_n_buckets));
	  ASSERT (lb3->lb_n_buckets > 0);
	  ASSERT (is_pow2 (lb3->lb_n_buckets));

	  dpo0 = load_balance_get_bucket_i (lb0,
					    (hash_c0 &
					     (lb0->lb_n_buckets_minus_1)));
	  dpo1 = load_balance_get_bucket_i (lb1,
					    (hash_c1 &
					     (lb1->lb_n_buckets_minus_1)));
	  dpo2 = load_balance_get_bucket_i (lb2,
					    (hash_c2 &
					     (lb2->lb_n_buckets_minus_1)));
	  dpo3 = load_balance_get_bucket_i (lb3,
					    (hash_c3 &
					     (lb3->lb_n_buckets_minus_1)));

	  next0 = dpo0->dpoi_next_node;
	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
	  next1 = dpo1->dpoi_next_node;
	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
	  next2 = dpo2->dpoi_next_node;
	  vnet_buffer (p2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index;
	  next3 = dpo3->dpoi_next_node;
	  vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;

	  vlib_increment_combined_counter
	    (cm, cpu_index, lb_index0, 1,
	     vlib_buffer_length_in_chain (vm, p0)
	     + sizeof (ethernet_header_t));
	  vlib_increment_combined_counter
	    (cm, cpu_index, lb_index1, 1,
	     vlib_buffer_length_in_chain (vm, p1)
	     + sizeof (ethernet_header_t));
	  vlib_increment_combined_counter
	    (cm, cpu_index, lb_index2, 1,
	     vlib_buffer_length_in_chain (vm, p2)
	     + sizeof (ethernet_header_t));
	  vlib_increment_combined_counter
	    (cm, cpu_index, lb_index3, 1,
	     vlib_buffer_length_in_chain (vm, p3)
	     + sizeof (ethernet_header_t));

	  vlib_validate_buffer_enqueue_x4 (vm, node, next,
					   to_next, n_left_to_next,
					   pi0, pi1, pi2, pi3,
					   next0, next1, next2, next3);
	}

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  vlib_buffer_t *p0;
	  ip4_header_t *ip0;
	  __attribute__ ((unused)) tcp_header_t *tcp0;
	  ip_lookup_next_t next0;
	  const load_balance_t *lb0;
	  ip4_fib_mtrie_t *mtrie0;
	  ip4_fib_mtrie_leaf_t leaf0;
	  ip4_address_t *dst_addr0;
	  __attribute__ ((unused)) u32 pi0, fib_index0, is_tcp_udp0, lbi0;
	  flow_hash_config_t flow_hash_config0;
	  const dpo_id_t *dpo0;
	  u32 hash_c0;

	  pi0 = from[0];
	  to_next[0] = pi0;

	  p0 = vlib_get_buffer (vm, pi0);

	  ip0 = vlib_buffer_get_current (p0);

	  dst_addr0 = &ip0->dst_address;

	  fib_index0 =
	    vec_elt (im->fib_index_by_sw_if_index,
		     vnet_buffer (p0)->sw_if_index[VLIB_RX]);
	  fib_index0 =
	    (vnet_buffer (p0)->sw_if_index[VLIB_TX] ==
	     (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX];

	  if (!lookup_for_responses_to_locally_received_packets)
	    {
	      mtrie0 = &ip4_fib_get (fib_index0)->mtrie;

	      leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;

	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
	    }

	  tcp0 = (void *) (ip0 + 1);

	  is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
			 || ip0->protocol == IP_PROTOCOL_UDP);

	  if (!lookup_for_responses_to_locally_received_packets)
	    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);

	  if (!lookup_for_responses_to_locally_received_packets)
	    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);

	  if (!lookup_for_responses_to_locally_received_packets)
	    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);

	  if (lookup_for_responses_to_locally_received_packets)
	    lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
	  else
	    {
	      /* Handle default route. */
	      leaf0 =
		(leaf0 ==
		 IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
	      lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
	    }

	  lb0 = load_balance_get (lbi0);

	  /* Use flow hash to compute multipath adjacency. */
	  hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
	  if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
	    {
	      flow_hash_config0 = lb0->lb_hash_config;

	      hash_c0 = vnet_buffer (p0)->ip.flow_hash =
		ip4_compute_flow_hash (ip0, flow_hash_config0);
	    }

	  ASSERT (lb0->lb_n_buckets > 0);
	  ASSERT (is_pow2 (lb0->lb_n_buckets));

	  dpo0 = load_balance_get_bucket_i (lb0,
					    (hash_c0 &
					     (lb0->lb_n_buckets_minus_1)));

	  next0 = dpo0->dpoi_next_node;
	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;

	  vlib_increment_combined_counter
	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));

	  from += 1;
	  to_next += 1;
	  n_left_to_next -= 1;
	  n_left_from -= 1;

	  if (PREDICT_FALSE (next0 != next))
	    {
	      n_left_to_next += 1;
	      vlib_put_next_frame (vm, node, next, n_left_to_next);
	      next = next0;
	      vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
	      to_next[0] = pi0;
	      to_next += 1;
	      n_left_to_next -= 1;
	    }
	}

      vlib_put_next_frame (vm, node, next, n_left_to_next);
    }

  if (node->flags & VLIB_NODE_FLAG_TRACE)
    ip4_forward_next_trace (vm, node, frame, VLIB_TX);

  return frame->n_vectors;
}

/** @brief IPv4 lookup node.
    @node ip4-lookup

    This is the main IPv4 lookup dispatch node.

    @param vm vlib_main_t corresponding to the current thread
    @param node vlib_node_runtime_t
    @param frame vlib_frame_t whose contents should be dispatched

    @par Graph mechanics: buffer metadata, next index usage

    @em Uses:
    - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
        - Indicates the @c sw_if_index value of the interface that the
	  packet was received on.
    - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
        - When the value is @c ~0 then the node performs a longest prefix
          match (LPM) for the packet destination address in the FIB attached
          to the receive interface.
        - Otherwise perform LPM for the packet destination address in the
          indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
          value (0, 1, ...) and not a VRF id.

    @em Sets:
    - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
        - The lookup result adjacency index.

    <em>Next Index:</em>
    - Dispatches the packet to the node index found in
      ip_adjacency_t @c adj->lookup_next_index
      (where @c adj is the lookup result adjacency).
*/
static uword
ip4_lookup (vlib_main_t * vm,
	    vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  return ip4_lookup_inline (vm, node, frame,
			    /* lookup_for_responses_to_locally_received_packets */
			    0);

}

static u8 *format_ip4_lookup_trace (u8 * s, va_list * args);

VLIB_REGISTER_NODE (ip4_lookup_node) =
{
.function = ip4_lookup,.name = "ip4-lookup",.vector_size =
    sizeof (u32),.format_trace = format_ip4_lookup_trace,.n_next_nodes =
    IP_LOOKUP_N_NEXT,.next_nodes = IP4_LOOKUP_NEXT_NODES,};

VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_node, ip4_lookup);

always_inline uword
ip4_load_balance (vlib_main_t * vm,
		  vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
  u32 n_left_from, n_left_to_next, *from, *to_next;
  ip_lookup_next_t next;
  u32 cpu_index = os_get_cpu_number ();

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next = node->cached_next_index;

  if (node->flags & VLIB_NODE_FLAG_TRACE)
    ip4_forward_next_trace (vm, node, frame, VLIB_TX);

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);


      while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	  ip_lookup_next_t next0, next1;
	  const load_balance_t *lb0, *lb1;
	  vlib_buffer_t *p0, *p1;
	  u32 pi0, lbi0, hc0, pi1, lbi1, hc1;
	  const ip4_header_t *ip0, *ip1;
	  const dpo_id_t *dpo0, *dpo1;

	  /* Prefetch next iteration. */
	  {
	    vlib_buffer_t *p2, *p3;

	    p2 = vlib_get_buffer (vm, from[2]);
	    p3 = vlib_get_buffer (vm, from[3]);

	    vlib_prefetch_buffer_header (p2, STORE);
	    vlib_prefetch_buffer_header (p3, STORE);

	    CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
	    CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
	  }

	  pi0 = to_next[0] = from[0];
	  pi1 = to_next[1] = from[1];

	  from += 2;
	  n_left_from -= 2;
	  to_next += 2;
	  n_left_to_next -= 2;

	  p0 = vlib_get_buffer (vm, pi0);
	  p1 = vlib_get_buffer (vm, pi1);

	  ip0 = vlib_buffer_get_current (p0);
	  ip1 = vlib_buffer_get_current (p1);
	  lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
	  lbi1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX];

	  lb0 = load_balance_get (lbi0);
	  lb1 = load_balance_get (lbi1);

	  /*
	   * this node is for via FIBs we can re-use the hash value from the
	   * to node if present.
	   * We don't want to use the same hash value at each level in the recursion
	   * graph as that would lead to polarisation
	   */
	  hc0 = hc1 = 0;

	  if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
	    {
	      if (PREDICT_TRUE (vnet_buffer (p0)->ip.flow_hash))
		{
		  hc0 = vnet_buffer (p0)->ip.flow_hash =
		    vnet_buffer (p0)->ip.flow_hash >> 1;
		}
	      else
		{
		  hc0 = vnet_buffer (p0)->ip.flow_hash =
		    ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
		}
	    }
	  if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
	    {
	      if (PREDICT_TRUE (vnet_buffer (p1)->ip.flow_hash))
		{
		  hc1 = vnet_buffer (p1)->ip.flow_hash =
		    vnet_buffer (p1)->ip.flow_hash >> 1;
		}
	      else
		{
		  hc1 = vnet_buffer (p1)->ip.flow_hash =
		    ip4_compute_flow_hash (ip1, lb1->lb_hash_config);
		}
	    }

	  dpo0 =
	    load_balance_get_bucket_i (lb0,
				       hc0 & (lb0->lb_n_buckets_minus_1));
	  dpo1 =
	    load_balance_get_bucket_i (lb1,
				       hc1 & (lb1->lb_n_buckets_minus_1));

	  next0 = dpo0->dpoi_next_node;
	  next1 = dpo1->dpoi_next_node;

	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;

	  vlib_increment_combined_counter
	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
	  vlib_increment_combined_counter
	    (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1));

	  vlib_validate_buffer_enqueue_x2 (vm, node, next,
					   to_next, n_left_to_next,
					   pi0, pi1, next0, next1);
	}

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  ip_lookup_next_t next0;
	  const load_balance_t *lb0;
	  vlib_buffer_t *p0;
	  u32 pi0, lbi0, hc0;
	  const ip4_header_t *ip0;
	  const dpo_id_t *dpo0;

	  pi0 = from[0];
	  to_next[0] = pi0;
	  from += 1;
	  to_next += 1;
	  n_left_to_next -= 1;
	  n_left_from -= 1;

	  p0 = vlib_get_buffer (vm, pi0);

	  ip0 = vlib_buffer_get_current (p0);
	  lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];

	  lb0 = load_balance_get (lbi0);

	  hc0 = 0;
	  if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
	    {
	      if (PREDICT_TRUE (vnet_buffer (p0)->ip.flow_hash))
		{
		  hc0 = vnet_buffer (p0)->ip.flow_hash =
		    vnet_buffer (p0)->ip.flow_hash >> 1;
		}
	      else
		{
		  hc0 = vnet_buffer (p0)->ip.flow_hash =
		    ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
		}
	    }

	  dpo0 =
	    load_balance_get_bucket_i (lb0,
				       hc0 & (lb0->lb_n_buckets_minus_1));

	  next0 = dpo0->dpoi_next_node;
	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;

	  vlib_increment_combined_counter
	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));

	  vlib_validate_buffer_enqueue_x1 (vm, node, next,
					   to_next, n_left_to_next,
					   pi0, next0);
	}

      vlib_put_next_frame (vm, node, next, n_left_to_next);
    }

  return frame->n_vectors;
}

VLIB_REGISTER_NODE (ip4_load_balance_node) =
{
.function = ip4_load_balance,.name = "ip4-load-balance",.vector_size =
    sizeof (u32),.sibling_of = "ip4-lookup",.format_trace =
    format_ip4_lookup_trace,};

VLIB_NODE_FUNCTION_MULTIARCH (ip4_load_balance_node, ip4_load_balance);

/* get first interface address */
ip4_address_t *
ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
			     ip_interface_address_t ** result_ia)
{
  ip_lookup_main_t *lm = &im->lookup_main;
  ip_interface_address_t *ia = 0;
  ip4_address_t *result = 0;

  /* *INDENT-OFF* */
  foreach_ip_interface_address
    (lm, ia, sw_if_index,
     1 /* honor unnumbered */ ,
     ({
       ip4_address_t * a =
         ip_interface_address_get_address (lm, ia);
       result = a;
       break;
     }));
  /* *INDENT-OFF* */
  if (result_ia)
    *result_ia = result ? ia : 0;
  return result;
}

static void
ip4_add_interface_routes (u32 sw_if_index,
			  ip4_main_t * im, u32 fib_index,
			  ip_interface_address_t * a)
{
  ip_lookup_main_t *lm = &im->lookup_main;
  ip4_address_t *address = ip_interface_address_get_address (lm, a);
  fib_prefix_t pfx = {
    .fp_len = a->address_length,
    .fp_proto = FIB_PROTOCOL_IP4,
    .fp_addr.ip4 = *address,
  };

  a->neighbor_probe_adj_index = ~0;

  if (pfx.fp_len < 32)
    {
      fib_node_index_t fei;

      fei = fib_table_entry_update_one_path (fib_index, &pfx,
                                             FIB_SOURCE_INTERFACE,
                                             (FIB_ENTRY_FLAG_CONNECTED |
                                              FIB_ENTRY_FLAG_ATTACHED),
                                             FIB_PROTOCOL_IP4,
                                             /* No next-hop address */
                                             NULL,
					     sw_if_index,
                                             // invalid FIB index
                                             ~0,
					     1,
                                             // no out-label stack
                                             NULL,
					     FIB_ROUTE_PATH_FLAG_NONE);
      a->neighbor_probe_adj_index = fib_entry_get_adj (fei);
    }

  pfx.fp_len = 32;

  if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
    {
      u32 classify_table_index =
	lm->classify_table_index_by_sw_if_index[sw_if_index];
      if (classify_table_index != (u32) ~ 0)
	{
	  dpo_id_t dpo = DPO_INVALID;

	  dpo_set (&dpo,
		   DPO_CLASSIFY,
		   DPO_PROTO_IP4,
		   classify_dpo_create (DPO_PROTO_IP4, classify_table_index));

	  fib_table_entry_special_dpo_add (fib_index,
					   &pfx,
					   FIB_SOURCE_CLASSIFY,
					   FIB_ENTRY_FLAG_NONE, &dpo);
	  dpo_reset (&dpo);
	}
    }

  fib_table_entry_update_one_path (fib_index, &pfx,
                                   FIB_SOURCE_INTERFACE,
                                   (FIB_ENTRY_FLAG_CONNECTED |
                                    FIB_ENTRY_FLAG_LOCAL),
                                   FIB_PROTOCOL_IP4,
                                   &pfx.fp_addr,
                                   sw_if_index,
                                   // invalid FIB index
                                   ~0,
				   1, NULL,
				   FIB_ROUTE_PATH_FLAG_NONE);
}

static void
ip4_del_interface_routes (ip4_main_t * im,
			  u32 fib_index,
			  ip4_address_t * address, u32 address_length)
{
  fib_prefix_t pfx = {
    .fp_len = address_length,
    .fp_proto = FIB_PROTOCOL_IP4,
    .fp_addr.ip4 = *address,
  };

  if (pfx.fp_len < 32)
    {
      fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
    }

  pfx.fp_len = 32;
  fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
}

void
ip4_sw_interface_enable_disable (u32 sw_if_index, u32 is_enable)
{
  ip4_main_t *im = &ip4_main;

  vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);

  /*
   * enable/disable only on the 1<->0 transition
   */
  if (is_enable)
    {
      if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
	return;
    }
  else
    {
      ASSERT (im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
      if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
	return;
    }
  vnet_feature_enable_disable ("ip4-unicast", "ip4-drop", sw_if_index,
			       !is_enable, 0, 0);


  vnet_feature_enable_disable ("ip4-multicast", "ip4-drop",
			       sw_if_index, !is_enable, 0, 0);
}

static clib_error_t *
ip4_add_del_interface_address_internal (vlib_main_t * vm,
					u32 sw_if_index,
					ip4_address_t * address,
					u32 address_length, u32 is_del)
{
  vnet_main_t *vnm = vnet_get_main ();
  ip4_main_t *im = &ip4_main;
  ip_lookup_main_t *lm = &im->lookup_main;
  clib_error_t *error = 0;
  u32 if_address_index, elts_before;
  ip4_address_fib_t ip4_af, *addr_fib = 0;

  vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
  ip4_addr_fib_init (&ip4_af, address,
		     vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
  vec_add1 (addr_fib, ip4_af);

  /* FIXME-LATER
   * there is no support for adj-fib handling in the presence of overlapping
   * subnets on interfaces. Easy fix - disallow overlapping subnets, like
   * most routers do.
   */
  /* *INDENT-OFF* */
  if (!is_del)
    {
      /* When adding an address check that it does not conflict
         with an existing address. */
      ip_interface_address_t *ia;
      foreach_ip_interface_address
        (&im->lookup_main, ia, sw_if_index,
         0 /* honor unnumbered */ ,
         ({
           ip4_address_t * x =
             ip_interface_address_get_address
             (&im->lookup_main, ia);
           if (ip4_destination_matches_route
               (im, address, x, ia->address_length) ||
               ip4_destination_matches_route (im,
                                              x,
                                              address,
                                              address_length))
             return
               clib_error_create
               ("failed to add %U which conflicts with %U for interface %U",
                format_ip4_address_and_length, address,
                address_length,
                format_ip4_address_and_length, x,
                ia->address_length,
                format_vnet_sw_if_index_name, vnm,
                sw_if_index);
         }));
    }
  /* *INDENT-ON* */

  elts_before = pool_elts (lm->if_address_pool);

  error = ip_interface_address_add_del
    (lm, sw_if_index, addr_fib, address_length, is_del, &if_address_index);
  if (error)
    goto done;

  ip4_sw_interface_enable_disable (sw_if_index, !is_del);

  if (is_del)
    ip4_del_interface_routes (im, ip4_af.fib_index, address, address_length);
  else
    ip4_add_interface_routes (sw_if_index,
			      im, ip4_af.fib_index,
			      pool_elt_at_index
			      (lm->if_address_pool, if_address_index));

  /* If pool did not grow/shrink: add duplicate address. */
  if (elts_before != pool_elts (lm->if_address_pool))
    {
      ip4_add_del_interface_address_callback_t *cb;
      vec_foreach (cb, im->add_del_interface_address_callbacks)
	cb->function (im, cb->function_opaque, sw_if_index,
		      address, address_length, if_address_index, is_del);
    }

done:
  vec_free (addr_fib);
  return error;
}

clib_error_t *
ip4_add_del_interface_address (vlib_main_t * vm,
			       u32 sw_if_index,
			       ip4_address_t * address,
			       u32 address_length, u32 is_del)
{
  return ip4_add_del_interface_address_internal
    (vm, sw_if_index, address, address_length, is_del);
}

/* Built-in ip4 unicast rx feature path definition */
/* *INDENT-OFF* */
VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
{
  .arc_name = "ip4-unicast",
  .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
  .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
};

VNET_FEATURE_INIT (ip4_flow_classify, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-flow-classify",
  .runs_before = VNET_FEATURES ("ip4-inacl"),
};

VNET_FEATURE_INIT (ip4_inacl, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-inacl",
  .runs_before = VNET_FEATURES ("ip4-source-check-via-rx"),
};

VNET_FEATURE_INIT (ip4_source_check_1, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-source-check-via-rx",
  .runs_before = VNET_FEATURES ("ip4-source-check-via-any"),
};

VNET_FEATURE_INIT (ip4_source_check_2, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-source-check-via-any",
  .runs_before = VNET_FEATURES ("ip4-policer-classify"),
};

VNET_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-source-and-port-range-check-rx",
  .runs_before = VNET_FEATURES ("ip4-policer-classify"),
};

VNET_FEATURE_INIT (ip4_policer_classify, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-policer-classify",
  .runs_before = VNET_FEATURES ("ipsec-input-ip4"),
};

VNET_FEATURE_INIT (ip4_ipsec, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ipsec-input-ip4",
  .runs_before = VNET_FEATURES ("vpath-input-ip4"),
};

VNET_FEATURE_INIT (ip4_vpath, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "vpath-input-ip4",
  .runs_before = VNET_FEATURES ("ip4-vxlan-bypass"),
};

VNET_FEATURE_INIT (ip4_vxlan_bypass, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-vxlan-bypass",
  .runs_before = VNET_FEATURES ("ip4-lookup"),
};

VNET_FEATURE_INIT (ip4_drop, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-drop",
  .runs_before = VNET_FEATURES ("ip4-lookup"),
};

VNET_FEATURE_INIT (ip4_lookup, static) =
{
  .arc_name = "ip4-unicast",
  .node_name = "ip4-lookup",
  .runs_before = 0,	/* not before any other features */
};

/* Built-in ip4 multicast rx feature path definition */
VNET_FEATURE_ARC_INIT (ip4_multicast, static) =
{
  .arc_name = "ip4-multicast",
  .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
  .arc_index_ptr = &ip4_main.lookup_main.mcast_feature_arc_index,
};

VNET_FEATURE_INIT (ip4_vpath_mc, static) =
{
  .arc_name = "ip4-multicast",
  .node_name = "vpath-input-ip4",
  .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
};

VNET_FEATURE_INIT (ip4_mc_drop, static) =
{
  .arc_name = "ip4-multicast",
  .node_name = "ip4-drop",
  .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
};

VNET_FEATURE_INIT (ip4_lookup_mc, static) =
{
  .arc_name = "ip4-multicast",
  .node_name = "ip4-mfib-forward-lookup",
  .runs_before = 0,	/* last feature */
};

/* Source and port-range check ip4 tx feature path definition */
VNET_FEATURE_ARC_INIT (ip4_output, static) =
{
  .arc_name = "ip4-output",
  .start_nodes = VNET_FEATURES ("ip4-rewrite", "ip4-midchain"),
  .arc_index_ptr = &ip4_main.lookup_main.output_feature_arc_index,
};

VNET_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) =
{
  .arc_name = "ip4-output",
  .node_name = "ip4-source-and-port-range-check-tx",
  .runs_before = VNET_FEATURES ("ipsec-output-ip4"),
};

VNET_FEATURE_INIT (ip4_ipsec_output, static) =
{
  .arc_name = "ip4-output",
  .node_name = "ipsec-output-ip4",
  .runs_before = VNET_FEATURES ("interface-output"),
};

/* Built-in ip4 tx feature path definition */
VNET_FEATURE_INIT (ip4_interface_output, static) =
{
  .arc_name = "ip4-output",
  .node_name = "interface-output",
  .runs_before = 0,	/* not before any other features */
};
/* *INDENT-ON* */

static clib_error_t *
ip4_sw_interface_add_del (vnet_main_t * vnm, u32 sw_if_index, u32 is_add)
{
  ip4_main_t *im = &ip4_main;

  /* Fill in lookup tables with default table (0). */
  vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
  vec_validate (im->mfib_index_by_sw_if_index, sw_if_index);

  vnet_feature_enable_disable ("ip4-unicast", "ip4-drop", sw_if_index,
			       is_add, 0, 0);

  vnet_feature_enable_disable ("ip4-multicast", "ip4-drop", sw_if_index,
			       is_add, 0, 0);

  return /* no error */ 0;
}

VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);

/* Global IP4 main. */
ip4_main_t ip4_main;

clib_error_t *
ip4_lookup_init (vlib_main_t * vm)
{
  ip4_main_t *im = &ip4_main;
  clib_error_t *error;
  uword i;

  if ((error = vlib_call_init_function (vm, vnet_feature_init)))
    return error;

  for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
    {
      u32 m;

      if (i < 32)
	m = pow2_mask (i) << (32 - i);
      else
	m = ~0;
      im->fib_masks[i] = clib_host_to_net_u32 (m);
    }

  ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);

  /* Create FIB with index 0 and table id of 0. */
  fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0);
  mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0);

  {
    pg_node_t *pn;
    pn = pg_get_node (ip4_lookup_node.index);
    pn->unformat_edit = unformat_pg_ip4_header;
  }

  {
    ethernet_arp_header_t h;

    memset (&h, 0, sizeof (h));

    /* Set target ethernet address to all zeros. */
    memset (h.ip4_over_ethernet[1].ethernet, 0,
	    sizeof (h.ip4_over_ethernet[1].ethernet));

#define _16(f,v) h.f = clib_host_to_net_u16 (v);
#define _8(f,v) h.f = v;
    _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
    _16 (l3_type, ETHERNET_TYPE_IP4);
    _8 (n_l2_address_bytes, 6);
    _8 (n_l3_address_bytes, 4);
    _16 (opcode, ETHERNET_ARP_OPCODE_request);
#undef _16
#undef _8

    vlib_packet_template_init (vm, &im->ip4_arp_request_packet_template,
			       /* data */ &h,
			       sizeof (h),
			       /* alloc chunk size */ 8,
			       "ip4 arp");
  }

  return error;
}

VLIB_INIT_FUNCTION (ip4_lookup_init);

typedef struct
{
  /* Adjacency taken. */
  u32 dpo_index;
  u32 flow_hash;
  u32 fib_index;

  /* Packet data, possibly *after* rewrite. */
  u8 packet_data[64 - 1 * sizeof (u32)];
}
ip4_forward_next_trace_t;

u8 *
format_ip4_forward_next_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
  uword indent = format_get_indent (s);
  s = format (s, "%U%U",
	      format_white_space, indent,
	      format_ip4_header, t->packet_data, sizeof (t->packet_data));
  return s;
}

static u8 *
format_ip4_lookup_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
  uword indent = format_get_indent (s);

  s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
	      t->fib_index, t->dpo_index, t->flow_hash);
  s = format (s, "\n%U%U",
	      format_white_space, indent,
	      format_ip4_header, t->packet_data, sizeof (t->packet_data));
  return s;
}

static u8 *
format_ip4_rewrite_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
  uword indent = format_get_indent (s);

  s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
	      t->fib_index, t->dpo_index, format_ip_adjacency,
	      t->dpo_index, FORMAT_IP_ADJACENCY_NONE, t->flow_hash);
  s = format (s, "\n%U%U",
	      format_white_space, indent,
	      format_ip_adjacency_packet_data,
	      t->dpo_index, t->packet_data, sizeof (t->packet_data));
  return s;
}

/* Common trace function for all ip4-forward next nodes. */
void
ip4_forward_next_trace (vlib_main_t * vm,
			vlib_node_runtime_t * node,
			vlib_frame_t * frame, vlib_rx_or_tx_t which_adj_index)
{
  u32 *from, n_left;
  ip4_main_t *im = &ip4_main;

  n_left = frame->n_vectors;
  from = vlib_frame_vector_args (frame);

  while (n_left >= 4)
    {
      u32 bi0, bi1;
      vlib_buffer_t *b0, *b1;
      ip4_forward_next_trace_t *t0, *t1;

      /* Prefetch next iteration. */
      vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
      vlib_prefetch_buffer_with_index (vm, from[3], LOAD);

      bi0 = from[0];
      bi1 = from[1];

      b0 = vlib_get_buffer (vm, bi0);
      b1 = vlib_get_buffer (vm, bi1);

      if (b0->flags & VLIB_BUFFER_IS_TRACED)
	{
	  t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
	  t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
	  t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
	  t0->fib_index =
	    (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
	     (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
	    vec_elt (im->fib_index_by_sw_if_index,
		     vnet_buffer (b0)->sw_if_index[VLIB_RX]);

	  clib_memcpy (t0->packet_data,
		       vlib_buffer_get_current (b0),
		       sizeof (t0->packet_data));
	}
      if (b1->flags & VLIB_BUFFER_IS_TRACED)
	{
	  t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
	  t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
	  t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
	  t1->fib_index =
	    (vnet_buffer (b1)->sw_if_index[VLIB_TX] !=
	     (u32) ~ 0) ? vnet_buffer (b1)->sw_if_index[VLIB_TX] :
	    vec_elt (im->fib_index_by_sw_if_index,
		     vnet_buffer (b1)->sw_if_index[VLIB_RX]);
	  clib_memcpy (t1->packet_data, vlib_buffer_get_current (b1),
		       sizeof (t1->packet_data));
	}
      from += 2;
      n_left -= 2;
    }

  while (n_left >= 1)
    {
      u32 bi0;
      vlib_buffer_t *b0;
      ip4_forward_next_trace_t *t0;

      bi0 = from[0];

      b0 = vlib_get_buffer (vm, bi0);

      if (b0->flags & VLIB_BUFFER_IS_TRACED)
	{
	  t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
	  t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
	  t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
	  t0->fib_index =
	    (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
	     (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
	    vec_elt (im->fib_index_by_sw_if_index,
		     vnet_buffer (b0)->sw_if_index[VLIB_RX]);
	  clib_memcpy (t0->packet_data, vlib_buffer_get_current (b0),
		       sizeof (t0->packet_data));
	}
      from += 1;
      n_left -= 1;
    }
}

static uword
ip4_drop_or_punt (vlib_main_t * vm,
		  vlib_node_runtime_t * node,
		  vlib_frame_t * frame, ip4_error_t error_code)
{
  u32 *buffers = vlib_frame_vector_args (frame);
  uword n_packets = frame->n_vectors;

  vlib_error_drop_buffers (vm, node, buffers,
			   /* stride */ 1,
			   n_packets,
			   /* next */ 0,
			   ip4_input_node.index, error_code);

  if (node->flags & VLIB_NODE_FLAG_TRACE)
    ip4_forward_next_trace (vm, node, frame, VLIB_TX);

  return n_packets;
}

static uword
ip4_drop (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_DROP);
}

static uword
ip4_punt (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_PUNT);
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_drop_node, static) =
{
  .function = ip4_drop,.
  name = "ip4-drop",
  .vector_size = sizeof (u32),
  .format_trace = format_ip4_forward_next_trace,
  .n_next_nodes = 1,
  .next_nodes = {
    [0] = "error-drop",
  },
};

VLIB_NODE_FUNCTION_MULTIARCH (ip4_drop_node, ip4_drop);

VLIB_REGISTER_NODE (ip4_punt_node, static) =
{
  .function = ip4_punt,
  .name = "ip4-punt",
  .vector_size = sizeof (u32),
  .format_trace = format_ip4_forward_next_trace,
  .n_next_nodes = 1,
  .next_nodes = {
    [0] = "error-punt",
  },
};

VLIB_NODE_FUNCTION_MULTIARCH (ip4_punt_node, ip4_punt);
/* *INDENT-ON */

/* Compute TCP/UDP/ICMP4 checksum in software. */
u16
ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
			      ip4_header_t * ip0)
{
  ip_csum_t sum0;
  u32 ip_header_length, payload_length_host_byte_order;
  u32 n_this_buffer, n_bytes_left;
  u16 sum16;
  void *data_this_buffer;

  /* Initialize checksum with ip header. */
  ip_header_length = ip4_header_bytes (ip0);
  payload_length_host_byte_order =
    clib_net_to_host_u16 (ip0->length) - ip_header_length;
  sum0 =
    clib_host_to_net_u32 (payload_length_host_byte_order +
			  (ip0->protocol << 16));

  if (BITS (uword) == 32)
    {
      sum0 =
	ip_csum_with_carry (sum0,
			    clib_mem_unaligned (&ip0->src_address, u32));
      sum0 =
	ip_csum_with_carry (sum0,
			    clib_mem_unaligned (&ip0->dst_address, u32));
    }
  else
    sum0 =
      ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));

  n_bytes_left = n_this_buffer = payload_length_host_byte_order;
  data_this_buffer = (void *) ip0 + ip_header_length;
  if (n_this_buffer + ip_header_length > p0->current_length)
    n_this_buffer =
      p0->current_length >
      ip_header_length ? p0->current_length - ip_header_length : 0;
  while (1)
    {
      sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
      n_bytes_left -= n_this_buffer;
      if (n_bytes_left == 0)
	break;

      ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT);
      p0 = vlib_get_buffer (vm, p0->next_buffer);
      data_this_buffer = vlib_buffer_get_current (p0);
      n_this_buffer = p0->current_length;
    }

  sum16 = ~ip_csum_fold (sum0);

  return sum16;
}

u32
ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
{
  ip4_header_t *ip0 = vlib_buffer_get_current (p0);
  udp_header_t *udp0;
  u16 sum16;

  ASSERT (ip0->protocol == IP_PROTOCOL_TCP
	  || ip0->protocol == IP_PROTOCOL_UDP);

  udp0 = (void *) (ip0 + 1);
  if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
    {
      p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
		    | IP_BUFFER_L4_CHECKSUM_CORRECT);
      return p0->flags;
    }

  sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);

  p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
		| ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT));

  return p0->flags;
}

/* *INDENT-OFF* */
VNET_FEATURE_ARC_INIT (ip4_local) =
{
  .arc_name  = "ip4-local",
  .start_nodes = VNET_FEATURES ("ip4-local"),
};
/* *INDENT-ON* */

static inline uword
ip4_local_inline (vlib_main_t * vm,
		  vlib_node_runtime_t * node,
		  vlib_frame_t * frame, int head_of_feature_arc)
{
  ip4_main_t *im = &ip4_main;
  ip_lookup_main_t *lm = &im->lookup_main;
  ip_local_next_t next_index;
  u32 *from, *to_next, n_left_from, n_left_to_next;
  vlib_node_runtime_t *error_node =
    vlib_node_get_runtime (vm, ip4_input_node.index);
  u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  if (node->flags & VLIB_NODE_FLAG_TRACE)
    ip4_forward_next_trace (vm, node, frame, VLIB_TX);

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	  vlib_buffer_t *p0, *p1;
	  ip4_header_t *ip0, *ip1;
	  udp_header_t *udp0, *udp1;
	  ip4_fib_mtrie_t *mtrie0, *mtrie1;
	  ip4_fib_mtrie_leaf_t leaf0, leaf1;
	  const dpo_id_t *dpo0, *dpo1;
	  const load_balance_t *lb0, *lb1;
	  u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, lbi0;
	  u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, lbi1;
	  i32 len_diff0, len_diff1;
	  u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
	  u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1;
	  u32 sw_if_index0, sw_if_index1;

	  pi0 = to_next[0] = from[0];
	  pi1 = to_next[1] = from[1];
	  from += 2;
	  n_left_from -= 2;
	  to_next += 2;
	  n_left_to_next -= 2;

	  next0 = next1 = IP_LOCAL_NEXT_DROP;

	  p0 = vlib_get_buffer (vm, pi0);
	  p1 = vlib_get_buffer (vm, pi1);

	  ip0 = vlib_buffer_get_current (p0);
	  ip1 = vlib_buffer_get_current (p1);

	  vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data;
	  vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data;

	  sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
	  sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX];

	  fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0);
	  fib_index1 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index1);

	  fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0);
	  fib_index0 =
	    (vnet_buffer (p0)->sw_if_index[VLIB_TX] ==
	     (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX];

	  fib_index1 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index1);
	  fib_index1 =
	    (vnet_buffer (p1)->sw_if_index[VLIB_TX] ==
	     (u32) ~ 0) ? fib_index1 : vnet_buffer (p1)->sw_if_index[VLIB_TX];

	  mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
	  mtrie1 = &ip4_fib_get (fib_index1)->mtrie;

	  leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;

	  leaf0 =
	    ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
	  leaf1 =
	    ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 0);

	  /* Treat IP frag packets as "experimental" protocol for now
	     until support of IP frag reassembly is implemented */
	  proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol;
	  proto1 = ip4_is_fragment (ip1) ? 0xfe : ip1->protocol;

	  if (head_of_feature_arc == 0)
	    {
	      error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL;
	      goto skip_checks;
	    }

	  is_udp0 = proto0 == IP_PROTOCOL_UDP;
	  is_udp1 = proto1 == IP_PROTOCOL_UDP;
	  is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
	  is_tcp_udp1 = is_udp1 || proto1 == IP_PROTOCOL_TCP;

	  flags0 = p0->flags;
	  flags1 = p1->flags;

	  good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
	  good_tcp_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;

	  udp0 = ip4_next_header (ip0);
	  udp1 = ip4_next_header (ip1);

	  /* Don't verify UDP checksum for packets with explicit zero checksum. */
	  good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
	  good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;

	  leaf0 =
	    ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
	  leaf1 =
	    ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1);

	  /* Verify UDP length. */
	  ip_len0 = clib_net_to_host_u16 (ip0->length);
	  ip_len1 = clib_net_to_host_u16 (ip1->length);
	  udp_len0 = clib_net_to_host_u16 (udp0->length);
	  udp_len1 = clib_net_to_host_u16 (udp1->length);

	  len_diff0 = ip_len0 - udp_len0;
	  len_diff1 = ip_len1 - udp_len1;

	  len_diff0 = is_udp0 ? len_diff0 : 0;
	  len_diff1 = is_udp1 ? len_diff1 : 0;

	  if (PREDICT_FALSE (!(is_tcp_udp0 & is_tcp_udp1
			       & good_tcp_udp0 & good_tcp_udp1)))
	    {
	      if (is_tcp_udp0)
		{
		  if (is_tcp_udp0
		      && !(flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
		    flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
		  good_tcp_udp0 =
		    (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
		  good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
		}
	      if (is_tcp_udp1)
		{
		  if (is_tcp_udp1
		      && !(flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
		    flags1 = ip4_tcp_udp_validate_checksum (vm, p1);
		  good_tcp_udp1 =
		    (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
		  good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
		}
	    }

	  good_tcp_udp0 &= len_diff0 >= 0;
	  good_tcp_udp1 &= len_diff1 >= 0;

	  leaf0 =
	    ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
	  leaf1 =
	    ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2);

	  error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL;

	  error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
	  error1 = len_diff1 < 0 ? IP4_ERROR_UDP_LENGTH : error1;

	  ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
	  error0 = (is_tcp_udp0 && !good_tcp_udp0
		    ? IP4_ERROR_TCP_CHECKSUM + is_udp0 : error0);
	  error1 = (is_tcp_udp1 && !good_tcp_udp1
		    ? IP4_ERROR_TCP_CHECKSUM + is_udp1 : error1);

	  leaf0 =
	    ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
	  leaf1 =
	    ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
	  leaf0 =
	    (leaf0 ==
	     IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
	  leaf1 =
	    (leaf1 ==
	     IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);

	  vnet_buffer (p0)->ip.adj_index[VLIB_RX] = lbi0 =
	    ip4_fib_mtrie_leaf_get_adj_index (leaf0);
	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;

	  vnet_buffer (p1)->ip.adj_index[VLIB_RX] = lbi1 =
	    ip4_fib_mtrie_leaf_get_adj_index (leaf1);
	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = lbi1;

	  lb0 = load_balance_get (lbi0);
	  lb1 = load_balance_get (lbi1);
	  dpo0 = load_balance_get_bucket_i (lb0, 0);
	  dpo1 = load_balance_get_bucket_i (lb1, 0);

	  /*
	   * Must have a route to source otherwise we drop the packet.
	   * ip4 broadcasts are accepted, e.g. to make dhcp client work
	   *
	   * The checks are:
	   *  - the source is a recieve => it's from us => bogus, do this
	   *    first since it sets a different error code.
	   *  - uRPF check for any route to source - accept if passes.
	   *  - allow packets destined to the broadcast address from unknown sources
	   */
	  error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
		     dpo0->dpoi_type == DPO_RECEIVE) ?
		    IP4_ERROR_SPOOFED_LOCAL_PACKETS : error0);
	  error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
		     !fib_urpf_check_size (lb0->lb_urpf) &&
		     ip0->dst_address.as_u32 != 0xFFFFFFFF)
		    ? IP4_ERROR_SRC_LOOKUP_MISS : error0);
	  error1 = ((error1 == IP4_ERROR_UNKNOWN_PROTOCOL &&
		     dpo1->dpoi_type == DPO_RECEIVE) ?
		    IP4_ERROR_SPOOFED_LOCAL_PACKETS : error1);
	  error1 = ((error1 == IP4_ERROR_UNKNOWN_PROTOCOL &&
		     !fib_urpf_check_size (lb1->lb_urpf) &&
		     ip1->dst_address.as_u32 != 0xFFFFFFFF)
		    ? IP4_ERROR_SRC_LOOKUP_MISS : error1);

	skip_checks:

	  next0 = lm->local_next_by_ip_protocol[proto0];
	  next1 = lm->local_next_by_ip_protocol[proto1];

	  next0 =
	    error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
	  next1 =
	    error1 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1;

	  p0->error = error0 ? error_node->errors[error0] : 0;
	  p1->error = error1 ? error_node->errors[error1] : 0;

	  if (head_of_feature_arc)
	    {
	      if (PREDICT_TRUE (error0 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
		vnet_feature_arc_start (arc_index, sw_if_index0, &next0, p0);
	      if (PREDICT_TRUE (error1 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
		vnet_feature_arc_start (arc_index, sw_if_index1, &next1, p1);
	    }

	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
					   n_left_to_next, pi0, pi1,
					   next0, next1);
	}

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  vlib_buffer_t *p0;
	  ip4_header_t *ip0;
	  udp_header_t *udp0;
	  ip4_fib_mtrie_t *mtrie0;
	  ip4_fib_mtrie_leaf_t leaf0;
	  u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, lbi0;
	  i32 len_diff0;
	  u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
	  load_balance_t *lb0;
	  const dpo_id_t *dpo0;
	  u32 sw_if_index0;

	  pi0 = to_next[0] = from[0];
	  from += 1;
	  n_left_from -= 1;
	  to_next += 1;
	  n_left_to_next -= 1;

	  next0 = IP_LOCAL_NEXT_DROP;

	  p0 = vlib_get_buffer (vm, pi0);

	  ip0 = vlib_buffer_get_current (p0);

	  vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data;

	  sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];

	  fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0);

	  fib_index0 =
	    (vnet_buffer (p0)->sw_if_index[VLIB_TX] ==
	     (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX];

	  mtrie0 = &ip4_fib_get (fib_index0)->mtrie;

	  leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;

	  leaf0 =
	    ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);

	  /* Treat IP frag packets as "experimental" protocol for now
	     until support of IP frag reassembly is implemented */
	  proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol;

	  if (head_of_feature_arc == 0)
	    {
	      error0 = IP4_ERROR_UNKNOWN_PROTOCOL;
	      goto skip_check;
	    }

	  is_udp0 = proto0 == IP_PROTOCOL_UDP;
	  is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;

	  flags0 = p0->flags;

	  good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;

	  udp0 = ip4_next_header (ip0);

	  /* Don't verify UDP checksum for packets with explicit zero checksum. */
	  good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;

	  leaf0 =
	    ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);

	  /* Verify UDP length. */
	  ip_len0 = clib_net_to_host_u16 (ip0->length);
	  udp_len0 = clib_net_to_host_u16 (udp0->length);

	  len_diff0 = ip_len0 - udp_len0;

	  len_diff0 = is_udp0 ? len_diff0 : 0;

	  if (PREDICT_FALSE (!(is_tcp_udp0 & good_tcp_udp0)))
	    {
	      if (is_tcp_udp0)
		{
		  if (is_tcp_udp0
		      && !(flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
		    flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
		  good_tcp_udp0 =
		    (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
		  good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
		}
	    }

	  good_tcp_udp0 &= len_diff0 >= 0;

	  leaf0 =
	    ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);

	  error0 = IP4_ERROR_UNKNOWN_PROTOCOL;

	  error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;

	  ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
	  error0 = (is_tcp_udp0 && !good_tcp_udp0
		    ? IP4_ERROR_TCP_CHECKSUM + is_udp0 : error0);

	  leaf0 =
	    ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
	  leaf0 =
	    (leaf0 ==
	     IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);

	  lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;

	  lb0 = load_balance_get (lbi0);
	  dpo0 = load_balance_get_bucket_i (lb0, 0);

	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] =
	    vnet_buffer (p0)->ip.adj_index[VLIB_RX] = lbi0;

	  error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
		     dpo0->dpoi_type == DPO_RECEIVE) ?
		    IP4_ERROR_SPOOFED_LOCAL_PACKETS : error0);
	  error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
		     !fib_urpf_check_size (lb0->lb_urpf) &&
		     ip0->dst_address.as_u32 != 0xFFFFFFFF)
		    ? IP4_ERROR_SRC_LOOKUP_MISS : error0);

	skip_check:

	  next0 = lm->local_next_by_ip_protocol[proto0];

	  next0 =
	    error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;

	  p0->error = error0 ? error_node->errors[error0] : 0;

	  if (head_of_feature_arc)
	    {
	      if (PREDICT_TRUE (error0 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
		vnet_feature_arc_start (arc_index, sw_if_index0, &next0, p0);
	    }

	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
					   n_left_to_next, pi0, next0);

	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  return frame->n_vectors;
}

static uword
ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ );
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_local_node) =
{
  .function = ip4_local,
  .name = "ip4-local",
  .vector_size = sizeof (u32),
  .format_trace = format_ip4_forward_next_trace,
  .n_next_nodes = IP_LOCAL_N_NEXT,
  .next_nodes =
  {
    [IP_LOCAL_NEXT_DROP] = "error-drop",
    [IP_LOCAL_NEXT_PUNT] = "error-punt",
    [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
    [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",},
};
/* *INDENT-ON* */

VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local);

static uword
ip4_local_end_of_arc (vlib_main_t * vm,
		      vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ );
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_local_end_of_arc_node,static) = {
  .function = ip4_local_end_of_arc,
  .name = "ip4-local-end-of-arc",
  .vector_size = sizeof (u32),

  .format_trace = format_ip4_forward_next_trace,
  .sibling_of = "ip4-local",
};

VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_end_of_arc_node, ip4_local_end_of_arc)

VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = {
  .arc_name = "ip4-local",
  .node_name = "ip4-local-end-of-arc",
  .runs_before = 0, /* not before any other features */
};
/* *INDENT-ON* */

void
ip4_register_protocol (u32 protocol, u32 node_index)
{
  vlib_main_t *vm = vlib_get_main ();
  ip4_main_t *im = &ip4_main;
  ip_lookup_main_t *lm = &im->lookup_main;

  ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
  lm->local_next_by_ip_protocol[protocol] =
    vlib_node_add_next (vm, ip4_local_node.index, node_index);
}

static clib_error_t *
show_ip_local_command_fn (vlib_main_t * vm,
			  unformat_input_t * input, vlib_cli_command_t * cmd)
{
  ip4_main_t *im = &ip4_main;
  ip_lookup_main_t *lm = &im->lookup_main;
  int i;

  vlib_cli_output (vm, "Protocols handled by ip4_local");
  for (i = 0; i < ARRAY_LEN (lm->local_next_by_ip_protocol); i++)
    {
      if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
	vlib_cli_output (vm, "%d", i);
    }
  return 0;
}



/*?
 * Display the set of protocols handled by the local IPv4 stack.
 *
 * @cliexpar
 * Example of how to display local protocol table:
 * @cliexstart{show ip local}
 * Protocols handled by ip4_local
 * 1
 * 17
 * 47
 * @cliexend
?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (show_ip_local, static) =
{
  .path = "show ip local",
  .function = show_ip_local_command_fn,
  .short_help = "show ip local",
};
/* *INDENT-ON* */

always_inline uword
ip4_arp_inline (vlib_main_t * vm,
		vlib_node_runtime_t * node,
		vlib_frame_t * frame, int is_glean)
{
  vnet_main_t *vnm = vnet_get_main ();
  ip4_main_t *im = &ip4_main;
  ip_lookup_main_t *lm = &im->lookup_main;
  u32 *from, *to_next_drop;
  uword n_left_from, n_left_to_next_drop, next_index;
  static f64 time_last_seed_change = -1e100;
  static u32 hash_seeds[3];
  static uword hash_bitmap[256 / BITS (uword)];
  f64 time_now;

  if (node->flags & VLIB_NODE_FLAG_TRACE)
    ip4_forward_next_trace (vm, node, frame, VLIB_TX);

  time_now = vlib_time_now (vm);
  if (time_now - time_last_seed_change > 1e-3)
    {
      uword i;
      u32 *r = clib_random_buffer_get_data (&vm->random_buffer,
					    sizeof (hash_seeds));
      for (i = 0; i < ARRAY_LEN (hash_seeds); i++)
	hash_seeds[i] = r[i];

      /* Mark all hash keys as been no-seen before. */
      for (i = 0; i < ARRAY_LEN (hash_bitmap); i++)
	hash_bitmap[i] = 0;

      time_last_seed_change = time_now;
    }

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;
  if (next_index == IP4_ARP_NEXT_DROP)
    next_index = IP4_ARP_N_NEXT;	/* point to first interface */

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
			   to_next_drop, n_left_to_next_drop);

      while (n_left_from > 0 && n_left_to_next_drop > 0)
	{
	  u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
	  ip_adjacency_t *adj0;
	  vlib_buffer_t *p0;
	  ip4_header_t *ip0;
	  uword bm0;

	  pi0 = from[0];

	  p0 = vlib_get_buffer (vm, pi0);

	  adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
	  adj0 = ip_get_adjacency (lm, adj_index0);
	  ip0 = vlib_buffer_get_current (p0);

	  a0 = hash_seeds[0];
	  b0 = hash_seeds[1];
	  c0 = hash_seeds[2];

	  sw_if_index0 = adj0->rewrite_header.sw_if_index;
	  vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;

	  if (is_glean)
	    {
	      /*
	       * this is the Glean case, so we are ARPing for the
	       * packet's destination
	       */
	      a0 ^= ip0->dst_address.data_u32;
	    }
	  else
	    {
	      a0 ^= adj0->sub_type.nbr.next_hop.ip4.data_u32;
	    }
	  b0 ^= sw_if_index0;

	  hash_v3_finalize32 (a0, b0, c0);

	  c0 &= BITS (hash_bitmap) - 1;
	  c0 = c0 / BITS (uword);
	  m0 = (uword) 1 << (c0 % BITS (uword));

	  bm0 = hash_bitmap[c0];
	  drop0 = (bm0 & m0) != 0;

	  /* Mark it as seen. */
	  hash_bitmap[c0] = bm0 | m0;

	  from += 1;
	  n_left_from -= 1;
	  to_next_drop[0] = pi0;
	  to_next_drop += 1;
	  n_left_to_next_drop -= 1;

	  p0->error =
	    node->errors[drop0 ? IP4_ARP_ERROR_DROP :
			 IP4_ARP_ERROR_REQUEST_SENT];

	  /*
	   * the adj has been updated to a rewrite but the node the DPO that got
	   * us here hasn't - yet. no big deal. we'll drop while we wait.
	   */
	  if (IP_LOOKUP_NEXT_REWRITE == adj0->lookup_next_index)
	    continue;

	  if (drop0)
	    continue;

	  /*
	   * Can happen if the control-plane is programming tables
	   * with traffic flowing; at least that's today's lame excuse.
	   */
	  if ((is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN)
	      || (!is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP))
	    {
	      p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
	    }
	  else
	    /* Send ARP request. */
	    {
	      u32 bi0 = 0;
	      vlib_buffer_t *b0;
	      ethernet_arp_header_t *h0;
	      vnet_hw_interface_t *hw_if0;

	      h0 =
		vlib_packet_template_get_packet (vm,
						 &im->ip4_arp_request_packet_template,
						 &bi0);

	      /* Add rewrite/encap string for ARP packet. */
	      vnet_rewrite_one_header (adj0[0], h0,
				       sizeof (ethernet_header_t));

	      hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);

	      /* Src ethernet address in ARP header. */
	      clib_memcpy (h0->ip4_over_ethernet[0].ethernet,
			   hw_if0->hw_address,
			   sizeof (h0->ip4_over_ethernet[0].ethernet));

	      if (is_glean)
		{
		  /* The interface's source address is stashed in the Glean Adj */
		  h0->ip4_over_ethernet[0].ip4 =
		    adj0->sub_type.glean.receive_addr.ip4;

		  /* Copy in destination address we are requesting. This is the
		   * glean case, so it's the packet's destination.*/
		  h0->ip4_over_ethernet[1].ip4.data_u32 =
		    ip0->dst_address.data_u32;
		}
	      else
		{
		  /* Src IP address in ARP header. */
		  if (ip4_src_address_for_packet (lm, sw_if_index0,
						  &h0->
						  ip4_over_ethernet[0].ip4))
		    {
		      /* No source address available */
		      p0->error =
			node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
		      vlib_buffer_free (vm, &bi0, 1);
		      continue;
		    }

		  /* Copy in destination address we are requesting from the
		     incomplete adj */
		  h0->ip4_over_ethernet[1].ip4.data_u32 =
		    adj0->sub_type.nbr.next_hop.ip4.as_u32;
		}

	      vlib_buffer_copy_trace_flag (vm, p0, bi0);
	      b0 = vlib_get_buffer (vm, bi0);
	      vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;

	      vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);

	      vlib_set_next_frame_buffer (vm, node,
					  adj0->rewrite_header.next_index,
					  bi0);
	    }
	}

      vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
    }

  return frame->n_vectors;
}

static uword
ip4_arp (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  return (ip4_arp_inline (vm, node, frame, 0));
}

static uword
ip4_glean (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  return (ip4_arp_inline (vm, node, frame, 1));
}

static char *ip4_arp_error_strings[] = {
  [IP4_ARP_ERROR_DROP] = "address overflow drops",
  [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
  [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
  [IP4_ARP_ERROR_REPLICATE_DROP] = "ARP replication completed",
  [IP4_ARP_ERROR_REPLICATE_FAIL] = "ARP replication failed",
  [IP4_ARP_ERROR_NO_SOURCE_ADDRESS] = "no source address for ARP request",
};

VLIB_REGISTER_NODE (ip4_arp_node) =
{
  .function = ip4_arp,.name = "ip4-arp",.vector_size =
    sizeof (u32),.format_trace = format_ip4_forward_next_trace,.n_errors =
    ARRAY_LEN (ip4_arp_error_strings),.error_strings =
    ip4_arp_error_strings,.n_next_nodes = IP4_ARP_N_NEXT,.next_nodes =
  {
  [IP4_ARP_NEXT_DROP] = "error-drop",}
,};

VLIB_REGISTER_NODE (ip4_glean_node) =
{
  .function = ip4_glean,.name = "ip4-glean",.vector_size =
    sizeof (u32),.format_trace = format_ip4_forward_next_trace,.n_errors =
    ARRAY_LEN (ip4_arp_error_strings),.error_strings =
    ip4_arp_error_strings,.n_next_nodes = IP4_ARP_N_NEXT,.next_nodes =
  {
  [IP4_ARP_NEXT_DROP] = "error-drop",}
,};

#define foreach_notrace_ip4_arp_error           \
_(DROP)                                         \
_(REQUEST_SENT)                                 \
_(REPLICATE_DROP)                               \
_(REPLICATE_FAIL)

clib_error_t *
arp_notrace_init (vlib_main_t * vm)
{
  vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, ip4_arp_node.index);

  /* don't trace ARP request packets */
#define _(a)                                    \
    vnet_pcap_drop_trace_filter_add_del         \
        (rt->errors[IP4_ARP_ERROR_##a],         \
         1 /* is_add */);
  foreach_notrace_ip4_arp_error;
#undef _
  return 0;
}

VLIB_INIT_FUNCTION (arp_notrace_init);


/* Send an ARP request to see if given destination is reachable on given interface. */
clib_error_t *
ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index)
{
  vnet_main_t *vnm = vnet_get_main ();
  ip4_main_t *im = &ip4_main;
  ethernet_arp_header_t *h;
  ip4_address_t *src;
  ip_interface_address_t *ia;
  ip_adjacency_t *adj;
  vnet_hw_interface_t *hi;
  vnet_sw_interface_t *si;
  vlib_buffer_t *b;
  u32 bi = 0;

  si = vnet_get_sw_interface (vnm, sw_if_index);

  if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
    {
      return clib_error_return (0, "%U: interface %U down",
				format_ip4_address, dst,
				format_vnet_sw_if_index_name, vnm,
				sw_if_index);
    }

  src =
    ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
  if (!src)
    {
      vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
      return clib_error_return
	(0,
	 "no matching interface address for destination %U (interface %U)",
	 format_ip4_address, dst, format_vnet_sw_if_index_name, vnm,
	 sw_if_index);
    }

  adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index);

  h =
    vlib_packet_template_get_packet (vm,
				     &im->ip4_arp_request_packet_template,
				     &bi);

  hi = vnet_get_sup_hw_interface (vnm, sw_if_index);

  clib_memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address,
	       sizeof (h->ip4_over_ethernet[0].ethernet));

  h->ip4_over_ethernet[0].ip4 = src[0];
  h->ip4_over_ethernet[1].ip4 = dst[0];

  b = vlib_get_buffer (vm, bi);
  vnet_buffer (b)->sw_if_index[VLIB_RX] =
    vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;

  /* Add encapsulation string for software interface (e.g. ethernet header). */
  vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
  vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);

  {
    vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index);
    u32 *to_next = vlib_frame_vector_args (f);
    to_next[0] = bi;
    f->n_vectors = 1;
    vlib_put_frame_to_node (vm, hi->output_node_index, f);
  }

  return /* no error */ 0;
}

typedef enum
{
  IP4_REWRITE_NEXT_DROP,
  IP4_REWRITE_NEXT_ICMP_ERROR,
} ip4_rewrite_next_t;

always_inline uword
ip4_rewrite_inline (vlib_main_t * vm,
		    vlib_node_runtime_t * node,
		    vlib_frame_t * frame,
		    int do_counters, int is_midchain, int is_mcast)
{
  ip_lookup_main_t *lm = &ip4_main.lookup_main;
  u32 *from = vlib_frame_vector_args (frame);
  u32 n_left_from, n_left_to_next, *to_next, next_index;
  vlib_node_runtime_t *error_node =
    vlib_node_get_runtime (vm, ip4_input_node.index);

  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;
  u32 cpu_index = os_get_cpu_number ();

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	  ip_adjacency_t *adj0, *adj1;
	  vlib_buffer_t *p0, *p1;
	  ip4_header_t *ip0, *ip1;
	  u32 pi0, rw_len0, next0, error0, checksum0, adj_index0;
	  u32 pi1, rw_len1, next1, error1, checksum1, adj_index1;
	  u32 tx_sw_if_index0, tx_sw_if_index1;

	  /* Prefetch next iteration. */
	  {
	    vlib_buffer_t *p2, *p3;

	    p2 = vlib_get_buffer (vm, from[2]);
	    p3 = vlib_get_buffer (vm, from[3]);

	    vlib_prefetch_buffer_header (p2, STORE);
	    vlib_prefetch_buffer_header (p3, STORE);

	    CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
	    CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
	  }

	  pi0 = to_next[0] = from[0];
	  pi1 = to_next[1] = from[1];

	  from += 2;
	  n_left_from -= 2;
	  to_next += 2;
	  n_left_to_next -= 2;

	  p0 = vlib_get_buffer (vm, pi0);
	  p1 = vlib_get_buffer (vm, pi1);

	  adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
	  adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX];

	  /*
	   * pre-fetch the per-adjacency counters
	   */
	  if (do_counters)
	    {
	      vlib_prefetch_combined_counter (&adjacency_counters,
					      cpu_index, adj_index0);
	      vlib_prefetch_combined_counter (&adjacency_counters,
					      cpu_index, adj_index1);
	    }

	  /* We should never rewrite a pkt using the MISS adjacency */
	  ASSERT (adj_index0 && adj_index1);

	  ip0 = vlib_buffer_get_current (p0);
	  ip1 = vlib_buffer_get_current (p1);

	  error0 = error1 = IP4_ERROR_NONE;
	  next0 = next1 = IP4_REWRITE_NEXT_DROP;

	  /* Decrement TTL & update checksum.
	     Works either endian, so no need for byte swap. */
	  if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
	    {
	      i32 ttl0 = ip0->ttl;

	      /* Input node should have reject packets with ttl 0. */
	      ASSERT (ip0->ttl > 0);

	      checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
	      checksum0 += checksum0 >= 0xffff;

	      ip0->checksum = checksum0;
	      ttl0 -= 1;
	      ip0->ttl = ttl0;

	      /*
	       * If the ttl drops below 1 when forwarding, generate
	       * an ICMP response.
	       */
	      if (PREDICT_FALSE (ttl0 <= 0))
		{
		  error0 = IP4_ERROR_TIME_EXPIRED;
		  vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
		  icmp4_error_set_vnet_buffer (p0, ICMP4_time_exceeded,
					       ICMP4_time_exceeded_ttl_exceeded_in_transit,
					       0);
		  next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
		}

	      /* Verify checksum. */
	      ASSERT (ip0->checksum == ip4_header_checksum (ip0));
	    }
	  else
	    {
	      p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
	    }
	  if (PREDICT_TRUE (!(p1->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
	    {
	      i32 ttl1 = ip1->ttl;

	      /* Input node should have reject packets with ttl 0. */
	      ASSERT (ip1->ttl > 0);

	      checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100);
	      checksum1 += checksum1 >= 0xffff;

	      ip1->checksum = checksum1;
	      ttl1 -= 1;
	      ip1->ttl = ttl1;

	      /*
	       * If the ttl drops below 1 when forwarding, generate
	       * an ICMP response.
	       */
	      if (PREDICT_FALSE (ttl1 <= 0))
		{
		  error1 = IP4_ERROR_TIME_EXPIRED;
		  vnet_buffer (p1)->sw_if_index[VLIB_TX] = (u32) ~ 0;
		  icmp4_error_set_vnet_buffer (p1, ICMP4_time_exceeded,
					       ICMP4_time_exceeded_ttl_exceeded_in_transit,
					       0);
		  next1 = IP4_REWRITE_NEXT_ICMP_ERROR;
		}

	      /* Verify checksum. */
	      ASSERT (ip0->checksum == ip4_header_checksum (ip0));
	      ASSERT (ip1->checksum == ip4_header_checksum (ip1));
	    }
	  else
	    {
	      p1->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
	    }

	  /* Rewrite packet header and updates lengths. */
	  adj0 = ip_get_adjacency (lm, adj_index0);
	  adj1 = ip_get_adjacency (lm, adj_index1);

	  /* Worth pipelining. No guarantee that adj0,1 are hot... */
	  rw_len0 = adj0[0].rewrite_header.data_bytes;
	  rw_len1 = adj1[0].rewrite_header.data_bytes;
	  vnet_buffer (p0)->ip.save_rewrite_length = rw_len0;
	  vnet_buffer (p1)->ip.save_rewrite_length = rw_len1;

	  /* Check MTU of outgoing interface. */
	  error0 =
	    (vlib_buffer_length_in_chain (vm, p0) >
	     adj0[0].
	     rewrite_header.max_l3_packet_bytes ? IP4_ERROR_MTU_EXCEEDED :
	     error0);
	  error1 =
	    (vlib_buffer_length_in_chain (vm, p1) >
	     adj1[0].
	     rewrite_header.max_l3_packet_bytes ? IP4_ERROR_MTU_EXCEEDED :
	     error1);

	  /* Don't adjust the buffer for ttl issue; icmp-error node wants
	   * to see the IP headerr */
	  if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
	    {
	      next0 = adj0[0].rewrite_header.next_index;
	      p0->current_data -= rw_len0;
	      p0->current_length += rw_len0;
	      tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
	      vnet_buffer (p0)->sw_if_index[VLIB_TX] = tx_sw_if_index0;

	      if (PREDICT_FALSE
		  (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
		vnet_feature_arc_start (lm->output_feature_arc_index,
					tx_sw_if_index0, &next0, p0);
	    }
	  if (PREDICT_TRUE (error1 == IP4_ERROR_NONE))
	    {
	      next1 = adj1[0].rewrite_header.next_index;
	      p1->current_data -= rw_len1;
	      p1->current_length += rw_len1;

	      tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
	      vnet_buffer (p1)->sw_if_index[VLIB_TX] = tx_sw_if_index1;

	      if (PREDICT_FALSE
		  (adj1[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
		vnet_feature_arc_start (lm->output_feature_arc_index,
					tx_sw_if_index1, &next1, p1);
	    }

	  /* Guess we are only writing on simple Ethernet header. */
	  vnet_rewrite_two_headers (adj0[0], adj1[0],
				    ip0, ip1, sizeof (ethernet_header_t));

	  /*
	   * Bump the per-adjacency counters
	   */
	  if (do_counters)
	    {
	      vlib_increment_combined_counter
		(&adjacency_counters,
		 cpu_index,
		 adj_index0, 1,
		 vlib_buffer_length_in_chain (vm, p0) + rw_len0);

	      vlib_increment_combined_counter
		(&adjacency_counters,
		 cpu_index,
		 adj_index1, 1,
		 vlib_buffer_length_in_chain (vm, p1) + rw_len1);
	    }

	  if (is_midchain)
	    {
	      adj0->sub_type.midchain.fixup_func (vm, adj0, p0);
	      adj1->sub_type.midchain.fixup_func (vm, adj1, p1);
	    }
	  if (is_mcast)
	    {
	      /*
	       * copy bytes from the IP address into the MAC rewrite
	       */
	      vnet_fixup_one_header (adj0[0], &ip0->dst_address, ip0);
	      vnet_fixup_one_header (adj1[0], &ip1->dst_address, ip1);
	    }

	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
					   to_next, n_left_to_next,
					   pi0, pi1, next0, next1);
	}

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  ip_adjacency_t *adj0;
	  vlib_buffer_t *p0;
	  ip4_header_t *ip0;
	  u32 pi0, rw_len0, adj_index0, next0, error0, checksum0;
	  u32 tx_sw_if_index0;

	  pi0 = to_next[0] = from[0];

	  p0 = vlib_get_buffer (vm, pi0);

	  adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];

	  /* We should never rewrite a pkt using the MISS adjacency */
	  ASSERT (adj_index0);

	  adj0 = ip_get_adjacency (lm, adj_index0);

	  ip0 = vlib_buffer_get_current (p0);

	  error0 = IP4_ERROR_NONE;
	  next0 = IP4_REWRITE_NEXT_DROP;	/* drop on error */

	  /* Decrement TTL & update checksum. */
	  if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
	    {
	      i32 ttl0 = ip0->ttl;

	      checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);

	      checksum0 += checksum0 >= 0xffff;

	      ip0->checksum = checksum0;

	      ASSERT (ip0->ttl > 0);

	      ttl0 -= 1;

	      ip0->ttl = ttl0;

	      ASSERT (ip0->checksum == ip4_header_checksum (ip0));

	      if (PREDICT_FALSE (ttl0 <= 0))
		{
		  /*
		   * If the ttl drops below 1 when forwarding, generate
		   * an ICMP response.
		   */
		  error0 = IP4_ERROR_TIME_EXPIRED;
		  next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
		  vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
		  icmp4_error_set_vnet_buffer (p0, ICMP4_time_exceeded,
					       ICMP4_time_exceeded_ttl_exceeded_in_transit,
					       0);
		}
	    }
	  else
	    {
	      p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
	    }

	  if (do_counters)
	    vlib_prefetch_combined_counter (&adjacency_counters,
					    cpu_index, adj_index0);

	  /* Guess we are only writing on simple Ethernet header. */
	  vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
	  if (is_mcast)
	    {
	      /*
	       * copy bytes from the IP address into the MAC rewrite
	       */
	      vnet_fixup_one_header (adj0[0], &ip0->dst_address, ip0);
	    }

	  /* Update packet buffer attributes/set output interface. */
	  rw_len0 = adj0[0].rewrite_header.data_bytes;
	  vnet_buffer (p0)->ip.save_rewrite_length = rw_len0;

	  if (do_counters)
	    vlib_increment_combined_counter
	      (&adjacency_counters,
	       cpu_index, adj_index0, 1,
	       vlib_buffer_length_in_chain (vm, p0) + rw_len0);

	  /* Check MTU of outgoing interface. */
	  error0 = (vlib_buffer_length_in_chain (vm, p0)
		    > adj0[0].rewrite_header.max_l3_packet_bytes
		    ? IP4_ERROR_MTU_EXCEEDED : error0);

	  p0->error = error_node->errors[error0];

	  /* Don't adjust the buffer for ttl issue; icmp-error node wants
	   * to see the IP headerr */
	  if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
	    {
	      p0->current_data -= rw_len0;
	      p0->current_length += rw_len0;
	      tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;

	      vnet_buffer (p0)->sw_if_index[VLIB_TX] = tx_sw_if_index0;
	      next0 = adj0[0].rewrite_header.next_index;

	      if (is_midchain)
		{
		  adj0->sub_type.midchain.fixup_func (vm, adj0, p0);
		}

	      if (PREDICT_FALSE
		  (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
		vnet_feature_arc_start (lm->output_feature_arc_index,
					tx_sw_if_index0, &next0, p0);

	    }

	  from += 1;
	  n_left_from -= 1;
	  to_next += 1;
	  n_left_to_next -= 1;

	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   pi0, next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  /* Need to do trace after rewrites to pick up new packet data. */
  if (node->flags & VLIB_NODE_FLAG_TRACE)
    ip4_forward_next_trace (vm, node, frame, VLIB_TX);

  return frame->n_vectors;
}


/** @brief IPv4 rewrite node.
    @node ip4-rewrite

    This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
    header checksum, fetch the ip adjacency, check the outbound mtu,
    apply the adjacency rewrite, and send pkts to the adjacency
    rewrite header's rewrite_next_index.

    @param vm vlib_main_t corresponding to the current thread
    @param node vlib_node_runtime_t
    @param frame vlib_frame_t whose contents should be dispatched

    @par Graph mechanics: buffer metadata, next index usage

    @em Uses:
    - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
        - the rewrite adjacency index
    - <code>adj->lookup_next_index</code>
        - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
          the packet will be dropped.
    - <code>adj->rewrite_header</code>
        - Rewrite string length, rewrite string, next_index

    @em Sets:
    - <code>b->current_data, b->current_length</code>
        - Updated net of applying the rewrite string

    <em>Next Indices:</em>
    - <code> adj->rewrite_header.next_index </code>
      or @c error-drop
*/
static uword
ip4_rewrite (vlib_main_t * vm,
	     vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  if (adj_are_counters_enabled ())
    return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
  else
    return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
}

static uword
ip4_midchain (vlib_main_t * vm,
	      vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  if (adj_are_counters_enabled ())
    return ip4_rewrite_inline (vm, node, frame, 1, 1, 0);
  else
    return ip4_rewrite_inline (vm, node, frame, 0, 1, 0);
}

static uword
ip4_rewrite_mcast (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  if (adj_are_counters_enabled ())
    return ip4_rewrite_inline (vm, node, frame, 1, 0, 1);
  else
    return ip4_rewrite_inline (vm, node, frame, 0, 0, 1);
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_rewrite_node) = {
  .function = ip4_rewrite,
  .name = "ip4-rewrite",
  .vector_size = sizeof (u32),

  .format_trace = format_ip4_rewrite_trace,

  .n_next_nodes = 2,
  .next_nodes = {
    [IP4_REWRITE_NEXT_DROP] = "error-drop",
    [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
  },
};
VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_node, ip4_rewrite)

VLIB_REGISTER_NODE (ip4_rewrite_mcast_node) = {
  .function = ip4_rewrite_mcast,
  .name = "ip4-rewrite-mcast",
  .vector_size = sizeof (u32),

  .format_trace = format_ip4_rewrite_trace,
  .sibling_of = "ip4-rewrite",
};
VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_mcast_node, ip4_rewrite_mcast)

VLIB_REGISTER_NODE (ip4_midchain_node) = {
  .function = ip4_midchain,
  .name = "ip4-midchain",
  .vector_size = sizeof (u32),
  .format_trace = format_ip4_forward_next_trace,
  .sibling_of =  "ip4-rewrite",
};
VLIB_NODE_FUNCTION_MULTIARCH (ip4_midchain_node, ip4_midchain);
/* *INDENT-ON */

static clib_error_t *
add_del_interface_table (vlib_main_t * vm,
			 unformat_input_t * input, vlib_cli_command_t * cmd)
{
  vnet_main_t *vnm = vnet_get_main ();
  ip_interface_address_t *ia;
  clib_error_t *error = 0;
  u32 sw_if_index, table_id;

  sw_if_index = ~0;

  if (!unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
    {
      error = clib_error_return (0, "unknown interface `%U'",
				 format_unformat_error, input);
      goto done;
    }

  if (unformat (input, "%d", &table_id))
    ;
  else
    {
      error = clib_error_return (0, "expected table id `%U'",
				 format_unformat_error, input);
      goto done;
    }

  /*
   * If the interface already has in IP address, then a change int
   * VRF is not allowed. The IP address applied must first be removed.
   * We do not do that automatically here, since VPP has no knowledge
   * of whether thoses subnets are valid in the destination VRF.
   */
  /* *INDENT-OFF* */
  foreach_ip_interface_address (&ip4_main.lookup_main,
                                ia, sw_if_index,
                                1 /* honor unnumbered */,
  ({
      ip4_address_t * a;

      a = ip_interface_address_get_address (&ip4_main.lookup_main, ia);
      error = clib_error_return (0, "interface %U has address %U",
                                 format_vnet_sw_if_index_name, vnm,
                                 sw_if_index,
                                 format_ip4_address, a);
      goto done;
   }));
   /* *INDENT-ON* */

{
  ip4_main_t *im = &ip4_main;
  u32 fib_index;

  fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, table_id);

  vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
  im->fib_index_by_sw_if_index[sw_if_index] = fib_index;

  fib_index = mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, table_id);
  vec_validate (im->mfib_index_by_sw_if_index, sw_if_index);
  im->mfib_index_by_sw_if_index[sw_if_index] = fib_index;
}

done:
return error;
}

/*?
 * Place the indicated interface into the supplied IPv4 FIB table (also known
 * as a VRF). If the FIB table does not exist, this command creates it. To
 * display the current IPv4 FIB table, use the command '<em>show ip fib</em>'.
 * FIB table will only be displayed if a route has been added to the table, or
 * an IP Address is assigned to an interface in the table (which adds a route
 * automatically).
 *
 * @note IP addresses added after setting the interface IP table are added to
 * the indicated FIB table. If an IP address is added prior to changing the
 * table then this is an error. The control plane must remove these addresses
 * first and then change the table. VPP will not automatically move the
 * addresses from the old to the new table as it does not know the validity
 * of such a change.
 *
 * @cliexpar
 * Example of how to add an interface to an IPv4 FIB table (where 2 is the table-id):
 * @cliexcmd{set interface ip table GigabitEthernet2/0/0 2}
 ?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (set_interface_ip_table_command, static) =
{
  .path = "set interface ip table",
  .function = add_del_interface_table,
  .short_help = "set interface ip table <interface> <table-id>",
};
/* *INDENT-ON* */

int
ip4_lookup_validate (ip4_address_t * a, u32 fib_index0)
{
  ip4_fib_mtrie_t *mtrie0;
  ip4_fib_mtrie_leaf_t leaf0;
  u32 lbi0;

  mtrie0 = &ip4_fib_get (fib_index0)->mtrie;

  leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
  leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 0);
  leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 1);
  leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
  leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);

  /* Handle default route. */
  leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);

  lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);

  return lbi0 == ip4_fib_table_lookup_lb (ip4_fib_get (fib_index0), a);
}

static clib_error_t *
test_lookup_command_fn (vlib_main_t * vm,
			unformat_input_t * input, vlib_cli_command_t * cmd)
{
  ip4_fib_t *fib;
  u32 table_id = 0;
  f64 count = 1;
  u32 n;
  int i;
  ip4_address_t ip4_base_address;
  u64 errors = 0;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "table %d", &table_id))
	{
	  /* Make sure the entry exists. */
	  fib = ip4_fib_get (table_id);
	  if ((fib) && (fib->index != table_id))
	    return clib_error_return (0, "<fib-index> %d does not exist",
				      table_id);
	}
      else if (unformat (input, "count %f", &count))
	;

      else if (unformat (input, "%U",
			 unformat_ip4_address, &ip4_base_address))
	;
      else
	return clib_error_return (0, "unknown input `%U'",
				  format_unformat_error, input);
    }

  n = count;

  for (i = 0; i < n; i++)
    {
      if (!ip4_lookup_validate (&ip4_base_address, table_id))
	errors++;

      ip4_base_address.as_u32 =
	clib_host_to_net_u32 (1 +
			      clib_net_to_host_u32 (ip4_base_address.as_u32));
    }

  if (errors)
    vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
  else
    vlib_cli_output (vm, "No errors in %d lookups\n", n);

  return 0;
}

/*?
 * Perform a lookup of an IPv4 Address (or range of addresses) in the
 * given FIB table to determine if there is a conflict with the
 * adjacency table. The fib-id can be determined by using the
 * '<em>show ip fib</em>' command. If fib-id is not entered, default value
 * of 0 is used.
 *
 * @todo This command uses fib-id, other commands use table-id (not
 * just a name, they are different indexes). Would like to change this
 * to table-id for consistency.
 *
 * @cliexpar
 * Example of how to run the test lookup command:
 * @cliexstart{test lookup 172.16.1.1 table 1 count 2}
 * No errors in 2 lookups
 * @cliexend
?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (lookup_test_command, static) =
{
  .path = "test lookup",
  .short_help = "test lookup <ipv4-addr> [table <fib-id>] [count <nn>]",
  .function = test_lookup_command_fn,
};
/* *INDENT-ON* */

int
vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
{
  ip4_main_t *im4 = &ip4_main;
  ip4_fib_t *fib;
  uword *p = hash_get (im4->fib_index_by_table_id, table_id);

  if (p == 0)
    return VNET_API_ERROR_NO_SUCH_FIB;

  fib = ip4_fib_get (p[0]);

  fib->flow_hash_config = flow_hash_config;
  return 0;
}

static clib_error_t *
set_ip_flow_hash_command_fn (vlib_main_t * vm,
			     unformat_input_t * input,
			     vlib_cli_command_t * cmd)
{
  int matched = 0;
  u32 table_id = 0;
  u32 flow_hash_config = 0;
  int rv;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "table %d", &table_id))
	matched = 1;
#define _(a,v) \
    else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
      foreach_flow_hash_bit
#undef _
	else
	break;
    }

  if (matched == 0)
    return clib_error_return (0, "unknown input `%U'",
			      format_unformat_error, input);

  rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
  switch (rv)
    {
    case 0:
      break;

    case VNET_API_ERROR_NO_SUCH_FIB:
      return clib_error_return (0, "no such FIB table %d", table_id);

    default:
      clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
      break;
    }

  return 0;
}

/*?
 * Configure the set of IPv4 fields used by the flow hash.
 *
 * @cliexpar
 * Example of how to set the flow hash on a given table:
 * @cliexcmd{set ip flow-hash table 7 dst sport dport proto}
 * Example of display the configured flow hash:
 * @cliexstart{show ip fib}
 * ipv4-VRF:0, fib_index 0, flow hash: src dst sport dport proto
 * 0.0.0.0/0
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:0 buckets:1 uRPF:0 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * 0.0.0.0/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:1 buckets:1 uRPF:1 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * 224.0.0.0/8
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:3 buckets:1 uRPF:3 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * 6.0.1.2/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:30 buckets:1 uRPF:29 to:[0:0]]
 *     [0] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
 * 7.0.0.1/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:31 buckets:4 uRPF:30 to:[0:0]]
 *     [0] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
 *     [1] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
 *     [2] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
 *     [3] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
 * 240.0.0.0/8
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:2 buckets:1 uRPF:2 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * 255.255.255.255/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:4 buckets:1 uRPF:4 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * ipv4-VRF:7, fib_index 1, flow hash: dst sport dport proto
 * 0.0.0.0/0
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:12 buckets:1 uRPF:11 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * 0.0.0.0/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:13 buckets:1 uRPF:12 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * 172.16.1.0/24
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:17 buckets:1 uRPF:16 to:[0:0]]
 *     [0] [@4]: ipv4-glean: af_packet0
 * 172.16.1.1/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:18 buckets:1 uRPF:17 to:[1:84]]
 *     [0] [@2]: dpo-receive: 172.16.1.1 on af_packet0
 * 172.16.1.2/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:21 buckets:1 uRPF:20 to:[0:0]]
 *     [0] [@5]: ipv4 via 172.16.1.2 af_packet0: IP4: 02:fe:9e:70:7a:2b -> 26:a5:f6:9c:3a:36
 * 172.16.2.0/24
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:19 buckets:1 uRPF:18 to:[0:0]]
 *     [0] [@4]: ipv4-glean: af_packet1
 * 172.16.2.1/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:20 buckets:1 uRPF:19 to:[0:0]]
 *     [0] [@2]: dpo-receive: 172.16.2.1 on af_packet1
 * 224.0.0.0/8
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:15 buckets:1 uRPF:14 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * 240.0.0.0/8
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:14 buckets:1 uRPF:13 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * 255.255.255.255/32
 *   unicast-ip4-chain
 *   [@0]: dpo-load-balance: [index:16 buckets:1 uRPF:15 to:[0:0]]
 *     [0] [@0]: dpo-drop ip6
 * @cliexend
?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) =
{
  .path = "set ip flow-hash",
  .short_help =
  "set ip flow-hash table <table-id> [src] [dst] [sport] [dport] [proto] [reverse]",
  .function = set_ip_flow_hash_command_fn,
};
/* *INDENT-ON* */

int
vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
			     u32 table_index)
{
  vnet_main_t *vnm = vnet_get_main ();
  vnet_interface_main_t *im = &vnm->interface_main;
  ip4_main_t *ipm = &ip4_main;
  ip_lookup_main_t *lm = &ipm->lookup_main;
  vnet_classify_main_t *cm = &vnet_classify_main;
  ip4_address_t *if_addr;

  if (pool_is_free_index (im->sw_interfaces, sw_if_index))
    return VNET_API_ERROR_NO_MATCHING_INTERFACE;

  if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
    return VNET_API_ERROR_NO_SUCH_ENTRY;

  vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
  lm->classify_table_index_by_sw_if_index[sw_if_index] = table_index;

  if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);

  if (NULL != if_addr)
    {
      fib_prefix_t pfx = {
	.fp_len = 32,
	.fp_proto = FIB_PROTOCOL_IP4,
	.fp_addr.ip4 = *if_addr,
      };
      u32 fib_index;

      fib_index = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4,
						       sw_if_index);


      if (table_index != (u32) ~ 0)
	{
	  dpo_id_t dpo = DPO_INVALID;

	  dpo_set (&dpo,
		   DPO_CLASSIFY,
		   DPO_PROTO_IP4,
		   classify_dpo_create (DPO_PROTO_IP4, table_index));

	  fib_table_entry_special_dpo_add (fib_index,
					   &pfx,
					   FIB_SOURCE_CLASSIFY,
					   FIB_ENTRY_FLAG_NONE, &dpo);
	  dpo_reset (&dpo);
	}
      else
	{
	  fib_table_entry_special_remove (fib_index,
					  &pfx, FIB_SOURCE_CLASSIFY);
	}
    }

  return 0;
}

static clib_error_t *
set_ip_classify_command_fn (vlib_main_t * vm,
			    unformat_input_t * input,
			    vlib_cli_command_t * cmd)
{
  u32 table_index = ~0;
  int table_index_set = 0;
  u32 sw_if_index = ~0;
  int rv;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "table-index %d", &table_index))
	table_index_set = 1;
      else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
			 vnet_get_main (), &sw_if_index))
	;
      else
	break;
    }

  if (table_index_set == 0)
    return clib_error_return (0, "classify table-index must be specified");

  if (sw_if_index == ~0)
    return clib_error_return (0, "interface / subif must be specified");

  rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);

  switch (rv)
    {
    case 0:
      break;

    case VNET_API_ERROR_NO_MATCHING_INTERFACE:
      return clib_error_return (0, "No such interface");

    case VNET_API_ERROR_NO_SUCH_ENTRY:
      return clib_error_return (0, "No such classifier table");
    }
  return 0;
}

/*?
 * Assign a classification table to an interface. The classification
 * table is created using the '<em>classify table</em>' and '<em>classify session</em>'
 * commands. Once the table is create, use this command to filter packets
 * on an interface.
 *
 * @cliexpar
 * Example of how to assign a classification table to an interface:
 * @cliexcmd{set ip classify intfc GigabitEthernet2/0/0 table-index 1}
?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (set_ip_classify_command, static) =
{
    .path = "set ip classify",
    .short_help =
    "set ip classify intfc <interface> table-index <classify-idx>",
    .function = set_ip_classify_command_fn,
};
/* *INDENT-ON* */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */