aboutsummaryrefslogtreecommitdiffstats
path: root/tests/vpp/device/flow/2n1l-10ge2p1x710-ethip4-flow-ip4-gtpu-scapy.robot
AgeCommit message (Expand)AuthorFilesLines
2022-06-28feat(telemetry): ReworkPeter Mikus1-2/+2
2021-12-13doc: fixing documentation structureViliam Luc1-7/+11
2021-10-08Add flow test suitesxinfeng zhao1-0/+123
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978
/*
 * Copyright (c) 2017 Intel and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or anated to in writing, software
 * distributed under the License is distributed on an "POD IS" BPODIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <kubeproxy/kp.h>
#include <vnet/plugin/plugin.h>
#include <vpp/app/version.h>
#include <vnet/api_errno.h>
#include <vnet/udp/udp.h>

//GC runs at most once every so many seconds
#define KP_GARBAGE_RUN 60

//After so many seconds. It is assumed that inter-core race condition will not occur.
#define KP_CONCURRENCY_TIMEOUT 10

kp_main_t kp_main;

#define kp_get_writer_lock() do {} while(__sync_lock_test_and_set (kp_main.writer_lock, 1))
#define kp_put_writer_lock() kp_main.writer_lock[0] = 0

static void kp_pod_stack (kp_pod_t *pod);

void ip46_prefix_normalize(ip46_address_t *prefix, u8 plen)
{
  if (plen == 0) {
    prefix->as_u64[0] = 0;
    prefix->as_u64[1] = 0;
  } else if (plen <= 64) {
    prefix->as_u64[0] &= clib_host_to_net_u64(0xffffffffffffffffL << (64 - plen));
    prefix->as_u64[1] = 0;
  } else {
    prefix->as_u64[1] &= clib_host_to_net_u64(0xffffffffffffffffL << (128 - plen));
  }

}

uword unformat_ip46_prefix (unformat_input_t * input, va_list * args)
{
  ip46_address_t *ip46 = va_arg (*args, ip46_address_t *);
  u8 *len = va_arg (*args, u8 *);
  ip46_type_t type = va_arg (*args, ip46_type_t);

  u32 l;
  if ((type != IP46_TYPE_IP6) && unformat(input, "%U/%u", unformat_ip4_address, &ip46->ip4, &l)) {
    if (l > 32)
      return 0;
    *len = l + 96;
    ip46->pad[0] = ip46->pad[1] = ip46->pad[2] = 0;
  } else if ((type != IP46_TYPE_IP4) && unformat(input, "%U/%u", unformat_ip6_address, &ip46->ip6, &l)) {
    if (l > 128)
      return 0;
    *len = l;
  } else {
    return 0;
  }
  return 1;
}

u8 *format_ip46_prefix (u8 * s, va_list * args)
{
  ip46_address_t *ip46 = va_arg (*args, ip46_address_t *);
  u32 len = va_arg (*args, u32); //va_arg cannot use u8 or u16
  ip46_type_t type = va_arg (*args, ip46_type_t);

  int is_ip4 = 0;
  if (type == IP46_TYPE_IP4)
    is_ip4 = 1;
  else if (type == IP46_TYPE_IP6)
    is_ip4 = 0;
  else
    is_ip4 = (len >= 96) && ip46_address_is_ip4(ip46);

  return is_ip4 ?
      format(s, "%U/%d", format_ip4_address, &ip46->ip4, len - 96):
      format(s, "%U/%d", format_ip6_address, &ip46->ip6, len);
}

const static char * const kp_dpo_nat4_ip4[] = { "kp4-nat4" , NULL };
const static char * const kp_dpo_nat4_ip6[] = { "kp6-nat4" , NULL };
const static char* const * const kp_dpo_nat4_nodes[DPO_PROTO_NUM] =
    {
	[DPO_PROTO_IP4]  = kp_dpo_nat4_ip4,
	[DPO_PROTO_IP6]  = kp_dpo_nat4_ip6,
    };

const static char * const kp_dpo_nat6_ip4[] = { "kp4-nat6" , NULL };
const static char * const kp_dpo_nat6_ip6[] = { "kp6-nat6" , NULL };
const static char* const * const kp_dpo_nat6_nodes[DPO_PROTO_NUM] =
    {
	[DPO_PROTO_IP4]  = kp_dpo_nat6_ip4,
	[DPO_PROTO_IP6]  = kp_dpo_nat6_ip6,
    };

u32 kp_hash_time_now(vlib_main_t * vm)
{
  return (u32) (vlib_time_now(vm) + 10000);
}

u8 *format_kp_main (u8 * s, va_list * args)
{
  vlib_thread_main_t *tm = vlib_get_thread_main();
  kp_main_t *kpm = &kp_main;
  s = format(s, "kp_main");
  s = format(s, " #vips: %u\n", pool_elts(kpm->vips));
  s = format(s, " #pods: %u\n", pool_elts(kpm->pods) - 1);

  u32 thread_index;
  for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) {
    kp_hash_t *h = kpm->per_cpu[thread_index].sticky_ht;
    if (h) {
      s = format(s, "core %d\n", thread_index);
      s = format(s, "  timeout: %ds\n", h->timeout);
      s = format(s, "  usage: %d / %d\n", kp_hash_elts(h, kp_hash_time_now(vlib_get_main())),  kp_hash_size(h));
    }
  }

  return s;
}

static char *kp_vip_type_strings[] = {
    [KP_VIP_TYPE_IP4_NAT44] = "ip4-nat44",
    [KP_VIP_TYPE_IP4_NAT46] = "ip4-nat46",
    [KP_VIP_TYPE_IP6_NAT64] = "ip6-nat64",
    [KP_VIP_TYPE_IP6_NAT66] = "ip6-nat66",
};

u8 *format_kp_vip_type (u8 * s, va_list * args)
{
  kp_vip_type_t vipt = va_arg (*args, kp_vip_type_t);
  u32 i;
  for (i=0; i<KP_VIP_N_TYPES; i++)
    if (vipt == i)
      return format(s, kp_vip_type_strings[i]);
  return format(s, "_WRONG_TYPE_");
}

uword unformat_kp_vip_type (unformat_input_t * input, va_list * args)
{
  kp_vip_type_t *vipt = va_arg (*args, kp_vip_type_t *);
  u32 i;
  for (i=0; i<KP_VIP_N_TYPES; i++)
    if (unformat(input, kp_vip_type_strings[i])) {
      *vipt = i;
      return 1;
    }
  return 0;
}

u8 *format_kp_vip (u8 * s, va_list * args)
{
  kp_vip_t *vip = va_arg (*args, kp_vip_t *);
  return format(s, "%U %U port:%u target_port:%u node_port:%u "
                   "new_size:%u #pod:%u%s",
             format_kp_vip_type, vip->type,
             format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY,
	     ntohs(vip->port), ntohs(vip->target_port),
	     ntohs(vip->node_port),
             vip->new_flow_table_mask + 1,
             pool_elts(vip->pod_indexes),
             (vip->flags & KP_VIP_FLAGS_USED)?"":" removed");
}

u8 *format_kp_pod (u8 * s, va_list * args)
{
  kp_pod_t *pod = va_arg (*args, kp_pod_t *);
  return format(s, "%U %s", format_ip46_address,
		&pod->address, IP46_TYPE_ANY,
		(pod->flags & KP_POD_FLAGS_USED)?"used":"removed");
}

u8 *format_kp_vip_detailed (u8 * s, va_list * args)
{
  kp_main_t *kpm = &kp_main;
  kp_vip_t *vip = va_arg (*args, kp_vip_t *);
  uword indent = format_get_indent (s);

  s = format(s, "%U %U [%u] %U port:%u target_port:%u node_port:%u%s\n"
                   "%U  new_size:%u\n",
                  format_white_space, indent,
                  format_kp_vip_type, vip->type,
                  vip - kpm->vips, format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY,
		  ntohs(vip->port), ntohs(vip->target_port),
		  ntohs(vip->node_port),
                  (vip->flags & KP_VIP_FLAGS_USED)?"":" removed",
                  format_white_space, indent,
                  vip->new_flow_table_mask + 1);

  //Print counters
  s = format(s, "%U  counters:\n",
             format_white_space, indent);
  u32 i;
  for (i=0; i<KP_N_VIP_COUNTERS; i++)
    s = format(s, "%U    %s: %d\n",
               format_white_space, indent,
               kpm->vip_counters[i].name,
               vlib_get_simple_counter(&kpm->vip_counters[i], vip - kpm->vips));


  s = format(s, "%U  #pod:%u\n",
             format_white_space, indent,
             pool_elts(vip->pod_indexes));

  //Let's count the buckets for each POD
  u32 *count = 0;
  vec_validate(count, pool_len(kpm->pods)); //Possibly big alloc for not much...
  kp_new_flow_entry_t *nfe;
  vec_foreach(nfe, vip->new_flow_table)
    count[nfe->pod_index]++;

  kp_pod_t *pod;
  u32 *pod_index;
  pool_foreach(pod_index, vip->pod_indexes, {
      pod = &kpm->pods[*pod_index];
      s = format(s, "%U    %U %d buckets   %d flows  dpo:%u %s\n",
                   format_white_space, indent,
                   format_ip46_address, &pod->address, IP46_TYPE_ANY,
                   count[pod - kpm->pods],
                   vlib_refcount_get(&kpm->pod_refcount, pod - kpm->pods),
                   pod->dpo.dpoi_index,
                   (pod->flags & KP_POD_FLAGS_USED)?"used":" removed");
  });

  vec_free(count);

  /*
  s = format(s, "%U  new flows table:\n", format_white_space, indent);
  kp_new_flow_entry_t *nfe;
  vec_foreach(nfe, vip->new_flow_table) {
    s = format(s, "%U    %d: %d\n", format_white_space, indent, nfe - vip->new_flow_table, nfe->pod_index);
  }
  */
  return s;
}

typedef struct {
  u32 pod_index;
  u32 last;
  u32 skip;
} kp_pseudorand_t;

static int kp_pseudorand_compare(void *a, void *b)
{
  kp_pod_t *poda, *podb;
  kp_main_t *kpm = &kp_main;
  poda = &kpm->pods[((kp_pseudorand_t *)a)->pod_index];
  podb = &kpm->pods[((kp_pseudorand_t *)b)->pod_index];
  return memcmp(&poda->address, &podb->address, sizeof(podb->address));
}

static void kp_vip_garbage_collection(kp_vip_t *vip)
{
  kp_main_t *kpm = &kp_main;
  ASSERT (kpm->writer_lock[0]);

  u32 now = (u32) vlib_time_now(vlib_get_main());
  if (!clib_u32_loop_gt(now, vip->last_garbage_collection + KP_GARBAGE_RUN))
    return;

  vip->last_garbage_collection = now;
  kp_pod_t *pod;
  u32 *pod_index;
  pool_foreach(pod_index, vip->pod_indexes, {
      pod = &kpm->pods[*pod_index];
      if (!(pod->flags & KP_POD_FLAGS_USED) && //Not used
	  clib_u32_loop_gt(now, pod->last_used + KP_CONCURRENCY_TIMEOUT) && //Not recently used
	  (vlib_refcount_get(&kpm->pod_refcount, pod - kpm->pods) == 0))
	{ //Not referenced
	  fib_entry_child_remove(pod->next_hop_fib_entry_index,
				 pod->next_hop_child_index);
	  fib_table_entry_delete_index(pod->next_hop_fib_entry_index,
				       FIB_SOURCE_RR);
	  pod->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID;

	  pool_put(vip->pod_indexes, pod_index);
	  pool_put(kpm->pods, pod);
	}
  });
}

void kp_garbage_collection()
{
  kp_main_t *kpm = &kp_main;
  kp_get_writer_lock();
  kp_vip_t *vip;
  u32 *to_be_removed_vips = 0, *i;
  pool_foreach(vip, kpm->vips, {
      kp_vip_garbage_collection(vip);

      if (!(vip->flags & KP_VIP_FLAGS_USED) &&
          (pool_elts(vip->pod_indexes) == 0)) {
        vec_add1(to_be_removed_vips, vip - kpm->vips);
      }
  });

  vec_foreach(i, to_be_removed_vips) {
    vip = &kpm->vips[*i];
    pool_put(kpm->vips, vip);
    pool_free(vip->pod_indexes);
  }

  vec_free(to_be_removed_vips);
  kp_put_writer_lock();
}

static void kp_vip_update_new_flow_table(kp_vip_t *vip)
{
  kp_main_t *kpm = &kp_main;
  kp_new_flow_entry_t *old_table;
  u32 i, *pod_index;
  kp_new_flow_entry_t *new_flow_table = 0;
  kp_pod_t *pod;
  kp_pseudorand_t *pr, *sort_arr = 0;
  u32 count;

  ASSERT (kpm->writer_lock[0]); //We must have the lock

  //Check if some POD is configured or not
  i = 0;
  pool_foreach(pod_index, vip->pod_indexes, {
      pod = &kpm->pods[*pod_index];
      if (pod->flags & KP_POD_FLAGS_USED) { //Not used anymore
        i = 1;
        goto out; //Not sure 'break' works in this macro-loop
      }
  });

out:
  if (i == 0) {
    //Only the default. i.e. no POD
    vec_validate(new_flow_table, vip->new_flow_table_mask);
    for (i=0; i<vec_len(new_flow_table); i++)
      new_flow_table[i].pod_index = 0;

    goto finished;
  }

  //First, let's sort the PODs
  sort_arr = 0;
  vec_alloc(sort_arr, pool_elts(vip->pod_indexes));

  i = 0;
  pool_foreach(pod_index, vip->pod_indexes, {
      pod = &kpm->pods[*pod_index];
      if (!(pod->flags & KP_POD_FLAGS_USED)) //Not used anymore
        continue;

      sort_arr[i].pod_index = pod - kpm->pods;
      i++;
  });
  _vec_len(sort_arr) = i;

  vec_sort_with_function(sort_arr, kp_pseudorand_compare);

  //Now let's pseudo-randomly generate permutations
  vec_foreach(pr, sort_arr) {
    kp_pod_t *pod = &kpm->pods[pr->pod_index];

    u64 seed = clib_xxhash(pod->address.as_u64[0] ^
                           pod->address.as_u64[1]);
    /* We have 2^n buckets.
     * skip must be prime with 2^n.
     * So skip must be odd.
     * MagLev actually state that M should be prime,
     * but this has a big computation cost (% operation).
     * Using 2^n is more better (& operation).
     */
    pr->skip = ((seed & 0xffffffff) | 1) & vip->new_flow_table_mask;
    pr->last = (seed >> 32) & vip->new_flow_table_mask;
  }

  //Let's create a new flow table
  vec_validate(new_flow_table, vip->new_flow_table_mask);
  for (i=0; i<vec_len(new_flow_table); i++)
    new_flow_table[i].pod_index = ~0;

  u32 done = 0;
  while (1) {
    vec_foreach(pr, sort_arr) {
      while (1) {
        u32 last = pr->last;
        pr->last = (pr->last + pr->skip) & vip->new_flow_table_mask;
        if (new_flow_table[last].pod_index == ~0) {
          new_flow_table[last].pod_index = pr->pod_index;
          break;
        }
      }
      done++;
      if (done == vec_len(new_flow_table))
        goto finished;
    }
  }

  vec_free(sort_arr);

finished:

//Count number of changed entries
  count = 0;
  for (i=0; i<vec_len(new_flow_table); i++)
    if (vip->new_flow_table == 0 ||
        new_flow_table[i].pod_index != vip->new_flow_table[i].pod_index)
      count++;

  old_table = vip->new_flow_table;
  vip->new_flow_table = new_flow_table;
  vec_free(old_table);
}

int kp_conf(u32 per_cpu_sticky_buckets, u32 flow_timeout)
{
  kp_main_t *kpm = &kp_main;

  if (!is_pow2(per_cpu_sticky_buckets))
    return VNET_API_ERROR_INVALID_MEMORY_SIZE;

  kp_get_writer_lock(); //Not exactly necessary but just a reminder that it exists for my future self
  kpm->per_cpu_sticky_buckets = per_cpu_sticky_buckets;
  kpm->flow_timeout = flow_timeout;
  kp_put_writer_lock();
  return 0;
}

static
int kp_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index)
{
  kp_main_t *kpm = &kp_main;
  kp_vip_t *vip;
  ASSERT (kpm->writer_lock[0]); //This must be called with the lock owned
  ip46_prefix_normalize(prefix, plen);
  pool_foreach(vip, kpm->vips, {
      if ((vip->flags & KP_POD_FLAGS_USED) &&
          vip->plen == plen &&
          vip->prefix.as_u64[0] == prefix->as_u64[0] &&
          vip->prefix.as_u64[1] == prefix->as_u64[1]) {
        *vip_index = vip - kpm->vips;
        return 0;
      }
  });
  return VNET_API_ERROR_NO_SUCH_ENTRY;
}

int kp_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index)
{
  int ret;
  kp_get_writer_lock();
  ret = kp_vip_find_index_with_lock(prefix, plen, vip_index);
  kp_put_writer_lock();
  return ret;
}

static int kp_pod_find_index_vip(kp_vip_t *vip, ip46_address_t *address, u32 *pod_index)
{
  kp_main_t *kpm = &kp_main;
  ASSERT (kpm->writer_lock[0]); //This must be called with the lock owned
  kp_pod_t *pod;
  u32 *podi;
  pool_foreach(podi, vip->pod_indexes, {
      pod = &kpm->pods[*podi];
      if (pod->vip_index == (vip - kpm->vips) &&
          pod->address.as_u64[0] == address->as_u64[0] &&
          pod->address.as_u64[1] == address->as_u64[1]) {
        *pod_index = pod - kpm->pods;
        return 0;
      }
  });
  return -1;
}

int kp_vip_add_pods(u32 vip_index, ip46_address_t *addresses, u32 n)
{
  kp_main_t *kpm = &kp_main;
  kp_get_writer_lock();
  kp_vip_t *vip;
  if (!(vip = kp_vip_get_by_index(vip_index))) {
    kp_put_writer_lock();
    return VNET_API_ERROR_NO_SUCH_ENTRY;
  }

  ip46_type_t type = kp_vip_is_nat4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6;
  u32 *to_be_added = 0;
  u32 *to_be_updated = 0;
  u32 i;
  u32 *ip;
  kp_snat_mapping_t *m;
  kp_snat4_key_t m_key4;
  clib_bihash_kv_8_8_t kv;

  //Sanity check
  while (n--) {

    if (!kp_pod_find_index_vip(vip, &addresses[n], &i)) {
      if (kpm->pods[i].flags & KP_POD_FLAGS_USED) {
        vec_free(to_be_added);
        vec_free(to_be_updated);
        kp_put_writer_lock();
        return VNET_API_ERROR_VALUE_EXIST;
      }
      vec_add1(to_be_updated, i);
      goto next;
    }

    if (ip46_address_type(&addresses[n]) != type) {
      vec_free(to_be_added);
      vec_free(to_be_updated);
      kp_put_writer_lock();
      return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
    }

    if (n) {
      u32 n2 = n;
      while(n2--) //Check for duplicates
        if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] &&
            addresses[n2].as_u64[1] == addresses[n].as_u64[1])
          goto next;
    }

    vec_add1(to_be_added, n);

next:
    continue;
  }

  //Update reused PODs
  vec_foreach(ip, to_be_updated) {
    kpm->pods[*ip].flags = KP_POD_FLAGS_USED;
  }
  vec_free(to_be_updated);

  //Create those who have to be created
  vec_foreach(ip, to_be_added) {
    kp_pod_t *pod;
    u32 *pod_index;
    pool_get(kpm->pods, pod);
    pod->address = addresses[*ip];
    pod->flags = KP_POD_FLAGS_USED;
    pod->vip_index = vip_index;
    pool_get(vip->pod_indexes, pod_index);
    *pod_index = pod - kpm->pods;

    /*
     * become a child of the FIB entry
     * so we are informed when its forwarding changes
     */
    fib_prefix_t nh = {};
    if (kp_vip_is_nat4(vip)) {
	nh.fp_addr.ip4 = pod->address.ip4;
	nh.fp_len = 32;
	nh.fp_proto = FIB_PROTOCOL_IP4;
    } else {
	nh.fp_addr.ip6 = pod->address.ip6;
	nh.fp_len = 128;
	nh.fp_proto = FIB_PROTOCOL_IP6;
    }

    pod->next_hop_fib_entry_index =
	fib_table_entry_special_add(0,
				    &nh,
				    FIB_SOURCE_RR,
				    FIB_ENTRY_FLAG_NONE);
    pod->next_hop_child_index =
	fib_entry_child_add(pod->next_hop_fib_entry_index,
			    kpm->fib_node_type,
			    pod - kpm->pods);

    kp_pod_stack(pod);

    /* Add SNAT static mapping */
    pool_get (kpm->snat_mappings, m);
    memset (m, 0, sizeof (*m));
    if (kp_vip_is_nat4(vip)) {
	m_key4.addr = pod->address.ip4;
	m_key4.port = vip->target_port;
	m_key4.protocol = 0;
	m_key4.fib_index = 0;

        m->vip.ip4 = vip->prefix.ip4;;
        m->node_ip.ip4.as_u32 = 0;
        m->pod_ip.ip4 = pod->address.ip4;
        m->vip_is_ipv6 = 0;
        m->node_ip_is_ipv6 = 0;
        m->pod_ip_is_ipv6 = 0;
        m->port = vip->port;
        m->node_port = vip->node_port;
        m->target_port = vip->target_port;
	m->vrf_id = 0;
	m->fib_index = 0;

	kv.key = m_key4.as_u64;
	kv.value = m - kpm->snat_mappings;
	clib_bihash_add_del_8_8(&kpm->mapping_by_pod, &kv, 1);
    } else {
	/* TBD */
    }

  }
  vec_free(to_be_added);

  //Recompute flows
  kp_vip_update_new_flow_table(vip);

  //Garbage collection maybe
  kp_vip_garbage_collection(vip);

  kp_put_writer_lock();
  return 0;
}

int kp_vip_del_pods_withlock(u32 vip_index, ip46_address_t *addresses, u32 n)
{
  kp_main_t *kpm = &kp_main;
  u32 now = (u32) vlib_time_now(vlib_get_main());
  u32 *ip = 0;

  kp_vip_t *vip;
  if (!(vip = kp_vip_get_by_index(vip_index))) {
    return VNET_API_ERROR_NO_SUCH_ENTRY;
  }

  u32 *indexes = NULL;
  while (n--) {
    u32 i;
    if (kp_pod_find_index_vip(vip, &addresses[n], &i)) {
      vec_free(indexes);
      return VNET_API_ERROR_NO_SUCH_ENTRY;
    }

    if (n) { //Check for duplicates
      u32 n2 = n - 1;
      while(n2--) {
        if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] &&
            addresses[n2].as_u64[1] == addresses[n].as_u64[1])
          goto next;
      }
    }

    vec_add1(indexes, i);
next:
  continue;
  }

  //Garbage collection maybe
  kp_vip_garbage_collection(vip);

  if (indexes != NULL) {
    vec_foreach(ip, indexes) {
      kpm->pods[*ip].flags &= ~KP_POD_FLAGS_USED;
      kpm->pods[*ip].last_used = now;
    }

    //Recompute flows
    kp_vip_update_new_flow_table(vip);
  }

  vec_free(indexes);
  return 0;
}

int kp_vip_del_pods(u32 vip_index, ip46_address_t *addresses, u32 n)
{
  kp_get_writer_lock();
  int ret = kp_vip_del_pods_withlock(vip_index, addresses, n);
  kp_put_writer_lock();
  return ret;
}

/**
 * Add the VIP adjacency to the ip4 or ip6 fib
 */
static void kp_vip_add_adjacency(kp_main_t *kpm, kp_vip_t *vip)
{
  dpo_proto_t proto = 0;
  dpo_id_t dpo = DPO_INVALID;
  fib_prefix_t pfx = {};
  if (kp_vip_is_ip4(vip)) {
      pfx.fp_addr.ip4 = vip->prefix.ip4;
      pfx.fp_len = vip->plen - 96;
      pfx.fp_proto = FIB_PROTOCOL_IP4;
      proto = DPO_PROTO_IP4;
  } else {
      pfx.fp_addr.ip6 = vip->prefix.ip6;
      pfx.fp_len = vip->plen;
      pfx.fp_proto = FIB_PROTOCOL_IP6;
      proto = DPO_PROTO_IP6;
  }
  dpo_set(&dpo, kp_vip_is_nat4(vip)?kpm->dpo_nat4_type:kpm->dpo_nat6_type,
      proto, vip - kpm->vips);
  fib_table_entry_special_dpo_add(0,
				  &pfx,
				  FIB_SOURCE_PLUGIN_HI,
				  FIB_ENTRY_FLAG_EXCLUSIVE,
				  &dpo);
  dpo_reset(&dpo);
}

/**
 * Deletes the adjacency podsociated with the VIP
 */
static void kp_vip_del_adjacency(kp_main_t *kpm, kp_vip_t *vip)
{
  fib_prefix_t pfx = {};
  if (kp_vip_is_ip4(vip)) {
      pfx.fp_addr.ip4 = vip->prefix.ip4;
      pfx.fp_len = vip->plen - 96;
      pfx.fp_proto = FIB_PROTOCOL_IP4;
  } else {
      pfx.fp_addr.ip6 = vip->prefix.ip6;
      pfx.fp_len = vip->plen;
      pfx.fp_proto = FIB_PROTOCOL_IP6;
  }
  fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI);
}

int kp_vip_add(ip46_address_t *prefix, u8 plen, kp_vip_type_t type,
	       u32 new_length, u32 *vip_index,
	       u16 port, u16 target_port, u16 node_port)
{
  kp_main_t *kpm = &kp_main;
  vlib_main_t *vm = kpm->vlib_main;
  kp_vip_t *vip;
  u32 key, *key_copy;
  uword * entry;

  kp_get_writer_lock();
  ip46_prefix_normalize(prefix, plen);

  if (!kp_vip_find_index_with_lock(prefix, plen, vip_index)) {
    kp_put_writer_lock();
    return VNET_API_ERROR_VALUE_EXIST;
  }

  if (!is_pow2(new_length)) {
    kp_put_writer_lock();
    return VNET_API_ERROR_INVALID_MEMORY_SIZE;
  }

  if (ip46_prefix_is_ip4(prefix, plen) &&
      (type != KP_VIP_TYPE_IP4_NAT44) &&
      (type != KP_VIP_TYPE_IP4_NAT46)) {
    kp_put_writer_lock();
    return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
  }


  //Allocate
  pool_get(kpm->vips, vip);

  //Init
  vip->prefix = *prefix;
  vip->plen = plen;
  vip->port = clib_host_to_net_u16(port);
  vip->target_port = clib_host_to_net_u16(target_port);
  vip->node_port = clib_host_to_net_u16(node_port);
  vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main());
  vip->type = type;
  vip->flags = KP_VIP_FLAGS_USED;
  vip->pod_indexes = 0;

  //Validate counters
  u32 i;
  for (i = 0; i < KP_N_VIP_COUNTERS; i++) {
    vlib_validate_simple_counter(&kpm->vip_counters[i], vip - kpm->vips);
    vlib_zero_simple_counter(&kpm->vip_counters[i], vip - kpm->vips);
  }

  //Configure new flow table
  vip->new_flow_table_mask = new_length - 1;
  vip->new_flow_table = 0;

  //Create a new flow hash table full of the default entry
  kp_vip_update_new_flow_table(vip);

  //Create adjacency to direct traffic
  kp_vip_add_adjacency(kpm, vip);

  //Create maping from nodeport to vip_index
  key = clib_host_to_net_u16(node_port);
  entry = hash_get_mem (kpm->nodeport_by_key, &key);
  if (entry) {
    kp_put_writer_lock();
    return VNET_API_ERROR_VALUE_EXIST;
  }

  key_copy = clib_mem_alloc (sizeof (*key_copy));
  clib_memcpy (key_copy, &key, sizeof (*key_copy));
  hash_set_mem (kpm->nodeport_by_key, key_copy, vip - kpm->vips);

  /* receive packets destined to NodeIP:NodePort */
  udp_register_dst_port (vm, node_port, kp4_nodeport_node.index, 1);
  udp_register_dst_port (vm, node_port, kp6_nodeport_node.index, 0);

  //Return result
  *vip_index = vip - kpm->vips;

  kp_put_writer_lock();
  return 0;
}

int kp_vip_del(u32 vip_index)
{
  kp_main_t *kpm = &kp_main;
  kp_vip_t *vip;
  kp_get_writer_lock();
  if (!(vip = kp_vip_get_by_index(vip_index))) {
    kp_put_writer_lock();
    return VNET_API_ERROR_NO_SUCH_ENTRY;
  }

  //FIXME: This operation is actually not working
  //We will need to remove state before performing this.

  {
    //Remove all PODs
    ip46_address_t *pods = 0;
    kp_pod_t *pod;
    u32 *pod_index;
    pool_foreach(pod_index, vip->pod_indexes, {
        pod = &kpm->pods[*pod_index];
        vec_add1(pods, pod->address);
    });
    if (vec_len(pods))
      kp_vip_del_pods_withlock(vip_index, pods, vec_len(pods));
    vec_free(pods);
  }

  //Delete adjacency
  kp_vip_del_adjacency(kpm, vip);

  //Set the VIP pod unused
  vip->flags &= ~KP_VIP_FLAGS_USED;

  kp_put_writer_lock();
  return 0;
}

/* *INDENT-OFF* */
VLIB_PLUGIN_REGISTER () = {
    .version = VPP_BUILD_VER,
    .description = "kube-proxy data plane",
};
/* *INDENT-ON* */

u8 *format_kp_dpo (u8 * s, va_list * va)
{
  index_t index = va_arg (*va, index_t);
  CLIB_UNUSED(u32 indent) = va_arg (*va, u32);
  kp_main_t *kpm = &kp_main;
  kp_vip_t *vip = pool_elt_at_index (kpm->vips, index);
  return format (s, "%U", format_kp_vip, vip);
}

static void kp_dpo_lock (dpo_id_t *dpo) {}
static void kp_dpo_unlock (dpo_id_t *dpo) {}

static fib_node_t *
kp_fib_node_get_node (fib_node_index_t index)
{
  kp_main_t *kpm = &kp_main;
  kp_pod_t *pod = pool_elt_at_index (kpm->pods, index);
  return (&pod->fib_node);
}

static void
kp_fib_node_last_lock_gone (fib_node_t *node)
{
}

static kp_pod_t *
kp_pod_from_fib_node (fib_node_t *node)
{
  return ((kp_pod_t*)(((char*)node) -
      STRUCT_OFFSET_OF(kp_pod_t, fib_node)));
}

static void
kp_pod_stack (kp_pod_t *pod)
{
  kp_main_t *kpm = &kp_main;
  kp_vip_t *vip = &kpm->vips[pod->vip_index];
  dpo_stack(kp_vip_is_nat4(vip)?kpm->dpo_nat4_type:kpm->dpo_nat6_type,
	    kp_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6,
	    &pod->dpo,
	    fib_entry_contribute_ip_forwarding(
		pod->next_hop_fib_entry_index));
}

static fib_node_back_walk_rc_t
kp_fib_node_back_walk_notify (fib_node_t *node,
			       fib_node_back_walk_ctx_t *ctx)
{
    kp_pod_stack(kp_pod_from_fib_node(node));
    return (FIB_NODE_BACK_WALK_CONTINUE);
}

int kp_nat4_interface_add_del (u32 sw_if_index, int is_del)
{
  if (is_del)
    {
      vnet_feature_enable_disable ("ip4-unicast", "kp-nat4-in2out",
                                   sw_if_index, 0, 0, 0);
    }
  else
    {
      vnet_feature_enable_disable ("ip4-unicast", "kp-nat4-in2out",
                                   sw_if_index, 1, 0, 0);
    }

  return 0;
}

clib_error_t *
kp_init (vlib_main_t * vm)
{
  vlib_thread_main_t *tm = vlib_get_thread_main ();
  kp_main_t *kpm = &kp_main;
  kpm->vnet_main = vnet_get_main ();
  kpm->vlib_main = vm;

  kp_pod_t *default_pod;
  fib_node_vft_t kp_fib_node_vft = {
      .fnv_get = kp_fib_node_get_node,
      .fnv_last_lock = kp_fib_node_last_lock_gone,
      .fnv_back_walk = kp_fib_node_back_walk_notify,
  };
  dpo_vft_t kp_vft = {
      .dv_lock = kp_dpo_lock,
      .dv_unlock = kp_dpo_unlock,
      .dv_format = format_kp_dpo,
  };

  kpm->vips = 0;
  kpm->per_cpu = 0;
  vec_validate(kpm->per_cpu, tm->n_vlib_mains - 1);
  kpm->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,  CLIB_CACHE_LINE_BYTES);
  kpm->writer_lock[0] = 0;
  kpm->per_cpu_sticky_buckets = KP_DEFAULT_PER_CPU_STICKY_BUCKETS;
  kpm->flow_timeout = KP_DEFAULT_FLOW_TIMEOUT;
  kpm->dpo_nat4_type = dpo_register_new_type(&kp_vft, kp_dpo_nat4_nodes);
  kpm->dpo_nat6_type = dpo_register_new_type(&kp_vft, kp_dpo_nat6_nodes);
  kpm->fib_node_type = fib_node_register_new_type(&kp_fib_node_vft);

  //Init POD reference counters
  vlib_refcount_init(&kpm->pod_refcount);

  //Allocate and init default POD.
  kpm->pods = 0;
  pool_get(kpm->pods, default_pod);
  default_pod->flags = 0;
  default_pod->dpo.dpoi_next_node = KP_NEXT_DROP;
  default_pod->vip_index = ~0;
  default_pod->address.ip6.as_u64[0] = 0xffffffffffffffffL;
  default_pod->address.ip6.as_u64[1] = 0xffffffffffffffffL;

  kpm->nodeport_by_key
    = hash_create_mem (0, sizeof(u16), sizeof (uword));

  clib_bihash_init_8_8 (&kpm->mapping_by_pod,
                        "mapping_by_pod", KP_MAPPING_BUCKETS,
			KP_MAPPING_MEMORY_SIZE);

#define _(a,b,c) kpm->vip_counters[c].name = b;
  kp_foreach_vip_counter
#undef _
  return NULL;
}

VLIB_INIT_FUNCTION (kp_init);