/*
 * ethernet/arp.c: IP v4 ARP node
 *
 * Copyright (c) 2010 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vnet/ip/ip.h>
#include <vnet/ip/ip6.h>
#include <vnet/ethernet/ethernet.h>
#include <vnet/ethernet/arp.h>
#include <vnet/l2/l2_input.h>
#include <vppinfra/mhash.h>
#include <vnet/fib/ip4_fib.h>
#include <vnet/fib/fib_entry_src.h>
#include <vnet/adj/adj_nbr.h>
#include <vnet/adj/adj_mcast.h>
#include <vnet/mpls/mpls.h>

/**
 * @file
 * @brief IPv4 ARP.
 *
 * This file contains code to manage the IPv4 ARP tables (IP Address
 * to MAC Address lookup).
 */


void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length);

/**
 * @brief Per-interface ARP configuration and state
 */
typedef struct ethernet_arp_interface_t_
{
  /**
   * Hash table of ARP entries.
   * Since this hash table is per-interface, the key is only the IPv4 address.
   */
  uword *arp_entries;
} ethernet_arp_interface_t;

typedef struct
{
  ip4_address_t lo_addr;
  ip4_address_t hi_addr;
  u32 fib_index;
} ethernet_proxy_arp_t;

typedef struct
{
  u32 next_index;
  uword node_index;
  uword type_opaque;
  uword data;
  /* Used for arp event notification only */
  void *data_callback;
  u32 pid;
} pending_resolution_t;

typedef struct
{
  /* Hash tables mapping name to opcode. */
  uword *opcode_by_name;

  /* lite beer "glean" adjacency handling */
  uword *pending_resolutions_by_address;
  pending_resolution_t *pending_resolutions;

  /* Mac address change notification */
  uword *mac_changes_by_address;
  pending_resolution_t *mac_changes;

  ethernet_arp_ip4_entry_t *ip4_entry_pool;

  /* ARP attack mitigation */
  u32 arp_delete_rotor;
  u32 limit_arp_cache_size;

  /** Per interface state */
  ethernet_arp_interface_t *ethernet_arp_by_sw_if_index;

  /* Proxy arp vector */
  ethernet_proxy_arp_t *proxy_arps;

  uword wc_ip4_arp_publisher_node;
  uword wc_ip4_arp_publisher_et;
} ethernet_arp_main_t;

static ethernet_arp_main_t ethernet_arp_main;

typedef struct
{
  u32 sw_if_index;
  ethernet_arp_ip4_over_ethernet_address_t a;
  int is_static;
  int is_no_fib_entry;
  int flags;
#define ETHERNET_ARP_ARGS_REMOVE (1<<0)
#define ETHERNET_ARP_ARGS_FLUSH  (1<<1)
#define ETHERNET_ARP_ARGS_POPULATE  (1<<2)
#define ETHERNET_ARP_ARGS_WC_PUB  (1<<3)
} vnet_arp_set_ip4_over_ethernet_rpc_args_t;

static const u8 vrrp_prefix[] = { 0x00, 0x00, 0x5E, 0x00, 0x01 };

/* Node index for send_garp_na_process */
u32 send_garp_na_process_node_index;

static void
set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t
				    * a);

static u8 *
format_ethernet_arp_hardware_type (u8 * s, va_list * va)
{
  ethernet_arp_hardware_type_t h = va_arg (*va, ethernet_arp_hardware_type_t);
  char *t = 0;
  switch (h)
    {
#define _(n,f) case n: t = #f; break;
      foreach_ethernet_arp_hardware_type;
#undef _

    default:
      return format (s, "unknown 0x%x", h);
    }

  return format (s, "%s", t);
}

static u8 *
format_ethernet_arp_opcode (u8 * s, va_list * va)
{
  ethernet_arp_opcode_t o = va_arg (*va, ethernet_arp_opcode_t);
  char *t = 0;
  switch (o)
    {
#define _(f) case ETHERNET_ARP_OPCODE_##f: t = #f; break;
      foreach_ethernet_arp_opcode;
#undef _

    default:
      return format (s, "unknown 0x%x", o);
    }

  return format (s, "%s", t);
}

static uword
unformat_ethernet_arp_opcode_host_byte_order (unformat_input_t * input,
					      va_list * args)
{
  int *result = va_arg (*args, int *);
  ethernet_arp_main_t *am = &ethernet_arp_main;
  int x, i;

  /* Numeric opcode. */
  if (unformat (input, "0x%x", &x) || unformat (input, "%d", &x))
    {
      if (x >= (1 << 16))
	return 0;
      *result = x;
      return 1;
    }

  /* Named type. */
  if (unformat_user (input, unformat_vlib_number_by_name,
		     am->opcode_by_name, &i))
    {
      *result = i;
      return 1;
    }

  return 0;
}

static uword
unformat_ethernet_arp_opcode_net_byte_order (unformat_input_t * input,
					     va_list * args)
{
  int *result = va_arg (*args, int *);
  if (!unformat_user
      (input, unformat_ethernet_arp_opcode_host_byte_order, result))
    return 0;

  *result = clib_host_to_net_u16 ((u16) * result);
  return 1;
}

static u8 *
format_ethernet_arp_header (u8 * s, va_list * va)
{
  ethernet_arp_header_t *a = va_arg (*va, ethernet_arp_header_t *);
  u32 max_header_bytes = va_arg (*va, u32);
  u32 indent;
  u16 l2_type, l3_type;

  if (max_header_bytes != 0 && sizeof (a[0]) > max_header_bytes)
    return format (s, "ARP header truncated");

  l2_type = clib_net_to_host_u16 (a->l2_type);
  l3_type = clib_net_to_host_u16 (a->l3_type);

  indent = format_get_indent (s);

  s = format (s, "%U, type %U/%U, address size %d/%d",
	      format_ethernet_arp_opcode, clib_net_to_host_u16 (a->opcode),
	      format_ethernet_arp_hardware_type, l2_type,
	      format_ethernet_type, l3_type,
	      a->n_l2_address_bytes, a->n_l3_address_bytes);

  if (l2_type == ETHERNET_ARP_HARDWARE_TYPE_ethernet
      && l3_type == ETHERNET_TYPE_IP4)
    {
      s = format (s, "\n%U%U/%U -> %U/%U",
		  format_white_space, indent,
		  format_ethernet_address, a->ip4_over_ethernet[0].ethernet,
		  format_ip4_address, &a->ip4_over_ethernet[0].ip4,
		  format_ethernet_address, a->ip4_over_ethernet[1].ethernet,
		  format_ip4_address, &a->ip4_over_ethernet[1].ip4);
    }
  else
    {
      uword n2 = a->n_l2_address_bytes;
      uword n3 = a->n_l3_address_bytes;
      s = format (s, "\n%U%U/%U -> %U/%U",
		  format_white_space, indent,
		  format_hex_bytes, a->data + 0 * n2 + 0 * n3, n2,
		  format_hex_bytes, a->data + 1 * n2 + 0 * n3, n3,
		  format_hex_bytes, a->data + 1 * n2 + 1 * n3, n2,
		  format_hex_bytes, a->data + 2 * n2 + 1 * n3, n3);
    }

  return s;
}

u8 *
format_ethernet_arp_ip4_entry (u8 * s, va_list * va)
{
  vnet_main_t *vnm = va_arg (*va, vnet_main_t *);
  ethernet_arp_ip4_entry_t *e = va_arg (*va, ethernet_arp_ip4_entry_t *);
  vnet_sw_interface_t *si;
  u8 *flags = 0;

  if (!e)
    return format (s, "%=12s%=16s%=6s%=20s%=24s", "Time", "IP4",
		   "Flags", "Ethernet", "Interface");

  si = vnet_get_sw_interface (vnm, e->sw_if_index);

  if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC)
    flags = format (flags, "S");

  if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC)
    flags = format (flags, "D");

  if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_NO_FIB_ENTRY)
    flags = format (flags, "N");

  s = format (s, "%=12U%=16U%=6s%=20U%U",
	      format_vlib_time, vnm->vlib_main, e->time_last_updated,
	      format_ip4_address, &e->ip4_address,
	      flags ? (char *) flags : "",
	      format_ethernet_address, e->ethernet_address,
	      format_vnet_sw_interface_name, vnm, si);

  vec_free (flags);
  return s;
}

typedef struct
{
  u8 packet_data[64];
} ethernet_arp_input_trace_t;

static u8 *
format_ethernet_arp_input_trace (u8 * s, va_list * va)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
  ethernet_arp_input_trace_t *t = va_arg (*va, ethernet_arp_input_trace_t *);

  s = format (s, "%U",
	      format_ethernet_arp_header,
	      t->packet_data, sizeof (t->packet_data));

  return s;
}

static u8 *
format_arp_term_input_trace (u8 * s, va_list * va)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
  ethernet_arp_input_trace_t *t = va_arg (*va, ethernet_arp_input_trace_t *);

  /* arp-term trace data saved is either arp or ip6/icmp6 packet:
     - for arp, the 1st 16-bit field is hw type of value of 0x0001.
     - for ip6, the first nibble has value of 6. */
  s = format (s, "%U", t->packet_data[0] == 0 ?
	      format_ethernet_arp_header : format_ip6_header,
	      t->packet_data, sizeof (t->packet_data));

  return s;
}

static void
arp_nbr_probe (ip_adjacency_t * adj)
{
  vnet_main_t *vnm = vnet_get_main ();
  ip4_main_t *im = &ip4_main;
  ip_interface_address_t *ia;
  ethernet_arp_header_t *h;
  vnet_hw_interface_t *hi;
  vnet_sw_interface_t *si;
  ip4_address_t *src;
  vlib_buffer_t *b;
  vlib_main_t *vm;
  u32 bi = 0;

  vm = vlib_get_main ();

  si = vnet_get_sw_interface (vnm, adj->rewrite_header.sw_if_index);

  if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
    {
      return;
    }

  src =
    ip4_interface_address_matching_destination (im,
						&adj->sub_type.nbr.next_hop.
						ip4,
						adj->rewrite_header.
						sw_if_index, &ia);
  if (!src)
    {
      return;
    }

  h =
    vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template,
				     &bi);
  if (!h)
    return;

  hi = vnet_get_sup_hw_interface (vnm, adj->rewrite_header.sw_if_index);

  clib_memcpy (h->ip4_over_ethernet[0].ethernet,
	       hi->hw_address, sizeof (h->ip4_over_ethernet[0].ethernet));

  h->ip4_over_ethernet[0].ip4 = src[0];
  h->ip4_over_ethernet[1].ip4 = adj->sub_type.nbr.next_hop.ip4;

  b = vlib_get_buffer (vm, bi);
  vnet_buffer (b)->sw_if_index[VLIB_RX] =
    vnet_buffer (b)->sw_if_index[VLIB_TX] = adj->rewrite_header.sw_if_index;

  /* Add encapsulation string for software interface (e.g. ethernet header). */
  vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
  vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);

  {
    vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index);
    u32 *to_next = vlib_frame_vector_args (f);
    to_next[0] = bi;
    f->n_vectors = 1;
    vlib_put_frame_to_node (vm, hi->output_node_index, f);
  }
}

static void
arp_mk_complete (adj_index_t ai, ethernet_arp_ip4_entry_t * e)
{
  adj_nbr_update_rewrite
    (ai, ADJ_NBR_REWRITE_FLAG_COMPLETE,
     ethernet_build_rewrite (vnet_get_main (),
			     e->sw_if_index,
			     adj_get_link_type (ai), e->ethernet_address));
}

static void
arp_mk_incomplete (adj_index_t ai)
{
  ip_adjacency_t *adj = adj_get (ai);

  adj_nbr_update_rewrite
    (ai,
     ADJ_NBR_REWRITE_FLAG_INCOMPLETE,
     ethernet_build_rewrite (vnet_get_main (),
			     adj->rewrite_header.sw_if_index,
			     VNET_LINK_ARP,
			     VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST));
}

static ethernet_arp_ip4_entry_t *
arp_entry_find (ethernet_arp_interface_t * eai, const ip4_address_t * addr)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *e = NULL;
  uword *p;

  if (NULL != eai->arp_entries)
    {
      p = hash_get (eai->arp_entries, addr->as_u32);
      if (!p)
	return (NULL);

      e = pool_elt_at_index (am->ip4_entry_pool, p[0]);
    }

  return (e);
}

static adj_walk_rc_t
arp_mk_complete_walk (adj_index_t ai, void *ctx)
{
  ethernet_arp_ip4_entry_t *e = ctx;

  arp_mk_complete (ai, e);

  return (ADJ_WALK_RC_CONTINUE);
}

static adj_walk_rc_t
arp_mk_incomplete_walk (adj_index_t ai, void *ctx)
{
  arp_mk_incomplete (ai);

  return (ADJ_WALK_RC_CONTINUE);
}

void
arp_update_adjacency (vnet_main_t * vnm, u32 sw_if_index, u32 ai)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_interface_t *arp_int;
  ethernet_arp_ip4_entry_t *e;
  ip_adjacency_t *adj;

  adj = adj_get (ai);

  vec_validate (am->ethernet_arp_by_sw_if_index, sw_if_index);
  arp_int = &am->ethernet_arp_by_sw_if_index[sw_if_index];
  e = arp_entry_find (arp_int, &adj->sub_type.nbr.next_hop.ip4);

  switch (adj->lookup_next_index)
    {
    case IP_LOOKUP_NEXT_GLEAN:
      adj_glean_update_rewrite (ai);
      break;
    case IP_LOOKUP_NEXT_ARP:
      if (NULL != e)
	{
	  adj_nbr_walk_nh4 (sw_if_index,
			    &e->ip4_address, arp_mk_complete_walk, e);
	}
      else
	{
	  /*
	   * no matching ARP entry.
	   * construct the rewrite required to for an ARP packet, and stick
	   * that in the adj's pipe to smoke.
	   */
	  adj_nbr_update_rewrite
	    (ai,
	     ADJ_NBR_REWRITE_FLAG_INCOMPLETE,
	     ethernet_build_rewrite
	     (vnm,
	      sw_if_index,
	      VNET_LINK_ARP,
	      VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST));

	  /*
	   * since the FIB has added this adj for a route, it makes sense it
	   * may want to forward traffic sometime soon. Let's send a
	   * speculative ARP. just one. If we were to do periodically that
	   * wouldn't be bad either, but that's more code than i'm prepared to
	   * write at this time for relatively little reward.
	   */
	  arp_nbr_probe (adj);
	}
      break;
    case IP_LOOKUP_NEXT_MCAST:
      {
	/*
	 * Construct a partial rewrite from the known ethernet mcast dest MAC
	 */
	u8 *rewrite;
	u8 offset;

	rewrite = ethernet_build_rewrite (vnm,
					  sw_if_index,
					  adj->ia_link,
					  ethernet_ip4_mcast_dst_addr ());
	offset = vec_len (rewrite) - 2;

	/*
	 * Complete the remaining fields of the adj's rewrite to direct the
	 * complete of the rewrite at switch time by copying in the IP
	 * dst address's bytes.
	 * Ofset is 2 bytes into the MAC desintation address.
	 */
	adj_mcast_update_rewrite (ai, rewrite, offset);

	break;
      }
    case IP_LOOKUP_NEXT_DROP:
    case IP_LOOKUP_NEXT_PUNT:
    case IP_LOOKUP_NEXT_LOCAL:
    case IP_LOOKUP_NEXT_REWRITE:
    case IP_LOOKUP_NEXT_MCAST_MIDCHAIN:
    case IP_LOOKUP_NEXT_MIDCHAIN:
    case IP_LOOKUP_NEXT_ICMP_ERROR:
    case IP_LOOKUP_N_NEXT:
      ASSERT (0);
      break;
    }
}

static void
arp_adj_fib_add (ethernet_arp_ip4_entry_t * e, u32 fib_index)
{
  fib_prefix_t pfx = {
    .fp_len = 32,
    .fp_proto = FIB_PROTOCOL_IP4,
    .fp_addr.ip4 = e->ip4_address,
  };

  e->fib_entry_index =
    fib_table_entry_path_add (fib_index, &pfx, FIB_SOURCE_ADJ,
			      FIB_ENTRY_FLAG_ATTACHED,
			      DPO_PROTO_IP4, &pfx.fp_addr,
			      e->sw_if_index, ~0, 1, NULL,
			      FIB_ROUTE_PATH_FLAG_NONE);
  fib_table_lock (fib_index, FIB_PROTOCOL_IP4, FIB_SOURCE_ADJ);
}

void
arp_adj_fib_remove (ethernet_arp_ip4_entry_t * e, u32 fib_index)
{
  if (FIB_NODE_INDEX_INVALID != e->fib_entry_index)
    {
      fib_prefix_t pfx = {
	.fp_len = 32,
	.fp_proto = FIB_PROTOCOL_IP4,
	.fp_addr.ip4 = e->ip4_address,
      };
      u32 fib_index;

      fib_index = ip4_fib_table_get_index_for_sw_if_index (e->sw_if_index);

      fib_table_entry_path_remove (fib_index, &pfx,
				   FIB_SOURCE_ADJ,
				   DPO_PROTO_IP4,
				   &pfx.fp_addr,
				   e->sw_if_index, ~0, 1,
				   FIB_ROUTE_PATH_FLAG_NONE);
      fib_table_unlock (fib_index, FIB_PROTOCOL_IP4, FIB_SOURCE_ADJ);
    }
}

static ethernet_arp_ip4_entry_t *
force_reuse_arp_entry (void)
{
  ethernet_arp_ip4_entry_t *e;
  ethernet_arp_main_t *am = &ethernet_arp_main;
  u32 count = 0;
  u32 index = pool_next_index (am->ip4_entry_pool, am->arp_delete_rotor);
  if (index == ~0)		/* Try again from elt 0 */
    index = pool_next_index (am->ip4_entry_pool, index);

  /* Find a non-static random entry to free up for reuse */
  do
    {
      if ((count++ == 100) || (index == ~0))
	return NULL;		/* give up after 100 entries */
      e = pool_elt_at_index (am->ip4_entry_pool, index);
      am->arp_delete_rotor = index;
      index = pool_next_index (am->ip4_entry_pool, index);
    }
  while (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC);

  /* Remove ARP entry from its interface and update fib */
  hash_unset
    (am->ethernet_arp_by_sw_if_index[e->sw_if_index].arp_entries,
     e->ip4_address.as_u32);
  arp_adj_fib_remove
    (e, ip4_fib_table_get_index_for_sw_if_index (e->sw_if_index));
  adj_nbr_walk_nh4 (e->sw_if_index,
		    &e->ip4_address, arp_mk_incomplete_walk, NULL);
  return e;
}

static int
vnet_arp_set_ip4_over_ethernet_internal (vnet_main_t * vnm,
					 vnet_arp_set_ip4_over_ethernet_rpc_args_t
					 * args)
{
  ethernet_arp_ip4_entry_t *e = 0;
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_over_ethernet_address_t *a = &args->a;
  vlib_main_t *vm = vlib_get_main ();
  int make_new_arp_cache_entry = 1;
  uword *p;
  pending_resolution_t *pr, *mc;
  ethernet_arp_interface_t *arp_int;
  int is_static = args->is_static;
  u32 sw_if_index = args->sw_if_index;
  int is_no_fib_entry = args->is_no_fib_entry;

  vec_validate (am->ethernet_arp_by_sw_if_index, sw_if_index);

  arp_int = &am->ethernet_arp_by_sw_if_index[sw_if_index];

  if (NULL != arp_int->arp_entries)
    {
      p = hash_get (arp_int->arp_entries, a->ip4.as_u32);
      if (p)
	{
	  e = pool_elt_at_index (am->ip4_entry_pool, p[0]);

	  /* Refuse to over-write static arp. */
	  if (!is_static && (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC))
	    return -2;
	  make_new_arp_cache_entry = 0;
	}
    }

  if (make_new_arp_cache_entry)
    {
      if (am->limit_arp_cache_size &&
	  pool_elts (am->ip4_entry_pool) >= am->limit_arp_cache_size)
	{
	  e = force_reuse_arp_entry ();
	  if (NULL == e)
	    return -2;
	}
      else
	pool_get (am->ip4_entry_pool, e);

      if (NULL == arp_int->arp_entries)
	arp_int->arp_entries = hash_create (0, sizeof (u32));

      hash_set (arp_int->arp_entries, a->ip4.as_u32, e - am->ip4_entry_pool);

      e->sw_if_index = sw_if_index;
      e->ip4_address = a->ip4;
      e->fib_entry_index = FIB_NODE_INDEX_INVALID;
      clib_memcpy (e->ethernet_address,
		   a->ethernet, sizeof (e->ethernet_address));

      if (!is_no_fib_entry)
	{
	  arp_adj_fib_add (e,
			   ip4_fib_table_get_index_for_sw_if_index
			   (e->sw_if_index));
	}
      else
	{
	  e->flags |= ETHERNET_ARP_IP4_ENTRY_FLAG_NO_FIB_ENTRY;
	}
    }
  else
    {
      /*
       * prevent a DoS attack from the data-plane that
       * spams us with no-op updates to the MAC address
       */
      if (0 == memcmp (e->ethernet_address,
		       a->ethernet, sizeof (e->ethernet_address)))
	{
	  e->time_last_updated = vlib_time_now (vm);
	  goto check_customers;
	}

      /* Update time stamp and ethernet address. */
      clib_memcpy (e->ethernet_address, a->ethernet,
		   sizeof (e->ethernet_address));
    }

  e->time_last_updated = vlib_time_now (vm);
  if (is_static)
    {
      e->flags &= ~ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC;
      e->flags |= ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC;
    }
  else
    {
      e->flags &= ~ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC;
      e->flags |= ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC;
    }

  adj_nbr_walk_nh4 (sw_if_index, &e->ip4_address, arp_mk_complete_walk, e);

check_customers:
  /* Customer(s) waiting for this address to be resolved? */
  p = hash_get (am->pending_resolutions_by_address, a->ip4.as_u32);
  if (p)
    {
      u32 next_index;
      next_index = p[0];

      while (next_index != (u32) ~ 0)
	{
	  pr = pool_elt_at_index (am->pending_resolutions, next_index);
	  vlib_process_signal_event (vm, pr->node_index,
				     pr->type_opaque, pr->data);
	  next_index = pr->next_index;
	  pool_put (am->pending_resolutions, pr);
	}

      hash_unset (am->pending_resolutions_by_address, a->ip4.as_u32);
    }

  /* Customer(s) requesting ARP event for this address? */
  p = hash_get (am->mac_changes_by_address, a->ip4.as_u32);
  if (p)
    {
      u32 next_index;
      next_index = p[0];

      while (next_index != (u32) ~ 0)
	{
	  int (*fp) (u32, u8 *, u32, u32);
	  int rv = 1;
	  mc = pool_elt_at_index (am->mac_changes, next_index);
	  fp = mc->data_callback;

	  /* Call the user's data callback, return 1 to suppress dup events */
	  if (fp)
	    rv = (*fp) (mc->data, a->ethernet, sw_if_index, 0);

	  /*
	   * Signal the resolver process, as long as the user
	   * says they want to be notified
	   */
	  if (rv == 0)
	    vlib_process_signal_event (vm, mc->node_index,
				       mc->type_opaque, mc->data);
	  next_index = mc->next_index;
	}
    }

  return 0;
}

void
vnet_register_ip4_arp_resolution_event (vnet_main_t * vnm,
					void *address_arg,
					uword node_index,
					uword type_opaque, uword data)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ip4_address_t *address = address_arg;
  uword *p;
  pending_resolution_t *pr;

  pool_get (am->pending_resolutions, pr);

  pr->next_index = ~0;
  pr->node_index = node_index;
  pr->type_opaque = type_opaque;
  pr->data = data;
  pr->data_callback = 0;

  p = hash_get (am->pending_resolutions_by_address, address->as_u32);
  if (p)
    {
      /* Insert new resolution at the head of the list */
      pr->next_index = p[0];
      hash_unset (am->pending_resolutions_by_address, address->as_u32);
    }

  hash_set (am->pending_resolutions_by_address, address->as_u32,
	    pr - am->pending_resolutions);
}

int
vnet_add_del_ip4_arp_change_event (vnet_main_t * vnm,
				   void *data_callback,
				   u32 pid,
				   void *address_arg,
				   uword node_index,
				   uword type_opaque, uword data, int is_add)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ip4_address_t *address = address_arg;

  /* Try to find an existing entry */
  u32 *first = (u32 *) hash_get (am->mac_changes_by_address, address->as_u32);
  u32 *p = first;
  pending_resolution_t *mc;
  while (p && *p != ~0)
    {
      mc = pool_elt_at_index (am->mac_changes, *p);
      if (mc->node_index == node_index && mc->type_opaque == type_opaque
	  && mc->pid == pid)
	break;
      p = &mc->next_index;
    }

  int found = p && *p != ~0;
  if (is_add)
    {
      if (found)
	return VNET_API_ERROR_ENTRY_ALREADY_EXISTS;

      pool_get (am->mac_changes, mc);
      *mc = (pending_resolution_t)
      {
      .next_index = ~0,.node_index = node_index,.type_opaque =
	  type_opaque,.data = data,.data_callback = data_callback,.pid =
	  pid,};

      /* Insert new resolution at the end of the list */
      u32 new_idx = mc - am->mac_changes;
      if (p)
	p[0] = new_idx;
      else
	hash_set (am->mac_changes_by_address, address->as_u32, new_idx);
    }
  else
    {
      if (!found)
	return VNET_API_ERROR_NO_SUCH_ENTRY;

      /* Clients may need to clean up pool entries, too */
      void (*fp) (u32, u8 *) = data_callback;
      if (fp)
	(*fp) (mc->data, 0 /* no new mac addrs */ );

      /* Remove the entry from the list and delete the entry */
      *p = mc->next_index;
      pool_put (am->mac_changes, mc);

      /* Remove from hash if we deleted the last entry */
      if (*p == ~0 && p == first)
	hash_unset (am->mac_changes_by_address, address->as_u32);
    }
  return 0;
}

/* Either we drop the packet or we send a reply to the sender. */
typedef enum
{
  ARP_INPUT_NEXT_DROP,
  ARP_INPUT_NEXT_REPLY_TX,
  ARP_INPUT_N_NEXT,
} arp_input_next_t;

#define foreach_ethernet_arp_error					\
  _ (replies_sent, "ARP replies sent")					\
  _ (l2_type_not_ethernet, "L2 type not ethernet")			\
  _ (l3_type_not_ip4, "L3 type not IP4")				\
  _ (l3_src_address_not_local, "IP4 source address not local to subnet") \
  _ (l3_dst_address_not_local, "IP4 destination address not local to subnet") \
  _ (l3_dst_address_unset, "IP4 destination address is unset")          \
  _ (l3_src_address_is_local, "IP4 source address matches local interface") \
  _ (l3_src_address_learned, "ARP request IP4 source address learned")  \
  _ (replies_received, "ARP replies received")				\
  _ (opcode_not_request, "ARP opcode not request")                      \
  _ (proxy_arp_replies_sent, "Proxy ARP replies sent")			\
  _ (l2_address_mismatch, "ARP hw addr does not match L2 frame src addr") \
  _ (gratuitous_arp, "ARP probe or announcement dropped") \
  _ (interface_no_table, "Interface is not mapped to an IP table") \
  _ (interface_not_ip_enabled, "Interface is not IP enabled") \

typedef enum
{
#define _(sym,string) ETHERNET_ARP_ERROR_##sym,
  foreach_ethernet_arp_error
#undef _
    ETHERNET_ARP_N_ERROR,
} ethernet_arp_input_error_t;

static int
arp_unnumbered (vlib_buffer_t * p0,
		u32 input_sw_if_index, u32 conn_sw_if_index)
{
  vnet_main_t *vnm = vnet_get_main ();
  vnet_interface_main_t *vim = &vnm->interface_main;
  vnet_sw_interface_t *si;

  /* verify that the input interface is unnumbered to the connected.
   * the connected interface is the interface on which the subnet is
   * configured */
  si = &vim->sw_interfaces[input_sw_if_index];

  if (!(si->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED &&
	(si->unnumbered_sw_if_index == conn_sw_if_index)))
    {
      /* the input interface is not unnumbered to the interface on which
       * the sub-net is configured that covers the ARP request.
       * So this is not the case for unnumbered.. */
      return 0;
    }

  return !0;
}

static u32
arp_learn (vnet_main_t * vnm,
	   ethernet_arp_main_t * am, u32 sw_if_index, void *addr)
{
  vnet_arp_set_ip4_over_ethernet (vnm, sw_if_index, addr, 0, 0);
  return (ETHERNET_ARP_ERROR_l3_src_address_learned);
}

static uword
arp_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  vnet_main_t *vnm = vnet_get_main ();
  ip4_main_t *im4 = &ip4_main;
  u32 n_left_from, next_index, *from, *to_next;
  u32 n_replies_sent = 0, n_proxy_arp_replies_sent = 0;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  if (node->flags & VLIB_NODE_FLAG_TRACE)
    vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
				   /* stride */ 1,
				   sizeof (ethernet_arp_input_trace_t));

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  vlib_buffer_t *p0;
	  vnet_hw_interface_t *hw_if0;
	  ethernet_arp_header_t *arp0;
	  ethernet_header_t *eth_rx, *eth_tx;
	  ip4_address_t *if_addr0, proxy_src;
	  u32 pi0, error0, next0, sw_if_index0, conn_sw_if_index0, fib_index0;
	  u8 is_request0, dst_is_local0, is_unnum0, is_vrrp_reply0;
	  ethernet_proxy_arp_t *pa;
	  fib_node_index_t dst_fei, src_fei;
	  fib_prefix_t pfx0;
	  fib_entry_flag_t src_flags, dst_flags;
	  u8 *rewrite0, rewrite0_len;

	  pi0 = from[0];
	  to_next[0] = pi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;
	  pa = 0;

	  p0 = vlib_get_buffer (vm, pi0);
	  arp0 = vlib_buffer_get_current (p0);
	  /* Fill in ethernet header. */
	  eth_rx = ethernet_buffer_get_header (p0);

	  is_request0 = arp0->opcode
	    == clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_request);

	  error0 = ETHERNET_ARP_ERROR_replies_sent;

	  error0 =
	    (arp0->l2_type !=
	     clib_net_to_host_u16 (ETHERNET_ARP_HARDWARE_TYPE_ethernet) ?
	     ETHERNET_ARP_ERROR_l2_type_not_ethernet : error0);
	  error0 =
	    (arp0->l3_type !=
	     clib_net_to_host_u16 (ETHERNET_TYPE_IP4) ?
	     ETHERNET_ARP_ERROR_l3_type_not_ip4 : error0);
	  error0 =
	    (0 == arp0->ip4_over_ethernet[0].ip4.as_u32 ?
	     ETHERNET_ARP_ERROR_l3_dst_address_unset : error0);

	  sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];

	  /* not playing the ARP game if the interface is not IPv4 enabled */
	  error0 =
	    (im4->ip_enabled_by_sw_if_index[sw_if_index0] == 0 ?
	     ETHERNET_ARP_ERROR_interface_not_ip_enabled : error0);

	  if (error0)
	    goto drop2;

	  /* Check that IP address is local and matches incoming interface. */
	  fib_index0 = ip4_fib_table_get_index_for_sw_if_index (sw_if_index0);
	  if (~0 == fib_index0)
	    {
	      error0 = ETHERNET_ARP_ERROR_interface_no_table;
	      goto drop2;

	    }
	  dst_fei = ip4_fib_table_lookup (ip4_fib_get (fib_index0),
					  &arp0->ip4_over_ethernet[1].ip4,
					  32);
	  dst_flags = fib_entry_get_flags (dst_fei);

	  conn_sw_if_index0 = fib_entry_get_resolving_interface (dst_fei);

	  /* Honor unnumbered interface, if any */
	  is_unnum0 = sw_if_index0 != conn_sw_if_index0;

	  {
	    /*
	     * we're looking for FIB entries that indicate the source
	     * is attached. There may be more specific non-attached
	     * routes that match the source, but these do not influence
	     * whether we respond to an ARP request, i.e. they do not
	     * influence whether we are the correct way for the sender
	     * to reach us, they only affect how we reach the sender.
	     */
	    fib_entry_t *src_fib_entry;
	    fib_entry_src_t *src;
	    fib_source_t source;
	    fib_prefix_t pfx;
	    int attached;
	    int mask;

	    mask = 32;
	    attached = 0;

	    do
	      {
		src_fei = ip4_fib_table_lookup (ip4_fib_get (fib_index0),
						&arp0->
						ip4_over_ethernet[0].ip4,
						mask);
		src_fib_entry = fib_entry_get (src_fei);

		/*
		 * It's possible that the source that provides the
		 * flags we need, or the flags we must not have,
		 * is not the best source, so check then all.
		 */
                /* *INDENT-OFF* */
                FOR_EACH_SRC_ADDED(src_fib_entry, src, source,
                ({
                  src_flags = fib_entry_get_flags_for_source (src_fei, source);

                  /* Reject requests/replies with our local interface
                     address. */
                  if (FIB_ENTRY_FLAG_LOCAL & src_flags)
                    {
                      error0 = ETHERNET_ARP_ERROR_l3_src_address_is_local;
                      /*
                       * When VPP has an interface whose address is also
                       * applied to a TAP interface on the host, then VPP's
                       * TAP interface will be unnumbered  to the 'real'
                       * interface and do proxy ARP from the host.
                       * The curious aspect of this setup is that ARP requests
                       * from the host will come from the VPP's own address.
                       * So don't drop immediately here, instead go see if this
                       * is a proxy ARP case.
                       */
                      goto drop1;
                    }
                  /* A Source must also be local to subnet of matching
                   * interface address. */
                  if ((FIB_ENTRY_FLAG_ATTACHED & src_flags) ||
                      (FIB_ENTRY_FLAG_CONNECTED & src_flags))
                    {
                      attached = 1;
                      break;
                    }
                  /*
                   * else
                   *  The packet was sent from an address that is not
                   *  connected nor attached i.e. it is not from an
                   *  address that is covered by a link's sub-net,
                   *  nor is it a already learned host resp.
                   */
                }));
                /* *INDENT-ON* */

		/*
		 * shorter mask lookup for the next iteration.
		 */
		fib_entry_get_prefix (src_fei, &pfx);
		mask = pfx.fp_len - 1;

		/*
		 * continue until we hit the default route or we find
		 * the attached we are looking for. The most likely
		 * outcome is we find the attached with the first source
		 * on the first lookup.
		 */
	      }
	    while (!attached &&
		   !fib_entry_is_sourced (src_fei, FIB_SOURCE_DEFAULT_ROUTE));

	    if (!attached)
	      {
		/*
		 * the matching route is a not attached, i.e. it was
		 * added as a result of routing, rather than interface/ARP
		 * configuration. If the matching route is not a host route
		 * (i.e. a /32)
		 */
		error0 = ETHERNET_ARP_ERROR_l3_src_address_not_local;
		goto drop2;
	      }
	  }

	  if (fib_entry_is_sourced (dst_fei, FIB_SOURCE_ADJ))
	    {
	      /*
	       * We matched an adj-fib on ths source subnet (a /32 previously
	       * added as a result of ARP). If this request is a gratuitous
	       * ARP, then learn from it.
	       * The check for matching an adj-fib, is to prevent hosts
	       * from spamming us with gratuitous ARPS that might otherwise
	       * blow our ARP cache
	       */
	      if (arp0->ip4_over_ethernet[0].ip4.as_u32 ==
		  arp0->ip4_over_ethernet[1].ip4.as_u32)
		error0 = arp_learn (vnm, am, sw_if_index0,
				    &arp0->ip4_over_ethernet[0]);
	      goto drop2;
	    }
	  else if (!(FIB_ENTRY_FLAG_CONNECTED & dst_flags))
	    {
	      error0 = ETHERNET_ARP_ERROR_l3_dst_address_not_local;
	      goto drop1;
	    }

	  if (sw_if_index0 != fib_entry_get_resolving_interface (src_fei))
	    {
	      /*
	       * The interface the ARP was received on is not the interface
	       * on which the covering prefix is configured. Maybe this is a
	       * case for unnumbered.
	       */
	      is_unnum0 = 1;
	    }

	  dst_is_local0 = (FIB_ENTRY_FLAG_LOCAL & dst_flags);
	  fib_entry_get_prefix (dst_fei, &pfx0);
	  if_addr0 = &pfx0.fp_addr.ip4;

	  is_vrrp_reply0 =
	    ((arp0->opcode ==
	      clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_reply))
	     &&
	     (!memcmp
	      (arp0->ip4_over_ethernet[0].ethernet, vrrp_prefix,
	       sizeof (vrrp_prefix))));

	  /* Trash ARP packets whose ARP-level source addresses do not
	     match their L2-frame-level source addresses, unless it's
	     a reply from a VRRP virtual router */
	  if (memcmp
	      (eth_rx->src_address, arp0->ip4_over_ethernet[0].ethernet,
	       sizeof (eth_rx->src_address)) && !is_vrrp_reply0)
	    {
	      error0 = ETHERNET_ARP_ERROR_l2_address_mismatch;
	      goto drop2;
	    }

	  /* Learn or update sender's mapping only for replies to addresses
	   * that are local to the subnet */
	  if (arp0->opcode ==
	      clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_reply))
	    {
	      if (dst_is_local0)
		error0 = arp_learn (vnm, am, sw_if_index0,
				    &arp0->ip4_over_ethernet[0]);
	      else
		/* a reply for a non-local destination could be a GARP.
		 * GARPs for hosts we know were handled above, so this one
		 * we drop */
		error0 = ETHERNET_ARP_ERROR_l3_dst_address_not_local;

	      goto drop1;
	    }
	  else if (arp0->opcode ==
		   clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_request) &&
		   (dst_is_local0 == 0))
	    {
	      goto drop1;
	    }

	send_reply:
	  /* Send a reply.
	     An adjacency to the sender is not always present,
	     so we use the interface to build us a rewrite string
	     which will contain all the necessary tags. */
	  rewrite0 = ethernet_build_rewrite (vnm, sw_if_index0,
					     VNET_LINK_ARP,
					     eth_rx->src_address);
	  rewrite0_len = vec_len (rewrite0);

	  /* Figure out how much to rewind current data from adjacency. */
	  vlib_buffer_advance (p0, -rewrite0_len);
	  eth_tx = vlib_buffer_get_current (p0);

	  vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
	  hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);

	  /* Send reply back through input interface */
	  vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
	  next0 = ARP_INPUT_NEXT_REPLY_TX;

	  arp0->opcode = clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_reply);

	  arp0->ip4_over_ethernet[1] = arp0->ip4_over_ethernet[0];

	  clib_memcpy (arp0->ip4_over_ethernet[0].ethernet,
		       hw_if0->hw_address, 6);
	  clib_mem_unaligned (&arp0->ip4_over_ethernet[0].ip4.data_u32, u32) =
	    if_addr0->data_u32;

	  /* Hardware must be ethernet-like. */
	  ASSERT (vec_len (hw_if0->hw_address) == 6);

	  /* the rx nd tx ethernet headers wil overlap in the case
	   * when we received a tagged VLAN=0 packet, but we are sending
	   * back untagged */
	  clib_memcpy (eth_tx, rewrite0, vec_len (rewrite0));
	  vec_free (rewrite0);

	  if (NULL == pa)
	    {
	      if (is_unnum0)
		{
		  if (!arp_unnumbered (p0, sw_if_index0, conn_sw_if_index0))
		    goto drop2;
		}
	    }

	  /* We are going to reply to this request, so, in the absence of
	     errors, learn the sender */
	  if (!error0)
	    error0 = arp_learn (vnm, am, sw_if_index0,
				&arp0->ip4_over_ethernet[1]);

	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
					   n_left_to_next, pi0, next0);

	  n_replies_sent += 1;
	  continue;

	drop1:
	  if (arp0->ip4_over_ethernet[0].ip4.as_u32 ==
	      arp0->ip4_over_ethernet[1].ip4.as_u32)
	    {
	      error0 = ETHERNET_ARP_ERROR_gratuitous_arp;
	      goto drop2;
	    }
	  /* See if proxy arp is configured for the address */
	  if (is_request0)
	    {
	      vnet_sw_interface_t *si;
	      u32 this_addr = clib_net_to_host_u32
		(arp0->ip4_over_ethernet[1].ip4.as_u32);
	      u32 fib_index0;

	      si = vnet_get_sw_interface (vnm, sw_if_index0);

	      if (!(si->flags & VNET_SW_INTERFACE_FLAG_PROXY_ARP))
		goto drop2;

	      fib_index0 = vec_elt (im4->fib_index_by_sw_if_index,
				    sw_if_index0);

	      vec_foreach (pa, am->proxy_arps)
	      {
		u32 lo_addr = clib_net_to_host_u32 (pa->lo_addr.as_u32);
		u32 hi_addr = clib_net_to_host_u32 (pa->hi_addr.as_u32);

		/* an ARP request hit in the proxy-arp table? */
		if ((this_addr >= lo_addr && this_addr <= hi_addr) &&
		    (fib_index0 == pa->fib_index))
		  {
		    proxy_src.as_u32 =
		      arp0->ip4_over_ethernet[1].ip4.data_u32;

		    /*
		     * change the interface address to the proxied
		     */
		    if_addr0 = &proxy_src;
		    is_unnum0 = 0;
		    n_proxy_arp_replies_sent++;
		    goto send_reply;
		  }
	      }
	    }

	drop2:

	  next0 = ARP_INPUT_NEXT_DROP;
	  p0->error = node->errors[error0];

	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
					   n_left_to_next, pi0, next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  vlib_error_count (vm, node->node_index,
		    ETHERNET_ARP_ERROR_replies_sent,
		    n_replies_sent - n_proxy_arp_replies_sent);

  vlib_error_count (vm, node->node_index,
		    ETHERNET_ARP_ERROR_proxy_arp_replies_sent,
		    n_proxy_arp_replies_sent);
  return frame->n_vectors;
}

static char *ethernet_arp_error_strings[] = {
#define _(sym,string) string,
  foreach_ethernet_arp_error
#undef _
};

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (arp_input_node, static) =
{
  .function = arp_input,
  .name = "arp-input",
  .vector_size = sizeof (u32),
  .n_errors = ETHERNET_ARP_N_ERROR,
  .error_strings = ethernet_arp_error_strings,
  .n_next_nodes = ARP_INPUT_N_NEXT,
  .next_nodes = {
    [ARP_INPUT_NEXT_DROP] = "error-drop",
    [ARP_INPUT_NEXT_REPLY_TX] = "interface-output",
  },
  .format_buffer = format_ethernet_arp_header,
  .format_trace = format_ethernet_arp_input_trace,
};
/* *INDENT-ON* */

static int
ip4_arp_entry_sort (void *a1, void *a2)
{
  ethernet_arp_ip4_entry_t *e1 = a1;
  ethernet_arp_ip4_entry_t *e2 = a2;

  int cmp;
  vnet_main_t *vnm = vnet_get_main ();

  cmp = vnet_sw_interface_compare (vnm, e1->sw_if_index, e2->sw_if_index);
  if (!cmp)
    cmp = ip4_address_compare (&e1->ip4_address, &e2->ip4_address);
  return cmp;
}

ethernet_arp_ip4_entry_t *
ip4_neighbors_pool (void)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  return am->ip4_entry_pool;
}

ethernet_arp_ip4_entry_t *
ip4_neighbor_entries (u32 sw_if_index)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *n, *ns = 0;

  /* *INDENT-OFF* */
  pool_foreach (n, am->ip4_entry_pool, ({
    if (sw_if_index != ~0 && n->sw_if_index != sw_if_index)
      continue;
    vec_add1 (ns, n[0]);
  }));
  /* *INDENT-ON* */

  if (ns)
    vec_sort_with_function (ns, ip4_arp_entry_sort);
  return ns;
}

static clib_error_t *
show_ip4_arp (vlib_main_t * vm,
	      unformat_input_t * input, vlib_cli_command_t * cmd)
{
  vnet_main_t *vnm = vnet_get_main ();
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *e, *es;
  ethernet_proxy_arp_t *pa;
  clib_error_t *error = 0;
  u32 sw_if_index;

  /* Filter entries by interface if given. */
  sw_if_index = ~0;
  (void) unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index);

  es = ip4_neighbor_entries (sw_if_index);
  if (es)
    {
      vlib_cli_output (vm, "%U", format_ethernet_arp_ip4_entry, vnm, 0);
      vec_foreach (e, es)
      {
	vlib_cli_output (vm, "%U", format_ethernet_arp_ip4_entry, vnm, e);
      }
      vec_free (es);
    }

  if (vec_len (am->proxy_arps))
    {
      vlib_cli_output (vm, "Proxy arps enabled for:");
      vec_foreach (pa, am->proxy_arps)
      {
	vlib_cli_output (vm, "Fib_index %d   %U - %U ",
			 pa->fib_index,
			 format_ip4_address, &pa->lo_addr,
			 format_ip4_address, &pa->hi_addr);
      }
    }

  return error;
}

/*?
 * Display all the IPv4 ARP entries.
 *
 * @cliexpar
 * Example of how to display the IPv4 ARP table:
 * @cliexstart{show ip arp}
 *    Time      FIB        IP4       Flags      Ethernet              Interface
 *    346.3028   0       6.1.1.3            de:ad:be:ef:ba:be   GigabitEthernet2/0/0
 *   3077.4271   0       6.1.1.4       S    de:ad:be:ef:ff:ff   GigabitEthernet2/0/0
 *   2998.6409   1       6.2.2.3            de:ad:be:ef:00:01   GigabitEthernet2/0/0
 * Proxy arps enabled for:
 * Fib_index 0   6.0.0.1 - 6.0.0.11
 * @cliexend
 ?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (show_ip4_arp_command, static) = {
  .path = "show ip arp",
  .function = show_ip4_arp,
  .short_help = "show ip arp",
};
/* *INDENT-ON* */

typedef struct
{
  pg_edit_t l2_type, l3_type;
  pg_edit_t n_l2_address_bytes, n_l3_address_bytes;
  pg_edit_t opcode;
  struct
  {
    pg_edit_t ethernet;
    pg_edit_t ip4;
  } ip4_over_ethernet[2];
} pg_ethernet_arp_header_t;

static inline void
pg_ethernet_arp_header_init (pg_ethernet_arp_header_t * p)
{
  /* Initialize fields that are not bit fields in the IP header. */
#define _(f) pg_edit_init (&p->f, ethernet_arp_header_t, f);
  _(l2_type);
  _(l3_type);
  _(n_l2_address_bytes);
  _(n_l3_address_bytes);
  _(opcode);
  _(ip4_over_ethernet[0].ethernet);
  _(ip4_over_ethernet[0].ip4);
  _(ip4_over_ethernet[1].ethernet);
  _(ip4_over_ethernet[1].ip4);
#undef _
}

uword
unformat_pg_arp_header (unformat_input_t * input, va_list * args)
{
  pg_stream_t *s = va_arg (*args, pg_stream_t *);
  pg_ethernet_arp_header_t *p;
  u32 group_index;

  p = pg_create_edit_group (s, sizeof (p[0]), sizeof (ethernet_arp_header_t),
			    &group_index);
  pg_ethernet_arp_header_init (p);

  /* Defaults. */
  pg_edit_set_fixed (&p->l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
  pg_edit_set_fixed (&p->l3_type, ETHERNET_TYPE_IP4);
  pg_edit_set_fixed (&p->n_l2_address_bytes, 6);
  pg_edit_set_fixed (&p->n_l3_address_bytes, 4);

  if (!unformat (input, "%U: %U/%U -> %U/%U",
		 unformat_pg_edit,
		 unformat_ethernet_arp_opcode_net_byte_order, &p->opcode,
		 unformat_pg_edit,
		 unformat_ethernet_address, &p->ip4_over_ethernet[0].ethernet,
		 unformat_pg_edit,
		 unformat_ip4_address, &p->ip4_over_ethernet[0].ip4,
		 unformat_pg_edit,
		 unformat_ethernet_address, &p->ip4_over_ethernet[1].ethernet,
		 unformat_pg_edit,
		 unformat_ip4_address, &p->ip4_over_ethernet[1].ip4))
    {
      /* Free up any edits we may have added. */
      pg_free_edit_group (s);
      return 0;
    }
  return 1;
}

clib_error_t *
ip4_set_arp_limit (u32 arp_limit)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;

  am->limit_arp_cache_size = arp_limit;
  return 0;
}

/**
 * @brief Control Plane hook to remove an ARP entry
 */
int
vnet_arp_unset_ip4_over_ethernet (vnet_main_t * vnm,
				  u32 sw_if_index, void *a_arg)
{
  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;

  args.sw_if_index = sw_if_index;
  args.flags = ETHERNET_ARP_ARGS_REMOVE;
  clib_memcpy (&args.a, a, sizeof (*a));

  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
			       (u8 *) & args, sizeof (args));
  return 0;
}

/**
 * @brief Internally generated event to flush the ARP cache on an
 * interface state change event.
 * A flush will remove dynamic ARP entries, and for statics remove the MAC
 * address from the corresponding adjacencies.
 */
static int
vnet_arp_flush_ip4_over_ethernet (vnet_main_t * vnm,
				  u32 sw_if_index, void *a_arg)
{
  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;

  args.sw_if_index = sw_if_index;
  args.flags = ETHERNET_ARP_ARGS_FLUSH;
  clib_memcpy (&args.a, a, sizeof (*a));

  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
			       (u8 *) & args, sizeof (args));
  return 0;
}

/**
 * @brief Internally generated event to populate the ARP cache on an
 * interface state change event.
 * For static entries this will re-source the adjacencies.
 *
 * @param sw_if_index The interface on which the ARP entires are acted
 */
static int
vnet_arp_populate_ip4_over_ethernet (vnet_main_t * vnm,
				     u32 sw_if_index, void *a_arg)
{
  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;

  args.sw_if_index = sw_if_index;
  args.flags = ETHERNET_ARP_ARGS_POPULATE;
  clib_memcpy (&args.a, a, sizeof (*a));

  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
			       (u8 *) & args, sizeof (args));
  return 0;
}

/**
 * @brief publish wildcard arp event
 * @param sw_if_index The interface on which the ARP entires are acted
 */
static int
vnet_arp_wc_publish (u32 sw_if_index, void *a_arg)
{
  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
  vnet_arp_set_ip4_over_ethernet_rpc_args_t args = {
    .flags = ETHERNET_ARP_ARGS_WC_PUB,
    .sw_if_index = sw_if_index,
    .a = *a
  };

  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
			       (u8 *) & args, sizeof (args));
  return 0;
}

static void
vnet_arp_wc_publish_internal (vnet_main_t * vnm,
			      vnet_arp_set_ip4_over_ethernet_rpc_args_t *
			      args)
{
  vlib_main_t *vm = vlib_get_main ();
  ethernet_arp_main_t *am = &ethernet_arp_main;
  uword ni = am->wc_ip4_arp_publisher_node;
  uword et = am->wc_ip4_arp_publisher_et;

  if (ni == (uword) ~ 0)
    return;
  wc_arp_report_t *r =
    vlib_process_signal_event_data (vm, ni, et, 1, sizeof *r);
  r->ip4 = args->a.ip4.as_u32;
  r->sw_if_index = args->sw_if_index;
  memcpy (r->mac, args->a.ethernet, sizeof r->mac);
}

void
wc_arp_set_publisher_node (uword node_index, uword event_type)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  am->wc_ip4_arp_publisher_node = node_index;
  am->wc_ip4_arp_publisher_et = event_type;
}

/*
 * arp_add_del_interface_address
 *
 * callback when an interface address is added or deleted
 */
static void
arp_add_del_interface_address (ip4_main_t * im,
			       uword opaque,
			       u32 sw_if_index,
			       ip4_address_t * address,
			       u32 address_length,
			       u32 if_address_index, u32 is_del)
{
  /*
   * Flush the ARP cache of all entries covered by the address
   * that is being removed.
   */
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *e;

  if (vec_len (am->ethernet_arp_by_sw_if_index) <= sw_if_index)
    return;

  if (is_del)
    {
      ethernet_arp_interface_t *eai;
      u32 i, *to_delete = 0;
      hash_pair_t *pair;

      eai = &am->ethernet_arp_by_sw_if_index[sw_if_index];

      /* *INDENT-OFF* */
      hash_foreach_pair (pair, eai->arp_entries,
      ({
	e = pool_elt_at_index(am->ip4_entry_pool,
			      pair->value[0]);
	if (ip4_destination_matches_route (im, &e->ip4_address,
					   address, address_length))
	  {
	    vec_add1 (to_delete, e - am->ip4_entry_pool);
	  }
      }));
      /* *INDENT-ON* */

      for (i = 0; i < vec_len (to_delete); i++)
	{
	  ethernet_arp_ip4_over_ethernet_address_t delme;
	  e = pool_elt_at_index (am->ip4_entry_pool, to_delete[i]);

	  clib_memcpy (&delme.ethernet, e->ethernet_address, 6);
	  delme.ip4.as_u32 = e->ip4_address.as_u32;

	  vnet_arp_flush_ip4_over_ethernet (vnet_get_main (),
					    e->sw_if_index, &delme);
	}

      vec_free (to_delete);
    }
}

static void
arp_table_bind (ip4_main_t * im,
		uword opaque,
		u32 sw_if_index, u32 new_fib_index, u32 old_fib_index)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_interface_t *eai;
  ethernet_arp_ip4_entry_t *e;
  hash_pair_t *pair;

  /*
   * the IP table that the interface is bound to has changed.
   * reinstall all the adj fibs.
   */

  if (vec_len (am->ethernet_arp_by_sw_if_index) <= sw_if_index)
    return;

  eai = &am->ethernet_arp_by_sw_if_index[sw_if_index];

  /* *INDENT-OFF* */
  hash_foreach_pair (pair, eai->arp_entries,
  ({
    e = pool_elt_at_index(am->ip4_entry_pool,
                          pair->value[0]);
    /*
     * remove the adj-fib from the old table and add to the new
     */
    arp_adj_fib_remove(e, old_fib_index);
    arp_adj_fib_add(e, new_fib_index);
  }));
  /* *INDENT-ON* */

}

static clib_error_t *
ethernet_arp_init (vlib_main_t * vm)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ip4_main_t *im = &ip4_main;
  clib_error_t *error;
  pg_node_t *pn;

  if ((error = vlib_call_init_function (vm, ethernet_init)))
    return error;

  ethernet_register_input_type (vm, ETHERNET_TYPE_ARP, arp_input_node.index);

  pn = pg_get_node (arp_input_node.index);
  pn->unformat_edit = unformat_pg_arp_header;

  am->opcode_by_name = hash_create_string (0, sizeof (uword));
#define _(o) hash_set_mem (am->opcode_by_name, #o, ETHERNET_ARP_OPCODE_##o);
  foreach_ethernet_arp_opcode;
#undef _

  /* $$$ configurable */
  am->limit_arp_cache_size = 50000;

  am->pending_resolutions_by_address = hash_create (0, sizeof (uword));
  am->mac_changes_by_address = hash_create (0, sizeof (uword));
  am->wc_ip4_arp_publisher_node = (uword) ~ 0;

  /* don't trace ARP error packets */
  {
    vlib_node_runtime_t *rt =
      vlib_node_get_runtime (vm, arp_input_node.index);

#define _(a,b)                                  \
    vnet_pcap_drop_trace_filter_add_del         \
        (rt->errors[ETHERNET_ARP_ERROR_##a],    \
         1 /* is_add */);
    foreach_ethernet_arp_error
#undef _
  }

  ip4_add_del_interface_address_callback_t cb;
  cb.function = arp_add_del_interface_address;
  cb.function_opaque = 0;
  vec_add1 (im->add_del_interface_address_callbacks, cb);

  ip4_table_bind_callback_t cbt;
  cbt.function = arp_table_bind;
  cbt.function_opaque = 0;
  vec_add1 (im->table_bind_callbacks, cbt);

  return 0;
}

VLIB_INIT_FUNCTION (ethernet_arp_init);

static void
arp_entry_free (ethernet_arp_interface_t * eai, ethernet_arp_ip4_entry_t * e)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;

  arp_adj_fib_remove
    (e, ip4_fib_table_get_index_for_sw_if_index (e->sw_if_index));
  hash_unset (eai->arp_entries, e->ip4_address.as_u32);
  pool_put (am->ip4_entry_pool, e);
}

static inline int
vnet_arp_unset_ip4_over_ethernet_internal (vnet_main_t * vnm,
					   vnet_arp_set_ip4_over_ethernet_rpc_args_t
					   * args)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *e;
  ethernet_arp_interface_t *eai;

  if (vec_len (am->ethernet_arp_by_sw_if_index) <= args->sw_if_index)
    return 0;

  eai = &am->ethernet_arp_by_sw_if_index[args->sw_if_index];

  e = arp_entry_find (eai, &args->a.ip4);

  if (NULL != e)
    {
      adj_nbr_walk_nh4 (e->sw_if_index,
			&e->ip4_address, arp_mk_incomplete_walk, NULL);
      arp_entry_free (eai, e);
    }

  return 0;
}

static int
vnet_arp_flush_ip4_over_ethernet_internal (vnet_main_t * vnm,
					   vnet_arp_set_ip4_over_ethernet_rpc_args_t
					   * args)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *e;
  ethernet_arp_interface_t *eai;

  if (vec_len (am->ethernet_arp_by_sw_if_index) <= args->sw_if_index)
    return 0;

  eai = &am->ethernet_arp_by_sw_if_index[args->sw_if_index];

  e = arp_entry_find (eai, &args->a.ip4);

  if (NULL != e)
    {
      adj_nbr_walk_nh4 (e->sw_if_index,
			&e->ip4_address, arp_mk_incomplete_walk, e);

      /*
       * The difference between flush and unset, is that an unset
       * means delete for static and dynamic entries. A flush
       * means delete only for dynamic. Flushing is what the DP
       * does in response to interface events. unset is only done
       * by the control plane.
       */
      if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC)
	{
	  e->flags &= ~ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC;
	}
      else if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_DYNAMIC)
	{
	  arp_entry_free (eai, e);
	}
    }
  return (0);
}

static int
vnet_arp_populate_ip4_over_ethernet_internal (vnet_main_t * vnm,
					      vnet_arp_set_ip4_over_ethernet_rpc_args_t
					      * args)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *e;
  ethernet_arp_interface_t *eai;

  vec_validate (am->ethernet_arp_by_sw_if_index, args->sw_if_index);
  eai = &am->ethernet_arp_by_sw_if_index[args->sw_if_index];

  e = arp_entry_find (eai, &args->a.ip4);

  if (NULL != e)
    {
      adj_nbr_walk_nh4 (e->sw_if_index,
			&e->ip4_address, arp_mk_complete_walk, e);
    }
  return (0);
}

static void
set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t
				    * a)
{
  vnet_main_t *vm = vnet_get_main ();
  ASSERT (vlib_get_thread_index () == 0);

  if (a->flags & ETHERNET_ARP_ARGS_REMOVE)
    vnet_arp_unset_ip4_over_ethernet_internal (vm, a);
  else if (a->flags & ETHERNET_ARP_ARGS_FLUSH)
    vnet_arp_flush_ip4_over_ethernet_internal (vm, a);
  else if (a->flags & ETHERNET_ARP_ARGS_POPULATE)
    vnet_arp_populate_ip4_over_ethernet_internal (vm, a);
  else if (a->flags & ETHERNET_ARP_ARGS_WC_PUB)
    vnet_arp_wc_publish_internal (vm, a);
  else
    vnet_arp_set_ip4_over_ethernet_internal (vm, a);
}

/**
 * @brief Invoked when the interface's admin state changes
 */
static clib_error_t *
ethernet_arp_sw_interface_up_down (vnet_main_t * vnm,
				   u32 sw_if_index, u32 flags)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *e;
  u32 i, *to_delete = 0;

  /* *INDENT-OFF* */
  pool_foreach (e, am->ip4_entry_pool,
  ({
    if (e->sw_if_index == sw_if_index)
      vec_add1 (to_delete,
		e - am->ip4_entry_pool);
  }));
  /* *INDENT-ON* */

  for (i = 0; i < vec_len (to_delete); i++)
    {
      ethernet_arp_ip4_over_ethernet_address_t delme;
      e = pool_elt_at_index (am->ip4_entry_pool, to_delete[i]);

      clib_memcpy (&delme.ethernet, e->ethernet_address, 6);
      delme.ip4.as_u32 = e->ip4_address.as_u32;

      if (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
	{
	  vnet_arp_populate_ip4_over_ethernet (vnm, e->sw_if_index, &delme);
	}
      else
	{
	  vnet_arp_flush_ip4_over_ethernet (vnm, e->sw_if_index, &delme);
	}

    }
  vec_free (to_delete);

  return 0;
}

VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ethernet_arp_sw_interface_up_down);

static void
increment_ip4_and_mac_address (ethernet_arp_ip4_over_ethernet_address_t * a)
{
  u8 old;
  int i;

  for (i = 3; i >= 0; i--)
    {
      old = a->ip4.as_u8[i];
      a->ip4.as_u8[i] += 1;
      if (old < a->ip4.as_u8[i])
	break;
    }

  for (i = 5; i >= 0; i--)
    {
      old = a->ethernet[i];
      a->ethernet[i] += 1;
      if (old < a->ethernet[i])
	break;
    }
}

int
vnet_arp_set_ip4_over_ethernet (vnet_main_t * vnm,
				u32 sw_if_index, void *a_arg,
				int is_static, int is_no_fib_entry)
{
  ethernet_arp_ip4_over_ethernet_address_t *a = a_arg;
  vnet_arp_set_ip4_over_ethernet_rpc_args_t args;

  args.sw_if_index = sw_if_index;
  args.is_static = is_static;
  args.is_no_fib_entry = is_no_fib_entry;
  args.flags = 0;
  clib_memcpy (&args.a, a, sizeof (*a));

  vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
			       (u8 *) & args, sizeof (args));
  return 0;
}

void
proxy_arp_walk (proxy_arp_walk_t cb, void *data)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_proxy_arp_t *pa;

  vec_foreach (pa, am->proxy_arps)
  {
    if (!cb (&pa->lo_addr, &pa->hi_addr, pa->fib_index, data))
      break;
  }
}

int
vnet_proxy_arp_add_del (ip4_address_t * lo_addr,
			ip4_address_t * hi_addr, u32 fib_index, int is_del)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_proxy_arp_t *pa;
  u32 found_at_index = ~0;

  vec_foreach (pa, am->proxy_arps)
  {
    if (pa->lo_addr.as_u32 == lo_addr->as_u32 &&
	pa->hi_addr.as_u32 == hi_addr->as_u32 && pa->fib_index == fib_index)
      {
	found_at_index = pa - am->proxy_arps;
	break;
      }
  }

  if (found_at_index != ~0)
    {
      /* Delete, otherwise it's already in the table */
      if (is_del)
	vec_delete (am->proxy_arps, 1, found_at_index);
      return 0;
    }
  /* delete, no such entry */
  if (is_del)
    return VNET_API_ERROR_NO_SUCH_ENTRY;

  /* add, not in table */
  vec_add2 (am->proxy_arps, pa, 1);
  pa->lo_addr.as_u32 = lo_addr->as_u32;
  pa->hi_addr.as_u32 = hi_addr->as_u32;
  pa->fib_index = fib_index;
  return 0;
}

/*
 * Remove any proxy arp entries asdociated with the
 * specificed fib.
 */
int
vnet_proxy_arp_fib_reset (u32 fib_id)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_proxy_arp_t *pa;
  u32 *entries_to_delete = 0;
  u32 fib_index;
  int i;

  fib_index = fib_table_find (FIB_PROTOCOL_IP4, fib_id);
  if (~0 == fib_index)
    return VNET_API_ERROR_NO_SUCH_ENTRY;

  vec_foreach (pa, am->proxy_arps)
  {
    if (pa->fib_index == fib_index)
      {
	vec_add1 (entries_to_delete, pa - am->proxy_arps);
      }
  }

  for (i = 0; i < vec_len (entries_to_delete); i++)
    {
      vec_delete (am->proxy_arps, 1, entries_to_delete[i]);
    }

  vec_free (entries_to_delete);

  return 0;
}

static clib_error_t *
ip_arp_add_del_command_fn (vlib_main_t * vm,
			   unformat_input_t * input, vlib_cli_command_t * cmd)
{
  vnet_main_t *vnm = vnet_get_main ();
  u32 sw_if_index;
  ethernet_arp_ip4_over_ethernet_address_t lo_addr, hi_addr, addr;
  int addr_valid = 0;
  int is_del = 0;
  int count = 1;
  u32 fib_index = 0;
  u32 fib_id;
  int is_static = 0;
  int is_no_fib_entry = 0;
  int is_proxy = 0;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      /* set ip arp TenGigE1/1/0/1 1.2.3.4 aa:bb:... or aabb.ccdd... */
      if (unformat (input, "%U %U %U",
		    unformat_vnet_sw_interface, vnm, &sw_if_index,
		    unformat_ip4_address, &addr.ip4,
		    unformat_ethernet_address, &addr.ethernet))
	addr_valid = 1;

      else if (unformat (input, "delete") || unformat (input, "del"))
	is_del = 1;

      else if (unformat (input, "static"))
	is_static = 1;

      else if (unformat (input, "no-fib-entry"))
	is_no_fib_entry = 1;

      else if (unformat (input, "count %d", &count))
	;

      else if (unformat (input, "fib-id %d", &fib_id))
	{
	  fib_index = fib_table_find (FIB_PROTOCOL_IP4, fib_id);

	  if (~0 == fib_index)
	    return clib_error_return (0, "fib ID %d doesn't exist\n", fib_id);
	}

      else if (unformat (input, "proxy %U - %U",
			 unformat_ip4_address, &lo_addr.ip4,
			 unformat_ip4_address, &hi_addr.ip4))
	is_proxy = 1;
      else
	break;
    }

  if (is_proxy)
    {
      (void) vnet_proxy_arp_add_del (&lo_addr.ip4, &hi_addr.ip4,
				     fib_index, is_del);
      return 0;
    }

  if (addr_valid)
    {
      int i;

      for (i = 0; i < count; i++)
	{
	  if (is_del == 0)
	    {
	      uword event_type, *event_data = 0;

	      /* Park the debug CLI until the arp entry is installed */
	      vnet_register_ip4_arp_resolution_event
		(vnm, &addr.ip4, vlib_current_process (vm),
		 1 /* type */ , 0 /* data */ );

	      vnet_arp_set_ip4_over_ethernet
		(vnm, sw_if_index, &addr, is_static, is_no_fib_entry);

	      vlib_process_wait_for_event (vm);
	      event_type = vlib_process_get_events (vm, &event_data);
	      vec_reset_length (event_data);
	      if (event_type != 1)
		clib_warning ("event type %d unexpected", event_type);
	    }
	  else
	    vnet_arp_unset_ip4_over_ethernet (vnm, sw_if_index, &addr);

	  increment_ip4_and_mac_address (&addr);
	}
    }
  else
    {
      return clib_error_return (0, "unknown input `%U'",
				format_unformat_error, input);
    }

  return 0;
}

/* *INDENT-OFF* */
/*?
 * Add or delete IPv4 ARP cache entries.
 *
 * @note 'set ip arp' options (e.g. delete, static, 'fib-id <id>',
 * 'count <number>', 'interface ip4_addr mac_addr') can be added in
 * any order and combination.
 *
 * @cliexpar
 * @parblock
 * Add or delete IPv4 ARP cache entries as follows. MAC Address can be in
 * either aa:bb:cc:dd:ee:ff format or aabb.ccdd.eeff format.
 * @cliexcmd{set ip arp GigabitEthernet2/0/0 6.0.0.3 dead.beef.babe}
 * @cliexcmd{set ip arp delete GigabitEthernet2/0/0 6.0.0.3 de:ad:be:ef:ba:be}
 *
 * To add or delete an IPv4 ARP cache entry to or from a specific fib
 * table:
 * @cliexcmd{set ip arp fib-id 1 GigabitEthernet2/0/0 6.0.0.3 dead.beef.babe}
 * @cliexcmd{set ip arp fib-id 1 delete GigabitEthernet2/0/0 6.0.0.3 dead.beef.babe}
 *
 * Add or delete IPv4 static ARP cache entries as follows:
 * @cliexcmd{set ip arp static GigabitEthernet2/0/0 6.0.0.3 dead.beef.babe}
 * @cliexcmd{set ip arp static delete GigabitEthernet2/0/0 6.0.0.3 dead.beef.babe}
 *
 * For testing / debugging purposes, the 'set ip arp' command can add or
 * delete multiple entries. Supply the 'count N' parameter:
 * @cliexcmd{set ip arp count 10 GigabitEthernet2/0/0 6.0.0.3 dead.beef.babe}
 * @endparblock
 ?*/
VLIB_CLI_COMMAND (ip_arp_add_del_command, static) = {
  .path = "set ip arp",
  .short_help =
  "set ip arp [del] <intfc> <ip-address> <mac-address> [static] [no-fib-entry] [count <count>] [fib-id <fib-id>] [proxy <lo-addr> - <hi-addr>]",
  .function = ip_arp_add_del_command_fn,
};
/* *INDENT-ON* */

static clib_error_t *
set_int_proxy_arp_command_fn (vlib_main_t * vm,
			      unformat_input_t *
			      input, vlib_cli_command_t * cmd)
{
  vnet_main_t *vnm = vnet_get_main ();
  u32 sw_if_index;
  vnet_sw_interface_t *si;
  int enable = 0;
  int intfc_set = 0;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "%U", unformat_vnet_sw_interface,
		    vnm, &sw_if_index))
	intfc_set = 1;
      else if (unformat (input, "enable") || unformat (input, "on"))
	enable = 1;
      else if (unformat (input, "disable") || unformat (input, "off"))
	enable = 0;
      else
	break;
    }

  if (intfc_set == 0)
    return clib_error_return (0, "unknown input '%U'",
			      format_unformat_error, input);

  si = vnet_get_sw_interface (vnm, sw_if_index);
  ASSERT (si);
  if (enable)
    si->flags |= VNET_SW_INTERFACE_FLAG_PROXY_ARP;
  else
    si->flags &= ~VNET_SW_INTERFACE_FLAG_PROXY_ARP;

  return 0;
}

/* *INDENT-OFF* */
/*?
 * Enable proxy-arp on an interface. The vpp stack will answer ARP
 * requests for the indicated address range. Multiple proxy-arp
 * ranges may be provisioned.
 *
 * @note Proxy ARP as a technology is infamous for blackholing traffic.
 * Also, the underlying implementation has not been performance-tuned.
 * Avoid creating an unnecessarily large set of ranges.
 *
 * @cliexpar
 * To enable proxy arp on a range of addresses, use:
 * @cliexcmd{set ip arp proxy 6.0.0.1 - 6.0.0.11}
 * Append 'del' to delete a range of proxy ARP addresses:
 * @cliexcmd{set ip arp proxy 6.0.0.1 - 6.0.0.11 del}
 * You must then specifically enable proxy arp on individual interfaces:
 * @cliexcmd{set interface proxy-arp GigabitEthernet0/8/0 enable}
 * To disable proxy arp on an individual interface:
 * @cliexcmd{set interface proxy-arp GigabitEthernet0/8/0 disable}
 ?*/
VLIB_CLI_COMMAND (set_int_proxy_enable_command, static) = {
  .path = "set interface proxy-arp",
  .short_help =
  "set interface proxy-arp <intfc> [enable|disable]",
  .function = set_int_proxy_arp_command_fn,
};
/* *INDENT-ON* */


/*
 * ARP/ND Termination in a L2 Bridge Domain based on IP4/IP6 to MAC
 * hash tables mac_by_ip4 and mac_by_ip6 for each BD.
 */
typedef enum
{
  ARP_TERM_NEXT_L2_OUTPUT,
  ARP_TERM_NEXT_DROP,
  ARP_TERM_N_NEXT,
} arp_term_next_t;

u32 arp_term_next_node_index[32];

static uword
arp_term_l2bd (vlib_main_t * vm,
	       vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  l2input_main_t *l2im = &l2input_main;
  u32 n_left_from, next_index, *from, *to_next;
  u32 n_replies_sent = 0;
  u16 last_bd_index = ~0;
  l2_bridge_domain_t *last_bd_config = 0;
  l2_input_config_t *cfg0;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  vlib_buffer_t *p0;
	  ethernet_header_t *eth0;
	  ethernet_arp_header_t *arp0;
	  ip6_header_t *iph0;
	  u8 *l3h0;
	  u32 pi0, error0, next0, sw_if_index0;
	  u16 ethertype0;
	  u16 bd_index0;
	  u32 ip0;
	  u8 *macp0;
	  u8 is_vrrp_reply0;

	  pi0 = from[0];
	  to_next[0] = pi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;

	  p0 = vlib_get_buffer (vm, pi0);
	  // Terminate only local (SHG == 0) ARP
	  if (vnet_buffer (p0)->l2.shg != 0)
	    goto next_l2_feature;

	  eth0 = vlib_buffer_get_current (p0);
	  l3h0 = (u8 *) eth0 + vnet_buffer (p0)->l2.l2_len;
	  ethertype0 = clib_net_to_host_u16 (*(u16 *) (l3h0 - 2));
	  arp0 = (ethernet_arp_header_t *) l3h0;

	  if (PREDICT_FALSE ((ethertype0 != ETHERNET_TYPE_ARP) ||
			     (arp0->opcode !=
			      clib_host_to_net_u16
			      (ETHERNET_ARP_OPCODE_request))))
	    goto check_ip6_nd;

	  /* Must be ARP request packet here */
	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) &&
			     (p0->flags & VLIB_BUFFER_IS_TRACED)))
	    {
	      u8 *t0 = vlib_add_trace (vm, node, p0,
				       sizeof (ethernet_arp_input_trace_t));
	      clib_memcpy (t0, l3h0, sizeof (ethernet_arp_input_trace_t));
	    }

	  error0 = ETHERNET_ARP_ERROR_replies_sent;
	  error0 =
	    (arp0->l2_type !=
	     clib_net_to_host_u16 (ETHERNET_ARP_HARDWARE_TYPE_ethernet)
	     ? ETHERNET_ARP_ERROR_l2_type_not_ethernet : error0);
	  error0 =
	    (arp0->l3_type !=
	     clib_net_to_host_u16 (ETHERNET_TYPE_IP4) ?
	     ETHERNET_ARP_ERROR_l3_type_not_ip4 : error0);

	  sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];

	  if (error0)
	    goto drop;

	  is_vrrp_reply0 =
	    ((arp0->opcode ==
	      clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_reply))
	     &&
	     (!memcmp
	      (arp0->ip4_over_ethernet[0].ethernet, vrrp_prefix,
	       sizeof (vrrp_prefix))));

	  /* Trash ARP packets whose ARP-level source addresses do not
	     match their L2-frame-level source addresses, unless it's
	     a reply from a VRRP virtual router */
	  if (PREDICT_FALSE
	      (memcmp (eth0->src_address, arp0->ip4_over_ethernet[0].ethernet,
		       sizeof (eth0->src_address)) && !is_vrrp_reply0))
	    {
	      error0 = ETHERNET_ARP_ERROR_l2_address_mismatch;
	      goto drop;
	    }

	  /* Check if anyone want ARP request events for L2 BDs */
	  {
	    ethernet_arp_main_t *am = &ethernet_arp_main;
	    if (am->wc_ip4_arp_publisher_node != (uword) ~ 0)
	      vnet_arp_wc_publish (sw_if_index0, &arp0->ip4_over_ethernet[0]);
	  }

	  /* lookup BD mac_by_ip4 hash table for MAC entry */
	  ip0 = arp0->ip4_over_ethernet[1].ip4.as_u32;
	  bd_index0 = vnet_buffer (p0)->l2.bd_index;
	  if (PREDICT_FALSE ((bd_index0 != last_bd_index)
			     || (last_bd_index == (u16) ~ 0)))
	    {
	      last_bd_index = bd_index0;
	      last_bd_config = vec_elt_at_index (l2im->bd_configs, bd_index0);
	    }
	  macp0 = (u8 *) hash_get (last_bd_config->mac_by_ip4, ip0);

	  if (PREDICT_FALSE (!macp0))
	    goto next_l2_feature;	/* MAC not found */

	  /* MAC found, send ARP reply -
	     Convert ARP request packet to ARP reply */
	  arp0->opcode = clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_reply);
	  arp0->ip4_over_ethernet[1] = arp0->ip4_over_ethernet[0];
	  arp0->ip4_over_ethernet[0].ip4.as_u32 = ip0;
	  clib_memcpy (arp0->ip4_over_ethernet[0].ethernet, macp0, 6);
	  clib_memcpy (eth0->dst_address, eth0->src_address, 6);
	  clib_memcpy (eth0->src_address, macp0, 6);
	  n_replies_sent += 1;

	output_response:
	  /* For BVI, need to use l2-fwd node to send ARP reply as
	     l2-output node cannot output packet to BVI properly */
	  cfg0 = vec_elt_at_index (l2im->configs, sw_if_index0);
	  if (PREDICT_FALSE (cfg0->bvi))
	    {
	      vnet_buffer (p0)->l2.feature_bitmap |= L2INPUT_FEAT_FWD;
	      vnet_buffer (p0)->sw_if_index[VLIB_RX] = 0;
	      goto next_l2_feature;
	    }

	  /* Send ARP/ND reply back out input interface through l2-output */
	  vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
	  next0 = ARP_TERM_NEXT_L2_OUTPUT;
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next, pi0,
					   next0);
	  continue;

	check_ip6_nd:
	  /* IP6 ND event notification or solicitation handling to generate
	     local response instead of flooding */
	  iph0 = (ip6_header_t *) l3h0;
	  if (PREDICT_FALSE (ethertype0 == ETHERNET_TYPE_IP6 &&
			     iph0->protocol == IP_PROTOCOL_ICMP6 &&
			     !ip6_address_is_unspecified
			     (&iph0->src_address)))
	    {
	      sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
	      if (vnet_ip6_nd_term
		  (vm, node, p0, eth0, iph0, sw_if_index0,
		   vnet_buffer (p0)->l2.bd_index))
		goto output_response;
	    }

	next_l2_feature:
	  {
	    next0 = vnet_l2_feature_next (p0, arp_term_next_node_index,
					  L2INPUT_FEAT_ARP_TERM);
	    vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					     to_next, n_left_to_next,
					     pi0, next0);
	    continue;
	  }

	drop:
	  if (0 == arp0->ip4_over_ethernet[0].ip4.as_u32 ||
	      (arp0->ip4_over_ethernet[0].ip4.as_u32 ==
	       arp0->ip4_over_ethernet[1].ip4.as_u32))
	    {
	      error0 = ETHERNET_ARP_ERROR_gratuitous_arp;
	    }
	  next0 = ARP_TERM_NEXT_DROP;
	  p0->error = node->errors[error0];

	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next, pi0,
					   next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  vlib_error_count (vm, node->node_index,
		    ETHERNET_ARP_ERROR_replies_sent, n_replies_sent);
  return frame->n_vectors;
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (arp_term_l2bd_node, static) = {
  .function = arp_term_l2bd,
  .name = "arp-term-l2bd",
  .vector_size = sizeof (u32),
  .n_errors = ETHERNET_ARP_N_ERROR,
  .error_strings = ethernet_arp_error_strings,
  .n_next_nodes = ARP_TERM_N_NEXT,
  .next_nodes = {
    [ARP_TERM_NEXT_L2_OUTPUT] = "l2-output",
    [ARP_TERM_NEXT_DROP] = "error-drop",
  },
  .format_buffer = format_ethernet_arp_header,
  .format_trace = format_arp_term_input_trace,
};
/* *INDENT-ON* */

clib_error_t *
arp_term_init (vlib_main_t * vm)
{
  // Initialize the feature next-node indexes
  feat_bitmap_init_next_nodes (vm,
			       arp_term_l2bd_node.index,
			       L2INPUT_N_FEAT,
			       l2input_get_feat_names (),
			       arp_term_next_node_index);
  return 0;
}

VLIB_INIT_FUNCTION (arp_term_init);

void
change_arp_mac (u32 sw_if_index, ethernet_arp_ip4_entry_t * e)
{
  if (e->sw_if_index == sw_if_index)
    {
      adj_nbr_walk_nh4 (e->sw_if_index,
			&e->ip4_address, arp_mk_complete_walk, e);
    }
}

void
ethernet_arp_change_mac (u32 sw_if_index)
{
  ethernet_arp_main_t *am = &ethernet_arp_main;
  ethernet_arp_ip4_entry_t *e;
  adj_index_t ai;

  /* *INDENT-OFF* */
  pool_foreach (e, am->ip4_entry_pool,
  ({
    change_arp_mac (sw_if_index, e);
  }));
  /* *INDENT-ON* */

  ai = adj_glean_get (FIB_PROTOCOL_IP4, sw_if_index);

  if (ADJ_INDEX_INVALID != ai)
    adj_glean_update_rewrite (ai);
}

void
send_ip4_garp (vlib_main_t * vm, u32 sw_if_index)
{
  ip4_main_t *i4m = &ip4_main;
  ip4_address_t *ip4_addr = ip4_interface_first_address (i4m, sw_if_index, 0);

  send_ip4_garp_w_addr (vm, ip4_addr, sw_if_index);
}

void
send_ip4_garp_w_addr (vlib_main_t * vm,
		      const ip4_address_t * ip4_addr, u32 sw_if_index)
{
  ip4_main_t *i4m = &ip4_main;
  vnet_main_t *vnm = vnet_get_main ();
  u8 *rewrite, rewrite_len;
  vnet_hw_interface_t *hi = vnet_get_sup_hw_interface (vnm, sw_if_index);

  if (ip4_addr)
    {
      clib_warning ("Sending GARP for IP4 address %U on sw_if_idex %d",
		    format_ip4_address, ip4_addr, sw_if_index);

      /* Form GARP packet for output - Gratuitous ARP is an ARP request packet
         where the interface IP/MAC pair is used for both source and request
         MAC/IP pairs in the request */
      u32 bi = 0;
      ethernet_arp_header_t *h = vlib_packet_template_get_packet
	(vm, &i4m->ip4_arp_request_packet_template, &bi);

      if (!h)
	return;

      clib_memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address,
		   sizeof (h->ip4_over_ethernet[0].ethernet));
      clib_memcpy (h->ip4_over_ethernet[1].ethernet, hi->hw_address,
		   sizeof (h->ip4_over_ethernet[1].ethernet));
      h->ip4_over_ethernet[0].ip4 = ip4_addr[0];
      h->ip4_over_ethernet[1].ip4 = ip4_addr[0];

      /* Setup MAC header with ARP Etype and broadcast DMAC */
      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
      rewrite =
	ethernet_build_rewrite (vnm, sw_if_index, VNET_LINK_ARP,
				VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST);
      rewrite_len = vec_len (rewrite);
      vlib_buffer_advance (b, -rewrite_len);
      ethernet_header_t *e = vlib_buffer_get_current (b);
      clib_memcpy (e->dst_address, rewrite, rewrite_len);
      vec_free (rewrite);

      /* Send GARP packet out the specified interface */
      vnet_buffer (b)->sw_if_index[VLIB_RX] =
	vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
      vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index);
      u32 *to_next = vlib_frame_vector_args (f);
      to_next[0] = bi;
      f->n_vectors = 1;
      vlib_put_frame_to_node (vm, hi->output_node_index, f);
    }
}

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */