summaryrefslogtreecommitdiffstats
path: root/src/vnet/ip/ip_checksum.c
blob: d586649646dead038dc3899de84895d9798d85a8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * ip4/ip_checksum.c: ip/tcp/udp checksums
 *
 * Copyright (c) 2008 Eliot Dresselhaus
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include <vnet/ip/ip.h>

static ip_csum_t
_ip_incremental_checksum (ip_csum_t sum, void *_data, uword n_bytes)
{
  uword data = pointer_to_uword (_data);
  ip_csum_t sum0, sum1;

  sum0 = 0;
  sum1 = sum;

  /*
   * Align pointer to 64 bits. The ip checksum is a 16-bit
   * one's complememt sum. It's impractical to optimize
   * the calculation if the incoming address is odd.
   */
#define _(t)					\
do {						\
  if (n_bytes >= sizeof (t)			\
      && sizeof (t) < sizeof (ip_csum_t)	\
      && (data % (2 * sizeof (t))) != 0)	\
    {						\
      sum0 += * uword_to_pointer (data, t *);	\
      data += sizeof (t);			\
      n_bytes -= sizeof (t);			\
    }						\
} while (0)

  if (PREDICT_TRUE ((data & 1) == 0))
    {
      _(u16);
      if (BITS (ip_csum_t) > 32)
	_(u32);
    }
#undef _

  {
    ip_csum_t *d = uword_to_pointer (data, ip_csum_t *);

    while (n_bytes >= 2 * sizeof (d[0]))
      {
	sum0 = ip_csum_with_carry (sum0, d[0]);
	sum1 = ip_csum_with_carry (sum1, d[1]);
	d += 2;
	n_bytes -= 2 * sizeof (d[0]);
      }

    data = pointer_to_uword (d);
  }

#define _(t)								\
do {									\
  if (n_bytes >= sizeof (t) && sizeof (t) <= sizeof (ip_csum_t))	\
    {									\
      sum0 = ip_csum_with_carry (sum0, * uword_to_pointer (data, t *));	\
      data += sizeof (t);						\
      n_bytes -= sizeof (t);						\
    }									\
} while (0)

  if (BITS (ip_csum_t) > 32)
    _(u64);
  _(u32);
  _(u16);
  _(u8);

#undef _

  /* Combine even and odd sums. */
  sum0 = ip_csum_with_carry (sum0, sum1);

  return sum0;
}

/*
 * Note: the tcp / udp checksum calculation is performance critical
 * [e.g. when NIC h/w offload is not available],
 * so it's worth producing architecture-dependent code.
 *
 * ip_incremental_checksum() is an always-inlined static
 * function which uses the function pointer we set up in
 * ip_checksum_init().
 */

ip_csum_t (*vnet_incremental_checksum_fp) (ip_csum_t, void *, uword);

static clib_error_t *
ip_checksum_init (vlib_main_t * vm)
{
  vnet_incremental_checksum_fp = _ip_incremental_checksum;
  return 0;
}

VLIB_INIT_FUNCTION (ip_checksum_init);

#if CLIB_DEBUG > 0

static const char test_pkt[] = {
  0x45, 0x00, 0x00, 0x3c, 0x5d, 0x6f, 0x40, 0x00,
  0x40, 0x06, 0x3f, 0x6b, 0x0a, 0x76, 0x72, 0x44,
  0x0a, 0x56, 0x16, 0xd2,
};

static clib_error_t *
test_ip_checksum_fn (vlib_main_t * vm,
		     unformat_input_t * input, vlib_cli_command_t * cmd)
{
  u16 csum;
  ip4_header_t *hp;
  u8 *align_test = 0;
  int offset;

  vec_validate (align_test, ARRAY_LEN (test_pkt) + 7);

  for (offset = 0; offset < 8; offset++)
    {
      memcpy (align_test + offset, test_pkt, ARRAY_LEN (test_pkt));

      hp = (ip4_header_t *) (align_test + offset);
      csum = ip4_header_checksum (hp);

      vlib_cli_output (vm, "offset %d checksum %u expected result 27455",
		       offset, (u32) csum);
    }

  return 0;
}

/* *INDENT-OFF* */
VLIB_CLI_COMMAND (test_checksum, static) =
{
  .path = "test ip checksum",
  .short_help = "test ip checksum",
  .function = test_ip_checksum_fn,
};
/* *INDENT-ON* */

#endif /* CLIB_DEBUG */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */
teral.String.Affix */ .highlight .sb { color: #e6db74 } /* Literal.String.Backtick */ .highlight .sc { color: #e6db74 } /* Literal.String.Char */ .highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */ .highlight .sd { color: #e6db74 } /* Literal.String.Doc */ .highlight .s2 { color: #e6db74 } /* Literal.String.Double */ .highlight .se { color: #ae81ff } /* Literal.String.Escape */ .highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */ .highlight .si { color: #e6db74 } /* Literal.String.Interpol */ .highlight .sx { color: #e6db74 } /* Literal.String.Other */ .highlight .sr { color: #e6db74 } /* Literal.String.Regex */ .highlight .s1 { color: #e6db74 } /* Literal.String.Single */ .highlight .ss { color: #e6db74 } /* Literal.String.Symbol */ .highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #a6e22e } /* Name.Function.Magic */ .highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */ .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */ .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */ .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */ .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */ } @media (prefers-color-scheme: light) { .highlight .hll { background-color: #ffffcc } .highlight .c { color: #888888 } /* Comment */ .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ .highlight .k { color: #008800; font-weight: bold } /* Keyword */ .highlight .ch { color: #888888 } /* Comment.Hashbang */ .highlight .cm { color: #888888 } /* Comment.Multiline */ .highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */ .highlight .cpf { color: #888888 } /* Comment.PreprocFile */ .highlight .c1 { color: #888888 } /* Comment.Single */ .highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */ .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ .highlight .ge { font-style: italic } /* Generic.Emph */ .highlight .gr { color: #aa0000 } /* Generic.Error */ .highlight .gh { color: #333333 } /* Generic.Heading */ .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ .highlight .go { color: #888888 } /* Generic.Output */ .highlight .gp { color: #555555 } /* Generic.Prompt */ .highlight .gs { font-weight: bold } /* Generic.Strong */ .highlight .gu { color: #666666 } /* Generic.Subheading */ .highlight .gt { color: #aa0000 } /* Generic.Traceback */ .highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */ .highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */ .highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */ .highlight .kp { color: #008800 } /* Keyword.Pseudo */ .highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */ .highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */ .highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */ .highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */ .highlight .na { color: #336699 } /* Name.Attribute */ .highlight .nb { color: #003388 } /* Name.Builtin */ .highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */ .highlight .no { color: #003366; font-weight: bold } /* Name.Constant */ .highlight .nd { color: #555555 } /* Name.Decorator */ .highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */ .highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */ .highlight .nl { color: #336699; font-style: italic } /* Name.Label */ .highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */ .highlight .py { color: #336699; font-weight: bold } /* Name.Property */ .highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */ .highlight .nv { color: #336699 } /* Name.Variable */ .highlight .ow { color: #008800 } /* Operator.Word */ .highlight .w { color: #bbbbbb } /* Text.Whitespace */ .highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */ .highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */ .highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */ .highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */ .highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */ .highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */ .highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */ .highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */ .highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */ .highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */ .highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */ .highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */ .highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */ .highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */ .highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */ .highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */ .highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */ .highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */ .highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */ .highlight .vc { color: #336699 } /* Name.Variable.Class */ .highlight .vg { color: #dd7700 } /* Name.Variable.Global */ .highlight .vi { color: #3333bb } /* Name.Variable.Instance */ .highlight .vm { color: #336699 } /* Name.Variable.Magic */ .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */ }
/*
 * Copyright (c) 2020 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vnet/fib/fib_source.h>
#include <vnet/fib/fib_table.h>
#include <vnet/fib/fib_entry_track.h>
#include <vnet/dpo/load_balance.h>
#include <vnet/dpo/drop_dpo.h>

#include <cnat/cnat_translation.h>
#include <cnat/cnat_session.h>
#include <cnat/cnat_client.h>

cnat_translation_t *cnat_translation_pool;
clib_bihash_8_8_t cnat_translation_db;
addr_resolution_t *tr_resolutions;
cnat_if_addr_add_cb_t *cnat_if_addr_add_cbs;

static fib_node_type_t cnat_translation_fib_node_type;

vlib_combined_counter_main_t cnat_translation_counters = {
  .name = "cnat-translation",
  .stat_segment_name = "/net/cnat-translation",
};

void
cnat_translation_watch_addr (index_t cti, u64 opaque, cnat_endpoint_t * ep,
			     cnat_addr_resol_type_t type)
{
  addr_resolution_t *ar;

  if (INDEX_INVALID == ep->ce_sw_if_index)
    return;

  pool_get (tr_resolutions, ar);
  ar->af = ep->ce_ip.version;
  ar->sw_if_index = ep->ce_sw_if_index;
  ar->type = type;
  ar->opaque = opaque;
  ar->cti = cti;
}

static void
cnat_resolve_ep_tuple (cnat_endpoint_tuple_t * path)
{
  cnat_resolve_ep (&path->src_ep);
  cnat_resolve_ep (&path->dst_ep);
}

void
cnat_translation_unwatch_addr (u32 cti, cnat_addr_resol_type_t type)
{
  /* Delete tr resolution entries matching translation index */
  addr_resolution_t *ar;
  index_t *indexes = 0, *ari;
  pool_foreach (ar, tr_resolutions)
    {
      if ((cti == INDEX_INVALID || ar->cti == cti) &&
	  (ar->type == type || CNAT_RESOLV_ADDR_ANY == type))
	vec_add1 (indexes, ar - tr_resolutions);
    }
  vec_foreach (ari, indexes) pool_put_index (tr_resolutions, *ari);

  vec_free (indexes);
}

static void
cnat_tracker_release (cnat_ep_trk_t * trk)
{
  /* We only track fully resolved endpoints */
  if (!(trk->ct_flags & CNAT_TRK_ACTIVE))
    return;
  fib_entry_untrack (trk->ct_fei, trk->ct_sibling);
}

static void
cnat_tracker_track (index_t cti, cnat_ep_trk_t * trk)
{
  fib_prefix_t pfx;
  /* We only track fully resolved endpoints */
  if (trk->ct_ep[VLIB_TX].ce_flags & CNAT_EP_FLAG_RESOLVED &&
      trk->ct_ep[VLIB_RX].ce_flags & CNAT_EP_FLAG_RESOLVED)
    trk->ct_flags |= CNAT_TRK_ACTIVE;
  else
    {
      trk->ct_flags &= ~CNAT_TRK_ACTIVE;
      return;
    }

  ip_address_to_fib_prefix (&trk->ct_ep[VLIB_TX].ce_ip, &pfx);
  trk->ct_fei = fib_entry_track (CNAT_FIB_TABLE,
				 &pfx,
				 cnat_translation_fib_node_type,
				 cti, &trk->ct_sibling);

  fib_entry_contribute_forwarding (trk->ct_fei,
				   fib_forw_chain_type_from_fib_proto
				   (pfx.fp_proto), &trk->ct_dpo);
}

u8 *
format_cnat_lb_type (u8 *s, va_list *args)
{
  cnat_lb_type_t lb_type = va_arg (*args, int);
  if (CNAT_LB_DEFAULT == lb_type)
    s = format (s, "default");
  else if (CNAT_LB_MAGLEV == lb_type)
    s = format (s, "maglev");
  else
    s = format (s, "unknown");
  return (s);
}

uword
unformat_cnat_lb_type (unformat_input_t *input, va_list *args)
{
  cnat_lb_type_t *a = va_arg (*args, cnat_lb_type_t *);
  if (unformat (input, "default"))
    *a = CNAT_LB_DEFAULT;
  else if (unformat (input, "maglev"))
    *a = CNAT_LB_MAGLEV;
  else
    return 0;
  return 1;
}

/**
 * Add a translation to the bihash
 *
 * @param cci the ID of the parent client (invalid if vip not resolved)
 * @param vip the translation endpoint
 * @param proto the translation proto
 * @param cti the translation index to be used as value
 */
static void
cnat_add_translation_to_db (index_t cci, cnat_endpoint_t * vip,
			    ip_protocol_t proto, index_t cti)
{
  clib_bihash_kv_8_8_t bkey;
  u64 key;
  if (INDEX_INVALID == cci)
    {
      key = proto << 8 | 0x80 | vip->ce_ip.version;
      key = key << 16 | vip->ce_port;
      key = key << 32 | (u32) vip->ce_sw_if_index;
    }
  else
    {
      key = proto << 8;
      key = key << 16 | vip->ce_port;
      key = key << 32 | (u32) cci;
    }

  bkey.key = key;
  bkey.value = cti;

  clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 1);
}

/**
 * Remove a translation from the bihash
 *
 * @param cci the ID of the parent client
 * @param vip the translation endpoint
 * @param proto the translation proto
 */
static void
cnat_remove_translation_from_db (index_t cci, cnat_endpoint_t * vip,
				 ip_protocol_t proto)
{
  clib_bihash_kv_8_8_t bkey;
  u64 key;
  if (INDEX_INVALID == cci)
    {
      key = proto << 8 | 0x80 | vip->ce_ip.version;
      key = key << 16 | vip->ce_port;
      key = key << 32 | (u32) vip->ce_sw_if_index;
    }
  else
    {
      key = proto << 8;
      key = key << 16 | vip->ce_port;
      key = key << 32 | (u32) cci;
    }

  bkey.key = key;

  clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 0);
}

typedef struct
{
  cnat_ep_trk_t *trk;
  u32 index;
  u32 offset;
  u32 skip;
} cnat_maglev_entry_t;

static int
cnat_maglev_entry_compare (void *_a, void *_b)
{
  cnat_ep_trk_t *a = ((cnat_maglev_entry_t *) _a)->trk;
  cnat_ep_trk_t *b = ((cnat_maglev_entry_t *) _b)->trk;
  int rv = 0;
  if ((rv =
	 ip_address_cmp (&a->ct_ep[VLIB_TX].ce_ip, &b->ct_ep[VLIB_TX].ce_ip)))
    return rv;
  if ((rv = a->ct_ep[VLIB_TX].ce_port - a->ct_ep[VLIB_TX].ce_port))
    return rv;
  if ((rv =
	 ip_address_cmp (&a->ct_ep[VLIB_RX].ce_ip, &b->ct_ep[VLIB_RX].ce_ip)))
    return rv;
  if ((rv = a->ct_ep[VLIB_RX].ce_port - a->ct_ep[VLIB_RX].ce_port))
    return rv;
  return 0;
}

static void
cnat_translation_init_maglev (cnat_translation_t *ct)
{
  cnat_maglev_entry_t *backends = NULL, *bk;
  cnat_main_t *cm = &cnat_main;
  u32 done = 0;
  cnat_ep_trk_t *trk;
  int ep_idx = 0;

  vec_foreach (trk, ct->ct_active_paths)
    {
      cnat_maglev_entry_t bk;
      u32 h1, h2;

      if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip))
	{
	  u32 a, b, c;
	  a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32;
	  b = (u64) trk->ct_ep[VLIB_TX].ce_port << 16 |
	      (u64) trk->ct_ep[VLIB_RX].ce_port;
	  c = ip_addr_v4 (&trk->ct_ep[VLIB_RX].ce_ip).data_u32;
	  hash_v3_mix32 (a, b, c);
	  hash_v3_finalize32 (a, b, c);
	  h1 = c;
	  h2 = b;
	}
      else
	{
	  u64 a, b, c;
	  a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0] ^
	      ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1];
	  b = (u64) trk->ct_ep[VLIB_TX].ce_port << 16 |
	      (u64) trk->ct_ep[VLIB_RX].ce_port;
	  c = ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[0] ^
	      ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[1];
	  hash_mix64 (a, b, c);
	  h1 = c;
	  h2 = b;
	}

      bk.offset = h1 % cm->maglev_len;
      bk.skip = h2 % (cm->maglev_len - 1) + 1;
      bk.index = ep_idx++;
      bk.trk = trk;
      vec_add1 (backends, bk);
    }

  if (0 == ep_idx)
    return;

  vec_sort_with_function (backends, cnat_maglev_entry_compare);

  /* Don't free if previous vector exists, just zero */
  vec_validate (ct->lb_maglev, cm->maglev_len);
  vec_set (ct->lb_maglev, -1);

  while (1)
    {
      vec_foreach (bk, backends)
	{
	  u32 next = 0;
	  u32 c = (bk->offset + next * bk->skip) % cm->maglev_len;
	  while (ct->lb_maglev[c] != (u32) -1)
	    {
	      next++;
	      c = (bk->offset + next * bk->skip) % cm->maglev_len;
	    }
	  ct->lb_maglev[c] = bk->index;
	  done++;
	  if (done == cm->maglev_len)
	    goto finished;
	}
    }

finished:
  vec_free (backends);
}

static void
cnat_translation_stack (cnat_translation_t * ct)
{
  fib_protocol_t fproto;
  cnat_ep_trk_t *trk;
  dpo_proto_t dproto;
  u32 ep_idx = 0;
  index_t lbi;

  fproto = ip_address_family_to_fib_proto (ct->ct_vip.ce_ip.version);
  dproto = fib_proto_to_dpo (fproto);

  vec_reset_length (ct->ct_active_paths);

  vec_foreach (trk, ct->ct_paths)
    if (trk->ct_flags & CNAT_TRK_ACTIVE)
      vec_add1 (ct->ct_active_paths, *trk);

  lbi = load_balance_create (vec_len (ct->ct_active_paths),
			     fib_proto_to_dpo (fproto), IP_FLOW_HASH_DEFAULT);

  ep_idx = 0;
  vec_foreach (trk, ct->ct_active_paths)
    load_balance_set_bucket (lbi, ep_idx++, &trk->ct_dpo);

  if (ep_idx > 0 && CNAT_LB_MAGLEV == ct->lb_type)
    cnat_translation_init_maglev (ct);

  dpo_set (&ct->ct_lb, DPO_LOAD_BALANCE, dproto, lbi);
  dpo_stack (cnat_client_dpo, dproto, &ct->ct_lb, &ct->ct_lb);
  ct->flags |= CNAT_TRANSLATION_STACKED;
}

int
cnat_translation_delete (u32 id)
{
  cnat_translation_t *ct;
  cnat_ep_trk_t *trk;

  if (pool_is_free_index (cnat_translation_pool, id))
    return (VNET_API_ERROR_NO_SUCH_ENTRY);

  ct = pool_elt_at_index (cnat_translation_pool, id);

  dpo_reset (&ct->ct_lb);

  vec_foreach (trk, ct->ct_active_paths)
    cnat_tracker_release (trk);

  cnat_remove_translation_from_db (ct->ct_cci, &ct->ct_vip, ct->ct_proto);
  cnat_client_translation_deleted (ct->ct_cci);
  cnat_translation_unwatch_addr (id, CNAT_RESOLV_ADDR_ANY);
  pool_put (cnat_translation_pool, ct);

  return (0);
}

u32
cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto,
			 cnat_endpoint_tuple_t *paths, u8 flags,
			 cnat_lb_type_t lb_type)
{
  cnat_endpoint_tuple_t *path;
  const cnat_client_t *cc;
  cnat_translation_t *ct;
  cnat_ep_trk_t *trk;
  index_t cci;

  cnat_lazy_init ();
  if (cnat_resolve_ep (vip))
    {
      /* vip only contains a sw_if_index for now */
      ct = cnat_find_translation (vip->ce_sw_if_index, vip->ce_port, proto);
      cci = INDEX_INVALID;
    }
  else
    {
      /* do we know of this ep's vip */
      cci = cnat_client_add (&vip->ce_ip, flags);
      cc = cnat_client_get (cci);

      ct = cnat_find_translation (cc->parent_cci, vip->ce_port, proto);
    }

  if (NULL == ct)
    {
      pool_get_zero (cnat_translation_pool, ct);

      clib_memcpy (&ct->ct_vip, vip, sizeof (*vip));
      ct->ct_proto = proto;
      ct->ct_cci = cci;
      ct->index = ct - cnat_translation_pool;
      ct->lb_type = lb_type;

      cnat_add_translation_to_db (cci, vip, proto, ct->index);
      cnat_client_translation_added (cci);

      vlib_validate_combined_counter (&cnat_translation_counters, ct->index);
      vlib_zero_combined_counter (&cnat_translation_counters, ct->index);
    }
  ct->flags = flags;

  cnat_translation_unwatch_addr (ct->index, CNAT_RESOLV_ADDR_ANY);
  cnat_translation_watch_addr (ct->index, 0, vip,
			       CNAT_RESOLV_ADDR_TRANSLATION);

  vec_foreach (trk, ct->ct_paths)
  {
    cnat_tracker_release (trk);
  }

  vec_reset_length (ct->ct_paths);
  ct->flags &= ~CNAT_TRANSLATION_STACKED;

  u64 path_idx = 0;
  vec_foreach (path, paths)
  {
    cnat_resolve_ep_tuple (path);
    cnat_translation_watch_addr (ct->index,
				 path_idx << 32 | VLIB_RX, &path->src_ep,
				 CNAT_RESOLV_ADDR_BACKEND);
    cnat_translation_watch_addr (ct->index,
				 path_idx << 32 | VLIB_TX, &path->dst_ep,
				 CNAT_RESOLV_ADDR_BACKEND);
    path_idx++;

    vec_add2 (ct->ct_paths, trk, 1);

    clib_memcpy (&trk->ct_ep[VLIB_TX], &path->dst_ep,
		 sizeof (trk->ct_ep[VLIB_TX]));
    clib_memcpy (&trk->ct_ep[VLIB_RX], &path->src_ep,
		 sizeof (trk->ct_ep[VLIB_RX]));
    trk->ct_flags = path->ep_flags;

    cnat_tracker_track (ct->index, trk);
  }

  cnat_translation_stack (ct);

  return (ct->index);
}

void
cnat_translation_walk (cnat_translation_walk_cb_t cb, void *ctx)
{
  u32 api;

  pool_foreach_index (api, cnat_translation_pool)
   {
    if (!cb(api, ctx))
      break;
  }
}

static u8 *
format_cnat_ep_trk (u8 * s, va_list * args)
{
  cnat_ep_trk_t *ck = va_arg (*args, cnat_ep_trk_t *);
  u32 indent = va_arg (*args, u32);

  s = format (s, "%U->%U", format_cnat_endpoint, &ck->ct_ep[VLIB_RX],
	      format_cnat_endpoint, &ck->ct_ep[VLIB_TX]);
  s = format (s, "\n%Ufib-entry:%d", format_white_space, indent, ck->ct_fei);
  s = format (s, "\n%U%U",
	      format_white_space, indent, format_dpo_id, &ck->ct_dpo, 6);

  return (s);
}

u8 *
format_cnat_translation (u8 * s, va_list * args)
{
  cnat_translation_t *ct = va_arg (*args, cnat_translation_t *);
  cnat_main_t *cm = &cnat_main;
  cnat_ep_trk_t *ck;

  s = format (s, "[%d] ", ct->index);
  s = format (s, "%U %U ", format_cnat_endpoint, &ct->ct_vip,
	      format_ip_protocol, ct->ct_proto);
  s = format (s, "lb:%U ", format_cnat_lb_type, ct->lb_type);

  vec_foreach (ck, ct->ct_paths)
    s = format (s, "\n%U", format_cnat_ep_trk, ck, 2);

  /* If printing a trace, the LB object might be deleted */
  if (!pool_is_free_index (load_balance_pool, ct->ct_lb.dpoi_index))
    {
      s = format (s, "\n via:");
      s = format (s, "\n%U%U",
		  format_white_space, 2, format_dpo_id, &ct->ct_lb, 2);
    }

  u32 bid = 0;
  if (CNAT_LB_MAGLEV == ct->lb_type)
    {
      s = format (s, "\nmaglev backends map");
      uword *bitmap = NULL;
      clib_bitmap_alloc (bitmap, cm->maglev_len);
      vec_foreach (ck, ct->ct_paths)
	{
	  clib_bitmap_zero (bitmap);
	  for (u32 i = 0; i < vec_len (ct->lb_maglev); i++)
	    if (ct->lb_maglev[i] == bid)
	      clib_bitmap_set (bitmap, i, 1);
	  s = format (s, "\n  backend#%d: %U", bid, format_bitmap_hex, bitmap);

	  bid++;
	}
      clib_bitmap_free (bitmap);
    }

  return (s);
}

static clib_error_t *
cnat_translation_show (vlib_main_t * vm,
		       unformat_input_t * input, vlib_cli_command_t * cmd)
{
  index_t cti;
  cnat_translation_t *ct;

  cti = INDEX_INVALID;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "%d", &cti))
	;
      else
	return (clib_error_return (0, "unknown input '%U'",
				   format_unformat_error, input));
    }

  if (INDEX_INVALID == cti)
    {
      pool_foreach_index (cti, cnat_translation_pool)
       {
	ct = pool_elt_at_index (cnat_translation_pool, cti);
        vlib_cli_output(vm, "%U", format_cnat_translation, ct);
      }
    }
  else
    {
      vlib_cli_output (vm, "Invalid policy ID:%d", cti);
    }

  return (NULL);
}

int
cnat_translation_purge (void)
{
  /* purge all the translations */
  index_t tri, *trp, *trs = NULL;

  pool_foreach_index (tri, cnat_translation_pool)
   {
    vec_add1(trs, tri);
  }

  vec_foreach (trp, trs) cnat_translation_delete (*trp);

  ASSERT (0 == pool_elts (cnat_translation_pool));

  vec_free (trs);

  return (0);
}

VLIB_CLI_COMMAND (cnat_translation_show_cmd_node, static) = {
  .path = "show cnat translation",
  .function = cnat_translation_show,
  .short_help = "show cnat translation <VIP>",
  .is_mp_safe = 1,
};

static fib_node_t *
cnat_translation_get_node (fib_node_index_t index)
{
  cnat_translation_t *ct = cnat_translation_get (index);
  return (&(ct->ct_node));
}

static cnat_translation_t *
cnat_translation_get_from_node (fib_node_t * node)
{
  return ((cnat_translation_t *) (((char *) node) -
				  STRUCT_OFFSET_OF (cnat_translation_t,
						    ct_node)));
}

static void
cnat_translation_last_lock_gone (fib_node_t * node)
{
 /**/}

/*
 * A back walk has reached this ABF policy
 */
static fib_node_back_walk_rc_t
cnat_translation_back_walk_notify (fib_node_t * node,
				   fib_node_back_walk_ctx_t * ctx)
{
  /*
   * re-stack the fmask on the n-eos of the via
   */
  cnat_translation_t *ct = cnat_translation_get_from_node (node);

  /* If we have more than FIB_PATH_LIST_POPULAR paths
   * we might get called during path tracking
   * (cnat_tracker_track) */
  if (!(ct->flags & CNAT_TRANSLATION_STACKED))
    return (FIB_NODE_BACK_WALK_CONTINUE);

  cnat_translation_stack (ct);

  return (FIB_NODE_BACK_WALK_CONTINUE);
}

/*
 * The translation's graph node virtual function table
 */
static const fib_node_vft_t cnat_translation_vft = {
  .fnv_get = cnat_translation_get_node,
  .fnv_last_lock = cnat_translation_last_lock_gone,
  .fnv_back_walk = cnat_translation_back_walk_notify,
};

static clib_error_t *
cnat_translation_cli_add_del (vlib_main_t * vm,
			      unformat_input_t * input,
			      vlib_cli_command_t * cmd)
{
  u32 del_index = INDEX_INVALID;
  ip_protocol_t proto = IP_PROTOCOL_TCP;
  cnat_endpoint_t vip;
  u8 flags = CNAT_FLAG_EXCLUSIVE;
  cnat_endpoint_tuple_t tmp, *paths = NULL, *path;
  unformat_input_t _line_input, *line_input = &_line_input;
  clib_error_t *e = 0;
  cnat_lb_type_t lb_type;

  /* Get a line of input. */
  if (!unformat_user (input, unformat_line_input, line_input))
    return 0;

  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (line_input, "add"))
	del_index = INDEX_INVALID;
      else if (unformat (line_input, "del %d", &del_index))
	;
      else
	if (unformat (line_input, "proto %U", unformat_ip_protocol, &proto))
	;
      else if (unformat (line_input, "vip %U", unformat_cnat_ep, &vip))
	flags = CNAT_FLAG_EXCLUSIVE;
      else if (unformat (line_input, "real %U", unformat_cnat_ep, &vip))
	flags = 0;
      else if (unformat (line_input, "to %U", unformat_cnat_ep_tuple, &tmp))
	{
	  vec_add2 (paths, path, 1);
	  clib_memcpy (path, &tmp, sizeof (cnat_endpoint_tuple_t));
	}
      else if (unformat (line_input, "%U", unformat_cnat_lb_type, &lb_type))
	;
      else
	{
	  e = clib_error_return (0, "unknown input '%U'",
				 format_unformat_error, line_input);
	  goto done;
	}
    }

  if (INDEX_INVALID == del_index)
    cnat_translation_update (&vip, proto, paths, flags, lb_type);
  else
    cnat_translation_delete (del_index);

done:
  vec_free (paths);
  unformat_free (line_input);
  return (e);
}

VLIB_CLI_COMMAND (cnat_translation_cli_add_del_command, static) =
{
  .path = "cnat translation",
  .short_help = "cnat translation [add|del] proto [TCP|UDP] [vip|real] [ip|sw_if_index [v6]] [port] [to [ip|sw_if_index [v6]] [port]->[ip|sw_if_index [v6]] [port]]",
  .function = cnat_translation_cli_add_del,
};

static void
cnat_if_addr_add_del_translation_cb (addr_resolution_t * ar,
				     ip_address_t * address, u8 is_del)
{
  cnat_translation_t *ct;
  ct = cnat_translation_get (ar->cti);
  if (!is_del && ct->ct_vip.ce_flags & CNAT_EP_FLAG_RESOLVED)
    return;

  cnat_remove_translation_from_db (ct->ct_cci, &ct->ct_vip, ct->ct_proto);

  if (is_del)
    {
      ct->ct_vip.ce_flags &= ~CNAT_EP_FLAG_RESOLVED;
      ct->ct_cci = INDEX_INVALID;
      cnat_client_translation_deleted (ct->ct_cci);
      /* Are there remaining addresses ? */
      if (0 == cnat_resolve_addr (ar->sw_if_index, ar->af, address))
	is_del = 0;
    }

  if (!is_del)
    {
      ct->ct_cci = cnat_client_add (address, ct->flags);
      cnat_client_translation_added (ct->ct_cci);
      ip_address_copy (&ct->ct_vip.ce_ip, address);
      ct->ct_vip.ce_flags |= CNAT_EP_FLAG_RESOLVED;
    }

  cnat_add_translation_to_db (ct->ct_cci, &ct->ct_vip, ct->ct_proto,
			      ct->index);
}

static void
cnat_if_addr_add_del_backend_cb (addr_resolution_t * ar,
				 ip_address_t * address, u8 is_del)
{
  cnat_translation_t *ct;
  cnat_ep_trk_t *trk;
  cnat_endpoint_t *ep;

  u8 direction = ar->opaque & 0xf;
  u32 path_idx = ar->opaque >> 32;

  ct = cnat_translation_get (ar->cti);

  trk = &ct->ct_paths[path_idx];
  ep = &trk->ct_ep[direction];

  if (!is_del && ep->ce_flags & CNAT_EP_FLAG_RESOLVED)
    return;

  ASSERT (ep->ce_sw_if_index == ar->sw_if_index);

  if (is_del)
    {
      ep->ce_flags &= ~CNAT_EP_FLAG_RESOLVED;
      /* Are there remaining addresses ? */
      if (0 == cnat_resolve_addr (ar->sw_if_index, ar->af, address))
	is_del = 0;
    }

  if (!is_del)
    {
      ip_address_copy (&ep->ce_ip, address);
      ep->ce_flags |= CNAT_EP_FLAG_RESOLVED;
    }

  ct->flags &= ~CNAT_TRANSLATION_STACKED;
  cnat_tracker_track (ar->cti, trk);

  cnat_translation_stack (ct);
  ct->flags |= CNAT_TRANSLATION_STACKED;
}

static void
cnat_if_addr_add_del_callback (u32 sw_if_index, ip_address_t * address,
			       u8 is_del)
{
  addr_resolution_t *ar;
  pool_foreach (ar, tr_resolutions)
    {
      if (ar->sw_if_index != sw_if_index)
	continue;
      if (ar->af != ip_addr_version (address))
	continue;
      cnat_if_addr_add_cbs[ar->type](ar, address, is_del);
    }
}

static void
cnat_ip6_if_addr_add_del_callback (struct ip6_main_t *im,
				   uword opaque, u32 sw_if_index,
				   ip6_address_t * address,
				   u32 address_length, u32 if_address_index,
				   u32 is_del)
{
  ip_address_t addr;
  ip_address_set (&addr, address, AF_IP6);
  cnat_if_addr_add_del_callback (sw_if_index, &addr, is_del);
}

static void
cnat_ip4_if_addr_add_del_callback (struct ip4_main_t *im,
				   uword opaque, u32 sw_if_index,
				   ip4_address_t * address,
				   u32 address_length, u32 if_address_index,
				   u32 is_del)
{
  ip_address_t addr;
  ip_address_set (&addr, address, AF_IP4);
  cnat_if_addr_add_del_callback (sw_if_index, &addr, is_del);
}

void
cnat_translation_register_addr_add_cb (cnat_addr_resol_type_t typ,
				       cnat_if_addr_add_cb_t fn)
{
  vec_validate (cnat_if_addr_add_cbs, CNAT_ADDR_N_RESOLUTIONS);
  cnat_if_addr_add_cbs[typ] = fn;
}

static clib_error_t *
cnat_translation_init (vlib_main_t * vm)
{
  ip4_main_t *i4m = &ip4_main;
  ip6_main_t *i6m = &ip6_main;
  cnat_main_t *cm = &cnat_main;
  cnat_translation_fib_node_type =
    fib_node_register_new_type (&cnat_translation_vft);

  clib_bihash_init_8_8 (&cnat_translation_db, "CNat translation DB",
			cm->translation_hash_buckets,
			cm->translation_hash_memory);

  ip4_add_del_interface_address_callback_t cb4 = { 0 };
  cb4.function = cnat_ip4_if_addr_add_del_callback;
  vec_add1 (i4m->add_del_interface_address_callbacks, cb4);

  ip6_add_del_interface_address_callback_t cb6 = { 0 };
  cb6.function = cnat_ip6_if_addr_add_del_callback;
  vec_add1 (i6m->add_del_interface_address_callbacks, cb6);

  cnat_translation_register_addr_add_cb (CNAT_RESOLV_ADDR_BACKEND,
					 cnat_if_addr_add_del_backend_cb);
  cnat_translation_register_addr_add_cb (CNAT_RESOLV_ADDR_TRANSLATION,
					 cnat_if_addr_add_del_translation_cb);

  return (NULL);
}

VLIB_INIT_FUNCTION (cnat_translation_init);

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */