src/vnet/gre/node.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */=== vnet classifier theory of operation ===

The vnet classifier trades off simplicity and perf / scale
characteristics. At a certain level, it's a dumb robot. Given an
incoming packet, search an ordered list of (mask, match) tables. If
the classifier finds a matching entry, take the indicated action. If
not, take a last-resort action.

We use the MMX-unit to match or hash 16 octets at a time. For hardware
backward compatibility, the code does not [currently] use 256-bit
(32-octet) vector instructions.

Effective use of the classifier centers around building table lists
which "hit" as soon as practicable. In many cases, established
sessions hit in the first table. In this mode of operation, the
classifier easily processes multiple MPPS / core - even with millions
of sessions in the data base. Searching 357 tables on a regular basis
will neatly solve the halting problem.

==== Basic operation ====

The classifier mask-and-match operation proceeds as follows. Given a
starting classifier table index, lay hands on the indicated mask
vector.  When building tables, we arrange for the mask to obey
mmx-unit (16-octet) alignment.

We know that the first octet of packet data starts on a cache-line
boundary. Further, it's reasonably likely that folks won't want to use
the generalized classifier on the L2 header; preferring to decode the
Ethertype manually. That scheme makes it easy to select among ip4 /
ip6 / MPLS, etc. classifier table sets.

A no-vlan-tag L2 header is 14 octets long. A typical ipv4 header
begins with the octets 0x4500: version=4, header_length=5, DSCP=0,
ECN=0. If one doesn't intend to classify on (DSCP, ECN) - the typical
case - we program the classifier to skip the first 16-octet vector.

To classify untagged ipv4 packets on source address, we program the
classifier to skip one vector, and mask-and-match one vector.

The basic match-and-match operation looks like this:

 switch (t->match_n_vectors)
   {
   case 1:
     result = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
     break;
     
   case 2:
     result =  (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
     result |= (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
     break;
     
     <etc>
    }

 result_mask = u32x4_zero_byte_mask (result);
 if (result_mask == 0xffff)
     return (v);

Net of setup, it costs a couple of clock cycles to mask-and-match 16
octets.

At the risk of belaboring an obvious point, the control-plane
'''must''' pay attention to detail. When skipping one (or more)
vectors, masks and matches must reflect that decision. See
.../vnet/vnet/classify/vnet_classify.c:unformat_classify_[mask|match]. Note
that vec_validate (xxx, 13) creates a 14-element vector.

==== Creating a classifier table ====

To create a new classifier table via the control-plane API, send a
"classify_add_del_table" message. The underlying action routine,
vnet_classify_add_del_table(...), is located in
.../vnet/vnet/classify/vnet_classify.c, and has the following
prototype:

 int vnet_classify_add_del_table (vnet_classify_main_t * cm,
                                  u8 * mask, 
                                  u32 nbuckets,
                                  u32 memory_size,
                                  u32 skip,
                                  u32 match,
                                  u32 next_table_index,
                                  u32 miss_next_index,
                                  u32 * table_index,
                                  int is_add)

Pass cm = &vnet_classify_main if calling this routine directly. Mask,
skip(_n_vectors) and match(_n_vectors) are as described above. Mask
need not be aligned, but it must be match*16 octets in length. To
avoid having your head explode, be absolutely certain that '''only'''
the bits you intend to match on are set.

The classifier uses thread-safe, no-reader-locking-required
bounded-index extensible hashing. Nbuckets is the [fixed] size of the
hash bucket vector. The algorithm works in constant time regardless of
hash collisions, but wastes space when the bucket array is too
small. A good rule of thumb: let nbuckets = approximate number of
entries expected.

At a signficant cost in complexity, it would be possible to resize the
bucket array dynamically. We have no plans to implement that function.

Each classifier table has its own clib mheap memory allocation
arena. To pick the memory_size parameter, note that each classifier
table entry needs 16*(1 + match_n_vectors) bytes. Within reason, aim a
bit high. Clib mheap memory uses o/s level virtual memory - not wired
or hugetlb memory - so it's best not to scrimp on size.

The "next_table_index" parameter is as described: the pool index in
vnet_classify_main.tables of the next table to search. Code ~0 to
indicate the end of the table list. 0 is a valid table index!

We often create classification tables in reverse order -
last-table-searched to first-table-searched - so we can easily set
this parameter. Of course, one can manually adjust the data structure
after-the-fact.

Specific classifier client nodes - for example,
.../vnet/vnet/classify/ip_classify.c - interpret the "miss_next_index"
parameter as a vpp graph-node next index. When packet classification
fails to produce a match, ip_classify_inline sends packets to the
indicated disposition. A classifier application might program this
parameter to send packets which don't match an existing session to a
"first-sign-of-life, create-new-session" node.

Finally, the is_add parameter indicates whether to add or delete the
indicated table. The delete case implicitly terminates all sessions
with extreme prejudice, by freeing the specified clib mheap.

==== Creating a classifier session ====

To create a new classifier session via the control-plane API, send a
"classify_add_del_session" message. The underlying action routine,
vnet_classify_add_del_session(...), is located in
.../vnet/vnet/classify/vnet_classify.c, and has the following
prototype:

int vnet_classify_add_del_session (vnet_classify_main_t * cm, 
                                   u32 table_index, 
                                   u8 * match, 
                                   u32 hit_next_index,
                                   u32 opaque_index, 
                                   i32 advance,
                                   int is_add)

Pass cm = &vnet_classify_main if calling this routine directly. Table
index specifies the table which receives the new session / contains
the session to delete depending on is_add.

Match is the key for the indicated session. It need not be aligned,
but it must be table->match_n_vectors*16 octets in length. As a
courtesy, vnet_classify_add_del_session applies the table's mask to
the stored key-value. In this way, one can create a sess/*
 * node.c: gre packet processing
 *
 * Copyright (c) 2012 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vlib/vlib.h>
#include <vnet/pg/pg.h>
#include <vnet/gre/gre.h>
#include <vnet/mpls/mpls.h>
#include <vppinfra/sparse_vec.h>

#define foreach_gre_input_next			\
_(PUNT, "error-punt")                           \
_(DROP, "error-drop")                           \
_(ETHERNET_INPUT, "ethernet-input")             \
_(IP4_INPUT, "ip4-input")                       \
_(IP6_INPUT, "ip6-input")			\
_(MPLS_INPUT, "mpls-input")

typedef enum {
#define _(s,n) GRE_INPUT_NEXT_##s,
  foreach_gre_input_next
#undef _
  GRE_INPUT_N_NEXT,
} gre_input_next_t;

typedef struct {
  u32 tunnel_id;
  u32 length;
  ip4_address_t src;
  ip4_address_t dst;
} gre_rx_trace_t;

u8 * format_gre_rx_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  gre_rx_trace_t * t = va_arg (*args, gre_rx_trace_t *);
    
  s = format (s, "GRE: tunnel %d len %d src %U dst %U",
              t->tunnel_id, clib_net_to_host_u16(t->length),
              format_ip4_address, &t->src.as_u8,
              format_ip4_address, &t->dst.as_u8);
  return s;
}

typedef struct {
  /* Sparse vector mapping gre protocol in network byte order
     to next index. */
  u16 * next_by_protocol;
} gre_input_runtime_t;

static uword
gre_input (vlib_main_t * vm,
	   vlib_node_runtime_t * node,
	   vlib_frame_t * from_frame)
{
  gre_main_t * gm = &gre_main;
  gre_input_runtime_t * rt = (void *) node->runtime_data;
  __attribute__((unused)) u32 n_left_from, next_index, * from, * to_next;
  u64 cached_tunnel_key = (u64) ~0;
  u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index = 0;

  u32 cpu_index = os_get_cpu_number();
  u32 len;
  vnet_interface_main_t *im = &gm->vnet_main->interface_main;

  from = vlib_frame_vector_args (from_frame);
  n_left_from = from_frame->n_vectors;

  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index,
			   to_next, n_left_to_next);

      while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	  u32 bi0, bi1;
	  vlib_buffer_t * b0, * b1;
	  gre_header_t * h0, * h1;
          u16 version0, version1;
          int verr0, verr1;
	  u32 i0, i1, next0, next1, protocol0, protocol1;
          ip4_header_t *ip0, *ip1;

	  /* Prefetch next iteration. */
	  {
	    vlib_buffer_t * p2, * p3;

	    p2 = vlib_get_buffer (vm, from[2]);
	    p3 = vlib_get_buffer (vm, from[3]);

	    vlib_prefetch_buffer_header (p2, LOAD);
	    vlib_prefetch_buffer_header (p3, LOAD);

	    CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD);
	    CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD);
	  }

	  bi0 = from[0];
	  bi1 = from[1];
	  to_next[0] = bi0;
	  to_next[1] = bi1;
	  from += 2;
	  to_next += 2;
	  n_left_to_next -= 2;
	  n_left_from -= 2;

	  b0 = vlib_get_buffer (vm, bi0);
	  b1 = vlib_get_buffer (vm, bi1);

          /* ip4_local hands us the ip header, not the gre header */
          ip0 = vlib_buffer_get_current (b0);
          ip1 = vlib_buffer_get_current (b1);

          /* Save src + dst ip4 address, e.g. for mpls-o-gre */
          vnet_buffer(b0)->gre.src = ip0->src_address.as_u32;
          vnet_buffer(b0)->gre.dst = ip0->dst_address.as_u32;
          vnet_buffer(b1)->gre.src = ip1->src_address.as_u32;
          vnet_buffer(b1)->gre.dst = ip1->dst_address.as_u32;

          vlib_buffer_advance (b0, sizeof (*ip0));
          vlib_buffer_advance (b1, sizeof (*ip1));

	  h0 = vlib_buffer_get_current (b0);
	  h1 = vlib_buffer_get_current (b1);

	  /* Index sparse array with network byte order. */
	  protocol0 = h0->protocol;
	  protocol1 = h1->protocol;
	  sparse_vec_index2 (rt->next_by_protocol, protocol0, protocol1,
                             &i0, &i1);
          next0 = vec_elt(rt->next_by_protocol, i0);
          next1 = vec_elt(rt->next_by_protocol, i1);

	  b0->error = node->errors[i0 == SPARSE_VEC_INVALID_INDEX ? GRE_ERROR_UNKNOWN_PROTOCOL : GRE_ERROR_NONE];
	  b1->error = node->errors[i1 == SPARSE_VEC_INVALID_INDEX ? GRE_ERROR_UNKNOWN_PROTOCOL : GRE_ERROR_NONE];
          
          version0 = clib_net_to_host_u16 (h0->flags_and_version);
          verr0 =  version0 & GRE_VERSION_MASK;
          version1 = clib_net_to_host_u16 (h1->flags_and_version);
          verr1 =  version1 & GRE_VERSION_MASK;

          b0->error = verr0 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION]
              : b0->error;
          next0 = verr0 ? GRE_INPUT_NEXT_DROP : next0;
          b1->error = verr1 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION]
              : b1->error;
          next1 = verr1 ? GRE_INPUT_NEXT_DROP : next1;


          /* RPF check for ip4/ip6 input */
          if (PREDICT_TRUE(next0 == GRE_INPUT_NEXT_IP4_INPUT
			   || next0 == GRE_INPUT_NEXT_IP6_INPUT
			   || next0 == GRE_INPUT_NEXT_ETHERNET_INPUT
			   || next0 == GRE_INPUT_NEXT_MPLS_INPUT))
            {
              u64 key = ((u64)(vnet_buffer(b0)->gre.dst) << 32) |
                         (u64)(vnet_buffer(b0)->gre.src);

              if (cached_tunnel_key != key)
                {
                  vnet_hw_interface_t * hi;
                  gre_tunnel_t * t;
                  uword * p;

                  p = hash_get (gm->tunnel_by_key, key);
                  if (!p)
                    {
                      next0 = GRE_INPUT_NEXT_DROP;
                      b0->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
                      goto drop0;
                    }
                  t = pool_elt_at_index (gm->tunnels, p[0]);
                  hi = vnet_get_hw_interface (gm->vnet_main,
                            t->hw_if_index);
                  tunnel_sw_if_index = hi->sw_if_index;

                  cached_tunnel_sw_if_index = tunnel_sw_if_index;
                }
              else
                {
                  tunnel_sw_if_index = cached_tunnel_sw_if_index;
                }
            }
          else
            {
		next0 = GRE_INPUT_NEXT_DROP;
                goto drop0;
            }
          len = vlib_buffer_length_in_chain (vm, b0);
          vlib_increment_combined_counter (im->combined_sw_if_counters
                                           + VNET_INTERFACE_COUNTER_RX,
                                           cpu_index,
                                           tunnel_sw_if_index,
                                           1 /* packets */,
                                           len /* bytes */);

          vnet_buffer(b0)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;

drop0:
          if (PREDICT_TRUE(next1 == GRE_INPUT_NEXT_IP4_INPUT
			   || next1 == GRE_INPUT_NEXT_IP6_INPUT
			   || next1 == GRE_INPUT_NEXT_ETHERNET_INPUT
			   || next1 == GRE_INPUT_NEXT_MPLS_INPUT))
            {
              u64 key = ((u64)(vnet_buffer(b1)->gre.dst) << 32) |
                         (u64)(vnet_buffer(b1)->gre.src);

              if (cached_tunnel_key != key)
                {
                  vnet_hw_interface_t * hi;
                  gre_tunnel_t * t;
                  uword * p;

                  p = hash_get (gm->tunnel_by_key, key);
                  if (!p)
                    {
                      next1 = GRE_INPUT_NEXT_DROP;
                      b1->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
                      goto drop1;
                    }
                  t = pool_elt_at_index (gm->tunnels, p[0]);
                  hi = vnet_get_hw_interface (gm->vnet_main,
                            t->hw_if_index);
                  tunnel_sw_if_index = hi->sw_if_index;

                  cached_tunnel_sw_if_index = tunnel_sw_if_index;
                }
              else
                {
                  tunnel_sw_if_index = cached_tunnel_sw_if_index;
                }
            }
          else
            {
		next1 = GRE_INPUT_NEXT_DROP;
                goto drop1;
            }
          len = vlib_buffer_length_in_chain (vm, b1);
          vlib_increment_combined_counter (im->combined_sw_if_counters
                                           + VNET_INTERFACE_COUNTER_RX,
                                           cpu_index,
                                           tunnel_sw_if_index,
                                           1 /* packets */,
                                           len /* bytes */);

          vnet_buffer(b1)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;

drop1:
          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
            {
              gre_rx_trace_t *tr = vlib_add_trace (vm, node,
                                                   b0, sizeof (*tr));
              tr->tunnel_id = tunnel_sw_if_index;
              tr->length = ip0->length;
              tr->src.as_u32 = ip0->src_address.as_u32;
              tr->dst.as_u32 = ip0->dst_address.as_u32;
            }

          if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
            {
              gre_rx_trace_t *tr = vlib_add_trace (vm, node,
                                                   b1, sizeof (*tr));
              tr->tunnel_id = tunnel_sw_if_index;
              tr->length = ip1->length;
              tr->src.as_u32 = ip1->src_address.as_u32;
              tr->dst.as_u32 = ip1->dst_address.as_u32;
            }

          vlib_buffer_advance (b0, sizeof (*h0));
          vlib_buffer_advance (b1, sizeof (*h1));

	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, bi1, next0, next1);
	}
    
      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  u32 bi0;
	  vlib_buffer_t * b0;
	  gre_header_t * h0;
          ip4_header_t * ip0;
          u16 version0;
          int verr0;
	  u32 i0, next0;

	  bi0 = from[0];
	  to_next[0] = bi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;

	  b0 = vlib_get_buffer (vm, bi0);
          ip0 = vlib_buffer_get_current (b0);

          vnet_buffer(b0)->gre.src = ip0->src_address.as_u32;
          vnet_buffer(b0)->gre.dst = ip0->dst_address.as_u32;

          vlib_buffer_advance (b0, sizeof (*ip0));

	  h0 = vlib_buffer_get_current (b0);

	  i0 = sparse_vec_index (rt->next_by_protocol, h0->protocol);
          next0 = vec_elt(rt->next_by_protocol, i0);

	  b0->error = 
              node->errors[i0 == SPARSE_VEC_INVALID_INDEX 
                           ? GRE_ERROR_UNKNOWN_PROTOCOL : GRE_ERROR_NONE];
	  
          version0 = clib_net_to_host_u16 (h0->flags_and_version);
          verr0 =  version0 & GRE_VERSION_MASK;
          b0->error = verr0 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION] 
              : b0->error;
          next0 = verr0 ? GRE_INPUT_NEXT_DROP : next0;


          /* For IP payload we need to find source interface
             so we can increase counters and help forward node to
             pick right FIB */
          /* RPF check for ip4/ip6 input */
          if (PREDICT_TRUE(next0 == GRE_INPUT_NEXT_IP4_INPUT
			   || next0 == GRE_INPUT_NEXT_IP6_INPUT
			   || next0 == GRE_INPUT_NEXT_ETHERNET_INPUT
			   || next0 == GRE_INPUT_NEXT_MPLS_INPUT))
            {
              u64 key = ((u64)(vnet_buffer(b0)->gre.dst) << 32) |
                         (u64)(vnet_buffer(b0)->gre.src);

              if (cached_tunnel_key != key)
                {
                  vnet_hw_interface_t * hi;
                  gre_tunnel_t * t;
                  uword * p;

                  p = hash_get (gm->tunnel_by_key, key);
                  if (!p)
                    {
                      next0 = GRE_INPUT_NEXT_DROP;
                      b0->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
                      goto drop;
                    }
                  t = pool_elt_at_index (gm->tunnels, p[0]);
                  hi = vnet_get_hw_interface (gm->vnet_main,
                            t->hw_if_index);
                  tunnel_sw_if_index = hi->sw_if_index;

                  cached_tunnel_sw_if_index = tunnel_sw_if_index;
                }
              else
                {
                  tunnel_sw_if_index = cached_tunnel_sw_if_index;
                }
            }
          else
            {
		next0 = GRE_INPUT_NEXT_DROP;
                goto drop;
            }
          len = vlib_buffer_length_in_chain (vm, b0);
          vlib_increment_combined_counter (im->combined_sw_if_counters
                                           + VNET_INTERFACE_COUNTER_RX,
                                           cpu_index,
                                           tunnel_sw_if_index,
                                           1 /* packets */,
                                           len /* bytes */);

          vnet_buffer(b0)->sw_if_index[VLIB_RX] = tunnel_sw_if_index;

drop:
          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
            {
              gre_rx_trace_t *tr = vlib_add_trace (vm, node, 
                                                   b0, sizeof (*tr));
              tr->tunnel_id = tunnel_sw_if_index;
              tr->length = ip0->length;
              tr->src.as_u32 = ip0->src_address.as_u32;
              tr->dst.as_u32 = ip0->dst_address.as_u32;
            }

          vlib_buffer_advance (b0, sizeof (*h0));

	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }
  vlib_node_increment_counter (vm, gre_input_node.index,
                               GRE_ERROR_PKTS_DECAP, from_frame->n_vectors);
  return from_frame->n_vectors;
}

static char * gre_error_strings[] = {
#define gre_error(n,s) s,
#include "error.def"
#undef gre_error
};

VLIB_REGISTER_NODE (gre_input_node) = {
  .function = gre_input,
  .name = "gre-input",
  /* Takes a vector of packets. */
  .vector_size = sizeof (u32),

  .runtime_data_bytes = sizeof (gre_input_runtime_t),

  .n_errors = GRE_N_ERROR,
  .error_strings = gre_error_strings,

  .n_next_nodes = GRE_INPUT_N_NEXT,
  .next_nodes = {
#define _(s,n) [GRE_INPUT_NEXT_##s] = n,
    foreach_gre_input_next
#undef _
  },

  .format_buffer = format_gre_header_with_length,
  .format_trace = format_gre_rx_trace,
  .unformat_buffer = unformat_gre_header,
};

VLIB_NODE_FUNCTION_MULTIARCH (gre_input_node, gre_input)

void
gre_register_input_protocol (vlib_main_t * vm,
			     gre_protocol_t protocol,
			     u32 node_index)
{
  gre_main_t * em = &gre_main;
  gre_protocol_info_t * pi;
  gre_input_runtime_t * rt;
  u16 * n;

  {
    clib_error_t * error = vlib_call_init_function (vm, gre_input_init);
    if (error)
      clib_error_report (error);
  }

  pi = gre_get_protocol_info (em, protocol);
  pi->node_index = node_index;
  pi->next_index = vlib_node_add_next (vm, 
				       gre_input_node.index,
				       node_index);

  /* Setup gre protocol -> next index sparse vector mapping. */
  rt = vlib_node_get_runtime_data (vm, gre_input_node.index);
  n = sparse_vec_validate (rt->next_by_protocol, 
                           clib_host_to_net_u16 (protocol));
  n[0] = pi->next_index;
}

static void
gre_setup_node (vlib_main_t * vm, u32 node_index)
{
  vlib_node_t * n = vlib_get_node (vm, node_index);
  pg_node_t * pn = pg_get_node (node_index);

  n->format_buffer = format_gre_header_with_length;
  n->unformat_buffer = unformat_gre_header;
  pn->unformat_edit = unformat_pg_gre_header;
}

static clib_error_t * gre_input_init (vlib_main_t * vm)
{
  gre_input_runtime_t * rt;
  vlib_node_t *ethernet_input, *ip4_input, *ip6_input, *mpls_unicast_input;

  {
    clib_error_t * error; 
    error = vlib_call_init_function (vm, gre_init);
    if (error)
      clib_error_report (error);
  }

  gre_setup_node (vm, gre_input_node.index);

  rt = vlib_node_get_runtime_data (vm, gre_input_node.index);

  rt->next_by_protocol = sparse_vec_new
    (/* elt bytes */ sizeof (rt->next_by_protocol[0]),
     /* bits in index */ BITS (((gre_header_t *) 0)->protocol));

  /* These could be moved to the supported protocol input node defn's */
  ethernet_input = vlib_get_node_by_name (vm, (u8 *)"ethernet-input");
  ASSERT(ethernet_input);
  ip4_input = vlib_get_node_by_name (vm, (u8 *)"ip4-input");
  ASSERT(ip4_input);
  ip6_input = vlib_get_node_by_name (vm, (u8 *)"ip6-input");
  ASSERT(ip6_input);
  mpls_unicast_input = vlib_get_node_by_name (vm, (u8 *)"mpls-input");
  ASSERT(mpls_unicast_input);

  gre_register_input_protocol (vm, GRE_PROTOCOL_teb,
                               ethernet_input->index);

  gre_register_input_protocol (vm, GRE_PROTOCOL_ip4, 
                               ip4_input->index);

  gre_register_input_protocol (vm, GRE_PROTOCOL_ip6, 
                               ip6_input->index);

  gre_register_input_protocol (vm, GRE_PROTOCOL_mpls_unicast,
                               mpls_unicast_input->index);

  ip4_register_protocol (IP_PROTOCOL_GRE, gre_input_node.index);

  return 0;
}

VLIB_INIT_FUNCTION (gre_input_init);