summaryrefslogtreecommitdiffstats
path: root/src/vnet/session/segment_manager.h
blob: 8358c50ba06d7ed80650f8caf77a1a4f1a7b2504 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
/*
 * Copyright (c) 2017-2019 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef SRC_VNET_SESSION_SEGMENT_MANAGER_H_
#define SRC_VNET_SESSION_SEGMENT_MANAGER_H_

#include <svm/message_queue.h>
#include <vppinfra/lock.h>
#include <vppinfra/valloc.h>
#include <svm/fifo_segment.h>

typedef struct _segment_manager_props
{
  u32 rx_fifo_size;			/**< receive fifo size */
  u32 tx_fifo_size;			/**< transmit fifo size */
  u32 evt_q_size;			/**< event queue length */
  u32 segment_size;			/**< first segment size */
  u32 prealloc_fifos;			/**< preallocated fifo pairs */
  u32 add_segment_size;			/**< additional segment size */
  u8 add_segment:1;			/**< can add new segments flag */
  u8 use_mq_eventfd:1;			/**< use eventfds for mqs flag */
  u8 reserved:6;			/**< reserved flags */
  ssvm_segment_type_t segment_type;	/**< seg type: if set to SSVM_N_TYPES,
					     private segments are used */
} segment_manager_props_t;

typedef struct _segment_manager
{
  /** Pool of segments allocated by this manager */
  fifo_segment_t *segments;

  /** rwlock that protects the segments pool */
  clib_rwlock_t segments_rwlock;

  /** Owner app worker index */
  u32 app_wrk_index;

  /**
   * First segment should not be deleted unless segment manger is deleted.
   * This also indicates that the segment manager is the first to have been
   * allocated for the app.
   */
  u8 first_is_protected;

  /**
   * App event queue allocated in first segment
   */
  svm_msg_q_t *event_queue;
} segment_manager_t;

typedef struct segment_manager_main_init_args_
{
  u64 baseva;
  u64 size;
} segment_manager_main_init_args_t;

#define SEGMENT_MANAGER_INVALID_APP_INDEX ((u32) ~0)

segment_manager_t *segment_manager_alloc (void);
int segment_manager_init (segment_manager_t * sm, u32 first_seg_size,
			  u32 prealloc_fifo_pairs);

/**
 * Cleanup segment manager
 *
 * @param sm	segment manager to be freed
 */
void segment_manager_free (segment_manager_t * sm);

/**
 * Initiate segment manager cleanup
 *
 * @param sm	segment manager to be freed
 */
void segment_manager_init_free (segment_manager_t * sm);
segment_manager_t *segment_manager_get (u32 index);
segment_manager_t *segment_manager_get_if_valid (u32 index);
u32 segment_manager_index (segment_manager_t * sm);

int segment_manager_add_segment (segment_manager_t * sm, u32 segment_size);
void segment_manager_del_segment (segment_manager_t * sm,
				  fifo_segment_t * fs);
fifo_segment_t *segment_manager_get_segment (segment_manager_t * sm,
					     u32 segment_index);
fifo_segment_t *segment_manager_get_segment_w_handle (u64 sh);
fifo_segment_t *segment_manager_get_segment_w_lock (segment_manager_t * sm,
						    u32 segment_index);
int segment_manager_add_first_segment (segment_manager_t * sm,
				       u32 segment_size);
u64 segment_manager_make_segment_handle (u32 segment_manager_index,
					 u32 segment_index);
u64 segment_manager_segment_handle (segment_manager_t * sm,
				    fifo_segment_t * segment);
void segment_manager_segment_reader_unlock (segment_manager_t * sm);
void segment_manager_segment_writer_unlock (segment_manager_t * sm);

int segment_manager_alloc_session_fifos (segment_manager_t * sm,
					 svm_fifo_t ** rx_fifo,
					 svm_fifo_t ** tx_fifo);
int segment_manager_try_alloc_fifos (fifo_segment_t * fs,
				     u32 rx_fifo_size, u32 tx_fifo_size,
				     svm_fifo_t ** rx_fifo,
				     svm_fifo_t ** tx_fifo);
void segment_manager_dealloc_fifos (svm_fifo_t * rx_fifo,
				    svm_fifo_t * tx_fifo);

/**
 * Grows fifo owned by segment manager
 *
 * @param sm	segment manager that owns the fifo
 * @param f	fifo to be grown
 * @param size	amount of bytes to add to fifo
 * @return	0 on success, negative number otherwise
 */
int segment_manager_grow_fifo (segment_manager_t * sm, svm_fifo_t * f,
			       u32 size);

/**
 * Request to shrink fifo owned by segment manager
 *
 * If this is not called by the producer, no attempt is made to reduce the
 * size until the producer tries to enqueue more data. To collect the chunks
 * that are to be removed call @ref segment_manager_collect_fifo_chunks
 *
 * Size reduction does not affect fifo chunk boundaries. Therefore chunks are
 * not split and the amount of bytes to be removed can be equal to or less
 * than what was requested.
 *
 * @param sm		segment manager that owns the fifo
 * @param f		fifo to be shrunk
 * @param size		amount of bytes to remove from fifo
 * @param is_producer	flag that indicates is caller is the producer for the
 * 			fifo.
 * @return		actual number of bytes to be removed
 */
int segment_manager_shrink_fifo (segment_manager_t * sm, svm_fifo_t * f,
				 u32 size, u8 is_producer);

/**
 * Collect fifo chunks that are no longer used
 *
 * This should not be called unless SVM_FIFO_F_COLLECT_CHUNKS is set for
 * the fifo. The chunks are returned to the fifo segment freelist.
 *
 * @param sm		segment manager that owns the fifo
 * @param f		fifo whose chunks are to be collected
 * @return		0 on success, error otherwise
 */
int segment_manager_collect_fifo_chunks (segment_manager_t * sm,
					 svm_fifo_t * f);
u8 segment_manager_has_fifos (segment_manager_t * sm);

svm_msg_q_t *segment_manager_alloc_queue (fifo_segment_t * fs,
					  segment_manager_props_t * props);
void segment_manager_dealloc_queue (segment_manager_t * sm, svm_queue_t * q);
svm_msg_q_t *segment_manager_event_queue (segment_manager_t * sm);
u32 segment_manager_evt_q_expected_size (u32 q_size);

void segment_manager_app_detach (segment_manager_t * sm);

/**
 * Cleanup segment manager sessions
 *
 * Initiates disconnects for all sessions 'owned' by a segment manager by
 * leveraging the backpointers that fifos keep.
 *
 * @param sm	segment manager whose sessions are to be disconnected
 */
void segment_manager_del_sessions (segment_manager_t * sm);
void segment_manager_format_sessions (segment_manager_t * sm, int verbose);

void segment_manager_main_init (segment_manager_main_init_args_t * a);

segment_manager_props_t *segment_manager_props_init (segment_manager_props_t *
						     sm);

#endif /* SRC_VNET_SESSION_SEGMENT_MANAGER_H_ */
/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */
n1039'>1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
/*
 * Copyright (c) 2016 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * @file
 * @brief Local TCP/IP stack punt infrastructure.
 *
 * Provides a set of VPP nodes together with the relevant APIs and CLI
 * commands in order to adjust and dispatch packets from the VPP data plane
 * to the local TCP/IP stack
 */

#include <vnet/ip/ip.h>
#include <vlib/vlib.h>
#include <vnet/pg/pg.h>
#include <vnet/udp/udp.h>
#include <vnet/tcp/tcp.h>
#include <vnet/sctp/sctp.h>
#include <vnet/ip/punt.h>
#include <vppinfra/sparse_vec.h>
#include <vlib/unix/unix.h>

#include <stdio.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <stdlib.h>

#define foreach_punt_next			\
  _ (PUNT4, "ip4-punt")                         \
  _ (PUNT6, "ip6-punt")

typedef enum
{
#define _(s,n) PUNT_NEXT_##s,
  foreach_punt_next
#undef _
    PUNT_N_NEXT,
} punt_next_t;

enum punt_socket_rx_next_e
{
  PUNT_SOCKET_RX_NEXT_INTERFACE_OUTPUT,
  PUNT_SOCKET_RX_NEXT_IP4_LOOKUP,
  PUNT_SOCKET_RX_NEXT_IP6_LOOKUP,
  PUNT_SOCKET_RX_N_NEXT
};

#define punt_next_punt(is_ip4) (is_ip4 ? PUNT_NEXT_PUNT4 : PUNT_NEXT_PUNT6)

extern vlib_node_registration_t udp4_punt_node;
extern vlib_node_registration_t udp6_punt_node;
extern vlib_node_registration_t udp4_punt_socket_node;
extern vlib_node_registration_t udp6_punt_socket_node;
static vlib_node_registration_t punt_socket_rx_node;

extern punt_main_t punt_main;

#ifndef CLIB_MARCH_VARIANT
punt_main_t punt_main;

char *
vnet_punt_get_server_pathname (void)
{
  punt_main_t *pm = &punt_main;
  return pm->sun_path;
}
#endif /* CLIB_MARCH_VARIANT */

/** @brief IPv4/IPv6 UDP punt node main loop.

    This is the main loop inline function for IPv4/IPv6 UDP punt
    transition node.

    @param vm vlib_main_t corresponding to the current thread
    @param node vlib_node_runtime_t
    @param frame vlib_frame_t whose contents should be dispatched
    @param is_ipv4 indicates if called for IPv4 or IPv6 node
*/
always_inline uword
udp46_punt_inline (vlib_main_t * vm,
		   vlib_node_runtime_t * node,
		   vlib_frame_t * from_frame, int is_ip4)
{
  u32 n_left_from, *from, *to_next;
  word advance;

  from = vlib_frame_vector_args (from_frame);
  n_left_from = from_frame->n_vectors;

  /* udp[46]_lookup hands us the data payload, not the IP header */
  if (is_ip4)
    advance = -(sizeof (ip4_header_t) + sizeof (udp_header_t));
  else
    advance = -(sizeof (ip6_header_t) + sizeof (udp_header_t));

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, punt_next_punt (is_ip4), to_next,
			   n_left_to_next);

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  u32 bi0;
	  vlib_buffer_t *b0;

	  bi0 = from[0];
	  to_next[0] = bi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;

	  b0 = vlib_get_buffer (vm, bi0);
	  vlib_buffer_advance (b0, advance);
	  b0->error = node->errors[PUNT_ERROR_UDP_PORT];
	}

      vlib_put_next_frame (vm, node, punt_next_punt (is_ip4), n_left_to_next);
    }

  return from_frame->n_vectors;
}

static char *punt_error_strings[] = {
#define punt_error(n,s) s,
#include "punt_error.def"
#undef punt_error
};

/** @brief IPv4 UDP punt node.
    @node ip4-udp-punt

    This is the IPv4 UDP punt transition node. It is registered as a next
    node for the "ip4-udp-lookup" handling UDP port(s) requested for punt.
    The buffer's current data pointer is adjusted to the original packet
    IPv4 header. All buffers are dispatched to "error-punt".

    @param vm vlib_main_t corresponding to the current thread
    @param node vlib_node_runtime_t
    @param frame vlib_frame_t whose contents should be dispatched

    @par Graph mechanics: next index usage

    @em Sets:
    - <code>vnet_buffer(b)->current_data</code>
    - <code>vnet_buffer(b)->current_len</code>

    <em>Next Index:</em>
    - Dispatches the packet to the "error-punt" node
*/
VLIB_NODE_FN (udp4_punt_node) (vlib_main_t * vm,
			       vlib_node_runtime_t * node,
			       vlib_frame_t * from_frame)
{
  return udp46_punt_inline (vm, node, from_frame, 1 /* is_ip4 */ );
}

/** @brief IPv6 UDP punt node.
    @node ip6-udp-punt

    This is the IPv6 UDP punt transition node. It is registered as a next
    node for the "ip6-udp-lookup" handling UDP port(s) requested for punt.
    The buffer's current data pointer is adjusted to the original packet
    IPv6 header. All buffers are dispatched to "error-punt".

    @param vm vlib_main_t corresponding to the current thread
    @param node vlib_node_runtime_t
    @param frame vlib_frame_t whose contents should be dispatched

    @par Graph mechanics: next index usage

    @em Sets:
    - <code>vnet_buffer(b)->current_data</code>
    - <code>vnet_buffer(b)->current_len</code>

    <em>Next Index:</em>
    - Dispatches the packet to the "error-punt" node
*/
VLIB_NODE_FN (udp6_punt_node) (vlib_main_t * vm,
			       vlib_node_runtime_t * node,
			       vlib_frame_t * from_frame)
{
  return udp46_punt_inline (vm, node, from_frame, 0 /* is_ip4 */ );
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (udp4_punt_node) = {
  .name = "ip4-udp-punt",
  /* Takes a vector of packets. */
  .vector_size = sizeof (u32),

  .n_errors = PUNT_N_ERROR,
  .error_strings = punt_error_strings,

  .n_next_nodes = PUNT_N_NEXT,
  .next_nodes = {
#define _(s,n) [PUNT_NEXT_##s] = n,
     foreach_punt_next
#undef _
  },
};

VLIB_REGISTER_NODE (udp6_punt_node) = {
  .name = "ip6-udp-punt",
  /* Takes a vector of packets. */
  .vector_size = sizeof (u32),

  .n_errors = PUNT_N_ERROR,
  .error_strings = punt_error_strings,

  .n_next_nodes = PUNT_N_NEXT,
  .next_nodes = {
#define _(s,n) [PUNT_NEXT_##s] = n,
     foreach_punt_next
#undef _
  },
};

/* *INDENT-ON* */

static punt_client_t *
punt_client_get (bool is_ip4, u16 port)
{
  punt_main_t *pm = &punt_main;
  punt_client_t *v =
    is_ip4 ? pm->clients_by_dst_port4 : pm->clients_by_dst_port6;

  u16 i = sparse_vec_index (v, port);
  if (i == SPARSE_VEC_INVALID_INDEX)
    return 0;

  return &vec_elt (v, i);
}

static struct sockaddr_un *
punt_socket_get (bool is_ip4, u16 port)
{
  punt_client_t *v = punt_client_get (is_ip4, port);
  if (v)
    return &v->caddr;

  return NULL;
}

#ifndef CLIB_MARCH_VARIANT
static int
punt_socket_register (bool is_ip4, u8 protocol, u16 port,
		      char *client_pathname)
{
  punt_main_t *pm = &punt_main;
  punt_client_t c, *n;
  punt_client_t *v = is_ip4 ? pm->clients_by_dst_port4 :
    pm->clients_by_dst_port6;

  if (strncmp (client_pathname, vnet_punt_get_server_pathname (),
	       UNIX_PATH_MAX) == 0)
    return -1;

  clib_memset (&c, 0, sizeof (c));
  memcpy (c.caddr.sun_path, client_pathname, sizeof (c.caddr.sun_path));
  c.caddr.sun_family = AF_UNIX;
  c.port = port;
  c.protocol = protocol;
  n = sparse_vec_validate (v, port);
  n[0] = c;
  return 0;
}

/* $$$$ Just leaves the mapping in place for now */
static void
punt_socket_unregister (bool is_ip4, u8 protocol, u16 port)
{
  return;
}
#endif /* CLIB_MARCH_VARIANT */

typedef struct
{
  punt_client_t client;
  u8 is_midchain;
} udp_punt_trace_t;

static u8 *
format_udp_punt_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  udp_punt_trace_t *t = va_arg (*args, udp_punt_trace_t *);
  u32 indent = format_get_indent (s);
  s = format (s, "to: %s", t->client.caddr.sun_path);
  if (t->is_midchain)
    {
      s = format (s, "\n%U(buffer is part of chain)", format_white_space,
		  indent);
    }
  return s;
}

always_inline uword
udp46_punt_socket_inline (vlib_main_t * vm,
			  vlib_node_runtime_t * node,
			  vlib_frame_t * frame, bool is_ip4)
{
  u32 *buffers = vlib_frame_vector_args (frame);
  uword n_packets = frame->n_vectors;
  struct iovec *iovecs = 0;
  punt_main_t *pm = &punt_main;
  int i;

  u32 node_index = is_ip4 ? udp4_punt_socket_node.index :
    udp6_punt_socket_node.index;

  for (i = 0; i < n_packets; i++)
    {
      struct iovec *iov;
      vlib_buffer_t *b;
      uword l;
      punt_packetdesc_t packetdesc;

      b = vlib_get_buffer (vm, buffers[i]);

      /* Reverse UDP Punt advance */
      udp_header_t *udp;
      if (is_ip4)
	{
	  vlib_buffer_advance (b, -(sizeof (ip4_header_t) +
				    sizeof (udp_header_t)));
	  ip4_header_t *ip = vlib_buffer_get_current (b);
	  udp = (udp_header_t *) (ip + 1);
	}
      else
	{
	  vlib_buffer_advance (b, -(sizeof (ip6_header_t) +
				    sizeof (udp_header_t)));
	  ip6_header_t *ip = vlib_buffer_get_current (b);
	  udp = (udp_header_t *) (ip + 1);
	}

      u16 port = clib_net_to_host_u16 (udp->dst_port);

      /*
       * Find registerered client
       * If no registered client, drop packet and count
       */
      struct sockaddr_un *caddr;
      caddr = punt_socket_get (is_ip4, port);
      if (!caddr)
	{
	  vlib_node_increment_counter (vm, node_index,
				       PUNT_ERROR_SOCKET_TX_ERROR, 1);
	  goto error;
	}

      punt_client_t *c = NULL;
      if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
	{
	  c = punt_client_get (is_ip4, port);
	  udp_punt_trace_t *t;
	  t = vlib_add_trace (vm, node, b, sizeof (t[0]));
	  clib_memcpy_fast (&t->client, c, sizeof (t->client));
	}

      /* Re-set iovecs if present. */
      if (iovecs)
	_vec_len (iovecs) = 0;

      /* Add packet descriptor */
      packetdesc.sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX];
      packetdesc.action = 0;
      vec_add2 (iovecs, iov, 1);
      iov->iov_base = &packetdesc;
      iov->iov_len = sizeof (packetdesc);

      /** VLIB buffer chain -> Unix iovec(s). */
      vlib_buffer_advance (b, -(sizeof (ethernet_header_t)));
      vec_add2 (iovecs, iov, 1);
      iov->iov_base = b->data + b->current_data;
      iov->iov_len = l = b->current_length;

      if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
	{
	  do
	    {
	      b = vlib_get_buffer (vm, b->next_buffer);
	      if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
		{
		  if (PREDICT_FALSE (!c))
		    {
		      c = punt_client_get (is_ip4, port);
		    }
		  udp_punt_trace_t *t;
		  t = vlib_add_trace (vm, node, b, sizeof (t[0]));
		  clib_memcpy_fast (&t->client, c, sizeof (t->client));
		  t->is_midchain = 1;
		}

	      vec_add2 (iovecs, iov, 1);

	      iov->iov_base = b->data + b->current_data;
	      iov->iov_len = b->current_length;
	      l += b->current_length;
	    }
	  while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
	}

      struct msghdr msg = {
	.msg_name = caddr,
	.msg_namelen = sizeof (*caddr),
	.msg_iov = iovecs,
	.msg_iovlen = vec_len (iovecs),
      };

      if (sendmsg (pm->socket_fd, &msg, 0) < (ssize_t) l)
	vlib_node_increment_counter (vm, node_index,
				     PUNT_ERROR_SOCKET_TX_ERROR, 1);
      else
	vlib_node_increment_counter (vm, node_index, PUNT_ERROR_SOCKET_TX, 1);

    }

error:
  vlib_buffer_free (vm, buffers, n_packets);

  return n_packets;
}

static uword
udp4_punt_socket (vlib_main_t * vm,
		  vlib_node_runtime_t * node, vlib_frame_t * from_frame)
{
  return udp46_punt_socket_inline (vm, node, from_frame, true /* is_ip4 */ );
}

static uword
udp6_punt_socket (vlib_main_t * vm,
		  vlib_node_runtime_t * node, vlib_frame_t * from_frame)
{
  return udp46_punt_socket_inline (vm, node, from_frame, false /* is_ip4 */ );
}


/* *INDENT-OFF* */
VLIB_REGISTER_NODE (udp4_punt_socket_node) = {
  .function = udp4_punt_socket,
  .name = "ip4-udp-punt-socket",
  .format_trace = format_udp_punt_trace,
  .flags = VLIB_NODE_FLAG_IS_DROP,
  /* Takes a vector of packets. */
  .vector_size = sizeof (u32),
  .n_errors = PUNT_N_ERROR,
  .error_strings = punt_error_strings,
};
VLIB_REGISTER_NODE (udp6_punt_socket_node) = {
  .function = udp6_punt_socket,
  .name = "ip6-udp-punt-socket",
  .format_trace = format_udp_punt_trace,
  .flags = VLIB_NODE_FLAG_IS_DROP,
  .vector_size = sizeof (u32),
  .n_errors = PUNT_N_ERROR,
  .error_strings = punt_error_strings,
};
/* *INDENT-ON* */

typedef struct
{
  enum punt_action_e action;
  u32 sw_if_index;
} punt_trace_t;

static u8 *
format_punt_trace (u8 * s, va_list * va)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
  vnet_main_t *vnm = vnet_get_main ();
  punt_trace_t *t = va_arg (*va, punt_trace_t *);
  s = format (s, "%U Action: %d", format_vnet_sw_if_index_name,
	      vnm, t->sw_if_index, t->action);
  return s;
}

static uword
punt_socket_rx_fd (vlib_main_t * vm, vlib_node_runtime_t * node, u32 fd)
{
  const uword buffer_size = vlib_buffer_get_default_data_size (vm);
  u32 n_trace = vlib_get_trace_count (vm, node);
  u32 next = node->cached_next_index;
  u32 n_left_to_next, next_index;
  u32 *to_next;
  u32 error = PUNT_ERROR_NONE;
  vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);

  /* $$$$ Only dealing with one buffer at the time for now */

  u32 bi;
  vlib_buffer_t *b;
  punt_packetdesc_t packetdesc;
  ssize_t size;
  struct iovec io[2];

  if (vlib_buffer_alloc (vm, &bi, 1) != 1)
    {
      error = PUNT_ERROR_NOBUFFER;
      goto error;
    }

  b = vlib_get_buffer (vm, bi);
  io[0].iov_base = &packetdesc;
  io[0].iov_len = sizeof (packetdesc);
  io[1].iov_base = b->data;
  io[1].iov_len = buffer_size;

  size = readv (fd, io, 2);
  /* We need at least the packet descriptor plus a header */
  if (size <= (int) (sizeof (packetdesc) + sizeof (ip4_header_t)))
    {
      vlib_buffer_free (vm, &bi, 1);
      error = PUNT_ERROR_READV;
      goto error;
    }

  b->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED;
  b->current_length = size - sizeof (packetdesc);

  VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b);

  switch (packetdesc.action)
    {
    case PUNT_L2:
      vnet_buffer (b)->sw_if_index[VLIB_TX] = packetdesc.sw_if_index;
      next_index = PUNT_SOCKET_RX_NEXT_INTERFACE_OUTPUT;
      break;

    case PUNT_IP4_ROUTED:
      vnet_buffer (b)->sw_if_index[VLIB_RX] = packetdesc.sw_if_index;
      vnet_buffer (b)->sw_if_index[VLIB_TX] = ~0;
      next_index = PUNT_SOCKET_RX_NEXT_IP4_LOOKUP;
      break;

    case PUNT_IP6_ROUTED:
      vnet_buffer (b)->sw_if_index[VLIB_RX] = packetdesc.sw_if_index;
      vnet_buffer (b)->sw_if_index[VLIB_TX] = ~0;
      next_index = PUNT_SOCKET_RX_NEXT_IP6_LOOKUP;
      break;

    default:
      error = PUNT_ERROR_ACTION;
      vlib_buffer_free (vm, &bi, 1);
      goto error;
    }

  if (PREDICT_FALSE (n_trace > 0))
    {
      punt_trace_t *t;
      vlib_trace_buffer (vm, node, next_index, b, 1 /* follow_chain */ );
      vlib_set_trace_count (vm, node, --n_trace);
      t = vlib_add_trace (vm, node, b, sizeof (*t));
      t->sw_if_index = packetdesc.sw_if_index;
      t->action = packetdesc.action;
    }

  to_next[0] = bi;
  to_next++;
  n_left_to_next--;

  vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next,
				   bi, next_index);
  vlib_put_next_frame (vm, node, next, n_left_to_next);
  return 1;

error:
  vlib_node_increment_counter (vm, punt_socket_rx_node.index, error, 1);
  return 0;
}

static uword
punt_socket_rx (vlib_main_t * vm,
		vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  punt_main_t *pm = &punt_main;
  u32 total_count = 0;
  int i;

  for (i = 0; i < vec_len (pm->ready_fds); i++)
    {
      total_count += punt_socket_rx_fd (vm, node, pm->ready_fds[i]);
      vec_del1 (pm->ready_fds, i);
    }
  return total_count;
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (punt_socket_rx_node, static) =
{
 .function = punt_socket_rx,
 .name = "punt-socket-rx",
 .type = VLIB_NODE_TYPE_INPUT,
 .state = VLIB_NODE_STATE_INTERRUPT,
 .vector_size = 1,
 .n_errors = PUNT_N_ERROR,
 .error_strings = punt_error_strings,
 .n_next_nodes = PUNT_SOCKET_RX_N_NEXT,
 .next_nodes = {
		[PUNT_SOCKET_RX_NEXT_INTERFACE_OUTPUT] = "interface-output",
		[PUNT_SOCKET_RX_NEXT_IP4_LOOKUP] = "ip4-lookup",
		[PUNT_SOCKET_RX_NEXT_IP6_LOOKUP] = "ip6-lookup",
		},
 .format_trace = format_punt_trace,
};
/* *INDENT-ON* */

static clib_error_t *
punt_socket_read_ready (clib_file_t * uf)
{
  vlib_main_t *vm = vlib_get_main ();
  punt_main_t *pm = &punt_main;

  /** Schedule the rx node */
  vlib_node_set_interrupt_pending (vm, punt_socket_rx_node.index);
  vec_add1 (pm->ready_fds, uf->file_descriptor);

  return 0;
}

#ifndef CLIB_MARCH_VARIANT
clib_error_t *
vnet_punt_socket_add (vlib_main_t * vm, u32 header_version,
		      bool is_ip4, u8 protocol, u16 port,
		      char *client_pathname)
{
  punt_main_t *pm = &punt_main;

  if (!pm->is_configured)
    return clib_error_return (0, "socket is not configured");

  if (header_version != PUNT_PACKETDESC_VERSION)
    return clib_error_return (0, "Invalid packet descriptor version");

  /* For now we only support UDP punt */
  if (protocol != IP_PROTOCOL_UDP)
    return clib_error_return (0,
			      "only UDP protocol (%d) is supported, got %d",
			      IP_PROTOCOL_UDP, protocol);

  if (port == (u16) ~ 0)
    return clib_error_return (0, "UDP port number required");

  /* Register client */
  if (punt_socket_register (is_ip4, protocol, port, client_pathname) < 0)
    return clib_error_return (0,
			      "Punt socket: Invalid client path: %s",
			      client_pathname);

  u32 node_index = is_ip4 ? udp4_punt_socket_node.index :
    udp6_punt_socket_node.index;

  udp_register_dst_port (vm, port, node_index, is_ip4);

  return 0;
}

clib_error_t *
vnet_punt_socket_del (vlib_main_t * vm, bool is_ip4, u8 l4_protocol, u16 port)
{
  punt_main_t *pm = &punt_main;

  if (!pm->is_configured)
    return clib_error_return (0, "socket is not configured");

  punt_socket_unregister (is_ip4, l4_protocol, port);
  udp_unregister_dst_port (vm, port, is_ip4);

  return 0;
}

/**
 * @brief Request IP traffic punt to the local TCP/IP stack.
 *
 * @em Note
 * - UDP and TCP are the only protocols supported in the current implementation
 *
 * @param vm       vlib_main_t corresponding to the current thread
 * @param ipv      IP protcol version.
 *                 4 - IPv4, 6 - IPv6, ~0 for both IPv6 and IPv4
 * @param protocol 8-bits L4 protocol value
 *                 UDP is 17
 *                 TCP is 1
 * @param port     16-bits L4 (TCP/IP) port number when applicable (UDP only)
 *
 * @returns 0 on success, non-zero value otherwise
 */
clib_error_t *
vnet_punt_add_del (vlib_main_t * vm, u8 ipv, u8 protocol, u16 port,
		   bool is_add)
{

  /* For now we only support TCP, UDP and SCTP punt */
  if (protocol != IP_PROTOCOL_UDP &&
      protocol != IP_PROTOCOL_TCP && protocol != IP_PROTOCOL_SCTP)
    return clib_error_return (0,
			      "only UDP (%d), TCP (%d) and SCTP (%d) protocols are supported, got %d",
			      IP_PROTOCOL_UDP, IP_PROTOCOL_TCP,
			      IP_PROTOCOL_SCTP, protocol);

  if (ipv != (u8) ~ 0 && ipv != 4 && ipv != 6)
    return clib_error_return (0, "IP version must be 4 or 6, got %d", ipv);

  if (port == (u16) ~ 0)
    {
      if ((ipv == 4) || (ipv == (u8) ~ 0))
	{
	  if (protocol == IP_PROTOCOL_UDP)
	    udp_punt_unknown (vm, 1, is_add);
	  else if (protocol == IP_PROTOCOL_TCP)
	    tcp_punt_unknown (vm, 1, is_add);
	  else if (protocol == IP_PROTOCOL_SCTP)
	    sctp_punt_unknown (vm, 1, is_add);
	}

      if ((ipv == 6) || (ipv == (u8) ~ 0))
	{
	  if (protocol == IP_PROTOCOL_UDP)
	    udp_punt_unknown (vm, 0, is_add);
	  else if (protocol == IP_PROTOCOL_TCP)
	    tcp_punt_unknown (vm, 0, is_add);
	  else if (protocol == IP_PROTOCOL_SCTP)
	    sctp_punt_unknown (vm, 0, is_add);
	}

      return 0;
    }

  else if (is_add)
    {
      if (protocol == IP_PROTOCOL_TCP || protocol == IP_PROTOCOL_SCTP)
	return clib_error_return (0,
				  "punt TCP/SCTP ports is not supported yet");

      if (ipv == 4 || ipv == (u8) ~ 0)
	udp_register_dst_port (vm, port, udp4_punt_node.index, 1);

      if (ipv == 6 || ipv == (u8) ~ 0)
	udp_register_dst_port (vm, port, udp6_punt_node.index, 0);

      return 0;
    }
  else
    {
      if (protocol == IP_PROTOCOL_TCP || protocol == IP_PROTOCOL_SCTP)
	return clib_error_return (0,
				  "punt TCP/SCTP ports is not supported yet");
      if (ipv == 4 || ipv == (u8) ~ 0)
	udp_unregister_dst_port (vm, port, 1);

      if (ipv == 6 || ipv == (u8) ~ 0)
	udp_unregister_dst_port (vm, port, 0);

      return 0;
    }
}
#endif /* CLIB_MARCH_VARIANT */

static clib_error_t *
punt_cli (vlib_main_t * vm,
	  unformat_input_t * input, vlib_cli_command_t * cmd)
{
  u32 port = ~0;
  bool is_add = true;
  u32 protocol = ~0;
  clib_error_t *error = NULL;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "del"))
	is_add = false;
      else if (unformat (input, "all"))
	;
      else if (unformat (input, "%d", &port))
	;
      else if (unformat (input, "udp"))
	protocol = IP_PROTOCOL_UDP;
      else if (unformat (input, "tcp"))
	protocol = IP_PROTOCOL_TCP;
      else
	{
	  error = clib_error_return (0, "parse error: '%U'",
				     format_unformat_error, input);
	  goto done;
	}
    }

  /* punt both IPv6 and IPv4 when used in CLI */
  error = vnet_punt_add_del (vm, ~0, protocol, port, is_add);
  if (error)
    {
      clib_error_report (error);
    }

done:
  return error;
}

/*?
 * The set of '<em>set punt</em>' commands allows specific IP traffic to
 * be punted to the host TCP/IP stack
 *
 * @em Note
 * - UDP is the only protocol supported in the current implementation
 * - All TCP traffic is currently punted to the host by default
 *
 * @cliexpar
 * @parblock
 * Example of how to request NTP traffic to be punted
 * @cliexcmd{set punt udp 125}
 *
 * Example of how to request all 'unknown' UDP traffic to be punted
 * @cliexcmd{set punt udp all}
 *
 * Example of how to stop all 'unknown' UDP traffic to be punted
 * @cliexcmd{set punt udp del all}
 * @endparblock
?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (punt_command, static) = {
  .path = "set punt",
  .short_help = "set punt [udp|tcp] [del] <all | port-num1 [port-num2 ...]>",
  .function = punt_cli,
};
/* *INDENT-ON* */

#ifndef CLIB_MARCH_VARIANT
static clib_error_t *
punt_socket_register_cmd (vlib_main_t * vm,
			  unformat_input_t * input, vlib_cli_command_t * cmd)
{
  bool is_ipv4 = true;
  u32 protocol = ~0;
  u32 port = ~0;
  u8 *socket_name = 0;
  clib_error_t *error = NULL;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "ipv4"))
	;
      else if (unformat (input, "ipv6"))
	is_ipv4 = false;
      else if (unformat (input, "udp"))
	protocol = IP_PROTOCOL_UDP;
      else if (unformat (input, "tcp"))
	protocol = IP_PROTOCOL_TCP;
      else if (unformat (input, "%d", &port))
	;
      else if (unformat (input, "socket %s", &socket_name))
	;
      else
	{
	  error = clib_error_return (0, "parse error: '%U'",
				     format_unformat_error, input);
	  goto done;
	}
    }

  error =
    vnet_punt_socket_add (vm, 1, is_ipv4, protocol, port,
			  (char *) socket_name);
done:
  return error;
}

/*?
 *
 * @cliexpar
 * @cliexcmd{punt socket register}
 ?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (punt_socket_register_command, static) =
{
  .path = "punt socket register",
  .function = punt_socket_register_cmd,
  .short_help = "punt socket register [ipv4|ipv6] [udp|tcp]> <all | port-num1 [port-num2 ...]> <socket>",
  .is_mp_safe = 1,
};
/* *INDENT-ON* */

static clib_error_t *
punt_socket_deregister_cmd (vlib_main_t * vm,
			    unformat_input_t * input,
			    vlib_cli_command_t * cmd)
{
  bool is_ipv4 = true;
  u32 protocol = ~0;
  u32 port = ~0;
  clib_error_t *error = NULL;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "ipv4"))
	;
      else if (unformat (input, "ipv6"))
	is_ipv4 = false;
      else if (unformat (input, "udp"))
	protocol = IP_PROTOCOL_UDP;
      else if (unformat (input, "tcp"))
	protocol = IP_PROTOCOL_TCP;
      else if (unformat (input, "%d", &port))
	;
      else
	{
	  error = clib_error_return (0, "parse error: '%U'",
				     format_unformat_error, input);
	  goto done;
	}
    }

  error = vnet_punt_socket_del (vm, is_ipv4, protocol, port);
done:
  return error;
}

/*?
 *
 * @cliexpar
 * @cliexcmd{punt socket register}
 ?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (punt_socket_deregister_command, static) =
{
  .path = "punt socket deregister",
  .function = punt_socket_deregister_cmd,
  .short_help = "punt socket deregister [ipv4|ipv6] [udp|tcp]> <all | port-num1 [port-num2 ...]>",
  .is_mp_safe = 1,
};
/* *INDENT-ON* */

punt_socket_detail_t *
punt_socket_entries (u8 ipv)
{
  punt_main_t *pm = &punt_main;
  punt_client_t *pc;
  punt_socket_detail_t *ps = 0;
  bool is_valid;

  punt_client_t *v = !ipv ? pm->clients_by_dst_port4 :
    pm->clients_by_dst_port6;

  vec_foreach (pc, v)
  {
    if (pc && pc->port != 0)
      {
	is_valid = false;
	if (pc->protocol == IP_PROTOCOL_UDP)
	  {
	    is_valid = udp_is_valid_dst_port (pc->port, !ipv);
	  }
	if (is_valid)
	  {
	    punt_socket_detail_t detail = {
	      .ipv = ipv,
	      .l4_protocol = pc->protocol,
	      .l4_port = pc->port
	    };
	    memcpy (detail.pathname, pc->caddr.sun_path,
		    sizeof (pc->caddr.sun_path));
	    vec_add1 (ps, detail);
	  }
      }
  }
  return ps;
}

u8 *
format_punt_socket (u8 * s, va_list * args)
{
  punt_client_t *clients = va_arg (*args, punt_client_t *);
  u8 *is_ipv6 = va_arg (*args, u8 *);
  punt_client_t *pc;
  bool is_valid;

  vec_foreach (pc, clients)
  {
    if (pc && pc->port != 0)
      {
	is_valid = false;
	if (pc->protocol == IP_PROTOCOL_UDP)
	  {
	    is_valid = udp_is_valid_dst_port (pc->port, !(*is_ipv6));
	  }
	if (is_valid)
	  {
	    s = format (s, " punt %s port %d to socket %s \n",
			(pc->protocol == IP_PROTOCOL_UDP) ? "UDP" : "TCP",
			pc->port, pc->caddr.sun_path);
	  }
      }
  }

  return (s);
}

static clib_error_t *
punt_socket_show_cmd (vlib_main_t * vm,
		      unformat_input_t * input, vlib_cli_command_t * cmd)
{
  u8 is_ipv6;
  punt_main_t *pm = &punt_main;
  clib_error_t *error = NULL;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "ipv4"))
	is_ipv6 = 0;
      else if (unformat (input, "ipv6"))
	is_ipv6 = 1;
      else
	{
	  error = clib_error_return (0, "parse error: '%U'",
				     format_unformat_error, input);
	  goto done;
	}
    }

  punt_client_t *v =
    is_ipv6 ? pm->clients_by_dst_port6 : pm->clients_by_dst_port4;
  vlib_cli_output (vm, "%U", format_punt_socket, v, &is_ipv6);

done:
  return (error);
}

/*?
 *
 * @cliexpar
 * @cliexcmd{show punt socket ipv4}
 ?*/
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (show_punt_socket_registration_command, static) =
{
  .path = "show punt socket registrations",
  .function = punt_socket_show_cmd,
  .short_help = "show punt socket registrations [ipv4|ipv6]",
  .is_mp_safe = 1,
};
/* *INDENT-ON* */

clib_error_t *
ip_punt_init (vlib_main_t * vm)
{
  punt_main_t *pm = &punt_main;

  pm->clients_by_dst_port6 = sparse_vec_new
    (sizeof (pm->clients_by_dst_port6[0]),
     BITS (((udp_header_t *) 0)->dst_port));
  pm->clients_by_dst_port4 = sparse_vec_new
    (sizeof (pm->clients_by_dst_port4[0]),
     BITS (((udp_header_t *) 0)->dst_port));

  pm->is_configured = false;
  pm->interface_output_node = vlib_get_node_by_name (vm,
						     (u8 *)
						     "interface-output");
  return 0;
}

VLIB_INIT_FUNCTION (ip_punt_init);
#endif /* CLIB_MARCH_VARIANT */

static clib_error_t *
punt_config (vlib_main_t * vm, unformat_input_t * input)
{
  punt_main_t *pm = &punt_main;
  char *socket_path = 0;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "socket %s", &socket_path))
	strncpy (pm->sun_path, socket_path, UNIX_PATH_MAX - 1);
      else
	return clib_error_return (0, "unknown input `%U'",
				  format_unformat_error, input);
    }

  if (socket_path == 0)
    return 0;

  /* UNIX domain socket */
  struct sockaddr_un addr;
  if ((pm->socket_fd = socket (AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0)) == -1)
    {
      return clib_error_return (0, "socket error");
    }

  clib_memset (&addr, 0, sizeof (addr));
  addr.sun_family = AF_UNIX;
  if (*socket_path == '\0')
    {
      *addr.sun_path = '\0';
      strncpy (addr.sun_path + 1, socket_path + 1,
	       sizeof (addr.sun_path) - 2);
    }
  else
    {
      strncpy (addr.sun_path, socket_path, sizeof (addr.sun_path) - 1);
      unlink (socket_path);
    }

  if (bind (pm->socket_fd, (struct sockaddr *) &addr, sizeof (addr)) == -1)
    {
      return clib_error_return (0, "bind error");
    }

  /* Register socket */
  clib_file_main_t *fm = &file_main;
  clib_file_t template = { 0 };
  template.read_function = punt_socket_read_ready;
  template.file_descriptor = pm->socket_fd;
  template.description = format (0, "%s", socket_path);
  pm->clib_file_index = clib_file_add (fm, &template);

  pm->is_configured = true;

  return 0;
}

VLIB_CONFIG_FUNCTION (punt_config, "punt");

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */