aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/devices
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2016-12-19 23:05:39 +0100
committerDamjan Marion <damarion@cisco.com>2016-12-28 12:25:14 +0100
commit7cd468a3d7dee7d6c92f69a0bb7061ae208ec727 (patch)
tree5de62f8dbd3a752f5a676ca600e43d2652d1ff1a /src/vnet/devices
parent696f1adec0df3b8f161862566dd9c86174302658 (diff)
Reorganize source tree to use single autotools instance
Change-Id: I7b51f88292e057c6443b12224486f2d0c9f8ae23 Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/vnet/devices')
-rw-r--r--src/vnet/devices/af_packet/af_packet.api71
-rw-r--r--src/vnet/devices/af_packet/af_packet.c366
-rw-r--r--src/vnet/devices/af_packet/af_packet.h69
-rw-r--r--src/vnet/devices/af_packet/af_packet_api.c143
-rw-r--r--src/vnet/devices/af_packet/cli.c144
-rw-r--r--src/vnet/devices/af_packet/device.c250
-rw-r--r--src/vnet/devices/af_packet/node.c288
-rw-r--r--src/vnet/devices/devices.c91
-rw-r--r--src/vnet/devices/devices.h53
-rw-r--r--src/vnet/devices/dpdk/cli.c1296
-rw-r--r--src/vnet/devices/dpdk/device.c840
-rw-r--r--src/vnet/devices/dpdk/dpdk.h534
-rw-r--r--src/vnet/devices/dpdk/dpdk_priv.h132
-rw-r--r--src/vnet/devices/dpdk/format.c763
-rw-r--r--src/vnet/devices/dpdk/hqos.c775
-rwxr-xr-xsrc/vnet/devices/dpdk/init.c1803
-rw-r--r--src/vnet/devices/dpdk/ipsec/cli.c141
-rw-r--r--src/vnet/devices/dpdk/ipsec/crypto_node.c210
-rw-r--r--src/vnet/devices/dpdk/ipsec/dir.dox18
-rw-r--r--src/vnet/devices/dpdk/ipsec/dpdk_crypto_ipsec_doc.md73
-rw-r--r--src/vnet/devices/dpdk/ipsec/esp.h295
-rw-r--r--src/vnet/devices/dpdk/ipsec/esp_decrypt.c583
-rw-r--r--src/vnet/devices/dpdk/ipsec/esp_encrypt.c598
-rw-r--r--src/vnet/devices/dpdk/ipsec/ipsec.c313
-rw-r--r--src/vnet/devices/dpdk/ipsec/ipsec.h227
-rw-r--r--src/vnet/devices/dpdk/node.c687
-rw-r--r--src/vnet/devices/dpdk/qos_doc.md404
-rw-r--r--src/vnet/devices/netmap/cli.c146
-rw-r--r--src/vnet/devices/netmap/device.c261
-rw-r--r--src/vnet/devices/netmap/net_netmap.h650
-rw-r--r--src/vnet/devices/netmap/netmap.api74
-rw-r--r--src/vnet/devices/netmap/netmap.c316
-rw-r--r--src/vnet/devices/netmap/netmap.h164
-rw-r--r--src/vnet/devices/netmap/netmap_api.c137
-rw-r--r--src/vnet/devices/netmap/node.c300
-rw-r--r--src/vnet/devices/nic/ixge.c2938
-rw-r--r--src/vnet/devices/nic/ixge.h1293
-rw-r--r--src/vnet/devices/nic/sfp.c117
-rw-r--r--src/vnet/devices/nic/sfp.h117
-rw-r--r--src/vnet/devices/ssvm/node.c343
-rw-r--r--src/vnet/devices/ssvm/ssvm_eth.c491
-rw-r--r--src/vnet/devices/ssvm/ssvm_eth.h141
-rw-r--r--src/vnet/devices/virtio/dir.dox27
-rw-r--r--src/vnet/devices/virtio/vhost-user.c3314
-rw-r--r--src/vnet/devices/virtio/vhost-user.h350
-rw-r--r--src/vnet/devices/virtio/vhost_user.api125
-rw-r--r--src/vnet/devices/virtio/vhost_user_api.c262
47 files changed, 22733 insertions, 0 deletions
diff --git a/src/vnet/devices/af_packet/af_packet.api b/src/vnet/devices/af_packet/af_packet.api
new file mode 100644
index 00000000000..9fb2a2070f2
--- /dev/null
+++ b/src/vnet/devices/af_packet/af_packet.api
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2015-2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** \brief Create host-interface
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param host_if_name - interface name
+ @param hw_addr - interface MAC
+ @param use_random_hw_addr - use random generated MAC
+*/
+define af_packet_create
+{
+ u32 client_index;
+ u32 context;
+
+ u8 host_if_name[64];
+ u8 hw_addr[6];
+ u8 use_random_hw_addr;
+};
+
+/** \brief Create host-interface response
+ @param context - sender context, to match reply w/ request
+ @param retval - return value for request
+*/
+define af_packet_create_reply
+{
+ u32 context;
+ i32 retval;
+ u32 sw_if_index;
+};
+
+/** \brief Delete host-interface
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param host_if_name - interface name
+*/
+define af_packet_delete
+{
+ u32 client_index;
+ u32 context;
+
+ u8 host_if_name[64];
+};
+
+/** \brief Delete host-interface response
+ @param context - sender context, to match reply w/ request
+ @param retval - return value for request
+*/
+define af_packet_delete_reply
+{
+ u32 context;
+ i32 retval;
+};
+
+/*
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c
new file mode 100644
index 00000000000..91c3988b439
--- /dev/null
+++ b/src/vnet/devices/af_packet/af_packet.c
@@ -0,0 +1,366 @@
+/*
+ *------------------------------------------------------------------
+ * af_packet.c - linux kernel packet interface
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/af_packet/af_packet.h>
+
+#define AF_PACKET_DEBUG_SOCKET 0
+
+#define AF_PACKET_TX_FRAMES_PER_BLOCK 1024
+#define AF_PACKET_TX_FRAME_SIZE (2048 * 5)
+#define AF_PACKET_TX_BLOCK_NR 1
+#define AF_PACKET_TX_FRAME_NR (AF_PACKET_TX_BLOCK_NR * \
+ AF_PACKET_TX_FRAMES_PER_BLOCK)
+#define AF_PACKET_TX_BLOCK_SIZE (AF_PACKET_TX_FRAME_SIZE * \
+ AF_PACKET_TX_FRAMES_PER_BLOCK)
+
+#define AF_PACKET_RX_FRAMES_PER_BLOCK 1024
+#define AF_PACKET_RX_FRAME_SIZE (2048 * 5)
+#define AF_PACKET_RX_BLOCK_NR 1
+#define AF_PACKET_RX_FRAME_NR (AF_PACKET_RX_BLOCK_NR * \
+ AF_PACKET_RX_FRAMES_PER_BLOCK)
+#define AF_PACKET_RX_BLOCK_SIZE (AF_PACKET_RX_FRAME_SIZE * \
+ AF_PACKET_RX_FRAMES_PER_BLOCK)
+
+#if AF_PACKET_DEBUG_SOCKET == 1
+#define DBG_SOCK(args...) clib_warning(args);
+#else
+#define DBG_SOCK(args...)
+#endif
+
+/*defined in net/if.h but clashes with dpdk headers */
+unsigned int if_nametoindex (const char *ifname);
+
+typedef struct tpacket_req tpacket_req_t;
+
+static u32
+af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi,
+ u32 flags)
+{
+ /* nothing for now */
+ return 0;
+}
+
+static clib_error_t *
+af_packet_fd_read_ready (unix_file_t * uf)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ af_packet_main_t *apm = &af_packet_main;
+ u32 idx = uf->private_data;
+
+ apm->pending_input_bitmap =
+ clib_bitmap_set (apm->pending_input_bitmap, idx, 1);
+
+ /* Schedule the rx node */
+ vlib_node_set_interrupt_pending (vm, af_packet_input_node.index);
+
+ return 0;
+}
+
+static int
+create_packet_v2_sock (u8 * name, tpacket_req_t * rx_req,
+ tpacket_req_t * tx_req, int *fd, u8 ** ring)
+{
+ int ret, err;
+ struct sockaddr_ll sll;
+ uint host_if_index;
+ int ver = TPACKET_V2;
+ socklen_t req_sz = sizeof (struct tpacket_req);
+ u32 ring_sz = rx_req->tp_block_size * rx_req->tp_block_nr +
+ tx_req->tp_block_size * tx_req->tp_block_nr;
+
+ host_if_index = if_nametoindex ((const char *) name);
+
+ if (!host_if_index)
+ {
+ DBG_SOCK ("Wrong host interface name");
+ ret = VNET_API_ERROR_INVALID_INTERFACE;
+ goto error;
+ }
+
+ if ((*fd = socket (AF_PACKET, SOCK_RAW, htons (ETH_P_ALL))) < 0)
+ {
+ DBG_SOCK ("Failed to create socket");
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ if ((err =
+ setsockopt (*fd, SOL_PACKET, PACKET_VERSION, &ver, sizeof (ver))) < 0)
+ {
+ DBG_SOCK ("Failed to set rx packet interface version");
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ int opt = 1;
+ if ((err =
+ setsockopt (*fd, SOL_PACKET, PACKET_LOSS, &opt, sizeof (opt))) < 0)
+ {
+ DBG_SOCK ("Failed to set packet tx ring error handling option");
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ if ((err =
+ setsockopt (*fd, SOL_PACKET, PACKET_RX_RING, rx_req, req_sz)) < 0)
+ {
+ DBG_SOCK ("Failed to set packet rx ring options");
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ if ((err =
+ setsockopt (*fd, SOL_PACKET, PACKET_TX_RING, tx_req, req_sz)) < 0)
+ {
+ DBG_SOCK ("Failed to set packet rx ring options");
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ *ring =
+ mmap (NULL, ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, *fd,
+ 0);
+ if (*ring == MAP_FAILED)
+ {
+ DBG_SOCK ("mmap failure");
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ memset (&sll, 0, sizeof (sll));
+ sll.sll_family = PF_PACKET;
+ sll.sll_protocol = htons (ETH_P_ALL);
+ sll.sll_ifindex = host_if_index;
+
+ if ((err = bind (*fd, (struct sockaddr *) &sll, sizeof (sll))) < 0)
+ {
+ DBG_SOCK ("Failed to bind rx packet socket (error %d)", err);
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ return 0;
+error:
+ if (*fd >= 0)
+ close (*fd);
+ *fd = -1;
+ return ret;
+}
+
+int
+af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set,
+ u32 * sw_if_index)
+{
+ af_packet_main_t *apm = &af_packet_main;
+ int ret, fd = -1;
+ struct tpacket_req *rx_req = 0;
+ struct tpacket_req *tx_req = 0;
+ u8 *ring = 0;
+ af_packet_if_t *apif = 0;
+ u8 hw_addr[6];
+ clib_error_t *error;
+ vnet_sw_interface_t *sw;
+ vnet_main_t *vnm = vnet_get_main ();
+ uword *p;
+ uword if_index;
+ u8 *host_if_name_dup = vec_dup (host_if_name);
+
+ p = mhash_get (&apm->if_index_by_host_if_name, host_if_name);
+ if (p)
+ {
+ return VNET_API_ERROR_SUBIF_ALREADY_EXISTS;
+ }
+
+ vec_validate (rx_req, 0);
+ rx_req->tp_block_size = AF_PACKET_RX_BLOCK_SIZE;
+ rx_req->tp_frame_size = AF_PACKET_RX_FRAME_SIZE;
+ rx_req->tp_block_nr = AF_PACKET_RX_BLOCK_NR;
+ rx_req->tp_frame_nr = AF_PACKET_RX_FRAME_NR;
+
+ vec_validate (tx_req, 0);
+ tx_req->tp_block_size = AF_PACKET_TX_BLOCK_SIZE;
+ tx_req->tp_frame_size = AF_PACKET_TX_FRAME_SIZE;
+ tx_req->tp_block_nr = AF_PACKET_TX_BLOCK_NR;
+ tx_req->tp_frame_nr = AF_PACKET_TX_FRAME_NR;
+
+ ret = create_packet_v2_sock (host_if_name, rx_req, tx_req, &fd, &ring);
+
+ if (ret != 0)
+ goto error;
+
+ /* So far everything looks good, let's create interface */
+ pool_get (apm->interfaces, apif);
+ if_index = apif - apm->interfaces;
+
+ apif->fd = fd;
+ apif->rx_ring = ring;
+ apif->tx_ring = ring + rx_req->tp_block_size * rx_req->tp_block_nr;
+ apif->rx_req = rx_req;
+ apif->tx_req = tx_req;
+ apif->host_if_name = host_if_name_dup;
+ apif->per_interface_next_index = ~0;
+ apif->next_tx_frame = 0;
+ apif->next_rx_frame = 0;
+
+ {
+ unix_file_t template = { 0 };
+ template.read_function = af_packet_fd_read_ready;
+ template.file_descriptor = fd;
+ template.private_data = if_index;
+ template.flags = UNIX_FILE_EVENT_EDGE_TRIGGERED;
+ apif->unix_file_index = unix_file_add (&unix_main, &template);
+ }
+
+ /*use configured or generate random MAC address */
+ if (hw_addr_set)
+ clib_memcpy (hw_addr, hw_addr_set, 6);
+ else
+ {
+ f64 now = vlib_time_now (vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ clib_memcpy (hw_addr + 2, &rnd, sizeof (rnd));
+ hw_addr[0] = 2;
+ hw_addr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface (vnm, af_packet_device_class.index,
+ if_index, hw_addr, &apif->hw_if_index,
+ af_packet_eth_flag_change);
+
+ if (error)
+ {
+ memset (apif, 0, sizeof (*apif));
+ pool_put (apm->interfaces, apif);
+ clib_error_report (error);
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ sw = vnet_get_hw_sw_interface (vnm, apif->hw_if_index);
+ apif->sw_if_index = sw->sw_if_index;
+
+ vnet_hw_interface_set_flags (vnm, apif->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+
+ mhash_set_mem (&apm->if_index_by_host_if_name, host_if_name_dup, &if_index,
+ 0);
+ if (sw_if_index)
+ *sw_if_index = apif->sw_if_index;
+ return 0;
+
+error:
+ vec_free (host_if_name_dup);
+ vec_free (rx_req);
+ vec_free (tx_req);
+ return ret;
+}
+
+int
+af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ af_packet_main_t *apm = &af_packet_main;
+ af_packet_if_t *apif;
+ uword *p;
+ uword if_index;
+ u32 ring_sz;
+
+ p = mhash_get (&apm->if_index_by_host_if_name, host_if_name);
+ if (p == NULL)
+ {
+ clib_warning ("Host interface %s does not exist", host_if_name);
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ }
+ apif = pool_elt_at_index (apm->interfaces, p[0]);
+ if_index = apif - apm->interfaces;
+
+ /* bring down the interface */
+ vnet_hw_interface_set_flags (vnm, apif->hw_if_index, 0);
+
+ /* clean up */
+ if (apif->unix_file_index != ~0)
+ {
+ unix_file_del (&unix_main, unix_main.file_pool + apif->unix_file_index);
+ apif->unix_file_index = ~0;
+ }
+ else
+ close (apif->fd);
+
+ ring_sz = apif->rx_req->tp_block_size * apif->rx_req->tp_block_nr +
+ apif->tx_req->tp_block_size * apif->tx_req->tp_block_nr;
+ if (munmap (apif->rx_ring, ring_sz))
+ clib_warning ("Host interface %s could not free rx/tx ring",
+ host_if_name);
+ apif->rx_ring = NULL;
+ apif->tx_ring = NULL;
+ apif->fd = -1;
+
+ vec_free (apif->rx_req);
+ apif->rx_req = NULL;
+ vec_free (apif->tx_req);
+ apif->tx_req = NULL;
+
+ vec_free (apif->host_if_name);
+ apif->host_if_name = NULL;
+
+ mhash_unset (&apm->if_index_by_host_if_name, host_if_name, &if_index);
+
+ ethernet_delete_interface (vnm, apif->hw_if_index);
+
+ pool_put (apm->interfaces, apif);
+
+ return 0;
+}
+
+static clib_error_t *
+af_packet_init (vlib_main_t * vm)
+{
+ af_packet_main_t *apm = &af_packet_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+ memset (apm, 0, sizeof (af_packet_main_t));
+
+ mhash_init_vec_string (&apm->if_index_by_host_if_name, sizeof (uword));
+
+ vec_validate_aligned (apm->rx_buffers, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (af_packet_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/af_packet/af_packet.h b/src/vnet/devices/af_packet/af_packet.h
new file mode 100644
index 00000000000..19e2523d6c9
--- /dev/null
+++ b/src/vnet/devices/af_packet/af_packet.h
@@ -0,0 +1,69 @@
+/*
+ *------------------------------------------------------------------
+ * af_packet.h - linux kernel packet interface header file
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u8 *host_if_name;
+ int fd;
+ struct tpacket_req *rx_req;
+ struct tpacket_req *tx_req;
+ u8 *rx_ring;
+ u8 *tx_ring;
+ u32 hw_if_index;
+ u32 sw_if_index;
+ u32 unix_file_index;
+
+ u32 next_rx_frame;
+ u32 next_tx_frame;
+
+ u32 per_interface_next_index;
+ u8 is_admin_up;
+} af_packet_if_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ af_packet_if_t *interfaces;
+
+ /* bitmap of pending rx interfaces */
+ uword *pending_input_bitmap;
+
+ /* rx buffer cache */
+ u32 **rx_buffers;
+
+ /* hash of host interface names */
+ mhash_t if_index_by_host_if_name;
+} af_packet_main_t;
+
+af_packet_main_t af_packet_main;
+extern vnet_device_class_t af_packet_device_class;
+extern vlib_node_registration_t af_packet_input_node;
+
+int af_packet_create_if (vlib_main_t * vm, u8 * host_if_name,
+ u8 * hw_addr_set, u32 * sw_if_index);
+int af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/af_packet/af_packet_api.c b/src/vnet/devices/af_packet/af_packet_api.c
new file mode 100644
index 00000000000..414c838cdf7
--- /dev/null
+++ b/src/vnet/devices/af_packet/af_packet_api.c
@@ -0,0 +1,143 @@
+/*
+ *------------------------------------------------------------------
+ * af_packet_api.c - af-packet api
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/vnet.h>
+#include <vlibmemory/api.h>
+
+#include <vnet/interface.h>
+#include <vnet/api_errno.h>
+#include <vnet/devices/af_packet/af_packet.h>
+
+#include <vnet/vnet_msg_enum.h>
+
+#define vl_typedefs /* define message structures */
+#include <vnet/vnet_all_api_h.h>
+#undef vl_typedefs
+
+#define vl_endianfun /* define message structures */
+#include <vnet/vnet_all_api_h.h>
+#undef vl_endianfun
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vnet/vnet_all_api_h.h>
+#undef vl_printfun
+
+#include <vlibapi/api_helper_macros.h>
+
+#define foreach_vpe_api_msg \
+_(AF_PACKET_CREATE, af_packet_create) \
+_(AF_PACKET_DELETE, af_packet_delete)
+
+static void
+vl_api_af_packet_create_t_handler (vl_api_af_packet_create_t * mp)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vl_api_af_packet_create_reply_t *rmp;
+ int rv = 0;
+ u8 *host_if_name = NULL;
+ u32 sw_if_index;
+
+ host_if_name = format (0, "%s", mp->host_if_name);
+ vec_add1 (host_if_name, 0);
+
+ rv = af_packet_create_if (vm, host_if_name,
+ mp->use_random_hw_addr ? 0 : mp->hw_addr,
+ &sw_if_index);
+
+ vec_free (host_if_name);
+
+ /* *INDENT-OFF* */
+ REPLY_MACRO2(VL_API_AF_PACKET_CREATE_REPLY,
+ ({
+ rmp->sw_if_index = clib_host_to_net_u32(sw_if_index);
+ }));
+ /* *INDENT-ON* */
+}
+
+static void
+vl_api_af_packet_delete_t_handler (vl_api_af_packet_delete_t * mp)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vl_api_af_packet_delete_reply_t *rmp;
+ int rv = 0;
+ u8 *host_if_name = NULL;
+
+ host_if_name = format (0, "%s", mp->host_if_name);
+ vec_add1 (host_if_name, 0);
+
+ rv = af_packet_delete_if (vm, host_if_name);
+
+ vec_free (host_if_name);
+
+ REPLY_MACRO (VL_API_AF_PACKET_DELETE_REPLY);
+}
+
+/*
+ * af_packet_api_hookup
+ * Add vpe's API message handlers to the table.
+ * vlib has alread mapped shared memory and
+ * added the client registration handlers.
+ * See .../vlib-api/vlibmemory/memclnt_vlib.c:memclnt_process()
+ */
+#define vl_msg_name_crc_list
+#include <vnet/vnet_all_api_h.h>
+#undef vl_msg_name_crc_list
+
+static void
+setup_message_id_table (api_main_t * am)
+{
+#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id);
+ foreach_vl_msg_name_crc_af_packet;
+#undef _
+}
+
+static clib_error_t *
+af_packet_api_hookup (vlib_main_t * vm)
+{
+ api_main_t *am = &api_main;
+
+#define _(N,n) \
+ vl_msg_api_set_handlers(VL_API_##N, #n, \
+ vl_api_##n##_t_handler, \
+ vl_noop_handler, \
+ vl_api_##n##_t_endian, \
+ vl_api_##n##_t_print, \
+ sizeof(vl_api_##n##_t), 1);
+ foreach_vpe_api_msg;
+#undef _
+
+ /*
+ * Set up the (msg_name, crc, message-id) table
+ */
+ setup_message_id_table (am);
+
+ return 0;
+}
+
+VLIB_API_INIT_FUNCTION (af_packet_api_hookup);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/af_packet/cli.c b/src/vnet/devices/af_packet/cli.c
new file mode 100644
index 00000000000..2cbd415289e
--- /dev/null
+++ b/src/vnet/devices/af_packet/cli.c
@@ -0,0 +1,144 @@
+/*
+ *------------------------------------------------------------------
+ * af_packet.c - linux kernel packet interface
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <fcntl.h> /* for open */
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h> /* for iovec */
+#include <netinet/in.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/af_packet/af_packet.h>
+
+static clib_error_t *
+af_packet_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u8 *host_if_name = NULL;
+ u8 hwaddr[6];
+ u8 *hw_addr_ptr = 0;
+ u32 sw_if_index;
+ int r;
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "name %s", &host_if_name))
+ ;
+ else
+ if (unformat
+ (line_input, "hw-addr %U", unformat_ethernet_address, hwaddr))
+ hw_addr_ptr = hwaddr;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (host_if_name == NULL)
+ return clib_error_return (0, "missing host interface name");
+
+ r = af_packet_create_if (vm, host_if_name, hw_addr_ptr, &sw_if_index);
+ vec_free (host_if_name);
+
+ if (r == VNET_API_ERROR_SYSCALL_ERROR_1)
+ return clib_error_return (0, "%s (errno %d)", strerror (errno), errno);
+
+ if (r == VNET_API_ERROR_INVALID_INTERFACE)
+ return clib_error_return (0, "Invalid interface name");
+
+ if (r == VNET_API_ERROR_SUBIF_ALREADY_EXISTS)
+ return clib_error_return (0, "Interface elready exists");
+
+ vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (),
+ sw_if_index);
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (af_packet_create_command, static) = {
+ .path = "create host-interface",
+ .short_help = "create host-interface name <interface name> [hw-addr <mac>]",
+ .function = af_packet_create_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+af_packet_delete_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u8 *host_if_name = NULL;
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "name %s", &host_if_name))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (host_if_name == NULL)
+ return clib_error_return (0, "missing host interface name");
+
+ af_packet_delete_if (vm, host_if_name);
+ vec_free (host_if_name);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (af_packet_delete_command, static) = {
+ .path = "delete host-interface",
+ .short_help = "delete host-interface name <interface name>",
+ .function = af_packet_delete_command_fn,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+af_packet_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (af_packet_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/af_packet/device.c b/src/vnet/devices/af_packet/device.c
new file mode 100644
index 00000000000..1fb4000f6e6
--- /dev/null
+++ b/src/vnet/devices/af_packet/device.c
@@ -0,0 +1,250 @@
+/*
+ *------------------------------------------------------------------
+ * af_packet.c - linux kernel packet interface
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <linux/if_packet.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/af_packet/af_packet.h>
+
+#define foreach_af_packet_tx_func_error \
+_(FRAME_NOT_READY, "tx frame not ready") \
+_(TXRING_EAGAIN, "tx sendto temporary failure") \
+_(TXRING_FATAL, "tx sendto fatal failure") \
+_(TXRING_OVERRUN, "tx ring overrun")
+
+typedef enum
+{
+#define _(f,s) AF_PACKET_TX_ERROR_##f,
+ foreach_af_packet_tx_func_error
+#undef _
+ AF_PACKET_TX_N_ERROR,
+} af_packet_tx_func_error_t;
+
+static char *af_packet_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_af_packet_tx_func_error
+#undef _
+};
+
+
+static u8 *
+format_af_packet_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ af_packet_main_t *apm = &af_packet_main;
+ af_packet_if_t *apif = pool_elt_at_index (apm->interfaces, i);
+
+ s = format (s, "host-%s", apif->host_if_name);
+ return s;
+}
+
+static u8 *
+format_af_packet_device (u8 * s, va_list * args)
+{
+ s = format (s, "Linux PACKET socket interface");
+ return s;
+}
+
+static u8 *
+format_af_packet_tx_trace (u8 * s, va_list * args)
+{
+ s = format (s, "Unimplemented...");
+ return s;
+}
+
+static uword
+af_packet_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ af_packet_main_t *apm = &af_packet_main;
+ u32 *buffers = vlib_frame_args (frame);
+ u32 n_left = frame->n_vectors;
+ u32 n_sent = 0;
+ vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
+ af_packet_if_t *apif =
+ pool_elt_at_index (apm->interfaces, rd->dev_instance);
+ int block = 0;
+ u32 block_size = apif->tx_req->tp_block_size;
+ u32 frame_size = apif->tx_req->tp_frame_size;
+ u32 frame_num = apif->tx_req->tp_frame_nr;
+ u8 *block_start = apif->tx_ring + block * block_size;
+ u32 tx_frame = apif->next_tx_frame;
+ struct tpacket2_hdr *tph;
+ u32 frame_not_ready = 0;
+
+ while (n_left > 0)
+ {
+ u32 len;
+ u32 offset = 0;
+ vlib_buffer_t *b0;
+ n_left--;
+ u32 bi = buffers[0];
+ buffers++;
+
+ tph = (struct tpacket2_hdr *) (block_start + tx_frame * frame_size);
+
+ if (PREDICT_FALSE
+ (tph->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)))
+ {
+ frame_not_ready++;
+ goto next;
+ }
+
+ do
+ {
+ b0 = vlib_get_buffer (vm, bi);
+ len = b0->current_length;
+ clib_memcpy ((u8 *) tph +
+ TPACKET_ALIGN (sizeof (struct tpacket2_hdr)) + offset,
+ vlib_buffer_get_current (b0), len);
+ offset += len;
+ }
+ while ((bi = b0->next_buffer));
+
+ tph->tp_len = tph->tp_snaplen = offset;
+ tph->tp_status = TP_STATUS_SEND_REQUEST;
+ n_sent++;
+ next:
+ /* check if we've exhausted the ring */
+ if (PREDICT_FALSE (frame_not_ready + n_sent == frame_num))
+ break;
+
+ tx_frame = (tx_frame + 1) % frame_num;
+ }
+
+ CLIB_MEMORY_BARRIER ();
+
+ if (PREDICT_TRUE (n_sent))
+ {
+ apif->next_tx_frame = tx_frame;
+
+ if (PREDICT_FALSE (sendto (apif->fd, NULL, 0,
+ MSG_DONTWAIT, NULL, 0) == -1))
+ {
+ /* Uh-oh, drop & move on, but count whether it was fatal or not.
+ * Note that we have no reliable way to properly determine the
+ * disposition of the packets we just enqueued for delivery.
+ */
+ vlib_error_count (vm, node->node_index,
+ unix_error_is_fatal (errno) ?
+ AF_PACKET_TX_ERROR_TXRING_FATAL :
+ AF_PACKET_TX_ERROR_TXRING_EAGAIN, n_sent);
+ }
+ }
+
+ if (PREDICT_FALSE (frame_not_ready))
+ vlib_error_count (vm, node->node_index,
+ AF_PACKET_TX_ERROR_FRAME_NOT_READY, frame_not_ready);
+
+ if (PREDICT_FALSE (frame_not_ready + n_sent == frame_num))
+ vlib_error_count (vm, node->node_index, AF_PACKET_TX_ERROR_TXRING_OVERRUN,
+ n_left);
+
+ vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+ return frame->n_vectors;
+}
+
+static void
+af_packet_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ af_packet_main_t *apm = &af_packet_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ af_packet_if_t *apif =
+ pool_elt_at_index (apm->interfaces, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ apif->per_interface_next_index = node_index;
+ return;
+ }
+
+ apif->per_interface_next_index =
+ vlib_node_add_next (vlib_get_main (), af_packet_input_node.index,
+ node_index);
+}
+
+static void
+af_packet_clear_hw_interface_counters (u32 instance)
+{
+ /* Nothing for now */
+}
+
+static clib_error_t *
+af_packet_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index,
+ u32 flags)
+{
+ af_packet_main_t *apm = &af_packet_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ af_packet_if_t *apif =
+ pool_elt_at_index (apm->interfaces, hw->dev_instance);
+ u32 hw_flags;
+
+ apif->is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+
+ if (apif->is_admin_up)
+ hw_flags = VNET_HW_INTERFACE_FLAG_LINK_UP;
+ else
+ hw_flags = 0;
+
+ vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags);
+
+ return 0;
+}
+
+static clib_error_t *
+af_packet_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t *st, int is_add)
+{
+ /* Nothing for now */
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (af_packet_device_class) = {
+ .name = "af-packet",
+ .tx_function = af_packet_interface_tx,
+ .format_device_name = format_af_packet_device_name,
+ .format_device = format_af_packet_device,
+ .format_tx_trace = format_af_packet_tx_trace,
+ .tx_function_n_errors = AF_PACKET_TX_N_ERROR,
+ .tx_function_error_strings = af_packet_tx_func_error_strings,
+ .rx_redirect_to_node = af_packet_set_interface_next_node,
+ .clear_counters = af_packet_clear_hw_interface_counters,
+ .admin_up_down_function = af_packet_interface_admin_up_down,
+ .subif_add_del_function = af_packet_subif_add_del_function,
+};
+
+VLIB_DEVICE_TX_FUNCTION_MULTIARCH (af_packet_device_class,
+ af_packet_interface_tx)
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c
new file mode 100644
index 00000000000..72004320c67
--- /dev/null
+++ b/src/vnet/devices/af_packet/node.c
@@ -0,0 +1,288 @@
+/*
+ *------------------------------------------------------------------
+ * af_packet.c - linux kernel packet interface
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <linux/if_packet.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+#include <vnet/feature/feature.h>
+
+#include <vnet/devices/af_packet/af_packet.h>
+
+#define foreach_af_packet_input_error
+
+typedef enum
+{
+#define _(f,s) AF_PACKET_INPUT_ERROR_##f,
+ foreach_af_packet_input_error
+#undef _
+ AF_PACKET_INPUT_N_ERROR,
+} af_packet_input_error_t;
+
+static char *af_packet_input_error_strings[] = {
+#define _(n,s) s,
+ foreach_af_packet_input_error
+#undef _
+};
+
+typedef struct
+{
+ u32 next_index;
+ u32 hw_if_index;
+ int block;
+ struct tpacket2_hdr tph;
+} af_packet_input_trace_t;
+
+static u8 *
+format_af_packet_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ af_packet_input_trace_t *t = va_arg (*args, af_packet_input_trace_t *);
+ uword indent = format_get_indent (s);
+
+ s = format (s, "af_packet: hw_if_index %d next-index %d",
+ t->hw_if_index, t->next_index);
+
+ s =
+ format (s,
+ "\n%Utpacket2_hdr:\n%Ustatus 0x%x len %u snaplen %u mac %u net %u"
+ "\n%Usec 0x%x nsec 0x%x vlan %U"
+#ifdef TP_STATUS_VLAN_TPID_VALID
+ " vlan_tpid %u"
+#endif
+ ,
+ format_white_space, indent + 2,
+ format_white_space, indent + 4,
+ t->tph.tp_status,
+ t->tph.tp_len,
+ t->tph.tp_snaplen,
+ t->tph.tp_mac,
+ t->tph.tp_net,
+ format_white_space, indent + 4,
+ t->tph.tp_sec,
+ t->tph.tp_nsec, format_ethernet_vlan_tci, t->tph.tp_vlan_tci
+#ifdef TP_STATUS_VLAN_TPID_VALID
+ , t->tph.tp_vlan_tpid
+#endif
+ );
+ return s;
+}
+
+always_inline void
+buffer_add_to_chain (vlib_main_t * vm, u32 bi, u32 first_bi, u32 prev_bi)
+{
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ vlib_buffer_t *first_b = vlib_get_buffer (vm, first_bi);
+ vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_bi);
+
+ /* update first buffer */
+ first_b->total_length_not_including_first_buffer += b->current_length;
+
+ /* update previous buffer */
+ prev_b->next_buffer = bi;
+ prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+ /* update current buffer */
+ b->next_buffer = 0;
+}
+
+always_inline uword
+af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame, u32 device_idx)
+{
+ af_packet_main_t *apm = &af_packet_main;
+ af_packet_if_t *apif = pool_elt_at_index (apm->interfaces, device_idx);
+ struct tpacket2_hdr *tph;
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ u32 block = 0;
+ u32 rx_frame;
+ u32 n_free_bufs;
+ u32 n_rx_packets = 0;
+ u32 n_rx_bytes = 0;
+ u32 *to_next = 0;
+ u32 block_size = apif->rx_req->tp_block_size;
+ u32 frame_size = apif->rx_req->tp_frame_size;
+ u32 frame_num = apif->rx_req->tp_frame_nr;
+ u8 *block_start = apif->rx_ring + block * block_size;
+ uword n_trace = vlib_get_trace_count (vm, node);
+ u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+ u32 min_bufs = apif->rx_req->tp_frame_size / n_buffer_bytes;
+ int cpu_index = node->cpu_index;
+
+ if (apif->per_interface_next_index != ~0)
+ next_index = apif->per_interface_next_index;
+
+ n_free_bufs = vec_len (apm->rx_buffers[cpu_index]);
+ if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE))
+ {
+ vec_validate (apm->rx_buffers[cpu_index],
+ VLIB_FRAME_SIZE + n_free_bufs - 1);
+ n_free_bufs +=
+ vlib_buffer_alloc (vm, &apm->rx_buffers[cpu_index][n_free_bufs],
+ VLIB_FRAME_SIZE);
+ _vec_len (apm->rx_buffers[cpu_index]) = n_free_bufs;
+ }
+
+ rx_frame = apif->next_rx_frame;
+ tph = (struct tpacket2_hdr *) (block_start + rx_frame * frame_size);
+ while ((tph->tp_status & TP_STATUS_USER) && (n_free_bufs > min_bufs))
+ {
+ vlib_buffer_t *b0 = 0, *first_b0 = 0;
+ u32 next0 = next_index;
+
+ u32 n_left_to_next;
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ while ((tph->tp_status & TP_STATUS_USER) && (n_free_bufs > min_bufs) &&
+ n_left_to_next)
+ {
+ u32 data_len = tph->tp_snaplen;
+ u32 offset = 0;
+ u32 bi0 = 0, first_bi0 = 0, prev_bi0;
+
+ while (data_len)
+ {
+ /* grab free buffer */
+ u32 last_empty_buffer =
+ vec_len (apm->rx_buffers[cpu_index]) - 1;
+ prev_bi0 = bi0;
+ bi0 = apm->rx_buffers[cpu_index][last_empty_buffer];
+ b0 = vlib_get_buffer (vm, bi0);
+ _vec_len (apm->rx_buffers[cpu_index]) = last_empty_buffer;
+ n_free_bufs--;
+
+ /* copy data */
+ u32 bytes_to_copy =
+ data_len > n_buffer_bytes ? n_buffer_bytes : data_len;
+ b0->current_data = 0;
+ clib_memcpy (vlib_buffer_get_current (b0),
+ (u8 *) tph + tph->tp_mac + offset, bytes_to_copy);
+
+ /* fill buffer header */
+ b0->current_length = bytes_to_copy;
+
+ if (offset == 0)
+ {
+ b0->total_length_not_including_first_buffer = 0;
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = apif->sw_if_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ first_bi0 = bi0;
+ first_b0 = vlib_get_buffer (vm, first_bi0);
+ }
+ else
+ buffer_add_to_chain (vm, bi0, first_bi0, prev_bi0);
+
+ offset += bytes_to_copy;
+ data_len -= bytes_to_copy;
+ }
+ n_rx_packets++;
+ n_rx_bytes += tph->tp_snaplen;
+ to_next[0] = first_bi0;
+ to_next += 1;
+ n_left_to_next--;
+
+ /* trace */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b0);
+ if (PREDICT_FALSE (n_trace > 0))
+ {
+ af_packet_input_trace_t *tr;
+ vlib_trace_buffer (vm, node, next0, first_b0, /* follow_chain */
+ 0);
+ vlib_set_trace_count (vm, node, --n_trace);
+ tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->hw_if_index = apif->hw_if_index;
+ clib_memcpy (&tr->tph, tph, sizeof (struct tpacket2_hdr));
+ }
+
+ /* redirect if feature path enabled */
+ vnet_feature_start_device_input_x1 (apif->sw_if_index, &next0, b0,
+ 0);
+
+ /* enque and take next packet */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, first_bi0, next0);
+
+ /* next packet */
+ tph->tp_status = TP_STATUS_KERNEL;
+ rx_frame = (rx_frame + 1) % frame_num;
+ tph = (struct tpacket2_hdr *) (block_start + rx_frame * frame_size);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ apif->next_rx_frame = rx_frame;
+
+ vlib_increment_combined_counter
+ (vnet_get_main ()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number (), apif->hw_if_index, n_rx_packets, n_rx_bytes);
+
+ return n_rx_packets;
+}
+
+static uword
+af_packet_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ int i;
+ u32 n_rx_packets = 0;
+
+ af_packet_main_t *apm = &af_packet_main;
+
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (i, apm->pending_input_bitmap,
+ ({
+ clib_bitmap_set (apm->pending_input_bitmap, i, 0);
+ n_rx_packets += af_packet_device_input_fn(vm, node, frame, i);
+ }));
+ /* *INDENT-ON* */
+
+ return n_rx_packets;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (af_packet_input_node) = {
+ .function = af_packet_input_fn,
+ .name = "af-packet-input",
+ .sibling_of = "device-input",
+ .format_trace = format_af_packet_input_trace,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_INTERRUPT,
+ .n_errors = AF_PACKET_INPUT_N_ERROR,
+ .error_strings = af_packet_input_error_strings,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (af_packet_input_node, af_packet_input_fn)
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c
new file mode 100644
index 00000000000..cd4386ebdca
--- /dev/null
+++ b/src/vnet/devices/devices.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/devices/devices.h>
+#include <vnet/feature/feature.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+static uword
+device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (device_input_node) = {
+ .function = device_input_fn,
+ .name = "device-input",
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_DISABLED,
+ .n_next_nodes = VNET_DEVICE_INPUT_N_NEXT_NODES,
+ .next_nodes = VNET_DEVICE_INPUT_NEXT_NODES,
+};
+
+/* Table defines how much we need to advance current data pointer
+ in the buffer if we shortcut to l3 nodes */
+
+const u32 __attribute__((aligned (CLIB_CACHE_LINE_BYTES)))
+device_input_next_node_advance[((VNET_DEVICE_INPUT_N_NEXT_NODES /
+ CLIB_CACHE_LINE_BYTES) +1) * CLIB_CACHE_LINE_BYTES] =
+{
+ [VNET_DEVICE_INPUT_NEXT_IP4_INPUT] = sizeof (ethernet_header_t),
+ [VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT] = sizeof (ethernet_header_t),
+ [VNET_DEVICE_INPUT_NEXT_IP6_INPUT] = sizeof (ethernet_header_t),
+ [VNET_DEVICE_INPUT_NEXT_MPLS_INPUT] = sizeof (ethernet_header_t),
+};
+
+VNET_FEATURE_ARC_INIT (device_input, static) =
+{
+ .arc_name = "device-input",
+ .start_nodes = VNET_FEATURES ("device-input"),
+ .end_node = "ethernet-input",
+ .arc_index_ptr = &feature_main.device_input_feature_arc_index,
+};
+
+VNET_FEATURE_INIT (l2_patch, static) = {
+ .arc_name = "device-input",
+ .node_name = "l2-patch",
+ .runs_before = VNET_FEATURES ("ethernet-input"),
+};
+
+VNET_FEATURE_INIT (worker_handoff, static) = {
+ .arc_name = "device-input",
+ .node_name = "worker-handoff",
+ .runs_before = VNET_FEATURES ("ethernet-input"),
+};
+
+VNET_FEATURE_INIT (span_input, static) = {
+ .arc_name = "device-input",
+ .node_name = "span-input",
+ .runs_before = VNET_FEATURES ("ethernet-input"),
+};
+
+VNET_FEATURE_INIT (ethernet_input, static) = {
+ .arc_name = "device-input",
+ .node_name = "ethernet-input",
+ .runs_before = 0, /* not before any other features */
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h
new file mode 100644
index 00000000000..c46dab904c3
--- /dev/null
+++ b/src/vnet/devices/devices.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vnet_vnet_device_h
+#define included_vnet_vnet_device_h
+
+#include <vnet/unix/pcap.h>
+#include <vnet/l3_types.h>
+
+typedef enum
+{
+ VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT,
+ VNET_DEVICE_INPUT_NEXT_IP4_INPUT,
+ VNET_DEVICE_INPUT_NEXT_IP6_INPUT,
+ VNET_DEVICE_INPUT_NEXT_MPLS_INPUT,
+ VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT,
+ VNET_DEVICE_INPUT_NEXT_DROP,
+ VNET_DEVICE_INPUT_N_NEXT_NODES,
+} vnet_device_input_next_t;
+
+#define VNET_DEVICE_INPUT_NEXT_NODES { \
+ [VNET_DEVICE_INPUT_NEXT_DROP] = "error-drop", \
+ [VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT] = "ethernet-input", \
+ [VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT] = "ip4-input-no-checksum", \
+ [VNET_DEVICE_INPUT_NEXT_IP4_INPUT] = "ip4-input", \
+ [VNET_DEVICE_INPUT_NEXT_IP6_INPUT] = "ip6-input", \
+ [VNET_DEVICE_INPUT_NEXT_MPLS_INPUT] = "mpls-input", \
+}
+
+extern vlib_node_registration_t device_input_node;
+extern const u32 device_input_next_node_advance[];
+
+#endif /* included_vnet_vnet_device_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/cli.c b/src/vnet/devices/dpdk/cli.c
new file mode 100644
index 00000000000..538a00fd975
--- /dev/null
+++ b/src/vnet/devices/dpdk/cli.c
@@ -0,0 +1,1296 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls/packet.h>
+
+#include "dpdk_priv.h"
+
+static clib_error_t *
+pcap_trace_command_fn (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u8 *filename;
+ u32 max;
+ int matched = 0;
+ clib_error_t *error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "on"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ if (dm->pcap_filename == 0)
+ dm->pcap_filename = format (0, "/tmp/vpe.pcap%c", 0);
+
+ memset (&dm->pcap_main, 0, sizeof (dm->pcap_main));
+ dm->pcap_main.file_name = (char *) dm->pcap_filename;
+ dm->pcap_main.n_packets_to_capture = 100;
+ if (dm->pcap_pkts_to_capture)
+ dm->pcap_main.n_packets_to_capture = dm->pcap_pkts_to_capture;
+
+ dm->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet;
+ dm->tx_pcap_enable = 1;
+ matched = 1;
+ vlib_cli_output (vm, "pcap tx capture on...");
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already on...");
+ }
+ matched = 1;
+ }
+ else if (unformat (input, "off"))
+ {
+ if (dm->tx_pcap_enable)
+ {
+ vlib_cli_output (vm, "captured %d pkts...",
+ dm->pcap_main.n_packets_captured + 1);
+ if (dm->pcap_main.n_packets_captured)
+ {
+ dm->pcap_main.n_packets_to_capture =
+ dm->pcap_main.n_packets_captured;
+ error = pcap_write (&dm->pcap_main);
+ if (error)
+ clib_error_report (error);
+ else
+ vlib_cli_output (vm, "saved to %s...", dm->pcap_filename);
+ }
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already off...");
+ }
+
+ dm->tx_pcap_enable = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "max %d", &max))
+ {
+ dm->pcap_pkts_to_capture = max;
+ matched = 1;
+ }
+
+ else if (unformat (input, "intfc %U",
+ unformat_vnet_sw_interface, dm->vnet_main,
+ &dm->pcap_sw_if_index))
+ matched = 1;
+ else if (unformat (input, "intfc any"))
+ {
+ dm->pcap_sw_if_index = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "file %s", &filename))
+ {
+ u8 *chroot_filename;
+ /* Brain-police user path input */
+ if (strstr ((char *) filename, "..")
+ || index ((char *) filename, '/'))
+ {
+ vlib_cli_output (vm, "illegal characters in filename '%s'",
+ filename);
+ continue;
+ }
+
+ chroot_filename = format (0, "/tmp/%s%c", filename, 0);
+ vec_free (filename);
+
+ if (dm->pcap_filename)
+ vec_free (dm->pcap_filename);
+ vec_add1 (filename, 0);
+ dm->pcap_filename = chroot_filename;
+ matched = 1;
+ }
+ else if (unformat (input, "status"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ vlib_cli_output (vm, "pcap tx capture is off...");
+ continue;
+ }
+
+ vlib_cli_output (vm, "pcap tx capture: %d of %d pkts...",
+ dm->pcap_main.n_packets_captured,
+ dm->pcap_main.n_packets_to_capture);
+ matched = 1;
+ }
+
+ else
+ break;
+ }
+
+ if (matched == 0)
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (pcap_trace_command, static) = {
+ .path = "pcap tx trace",
+ .short_help =
+ "pcap tx trace on off max <nn> intfc <intfc> file <name> status",
+ .function = pcap_trace_command_fn,
+};
+/* *INDENT-ON* */
+
+
+static clib_error_t *
+show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ struct rte_mempool *rmp;
+ int i;
+
+ for (i = 0; i < vec_len (vm->buffer_main->pktmbuf_pools); i++)
+ {
+ rmp = vm->buffer_main->pktmbuf_pools[i];
+ if (rmp)
+ {
+ unsigned count = rte_mempool_avail_count (rmp);
+ unsigned free_count = rte_mempool_in_use_count (rmp);
+
+ vlib_cli_output (vm,
+ "name=\"%s\" available = %7d allocated = %7d total = %7d\n",
+ rmp->name, (u32) count, (u32) free_count,
+ (u32) (count + free_count));
+ }
+ else
+ {
+ vlib_cli_output (vm, "rte_mempool is NULL (!)\n");
+ }
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_bufferr,static) = {
+ .path = "show dpdk buffer",
+ .short_help = "show dpdk buffer state",
+ .function = show_dpdk_buffer,
+ .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+test_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ static u32 *allocated_buffers;
+ u32 n_alloc = 0;
+ u32 n_free = 0;
+ u32 first, actual_alloc;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "allocate %d", &n_alloc))
+ ;
+ else if (unformat (input, "free %d", &n_free))
+ ;
+ else
+ break;
+ }
+
+ if (n_free)
+ {
+ if (vec_len (allocated_buffers) < n_free)
+ return clib_error_return (0, "Can't free %d, only %d allocated",
+ n_free, vec_len (allocated_buffers));
+
+ first = vec_len (allocated_buffers) - n_free;
+ vlib_buffer_free (vm, allocated_buffers + first, n_free);
+ _vec_len (allocated_buffers) = first;
+ }
+ if (n_alloc)
+ {
+ first = vec_len (allocated_buffers);
+ vec_validate (allocated_buffers,
+ vec_len (allocated_buffers) + n_alloc - 1);
+
+ actual_alloc = vlib_buffer_alloc (vm, allocated_buffers + first,
+ n_alloc);
+ _vec_len (allocated_buffers) = first + actual_alloc;
+
+ if (actual_alloc < n_alloc)
+ vlib_cli_output (vm, "WARNING: only allocated %d buffers",
+ actual_alloc);
+ }
+
+ vlib_cli_output (vm, "Currently %d buffers allocated",
+ vec_len (allocated_buffers));
+
+ if (allocated_buffers && vec_len (allocated_buffers) == 0)
+ vec_free (allocated_buffers);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_test_dpdk_buffer,static) = {
+ .path = "test dpdk buffer",
+ .short_help = "test dpdk buffer [allocate <nn>][free <nn>]",
+ .function = test_dpdk_buffer,
+ .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 nb_rx_desc = (u32) ~ 0;
+ u32 nb_tx_desc = (u32) ~ 0;
+ clib_error_t *rv;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "tx %d", &nb_tx_desc))
+ ;
+ else if (unformat (line_input, "rx %d", &nb_rx_desc))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ return clib_error_return (0, "number of descriptors can be set only for "
+ "physical devices");
+
+ if ((nb_rx_desc == (u32) ~ 0 || nb_rx_desc == xd->nb_rx_desc) &&
+ (nb_tx_desc == (u32) ~ 0 || nb_tx_desc == xd->nb_tx_desc))
+ return clib_error_return (0, "nothing changed");
+
+ if (nb_rx_desc != (u32) ~ 0)
+ xd->nb_rx_desc = nb_rx_desc;
+
+ if (nb_tx_desc != (u32) ~ 0)
+ xd->nb_tx_desc = nb_tx_desc;
+
+ rv = dpdk_port_setup (dm, xd);
+
+ return rv;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = {
+ .path = "set dpdk interface descriptors",
+ .short_help = "set dpdk interface descriptors <if-name> [rx <n>] [tx <n>]",
+ .function = set_dpdk_if_desc,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ int cpu;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_cli_output (vm, "All interfaces are handled by main thread");
+
+ for (cpu = 0; cpu < vec_len (dm->devices_by_cpu); cpu++)
+ {
+ if (vec_len (dm->devices_by_cpu[cpu]))
+ vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu,
+ vlib_worker_threads[cpu].name,
+ vlib_worker_threads[cpu].lcore_id);
+
+ /* *INDENT-OFF* */
+ vec_foreach(dq, dm->devices_by_cpu[cpu])
+ {
+ u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index;
+ vnet_hw_interface_t * hi = vnet_get_hw_interface(dm->vnet_main, hw_if_index);
+ vlib_cli_output(vm, " %v queue %u", hi->name, dq->queue_id);
+ }
+ /* *INDENT-ON* */
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_placement,static) = {
+ .path = "show dpdk interface placement",
+ .short_help = "show dpdk interface placement",
+ .function = show_dpdk_if_placement,
+};
+/* *INDENT-ON* */
+
+static int
+dpdk_device_queue_sort (void *a1, void *a2)
+{
+ dpdk_device_and_queue_t *dq1 = a1;
+ dpdk_device_and_queue_t *dq2 = a2;
+
+ if (dq1->device > dq2->device)
+ return 1;
+ else if (dq1->device < dq2->device)
+ return -1;
+ else if (dq1->queue_id > dq2->queue_id)
+ return 1;
+ else if (dq1->queue_id < dq2->queue_id)
+ return -1;
+ else
+ return 0;
+}
+
+static clib_error_t *
+set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 queue = (u32) 0;
+ u32 cpu = (u32) ~ 0;
+ int i;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "queue %d", &queue))
+ ;
+ else if (unformat (line_input, "thread %d", &cpu))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ if (cpu < dm->input_cpu_first_index ||
+ cpu >= (dm->input_cpu_first_index + dm->input_cpu_count))
+ return clib_error_return (0, "please specify valid thread id");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ for (i = 0; i < vec_len (dm->devices_by_cpu); i++)
+ {
+ /* *INDENT-OFF* */
+ vec_foreach(dq, dm->devices_by_cpu[i])
+ {
+ if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index &&
+ queue == dq->queue_id)
+ {
+ if (cpu == i) /* nothing to do */
+ return 0;
+
+ vec_del1(dm->devices_by_cpu[i], dq - dm->devices_by_cpu[i]);
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->queue_id = queue;
+ dq->device = xd->device_index;
+ xd->cpu_socket_id_by_queue[queue] =
+ rte_lcore_to_socket_id(vlib_worker_threads[cpu].lcore_id);
+
+ vec_sort_with_function(dm->devices_by_cpu[i],
+ dpdk_device_queue_sort);
+
+ vec_sort_with_function(dm->devices_by_cpu[cpu],
+ dpdk_device_queue_sort);
+
+ if (vec_len(dm->devices_by_cpu[i]) == 0)
+ vlib_node_set_state (vlib_mains[i], dpdk_input_node.index,
+ VLIB_NODE_STATE_DISABLED);
+
+ if (vec_len(dm->devices_by_cpu[cpu]) == 1)
+ vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ return 0;
+ }
+ }
+ /* *INDENT-ON* */
+ }
+
+ return clib_error_return (0, "not found");
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = {
+ .path = "set dpdk interface placement",
+ .short_help = "set dpdk interface placement <if-name> [queue <n>] thread <n>",
+ .function = set_dpdk_if_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ int cpu;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_cli_output (vm, "All interfaces are handled by main thread");
+
+ for (cpu = 0; cpu < vec_len (dm->devices_by_hqos_cpu); cpu++)
+ {
+ if (vec_len (dm->devices_by_hqos_cpu[cpu]))
+ vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu,
+ vlib_worker_threads[cpu].name,
+ vlib_worker_threads[cpu].lcore_id);
+
+ vec_foreach (dq, dm->devices_by_hqos_cpu[cpu])
+ {
+ u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index;
+ vnet_hw_interface_t *hi =
+ vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ vlib_cli_output (vm, " %v queue %u", hi->name, dq->queue_id);
+ }
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos_placement, static) = {
+ .path = "show dpdk interface hqos placement",
+ .short_help = "show dpdk interface hqos placement",
+ .function = show_dpdk_if_hqos_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 cpu = (u32) ~ 0;
+ int i;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "thread %d", &cpu))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ if (cpu < dm->hqos_cpu_first_index ||
+ cpu >= (dm->hqos_cpu_first_index + dm->hqos_cpu_count))
+ return clib_error_return (0, "please specify valid thread id");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ for (i = 0; i < vec_len (dm->devices_by_hqos_cpu); i++)
+ {
+ vec_foreach (dq, dm->devices_by_hqos_cpu[i])
+ {
+ if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index)
+ {
+ if (cpu == i) /* nothing to do */
+ return 0;
+
+ vec_del1 (dm->devices_by_hqos_cpu[i],
+ dq - dm->devices_by_hqos_cpu[i]);
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->queue_id = 0;
+ dq->device = xd->device_index;
+
+ vec_sort_with_function (dm->devices_by_hqos_cpu[i],
+ dpdk_device_queue_sort);
+
+ vec_sort_with_function (dm->devices_by_hqos_cpu[cpu],
+ dpdk_device_queue_sort);
+
+ return 0;
+ }
+ }
+ }
+
+ return clib_error_return (0, "not found");
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_placement, static) = {
+ .path = "set dpdk interface hqos placement",
+ .short_help = "set dpdk interface hqos placement <if-name> thread <n>",
+ .function = set_dpdk_if_hqos_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport_id = (u32) ~ 0;
+ u32 pipe_id = (u32) ~ 0;
+ u32 profile_id = (u32) ~ 0;
+ int rv;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "subport %d", &subport_id))
+ ;
+ else if (unformat (line_input, "pipe %d", &pipe_id))
+ ;
+ else if (unformat (line_input, "profile %d", &profile_id))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rv =
+ rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id,
+ profile_id);
+ if (rv)
+ return clib_error_return (0, "pipe configuration failed");
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pipe, static) =
+{
+ .path = "set dpdk interface hqos pipe",
+ .short_help = "set dpdk interface hqos pipe <if-name> subport <n> pipe <n> "
+ "profile <n>",
+ .function = set_dpdk_if_hqos_pipe,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_subport (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport_id = (u32) ~ 0;
+ struct rte_sched_subport_params p = {
+ .tb_rate = 1250000000, /* 10GbE */
+ .tb_size = 1000000,
+ .tc_rate = {1250000000, 1250000000, 1250000000, 1250000000},
+ .tc_period = 10,
+ };
+ int rv;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "subport %d", &subport_id))
+ ;
+ else if (unformat (line_input, "rate %d", &p.tb_rate))
+ {
+ p.tc_rate[0] = p.tb_rate;
+ p.tc_rate[1] = p.tb_rate;
+ p.tc_rate[2] = p.tb_rate;
+ p.tc_rate[3] = p.tb_rate;
+ }
+ else if (unformat (line_input, "bktsize %d", &p.tb_size))
+ ;
+ else if (unformat (line_input, "tc0 %d", &p.tc_rate[0]))
+ ;
+ else if (unformat (line_input, "tc1 %d", &p.tc_rate[1]))
+ ;
+ else if (unformat (line_input, "tc2 %d", &p.tc_rate[2]))
+ ;
+ else if (unformat (line_input, "tc3 %d", &p.tc_rate[3]))
+ ;
+ else if (unformat (line_input, "period %d", &p.tc_period))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport_id, &p);
+ if (rv)
+ return clib_error_return (0, "subport configuration failed");
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_subport, static) = {
+ .path = "set dpdk interface hqos subport",
+ .short_help = "set dpdk interface hqos subport <if-name> subport <n> "
+ "[rate <n>] [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] "
+ "[period <n>]",
+ .function = set_dpdk_if_hqos_subport,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 tc = (u32) ~ 0;
+ u32 queue = (u32) ~ 0;
+ u32 entry = (u32) ~ 0;
+ u32 val, i;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "entry %d", &entry))
+ ;
+ else if (unformat (line_input, "tc %d", &tc))
+ ;
+ else if (unformat (line_input, "queue %d", &queue))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+ if (entry >= 64)
+ return clib_error_return (0, "invalid entry");
+ if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE)
+ return clib_error_return (0, "invalid traffic class");
+ if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS)
+ return clib_error_return (0, "invalid traffic class");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ /* Detect the set of worker threads */
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ return clib_error_return (0, "no worker registrations?");
+
+ vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0];
+ int worker_thread_first = tr->first_index;
+ int worker_thread_count = tr->count;
+
+ val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue;
+ for (i = 0; i < worker_thread_count; i++)
+ xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val;
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_tctbl, static) = {
+ .path = "set dpdk interface hqos tctbl",
+ .short_help = "set dpdk interface hqos tctbl <if-name> entry <n> tc <n> queue <n>",
+ .function = set_dpdk_if_hqos_tctbl,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+
+ /* Device specific data */
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+
+ /* Detect the set of worker threads */
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ return clib_error_return (0, "no worker registrations?");
+
+ vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0];
+ int worker_thread_first = tr->first_index;
+ int worker_thread_count = tr->count;
+
+ /* Packet field configuration */
+ u64 mask = (u64) ~ 0;
+ u32 id = (u32) ~ 0;
+ u32 offset = (u32) ~ 0;
+
+ /* HQoS params */
+ u32 n_subports_per_port, n_pipes_per_subport, tctbl_size;
+
+ u32 i;
+
+ /* Parse input arguments */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "id %d", &id))
+ ;
+ else if (unformat (line_input, "offset %d", &offset))
+ ;
+ else if (unformat (line_input, "mask %llx", &mask))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ /* Get interface */
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ return 0;
+ }
+
+ n_subports_per_port = devconf->hqos.port.n_subports_per_port;
+ n_pipes_per_subport = devconf->hqos.port.n_pipes_per_subport;
+ tctbl_size = RTE_DIM (devconf->hqos.tc_table);
+
+ /* Validate packet field configuration: id, offset and mask */
+ if (id >= 3)
+ return clib_error_return (0, "invalid packet field id");
+
+ switch (id)
+ {
+ case 0:
+ if (dpdk_hqos_validate_mask (mask, n_subports_per_port) != 0)
+ return clib_error_return (0, "invalid subport ID mask "
+ "(n_subports_per_port = %u)",
+ n_subports_per_port);
+ break;
+ case 1:
+ if (dpdk_hqos_validate_mask (mask, n_pipes_per_subport) != 0)
+ return clib_error_return (0, "invalid pipe ID mask "
+ "(n_pipes_per_subport = %u)",
+ n_pipes_per_subport);
+ break;
+ case 2:
+ default:
+ if (dpdk_hqos_validate_mask (mask, tctbl_size) != 0)
+ return clib_error_return (0, "invalid TC table index mask "
+ "(TC table size = %u)", tctbl_size);
+ }
+
+ /* Propagate packet field configuration to all workers */
+ for (i = 0; i < worker_thread_count; i++)
+ switch (id)
+ {
+ case 0:
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabshr =
+ __builtin_ctzll (mask);
+ break;
+ case 1:
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabshr =
+ __builtin_ctzll (mask);
+ break;
+ case 2:
+ default:
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabshr =
+ __builtin_ctzll (mask);
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pktfield, static) = {
+ .path = "set dpdk interface hqos pktfield",
+ .short_help = "set dpdk interface hqos pktfield <if-name> id <n> offset <n> "
+ "mask <n>",
+ .function = set_dpdk_if_hqos_pktfield,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ dpdk_device_config_hqos_t *cfg;
+ dpdk_device_hqos_per_hqos_thread_t *ht;
+ dpdk_device_hqos_per_worker_thread_t *wk;
+ u32 *tctbl;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 profile_id, i;
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ vlib_thread_registration_t *tr;
+ uword *p = 0;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify interface name!!");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ return 0;
+ }
+
+ /* Detect the set of worker threads */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ return clib_error_return (0, "no worker registrations?");
+
+ tr = (vlib_thread_registration_t *) p[0];
+
+ cfg = &devconf->hqos;
+ ht = xd->hqos_ht;
+ wk = &xd->hqos_wt[tr->first_index];
+ tctbl = wk->hqos_tc_table;
+
+ vlib_cli_output (vm, " Thread:");
+ vlib_cli_output (vm, " Input SWQ size = %u packets", cfg->swq_size);
+ vlib_cli_output (vm, " Enqueue burst size = %u packets",
+ ht->hqos_burst_enq);
+ vlib_cli_output (vm, " Dequeue burst size = %u packets",
+ ht->hqos_burst_deq);
+
+ vlib_cli_output (vm,
+ " Packet field 0: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field0_slabpos, wk->hqos_field0_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 1: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field1_slabpos, wk->hqos_field1_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 2: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field2_slabpos, wk->hqos_field2_slabmask);
+ vlib_cli_output (vm, " Packet field 2 translation table:");
+ vlib_cli_output (vm, " [ 0 .. 15]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[0], tctbl[1], tctbl[2], tctbl[3],
+ tctbl[4], tctbl[5], tctbl[6], tctbl[7],
+ tctbl[8], tctbl[9], tctbl[10], tctbl[11],
+ tctbl[12], tctbl[13], tctbl[14], tctbl[15]);
+ vlib_cli_output (vm, " [16 .. 31]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[16], tctbl[17], tctbl[18], tctbl[19],
+ tctbl[20], tctbl[21], tctbl[22], tctbl[23],
+ tctbl[24], tctbl[25], tctbl[26], tctbl[27],
+ tctbl[28], tctbl[29], tctbl[30], tctbl[31]);
+ vlib_cli_output (vm, " [32 .. 47]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[32], tctbl[33], tctbl[34], tctbl[35],
+ tctbl[36], tctbl[37], tctbl[38], tctbl[39],
+ tctbl[40], tctbl[41], tctbl[42], tctbl[43],
+ tctbl[44], tctbl[45], tctbl[46], tctbl[47]);
+ vlib_cli_output (vm, " [48 .. 63]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[48], tctbl[49], tctbl[50], tctbl[51],
+ tctbl[52], tctbl[53], tctbl[54], tctbl[55],
+ tctbl[56], tctbl[57], tctbl[58], tctbl[59],
+ tctbl[60], tctbl[61], tctbl[62], tctbl[63]);
+
+ vlib_cli_output (vm, " Port:");
+ vlib_cli_output (vm, " Rate = %u bytes/second", cfg->port.rate);
+ vlib_cli_output (vm, " MTU = %u bytes", cfg->port.mtu);
+ vlib_cli_output (vm, " Frame overhead = %u bytes",
+ cfg->port.frame_overhead);
+ vlib_cli_output (vm, " Number of subports = %u",
+ cfg->port.n_subports_per_port);
+ vlib_cli_output (vm, " Number of pipes per subport = %u",
+ cfg->port.n_pipes_per_subport);
+ vlib_cli_output (vm,
+ " Packet queue size: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u packets",
+ cfg->port.qsize[0], cfg->port.qsize[1], cfg->port.qsize[2],
+ cfg->port.qsize[3]);
+ vlib_cli_output (vm, " Number of pipe profiles = %u",
+ cfg->port.n_pipe_profiles);
+
+ for (profile_id = 0; profile_id < vec_len (cfg->pipe); profile_id++)
+ {
+ vlib_cli_output (vm, " Pipe profile %u:", profile_id);
+ vlib_cli_output (vm, " Rate = %u bytes/second",
+ cfg->pipe[profile_id].tb_rate);
+ vlib_cli_output (vm, " Token bucket size = %u bytes",
+ cfg->pipe[profile_id].tb_size);
+ vlib_cli_output (vm,
+ " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second",
+ cfg->pipe[profile_id].tc_rate[0],
+ cfg->pipe[profile_id].tc_rate[1],
+ cfg->pipe[profile_id].tc_rate[2],
+ cfg->pipe[profile_id].tc_rate[3]);
+ vlib_cli_output (vm, " TC period = %u milliseconds",
+ cfg->pipe[profile_id].tc_period);
+#ifdef RTE_SCHED_SUBPORT_TC_OV
+ vlib_cli_output (vm, " TC3 oversubscription_weight = %u",
+ cfg->pipe[profile_id].tc_ov_weight);
+#endif
+
+ for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
+ {
+ vlib_cli_output (vm,
+ " TC%u WRR weights: Q0 = %u, Q1 = %u, Q2 = %u, Q3 = %u",
+ i, cfg->pipe[profile_id].wrr_weights[i * 4],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 1],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 2],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 3]);
+ }
+ }
+
+#ifdef RTE_SCHED_RED
+ vlib_cli_output (vm, " Weighted Random Early Detection (WRED):");
+ for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
+ {
+ vlib_cli_output (vm, " TC%u min: G = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].min_th,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].min_th,
+ cfg->port.red_params[i][e_RTE_METER_RED].min_th);
+
+ vlib_cli_output (vm, " TC%u max: G = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].max_th,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].max_th,
+ cfg->port.red_params[i][e_RTE_METER_RED].max_th);
+
+ vlib_cli_output (vm,
+ " TC%u inverted probability: G = %u, Y = %u, R = %u",
+ i, cfg->port.red_params[i][e_RTE_METER_GREEN].maxp_inv,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].maxp_inv,
+ cfg->port.red_params[i][e_RTE_METER_RED].maxp_inv);
+
+ vlib_cli_output (vm, " TC%u weight: R = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].wq_log2,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].wq_log2,
+ cfg->port.red_params[i][e_RTE_METER_RED].wq_log2);
+ }
+#endif
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos, static) = {
+ .path = "show dpdk interface hqos",
+ .short_help = "show dpdk interface hqos <if-name>",
+ .function = show_dpdk_if_hqos,
+};
+
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport = (u32) ~ 0;
+ u32 pipe = (u32) ~ 0;
+ u32 tc = (u32) ~ 0;
+ u32 tc_q = (u32) ~ 0;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ uword *p = 0;
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ u32 qindex;
+ struct rte_sched_queue_stats stats;
+ u16 qlen;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+
+ else if (unformat (line_input, "subport %d", &subport))
+ ;
+
+ else if (unformat (line_input, "pipe %d", &pipe))
+ ;
+
+ else if (unformat (line_input, "tc %d", &tc))
+ ;
+
+ else if (unformat (line_input, "tc_q %d", &tc_q))
+ ;
+
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify interface name!!");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ return 0;
+ }
+
+ /*
+ * Figure out which queue to query. cf rte_sched_port_qindex. (Not sure why
+ * that method isn't made public by DPDK - how _should_ we get the queue ID?)
+ */
+ qindex = subport * devconf->hqos.port.n_pipes_per_subport + pipe;
+ qindex = qindex * RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE + tc;
+ qindex = qindex * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + tc_q;
+
+ if (rte_sched_queue_read_stats (xd->hqos_ht->hqos, qindex, &stats, &qlen) !=
+ 0)
+ return clib_error_return (0, "failed to read stats");
+
+ vlib_cli_output (vm, "%=24s%=16s", "Stats Parameter", "Value");
+ vlib_cli_output (vm, "%=24s%=16d", "Packets", stats.n_pkts);
+ vlib_cli_output (vm, "%=24s%=16d", "Packets dropped", stats.n_pkts_dropped);
+#ifdef RTE_SCHED_RED
+ vlib_cli_output (vm, "%=24s%=16d", "Packets dropped (RED)",
+ stats.n_pkts_red_dropped);
+#endif
+ vlib_cli_output (vm, "%=24s%=16d", "Bytes", stats.n_bytes);
+ vlib_cli_output (vm, "%=24s%=16d", "Bytes dropped", stats.n_bytes_dropped);
+
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_hqos_queue_stats, static) = {
+ .path = "show dpdk hqos queue",
+ .short_help = "show dpdk hqos queue <if-name> subport <subport> pipe <pipe> tc <tc> tc_q <tc_q>",
+ .function = show_dpdk_hqos_queue_stats,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+dpdk_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (dpdk_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/device.c b/src/vnet/devices/dpdk/device.c
new file mode 100644
index 00000000000..b22fbf2e69e
--- /dev/null
+++ b/src/vnet/devices/dpdk/device.c
@@ -0,0 +1,840 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+#include <assert.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include "dpdk_priv.h"
+#include <vppinfra/error.h>
+
+#define foreach_dpdk_tx_func_error \
+ _(BAD_RETVAL, "DPDK tx function returned an error") \
+ _(RING_FULL, "Tx packet drops (ring full)") \
+ _(PKT_DROP, "Tx packet drops (dpdk tx failure)") \
+ _(REPL_FAIL, "Tx packet drops (replication failure)")
+
+typedef enum
+{
+#define _(f,s) DPDK_TX_FUNC_ERROR_##f,
+ foreach_dpdk_tx_func_error
+#undef _
+ DPDK_TX_FUNC_N_ERROR,
+} dpdk_tx_func_error_t;
+
+static char *dpdk_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_tx_func_error
+#undef _
+};
+
+clib_error_t *
+dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address)
+{
+ int error;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ error = rte_eth_dev_default_mac_addr_set (xd->device_index,
+ (struct ether_addr *) address);
+
+ if (error)
+ {
+ return clib_error_return (0, "mac address set failed: %d", error);
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+clib_error_t *
+dpdk_set_mc_filter (vnet_hw_interface_t * hi,
+ struct ether_addr mc_addr_vec[], int naddr)
+{
+ int error;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ error = rte_eth_dev_set_mc_addr_list (xd->device_index, mc_addr_vec, naddr);
+
+ if (error)
+ {
+ return clib_error_return (0, "mc addr list failed: %d", error);
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+struct rte_mbuf *
+dpdk_replicate_packet_mb (vlib_buffer_t * b)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_buffer_main_t *bm = vm->buffer_main;
+ struct rte_mbuf **mbufs = 0, *s, *d;
+ u8 nb_segs;
+ unsigned socket_id = rte_socket_id ();
+ int i;
+
+ ASSERT (bm->pktmbuf_pools[socket_id]);
+ s = rte_mbuf_from_vlib_buffer (b);
+ nb_segs = s->nb_segs;
+ vec_validate (mbufs, nb_segs - 1);
+
+ if (rte_pktmbuf_alloc_bulk (bm->pktmbuf_pools[socket_id], mbufs, nb_segs))
+ {
+ vec_free (mbufs);
+ return 0;
+ }
+
+ d = mbufs[0];
+ d->nb_segs = s->nb_segs;
+ d->data_len = s->data_len;
+ d->pkt_len = s->pkt_len;
+ d->data_off = s->data_off;
+ clib_memcpy (d->buf_addr, s->buf_addr, RTE_PKTMBUF_HEADROOM + s->data_len);
+
+ for (i = 1; i < nb_segs; i++)
+ {
+ d->next = mbufs[i];
+ d = mbufs[i];
+ s = s->next;
+ d->data_len = s->data_len;
+ clib_memcpy (d->buf_addr, s->buf_addr,
+ RTE_PKTMBUF_HEADROOM + s->data_len);
+ }
+
+ d = mbufs[0];
+ vec_free (mbufs);
+ return d;
+}
+
+static void
+dpdk_tx_trace_buffer (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id, u32 buffer_index, vlib_buffer_t * buffer)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ dpdk_tx_dma_trace_t *t0;
+ struct rte_mbuf *mb;
+
+ mb = rte_mbuf_from_vlib_buffer (buffer);
+
+ t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = buffer_index;
+ clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
+ clib_memcpy (&t0->buffer, buffer,
+ sizeof (buffer[0]) - sizeof (buffer->pre_data));
+ clib_memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data,
+ sizeof (t0->buffer.pre_data));
+}
+
+static_always_inline void
+dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b,
+ int maybe_multiseg)
+{
+ struct rte_mbuf *mb, *first_mb, *last_mb;
+
+ /* buffer is coming from non-dpdk source so we need to init
+ rte_mbuf header */
+ if (PREDICT_FALSE ((b->flags & VNET_BUFFER_RTE_MBUF_VALID) == 0))
+ {
+ vlib_buffer_t *b2 = b;
+ last_mb = mb = rte_mbuf_from_vlib_buffer (b2);
+ rte_pktmbuf_reset (mb);
+ while (maybe_multiseg && (b2->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ b2 = vlib_get_buffer (vm, b2->next_buffer);
+ mb = rte_mbuf_from_vlib_buffer (b2);
+ last_mb->next = mb;
+ last_mb = mb;
+ rte_pktmbuf_reset (mb);
+ }
+ }
+
+ first_mb = mb = rte_mbuf_from_vlib_buffer (b);
+ first_mb->nb_segs = 1;
+ mb->data_len = b->current_length;
+ mb->pkt_len = maybe_multiseg ? vlib_buffer_length_in_chain (vm, b) :
+ b->current_length;
+ mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data;
+
+ while (maybe_multiseg && (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ b = vlib_get_buffer (vm, b->next_buffer);
+ mb = rte_mbuf_from_vlib_buffer (b);
+ mb->data_len = b->current_length;
+ mb->pkt_len = b->current_length;
+ mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data;
+ first_mb->nb_segs++;
+ }
+}
+
+/*
+ * This function calls the dpdk's tx_burst function to transmit the packets
+ * on the tx_vector. It manages a lock per-device if the device does not
+ * support multiple queues. It returns the number of packets untransmitted
+ * on the tx_vector. If all packets are transmitted (the normal case), the
+ * function returns 0.
+ *
+ * The function assumes there is at least one packet on the tx_vector.
+ */
+static_always_inline
+ u32 tx_burst_vector_internal (vlib_main_t * vm,
+ dpdk_device_t * xd,
+ struct rte_mbuf **tx_vector)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 n_packets;
+ u32 tx_head;
+ u32 tx_tail;
+ u32 n_retry;
+ int rv;
+ int queue_id;
+ tx_ring_hdr_t *ring;
+
+ ring = vec_header (tx_vector, sizeof (*ring));
+
+ n_packets = ring->tx_head - ring->tx_tail;
+
+ tx_head = ring->tx_head % xd->nb_tx_desc;
+
+ /*
+ * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to
+ * unpredictable results.
+ */
+ ASSERT (n_packets > 0);
+
+ /*
+ * Check for tx_vector overflow. If this fails it is a system configuration
+ * error. The ring should be sized big enough to handle the largest un-flowed
+ * off burst from a traffic manager. A larger size also helps performance
+ * a bit because it decreases the probability of having to issue two tx_burst
+ * calls due to a ring wrap.
+ */
+ ASSERT (n_packets < xd->nb_tx_desc);
+ ASSERT (ring->tx_tail == 0);
+
+ n_retry = 16;
+ queue_id = vm->cpu_index;
+
+ do
+ {
+ /* start the burst at the tail */
+ tx_tail = ring->tx_tail % xd->nb_tx_desc;
+
+ /*
+ * This device only supports one TX queue,
+ * and we're running multi-threaded...
+ */
+ if (PREDICT_FALSE (xd->lockp != 0))
+ {
+ queue_id = queue_id % xd->tx_q_used;
+ while (__sync_lock_test_and_set (xd->lockp[queue_id], 1))
+ /* zzzz */
+ queue_id = (queue_id + 1) % xd->tx_q_used;
+ }
+
+ if (PREDICT_FALSE (xd->flags & DPDK_DEVICE_FLAG_HQOS)) /* HQoS ON */
+ {
+ /* no wrap, transmit in one burst */
+ dpdk_device_hqos_per_worker_thread_t *hqos =
+ &xd->hqos_wt[vm->cpu_index];
+
+ ASSERT (hqos->swq != NULL);
+
+ dpdk_hqos_metadata_set (hqos,
+ &tx_vector[tx_tail], tx_head - tx_tail);
+ rv = rte_ring_sp_enqueue_burst (hqos->swq,
+ (void **) &tx_vector[tx_tail],
+ (uint16_t) (tx_head - tx_tail));
+ }
+ else if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_eth_tx_burst (xd->device_index,
+ (uint16_t) queue_id,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head - tx_tail));
+ }
+ else
+ {
+ ASSERT (0);
+ rv = 0;
+ }
+
+ if (PREDICT_FALSE (xd->lockp != 0))
+ *xd->lockp[queue_id] = 0;
+
+ if (PREDICT_FALSE (rv < 0))
+ {
+ // emit non-fatal message, bump counter
+ vnet_main_t *vnm = dm->vnet_main;
+ vnet_interface_main_t *im = &vnm->interface_main;
+ u32 node_index;
+
+ node_index = vec_elt_at_index (im->hw_interfaces,
+ xd->vlib_hw_if_index)->tx_node_index;
+
+ vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1);
+ clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index,
+ rv);
+ return n_packets; // untransmitted packets
+ }
+ ring->tx_tail += (u16) rv;
+ n_packets -= (uint16_t) rv;
+ }
+ while (rv && n_packets && (n_retry > 0));
+
+ return n_packets;
+}
+
+static_always_inline void
+dpdk_prefetch_buffer_by_index (vlib_main_t * vm, u32 bi)
+{
+ vlib_buffer_t *b;
+ struct rte_mbuf *mb;
+ b = vlib_get_buffer (vm, bi);
+ mb = rte_mbuf_from_vlib_buffer (b);
+ CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+static_always_inline void
+dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 my_cpu = vm->cpu_index;
+ struct rte_mbuf *mb_new;
+
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0)
+ return;
+
+ mb_new = dpdk_replicate_packet_mb (b);
+ if (PREDICT_FALSE (mb_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ *mbp = mb_new;
+
+ vec_add1 (dm->recycle[my_cpu], bi);
+}
+
+/*
+ * Transmits the packets on the frame to the interface associated with the
+ * node. It first copies packets on the frame to a tx_vector containing the
+ * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal
+ * which calls the dpdk tx_burst function.
+ */
+static uword
+dpdk_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, rd->dev_instance);
+ u32 n_packets = f->n_vectors;
+ u32 n_left;
+ u32 *from;
+ struct rte_mbuf **tx_vector;
+ u16 i;
+ u16 nb_tx_desc = xd->nb_tx_desc;
+ int queue_id;
+ u32 my_cpu;
+ u32 tx_pkts = 0;
+ tx_ring_hdr_t *ring;
+ u32 n_on_ring;
+
+ my_cpu = vm->cpu_index;
+
+ queue_id = my_cpu;
+
+ tx_vector = xd->tx_vectors[queue_id];
+ ring = vec_header (tx_vector, sizeof (*ring));
+
+ n_on_ring = ring->tx_head - ring->tx_tail;
+ from = vlib_frame_vector_args (f);
+
+ ASSERT (n_packets <= VLIB_FRAME_SIZE);
+
+ if (PREDICT_FALSE (n_on_ring + n_packets > nb_tx_desc))
+ {
+ /*
+ * Overflowing the ring should never happen.
+ * If it does then drop the whole frame.
+ */
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL,
+ n_packets);
+
+ while (n_packets--)
+ {
+ u32 bi0 = from[n_packets];
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+ struct rte_mbuf *mb0 = rte_mbuf_from_vlib_buffer (b0);
+ rte_pktmbuf_free (mb0);
+ }
+ return n_on_ring;
+ }
+
+ if (PREDICT_FALSE (dm->tx_pcap_enable))
+ {
+ n_left = n_packets;
+ while (n_left > 0)
+ {
+ u32 bi0 = from[0];
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+ if (dm->pcap_sw_if_index == 0 ||
+ dm->pcap_sw_if_index == vnet_buffer (b0)->sw_if_index[VLIB_TX])
+ pcap_add_buffer (&dm->pcap_main, vm, bi0, 512);
+ from++;
+ n_left--;
+ }
+ }
+
+ from = vlib_frame_vector_args (f);
+ n_left = n_packets;
+ i = ring->tx_head % nb_tx_desc;
+
+ while (n_left >= 8)
+ {
+ u32 bi0, bi1, bi2, bi3;
+ struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+ vlib_buffer_t *b0, *b1, *b2, *b3;
+ u32 or_flags;
+
+ dpdk_prefetch_buffer_by_index (vm, from[4]);
+ dpdk_prefetch_buffer_by_index (vm, from[5]);
+ dpdk_prefetch_buffer_by_index (vm, from[6]);
+ dpdk_prefetch_buffer_by_index (vm, from[7]);
+
+ bi0 = from[0];
+ bi1 = from[1];
+ bi2 = from[2];
+ bi3 = from[3];
+ from += 4;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+ b2 = vlib_get_buffer (vm, bi2);
+ b3 = vlib_get_buffer (vm, bi3);
+
+ or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
+
+ if (or_flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ dpdk_validate_rte_mbuf (vm, b0, 1);
+ dpdk_validate_rte_mbuf (vm, b1, 1);
+ dpdk_validate_rte_mbuf (vm, b2, 1);
+ dpdk_validate_rte_mbuf (vm, b3, 1);
+ }
+ else
+ {
+ dpdk_validate_rte_mbuf (vm, b0, 0);
+ dpdk_validate_rte_mbuf (vm, b1, 0);
+ dpdk_validate_rte_mbuf (vm, b2, 0);
+ dpdk_validate_rte_mbuf (vm, b3, 0);
+ }
+
+ mb0 = rte_mbuf_from_vlib_buffer (b0);
+ mb1 = rte_mbuf_from_vlib_buffer (b1);
+ mb2 = rte_mbuf_from_vlib_buffer (b2);
+ mb3 = rte_mbuf_from_vlib_buffer (b3);
+
+ if (PREDICT_FALSE (or_flags & VLIB_BUFFER_RECYCLE))
+ {
+ dpdk_buffer_recycle (vm, node, b0, bi0, &mb0);
+ dpdk_buffer_recycle (vm, node, b1, bi1, &mb1);
+ dpdk_buffer_recycle (vm, node, b2, bi2, &mb2);
+ dpdk_buffer_recycle (vm, node, b3, bi3, &mb3);
+
+ /* dont enqueue packets if replication failed as they must
+ be sent back to recycle */
+ if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb0;
+ if (PREDICT_TRUE ((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb1;
+ if (PREDICT_TRUE ((b2->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb2;
+ if (PREDICT_TRUE ((b3->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb3;
+ }
+ else
+ {
+ if (PREDICT_FALSE (i + 3 >= nb_tx_desc))
+ {
+ tx_vector[i++ % nb_tx_desc] = mb0;
+ tx_vector[i++ % nb_tx_desc] = mb1;
+ tx_vector[i++ % nb_tx_desc] = mb2;
+ tx_vector[i++ % nb_tx_desc] = mb3;
+ i %= nb_tx_desc;
+ }
+ else
+ {
+ tx_vector[i++] = mb0;
+ tx_vector[i++] = mb1;
+ tx_vector[i++] = mb2;
+ tx_vector[i++] = mb3;
+ }
+ }
+
+
+ if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1);
+ if (b2->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi2, b2);
+ if (b3->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi3, b3);
+ }
+
+ n_left -= 4;
+ }
+ while (n_left > 0)
+ {
+ u32 bi0;
+ struct rte_mbuf *mb0;
+ vlib_buffer_t *b0;
+
+ bi0 = from[0];
+ from++;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ dpdk_validate_rte_mbuf (vm, b0, 1);
+
+ mb0 = rte_mbuf_from_vlib_buffer (b0);
+ dpdk_buffer_recycle (vm, node, b0, bi0, &mb0);
+
+ if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+
+ if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % nb_tx_desc] = mb0;
+ i++;
+ }
+ n_left--;
+ }
+
+ /* account for additional packets in the ring */
+ ring->tx_head += n_packets;
+ n_on_ring = ring->tx_head - ring->tx_tail;
+
+ /* transmit as many packets as possible */
+ n_packets = tx_burst_vector_internal (vm, xd, tx_vector);
+
+ /*
+ * tx_pkts is the number of packets successfully transmitted
+ * This is the number originally on ring minus the number remaining on ring
+ */
+ tx_pkts = n_on_ring - n_packets;
+
+ {
+ /* If there is no callback then drop any non-transmitted packets */
+ if (PREDICT_FALSE (n_packets))
+ {
+ vlib_simple_counter_main_t *cm;
+ vnet_main_t *vnm = vnet_get_main ();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ n_packets);
+
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP,
+ n_packets);
+
+ while (n_packets--)
+ rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]);
+ }
+
+ /* Reset head/tail to avoid unnecessary wrap */
+ ring->tx_head = 0;
+ ring->tx_tail = 0;
+ }
+
+ /* Recycle replicated buffers */
+ if (PREDICT_FALSE (vec_len (dm->recycle[my_cpu])))
+ {
+ vlib_buffer_free (vm, dm->recycle[my_cpu],
+ vec_len (dm->recycle[my_cpu]));
+ _vec_len (dm->recycle[my_cpu]) = 0;
+ }
+
+ ASSERT (ring->tx_head >= ring->tx_tail);
+
+ return tx_pkts;
+}
+
+static void
+dpdk_clear_hw_interface_counters (u32 instance)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, instance);
+
+ /*
+ * Set the "last_cleared_stats" to the current stats, so that
+ * things appear to clear from a display perspective.
+ */
+ dpdk_update_counters (xd, vlib_time_now (dm->vlib_main));
+
+ clib_memcpy (&xd->last_cleared_stats, &xd->stats, sizeof (xd->stats));
+ clib_memcpy (xd->last_cleared_xstats, xd->xstats,
+ vec_len (xd->last_cleared_xstats) *
+ sizeof (xd->last_cleared_xstats[0]));
+
+}
+
+static clib_error_t *
+dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hif->dev_instance);
+ int rv = 0;
+
+ if (is_up)
+ {
+ f64 now = vlib_time_now (dm->vlib_main);
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
+ rv = rte_eth_dev_start (xd->device_index);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
+ rte_eth_promiscuous_enable (xd->device_index);
+ else
+ rte_eth_promiscuous_disable (xd->device_index);
+
+ rte_eth_allmulticast_enable (xd->device_index);
+ xd->flags |= DPDK_DEVICE_FLAG_ADMIN_UP;
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+ }
+ else
+ {
+ xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP;
+
+ rte_eth_allmulticast_disable (xd->device_index);
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+ rte_eth_dev_stop (xd->device_index);
+
+ /* For bonded interface, stop slave links */
+ if (xd->pmd == VNET_DPDK_PMD_BOND)
+ {
+ u8 slink[16];
+ int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16);
+ while (nlink >= 1)
+ {
+ u8 dpdk_port = slink[--nlink];
+ rte_eth_dev_stop (dpdk_port);
+ }
+ }
+ }
+
+ if (rv < 0)
+ clib_warning ("rte_eth_dev_%s error: %d", is_up ? "start" : "stop", rv);
+
+ return /* no error */ 0;
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void
+dpdk_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ dpdk_main_t *xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ xd->per_interface_next_index = node_index;
+ return;
+ }
+
+ xd->per_interface_next_index =
+ vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index);
+}
+
+
+static clib_error_t *
+dpdk_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t *st, int is_add)
+{
+ dpdk_main_t *xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+ vnet_sw_interface_t *t = (vnet_sw_interface_t *) st;
+ int r, vlan_offload;
+ u32 prev_subifs = xd->num_subifs;
+ clib_error_t *err = 0;
+
+ if (is_add)
+ xd->num_subifs++;
+ else if (xd->num_subifs)
+ xd->num_subifs--;
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ goto done;
+
+ /* currently we program VLANS only for IXGBE VF and I40E VF */
+ if ((xd->pmd != VNET_DPDK_PMD_IXGBEVF) && (xd->pmd != VNET_DPDK_PMD_I40EVF))
+ goto done;
+
+ if (t->sub.eth.flags.no_tags == 1)
+ goto done;
+
+ if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "unsupported VLAN setup");
+ goto done;
+ }
+
+ vlan_offload = rte_eth_dev_get_vlan_offload (xd->device_index);
+ vlan_offload |= ETH_VLAN_FILTER_OFFLOAD;
+
+ if ((r = rte_eth_dev_set_vlan_offload (xd->device_index, vlan_offload)))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d",
+ xd->device_index, r);
+ goto done;
+ }
+
+
+ if ((r =
+ rte_eth_dev_vlan_filter (xd->device_index, t->sub.eth.outer_vlan_id,
+ is_add)))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d",
+ xd->device_index, r);
+ goto done;
+ }
+
+done:
+ if (xd->num_subifs)
+ xd->flags |= DPDK_DEVICE_FLAG_HAVE_SUBIF;
+ else
+ xd->flags &= ~DPDK_DEVICE_FLAG_HAVE_SUBIF;
+
+ return err;
+}
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (dpdk_device_class) = {
+ .name = "dpdk",
+ .tx_function = dpdk_interface_tx,
+ .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR,
+ .tx_function_error_strings = dpdk_tx_func_error_strings,
+ .format_device_name = format_dpdk_device_name,
+ .format_device = format_dpdk_device,
+ .format_tx_trace = format_dpdk_tx_dma_trace,
+ .clear_counters = dpdk_clear_hw_interface_counters,
+ .admin_up_down_function = dpdk_interface_admin_up_down,
+ .subif_add_del_function = dpdk_subif_add_del_function,
+ .rx_redirect_to_node = dpdk_set_interface_next_node,
+ .mac_addr_change_function = dpdk_set_mac_address,
+};
+
+VLIB_DEVICE_TX_FUNCTION_MULTIARCH (dpdk_device_class, dpdk_interface_tx)
+/* *INDENT-ON* */
+
+#define UP_DOWN_FLAG_EVENT 1
+
+uword
+admin_up_down_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ clib_error_t *error = 0;
+ uword event_type;
+ uword *event_data = 0;
+ u32 sw_if_index;
+ u32 flags;
+
+ while (1)
+ {
+ vlib_process_wait_for_event (vm);
+
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ dpdk_main.admin_up_down_in_progress = 1;
+
+ switch (event_type)
+ {
+ case UP_DOWN_FLAG_EVENT:
+ {
+ if (vec_len (event_data) == 2)
+ {
+ sw_if_index = event_data[0];
+ flags = event_data[1];
+ error =
+ vnet_sw_interface_set_flags (vnet_get_main (), sw_if_index,
+ flags);
+ clib_error_report (error);
+ }
+ }
+ break;
+ }
+
+ vec_reset_length (event_data);
+
+ dpdk_main.admin_up_down_in_progress = 0;
+
+ }
+ return 0; /* or not */
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (admin_up_down_process_node,static) = {
+ .function = admin_up_down_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "admin-up-down-process",
+ .process_log2_n_stack_bytes = 17, // 256KB
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/dpdk.h b/src/vnet/devices/dpdk/dpdk.h
new file mode 100644
index 00000000000..d8f378d2b54
--- /dev/null
+++ b/src/vnet/devices/dpdk/dpdk.h
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_dpdk_h__
+#define __included_dpdk_h__
+
+/* $$$$ We should rename always_inline -> clib_always_inline */
+#undef always_inline
+
+#include <rte_config.h>
+
+#include <rte_common.h>
+#include <rte_dev.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_prefetch.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_branch_prediction.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_random.h>
+#include <rte_debug.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_virtio_net.h>
+#include <rte_version.h>
+#include <rte_eth_bond.h>
+#include <rte_sched.h>
+
+#include <vnet/unix/pcap.h>
+#include <vnet/devices/devices.h>
+
+#if CLIB_DEBUG > 0
+#define always_inline static inline
+#else
+#define always_inline static inline __attribute__ ((__always_inline__))
+#endif
+
+#include <vlib/pci/pci.h>
+
+#define NB_MBUF (16<<10)
+
+extern vnet_device_class_t dpdk_device_class;
+extern vlib_node_registration_t dpdk_input_node;
+extern vlib_node_registration_t handoff_dispatch_node;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+#define foreach_dpdk_pmd \
+ _ ("net_thunderx", THUNDERX) \
+ _ ("net_e1000_em", E1000EM) \
+ _ ("net_e1000_igb", IGB) \
+ _ ("net_e1000_igb_vf", IGBVF) \
+ _ ("net_ixgbe", IXGBE) \
+ _ ("net_ixgbe_vf", IXGBEVF) \
+ _ ("net_i40e", I40E) \
+ _ ("net_i40e_vf", I40EVF) \
+ _ ("net_virtio", VIRTIO) \
+ _ ("net_enic", ENIC) \
+ _ ("net_vmxnet3", VMXNET3) \
+ _ ("net_af_packet", AF_PACKET) \
+ _ ("rte_bond_pmd", BOND) \
+ _ ("net_fm10k", FM10K) \
+ _ ("net_cxgbe", CXGBE) \
+ _ ("net_mlx5", MLX5) \
+ _ ("net_dpaa2", DPAA2)
+#else
+#define foreach_dpdk_pmd \
+ _ ("rte_nicvf_pmd", THUNDERX) \
+ _ ("rte_em_pmd", E1000EM) \
+ _ ("rte_igb_pmd", IGB) \
+ _ ("rte_igbvf_pmd", IGBVF) \
+ _ ("rte_ixgbe_pmd", IXGBE) \
+ _ ("rte_ixgbevf_pmd", IXGBEVF) \
+ _ ("rte_i40e_pmd", I40E) \
+ _ ("rte_i40evf_pmd", I40EVF) \
+ _ ("rte_virtio_pmd", VIRTIO) \
+ _ ("rte_enic_pmd", ENIC) \
+ _ ("rte_vmxnet3_pmd", VMXNET3) \
+ _ ("AF_PACKET PMD", AF_PACKET) \
+ _ ("rte_bond_pmd", BOND) \
+ _ ("rte_pmd_fm10k", FM10K) \
+ _ ("rte_cxgbe_pmd", CXGBE) \
+ _ ("rte_dpaa2_dpni", DPAA2)
+#endif
+
+typedef enum
+{
+ VNET_DPDK_PMD_NONE,
+#define _(s,f) VNET_DPDK_PMD_##f,
+ foreach_dpdk_pmd
+#undef _
+ VNET_DPDK_PMD_UNKNOWN, /* must be last */
+} dpdk_pmd_t;
+
+typedef enum
+{
+ VNET_DPDK_PORT_TYPE_ETH_1G,
+ VNET_DPDK_PORT_TYPE_ETH_10G,
+ VNET_DPDK_PORT_TYPE_ETH_40G,
+ VNET_DPDK_PORT_TYPE_ETH_100G,
+ VNET_DPDK_PORT_TYPE_ETH_BOND,
+ VNET_DPDK_PORT_TYPE_ETH_SWITCH,
+ VNET_DPDK_PORT_TYPE_AF_PACKET,
+ VNET_DPDK_PORT_TYPE_UNKNOWN,
+} dpdk_port_type_t;
+
+/*
+ * The header for the tx_vector in dpdk_device_t.
+ * Head and tail are indexes into the tx_vector and are of type
+ * u64 so they never overflow.
+ */
+typedef struct
+{
+ u64 tx_head;
+ u64 tx_tail;
+} tx_ring_hdr_t;
+
+typedef struct
+{
+ struct rte_ring *swq;
+
+ u64 hqos_field0_slabmask;
+ u32 hqos_field0_slabpos;
+ u32 hqos_field0_slabshr;
+ u64 hqos_field1_slabmask;
+ u32 hqos_field1_slabpos;
+ u32 hqos_field1_slabshr;
+ u64 hqos_field2_slabmask;
+ u32 hqos_field2_slabpos;
+ u32 hqos_field2_slabshr;
+ u32 hqos_tc_table[64];
+} dpdk_device_hqos_per_worker_thread_t;
+
+typedef struct
+{
+ struct rte_ring **swq;
+ struct rte_mbuf **pkts_enq;
+ struct rte_mbuf **pkts_deq;
+ struct rte_sched_port *hqos;
+ u32 hqos_burst_enq;
+ u32 hqos_burst_deq;
+ u32 pkts_enq_len;
+ u32 swq_pos;
+ u32 flush_count;
+} dpdk_device_hqos_per_hqos_thread_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ volatile u32 **lockp;
+
+ /* Instance ID */
+ u32 device_index;
+
+ u32 vlib_hw_if_index;
+ u32 vlib_sw_if_index;
+
+ /* next node index if we decide to steal the rx graph arc */
+ u32 per_interface_next_index;
+
+ /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */
+ struct rte_mbuf ***tx_vectors; /* one per worker thread */
+ struct rte_mbuf ***rx_vectors;
+
+ /* vector of traced contexts, per device */
+ u32 **d_trace_buffers;
+
+ dpdk_pmd_t pmd:8;
+ i8 cpu_socket;
+
+ u16 flags;
+#define DPDK_DEVICE_FLAG_ADMIN_UP (1 << 0)
+#define DPDK_DEVICE_FLAG_PROMISC (1 << 1)
+#define DPDK_DEVICE_FLAG_PMD (1 << 2)
+#define DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE (1 << 3)
+#define DPDK_DEVICE_FLAG_MAYBE_MULTISEG (1 << 4)
+#define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5)
+#define DPDK_DEVICE_FLAG_HQOS (1 << 6)
+
+ u16 nb_tx_desc;
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
+
+ u8 *interface_name_suffix;
+
+ /* number of sub-interfaces */
+ u16 num_subifs;
+
+ /* PMD related */
+ u16 tx_q_used;
+ u16 rx_q_used;
+ u16 nb_rx_desc;
+ u16 *cpu_socket_id_by_queue;
+ struct rte_eth_conf port_conf;
+ struct rte_eth_txconf tx_conf;
+
+ /* HQoS related */
+ dpdk_device_hqos_per_worker_thread_t *hqos_wt;
+ dpdk_device_hqos_per_hqos_thread_t *hqos_ht;
+
+ /* af_packet */
+ u8 af_packet_port_id;
+
+ struct rte_eth_link link;
+ f64 time_last_link_update;
+
+ struct rte_eth_stats stats;
+ struct rte_eth_stats last_stats;
+ struct rte_eth_stats last_cleared_stats;
+ struct rte_eth_xstat *xstats;
+ struct rte_eth_xstat *last_cleared_xstats;
+ f64 time_last_stats_update;
+ dpdk_port_type_t port_type;
+} dpdk_device_t;
+
+#define DPDK_STATS_POLL_INTERVAL (10.0)
+#define DPDK_MIN_STATS_POLL_INTERVAL (0.001) /* 1msec */
+
+#define DPDK_LINK_POLL_INTERVAL (3.0)
+#define DPDK_MIN_LINK_POLL_INTERVAL (0.001) /* 1msec */
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+
+ /* total input packet counter */
+ u64 aggregate_rx_packets;
+} dpdk_worker_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+
+ /* total input packet counter */
+ u64 aggregate_rx_packets;
+} dpdk_hqos_thread_t;
+
+typedef struct
+{
+ u32 device;
+ u16 queue_id;
+} dpdk_device_and_queue_t;
+
+#ifndef DPDK_HQOS_DBG_BYPASS
+#define DPDK_HQOS_DBG_BYPASS 0
+#endif
+
+#ifndef HQOS_FLUSH_COUNT_THRESHOLD
+#define HQOS_FLUSH_COUNT_THRESHOLD 100000
+#endif
+
+typedef struct dpdk_device_config_hqos_t
+{
+ u32 hqos_thread;
+ u32 hqos_thread_valid;
+
+ u32 swq_size;
+ u32 burst_enq;
+ u32 burst_deq;
+
+ u32 pktfield0_slabpos;
+ u32 pktfield1_slabpos;
+ u32 pktfield2_slabpos;
+ u64 pktfield0_slabmask;
+ u64 pktfield1_slabmask;
+ u64 pktfield2_slabmask;
+ u32 tc_table[64];
+
+ struct rte_sched_port_params port;
+ struct rte_sched_subport_params *subport;
+ struct rte_sched_pipe_params *pipe;
+ uint32_t *pipe_map;
+} dpdk_device_config_hqos_t;
+
+int dpdk_hqos_validate_mask (u64 mask, u32 n);
+void dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t *
+ hqos, u32 pipe_profile_id);
+void dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos);
+clib_error_t *dpdk_port_setup_hqos (dpdk_device_t * xd,
+ dpdk_device_config_hqos_t * hqos);
+void dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos,
+ struct rte_mbuf **pkts, u32 n_pkts);
+
+#define foreach_dpdk_device_config_item \
+ _ (num_rx_queues) \
+ _ (num_tx_queues) \
+ _ (num_rx_desc) \
+ _ (num_tx_desc) \
+ _ (rss_fn)
+
+typedef struct
+{
+ vlib_pci_addr_t pci_addr;
+ u8 is_blacklisted;
+ u8 vlan_strip_offload;
+#define DPDK_DEVICE_VLAN_STRIP_DEFAULT 0
+#define DPDK_DEVICE_VLAN_STRIP_OFF 1
+#define DPDK_DEVICE_VLAN_STRIP_ON 2
+
+#define _(x) uword x;
+ foreach_dpdk_device_config_item
+#undef _
+ clib_bitmap_t * workers;
+ u32 hqos_enabled;
+ dpdk_device_config_hqos_t hqos;
+} dpdk_device_config_t;
+
+typedef struct
+{
+
+ /* Config stuff */
+ u8 **eal_init_args;
+ u8 *eal_init_args_str;
+ u8 *uio_driver_name;
+ u8 no_multi_seg;
+ u8 enable_tcp_udp_checksum;
+
+ /* Required config parameters */
+ u8 coremask_set_manually;
+ u8 nchannels_set_manually;
+ u32 coremask;
+ u32 nchannels;
+ u32 num_mbufs;
+ u8 num_kni; /* while kni_init allows u32, port_id in callback fn is only u8 */
+
+ /*
+ * format interface names ala xxxEthernet%d/%d/%d instead of
+ * xxxEthernet%x/%x/%x.
+ */
+ u8 interface_name_format_decimal;
+
+ /* per-device config */
+ dpdk_device_config_t default_devconf;
+ dpdk_device_config_t *dev_confs;
+ uword *device_config_index_by_pci_addr;
+
+} dpdk_config_main_t;
+
+dpdk_config_main_t dpdk_config_main;
+
+typedef struct
+{
+
+ /* Devices */
+ dpdk_device_t *devices;
+ dpdk_device_and_queue_t **devices_by_cpu;
+ dpdk_device_and_queue_t **devices_by_hqos_cpu;
+
+ /* per-thread recycle lists */
+ u32 **recycle;
+
+ /* buffer flags template, configurable to enable/disable tcp / udp cksum */
+ u32 buffer_flags_template;
+
+ /* vlib buffer free list, must be same size as an rte_mbuf */
+ u32 vlib_buffer_free_list_index;
+
+ /* dpdk worker "threads" */
+ dpdk_worker_t *workers;
+
+ /* dpdk HQoS "threads" */
+ dpdk_hqos_thread_t *hqos_threads;
+
+ /* Ethernet input node index */
+ u32 ethernet_input_node_index;
+
+ /* pcap tracing [only works if (CLIB_DEBUG > 0)] */
+ int tx_pcap_enable;
+ pcap_main_t pcap_main;
+ u8 *pcap_filename;
+ u32 pcap_sw_if_index;
+ u32 pcap_pkts_to_capture;
+
+ /* hashes */
+ uword *dpdk_device_by_kni_port_id;
+ uword *vu_sw_if_index_by_listener_fd;
+ uword *vu_sw_if_index_by_sock_fd;
+ u32 *vu_inactive_interfaces_device_index;
+
+ /*
+ * flag indicating that a posted admin up/down
+ * (via post_sw_interface_set_flags) is in progress
+ */
+ u8 admin_up_down_in_progress;
+
+ u8 use_rss;
+
+ /* which cpus are running dpdk-input */
+ int input_cpu_first_index;
+ int input_cpu_count;
+
+ /* which cpus are running I/O TX */
+ int hqos_cpu_first_index;
+ int hqos_cpu_count;
+
+ /* control interval of dpdk link state and stat polling */
+ f64 link_state_poll_interval;
+ f64 stat_poll_interval;
+
+ /* Sleep for this many MS after each device poll */
+ u32 poll_sleep;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+ dpdk_config_main_t *conf;
+} dpdk_main_t;
+
+dpdk_main_t dpdk_main;
+
+typedef struct
+{
+ u32 buffer_index;
+ u16 device_index;
+ u8 queue_index;
+ struct rte_mbuf mb;
+ /* Copy of VLIB buffer; packet data stored in pre_data. */
+ vlib_buffer_t buffer;
+} dpdk_tx_dma_trace_t;
+
+typedef struct
+{
+ u32 buffer_index;
+ u16 device_index;
+ u16 queue_index;
+ struct rte_mbuf mb;
+ vlib_buffer_t buffer; /* Copy of VLIB buffer; pkt data stored in pre_data. */
+ u8 data[256]; /* First 256 data bytes, used for hexdump */
+} dpdk_rx_dma_trace_t;
+
+void vnet_buffer_needs_dpdk_mb (vlib_buffer_t * b);
+
+clib_error_t *dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address);
+
+clib_error_t *dpdk_set_mc_filter (vnet_hw_interface_t * hi,
+ struct ether_addr mc_addr_vec[], int naddr);
+
+void dpdk_thread_input (dpdk_main_t * dm, dpdk_device_t * xd);
+
+clib_error_t *dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd);
+
+u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance);
+
+struct rte_mbuf *dpdk_replicate_packet_mb (vlib_buffer_t * b);
+struct rte_mbuf *dpdk_zerocopy_replicate_packet_mb (vlib_buffer_t * b);
+
+#define foreach_dpdk_error \
+ _(NONE, "no error") \
+ _(RX_PACKET_ERROR, "Rx packet errors") \
+ _(RX_BAD_FCS, "Rx bad fcs") \
+ _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \
+ _(RX_ALLOC_FAIL, "rx buf alloc from free list failed") \
+ _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem") \
+ _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error")
+
+typedef enum
+{
+#define _(f,s) DPDK_ERROR_##f,
+ foreach_dpdk_error
+#undef _
+ DPDK_N_ERROR,
+} dpdk_error_t;
+
+int dpdk_set_stat_poll_interval (f64 interval);
+int dpdk_set_link_state_poll_interval (f64 interval);
+void dpdk_update_link_state (dpdk_device_t * xd, f64 now);
+void dpdk_device_lock_init (dpdk_device_t * xd);
+void dpdk_device_lock_free (dpdk_device_t * xd);
+
+static inline u64
+vnet_get_aggregate_rx_packets (void)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u64 sum = 0;
+ dpdk_worker_t *dw;
+
+ vec_foreach (dw, dm->workers) sum += dw->aggregate_rx_packets;
+
+ return sum;
+}
+
+void dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id, u32 * buffers, uword n_buffers);
+
+#define EFD_OPERATION_LESS_THAN 0
+#define EFD_OPERATION_GREATER_OR_EQUAL 1
+
+format_function_t format_dpdk_device_name;
+format_function_t format_dpdk_device;
+format_function_t format_dpdk_tx_dma_trace;
+format_function_t format_dpdk_rx_dma_trace;
+format_function_t format_dpdk_rte_mbuf;
+format_function_t format_dpdk_rx_rte_mbuf;
+unformat_function_t unformat_socket_mem;
+clib_error_t *unformat_rss_fn (unformat_input_t * input, uword * rss_fn);
+clib_error_t *unformat_hqos (unformat_input_t * input,
+ dpdk_device_config_hqos_t * hqos);
+
+uword
+admin_up_down_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f);
+
+#endif /* __included_dpdk_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/dpdk_priv.h b/src/vnet/devices/dpdk/dpdk_priv.h
new file mode 100644
index 00000000000..0c81dbc3beb
--- /dev/null
+++ b/src/vnet/devices/dpdk/dpdk_priv.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define DPDK_NB_RX_DESC_DEFAULT 1024
+#define DPDK_NB_TX_DESC_DEFAULT 1024
+#define DPDK_NB_RX_DESC_VIRTIO 256
+#define DPDK_NB_TX_DESC_VIRTIO 256
+
+#define I40E_DEV_ID_SFP_XL710 0x1572
+#define I40E_DEV_ID_QSFP_A 0x1583
+#define I40E_DEV_ID_QSFP_B 0x1584
+#define I40E_DEV_ID_QSFP_C 0x1585
+#define I40E_DEV_ID_10G_BASE_T 0x1586
+#define I40E_DEV_ID_VF 0x154C
+
+/* These args appear by themselves */
+#define foreach_eal_double_hyphen_predicate_arg \
+_(no-shconf) \
+_(no-hpet) \
+_(no-huge) \
+_(vmware-tsc-map)
+
+#define foreach_eal_single_hyphen_mandatory_arg \
+_(coremask, c) \
+_(nchannels, n) \
+
+#define foreach_eal_single_hyphen_arg \
+_(blacklist, b) \
+_(mem-alloc-request, m) \
+_(force-ranks, r)
+
+/* These args are preceeded by "--" and followed by a single string */
+#define foreach_eal_double_hyphen_arg \
+_(huge-dir) \
+_(proc-type) \
+_(file-prefix) \
+_(vdev)
+
+static inline void
+dpdk_get_xstats (dpdk_device_t * xd)
+{
+ int len;
+ if ((len = rte_eth_xstats_get (xd->device_index, NULL, 0)) > 0)
+ {
+ vec_validate (xd->xstats, len - 1);
+ vec_validate (xd->last_cleared_xstats, len - 1);
+
+ len =
+ rte_eth_xstats_get (xd->device_index, xd->xstats,
+ vec_len (xd->xstats));
+
+ ASSERT (vec_len (xd->xstats) == len);
+ ASSERT (vec_len (xd->last_cleared_xstats) == len);
+
+ _vec_len (xd->xstats) = len;
+ _vec_len (xd->last_cleared_xstats) = len;
+
+ }
+}
+
+
+static inline void
+dpdk_update_counters (dpdk_device_t * xd, f64 now)
+{
+ vlib_simple_counter_main_t *cm;
+ vnet_main_t *vnm = vnet_get_main ();
+ u32 my_cpu = os_get_cpu_number ();
+ u64 rxerrors, last_rxerrors;
+
+ /* only update counters for PMD interfaces */
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ return;
+
+ xd->time_last_stats_update = now ? now : xd->time_last_stats_update;
+ clib_memcpy (&xd->last_stats, &xd->stats, sizeof (xd->last_stats));
+ rte_eth_stats_get (xd->device_index, &xd->stats);
+
+ /* maybe bump interface rx no buffer counter */
+ if (PREDICT_FALSE (xd->stats.rx_nombuf != xd->last_stats.rx_nombuf))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_NO_BUF);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ xd->stats.rx_nombuf -
+ xd->last_stats.rx_nombuf);
+ }
+
+ /* missed pkt counter */
+ if (PREDICT_FALSE (xd->stats.imissed != xd->last_stats.imissed))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_MISS);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ xd->stats.imissed -
+ xd->last_stats.imissed);
+ }
+ rxerrors = xd->stats.ierrors;
+ last_rxerrors = xd->last_stats.ierrors;
+
+ if (PREDICT_FALSE (rxerrors != last_rxerrors))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ rxerrors - last_rxerrors);
+ }
+
+ dpdk_get_xstats (xd);
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/format.c b/src/vnet/devices/dpdk/format.c
new file mode 100644
index 00000000000..ff7c7a5a41c
--- /dev/null
+++ b/src/vnet/devices/dpdk/format.c
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+#include <assert.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include "dpdk_priv.h"
+#include <vppinfra/error.h>
+
+#define foreach_dpdk_counter \
+ _ (tx_frames_ok, opackets) \
+ _ (tx_bytes_ok, obytes) \
+ _ (tx_errors, oerrors) \
+ _ (rx_frames_ok, ipackets) \
+ _ (rx_bytes_ok, ibytes) \
+ _ (rx_errors, ierrors) \
+ _ (rx_missed, imissed) \
+ _ (rx_no_bufs, rx_nombuf)
+
+#define foreach_dpdk_q_counter \
+ _ (rx_frames_ok, q_ipackets) \
+ _ (tx_frames_ok, q_opackets) \
+ _ (rx_bytes_ok, q_ibytes) \
+ _ (tx_bytes_ok, q_obytes) \
+ _ (rx_errors, q_errors)
+
+#define foreach_dpdk_rss_hf \
+ _(ETH_RSS_FRAG_IPV4, "ipv4-frag") \
+ _(ETH_RSS_NONFRAG_IPV4_TCP, "ipv4-tcp") \
+ _(ETH_RSS_NONFRAG_IPV4_UDP, "ipv4-udp") \
+ _(ETH_RSS_NONFRAG_IPV4_SCTP, "ipv4-sctp") \
+ _(ETH_RSS_NONFRAG_IPV4_OTHER, "ipv4-other") \
+ _(ETH_RSS_IPV4, "ipv4") \
+ _(ETH_RSS_IPV6_TCP_EX, "ipv6-tcp-ex") \
+ _(ETH_RSS_IPV6_UDP_EX, "ipv6-udp-ex") \
+ _(ETH_RSS_FRAG_IPV6, "ipv6-frag") \
+ _(ETH_RSS_NONFRAG_IPV6_TCP, "ipv6-tcp") \
+ _(ETH_RSS_NONFRAG_IPV6_UDP, "ipv6-udp") \
+ _(ETH_RSS_NONFRAG_IPV6_SCTP, "ipv6-sctp") \
+ _(ETH_RSS_NONFRAG_IPV6_OTHER, "ipv6-other") \
+ _(ETH_RSS_L2_PAYLOAD, "l2-payload") \
+ _(ETH_RSS_IPV6_EX, "ipv6-ex") \
+ _(ETH_RSS_IPV6, "ipv6")
+
+
+#define foreach_dpdk_rx_offload_caps \
+ _(DEV_RX_OFFLOAD_VLAN_STRIP, "vlan-strip") \
+ _(DEV_RX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_RX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_LRO , "rcp-lro") \
+ _(DEV_RX_OFFLOAD_QINQ_STRIP, "qinq-strip")
+
+#define foreach_dpdk_tx_offload_caps \
+ _(DEV_TX_OFFLOAD_VLAN_INSERT, "vlan-insert") \
+ _(DEV_TX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_TX_OFFLOAD_SCTP_CKSUM , "sctp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_TSO , "tcp-tso") \
+ _(DEV_TX_OFFLOAD_UDP_TSO , "udp-tso") \
+ _(DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM, "outer-ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_QINQ_INSERT, "qinq-insert")
+
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+/* New ol_flags bits added in DPDK-16.11 */
+#define PKT_RX_IP_CKSUM_GOOD (1ULL << 7)
+#define PKT_RX_L4_CKSUM_GOOD (1ULL << 8)
+#endif
+
+#define foreach_dpdk_pkt_rx_offload_flag \
+ _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \
+ _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \
+ _ (PKT_RX_FDIR, "RX packet with FDIR infos") \
+ _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \
+ _ (PKT_RX_VLAN_STRIPPED, "RX packet VLAN tag stripped") \
+ _ (PKT_RX_IP_CKSUM_GOOD, "IP cksum of RX pkt. is valid") \
+ _ (PKT_RX_L4_CKSUM_GOOD, "L4 cksum of RX pkt. is valid") \
+ _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \
+ _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet") \
+ _ (PKT_RX_QINQ_STRIPPED, "RX packet QinQ tags stripped")
+
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+/* PTYPE added in DPDK-16.11 */
+#define RTE_PTYPE_L2_ETHER_VLAN 0x00000006
+#define RTE_PTYPE_L2_ETHER_QINQ 0x00000007
+#endif
+
+#define foreach_dpdk_pkt_type \
+ _ (L2, ETHER, "Ethernet packet") \
+ _ (L2, ETHER_TIMESYNC, "Ethernet packet for time sync") \
+ _ (L2, ETHER_ARP, "ARP packet") \
+ _ (L2, ETHER_LLDP, "LLDP (Link Layer Discovery Protocol) packet") \
+ _ (L2, ETHER_NSH, "NSH (Network Service Header) packet") \
+ _ (L2, ETHER_VLAN, "VLAN packet") \
+ _ (L2, ETHER_QINQ, "QinQ packet") \
+ _ (L3, IPV4, "IPv4 packet without extension headers") \
+ _ (L3, IPV4_EXT, "IPv4 packet with extension headers") \
+ _ (L3, IPV4_EXT_UNKNOWN, "IPv4 packet with or without extension headers") \
+ _ (L3, IPV6, "IPv6 packet without extension headers") \
+ _ (L3, IPV6_EXT, "IPv6 packet with extension headers") \
+ _ (L3, IPV6_EXT_UNKNOWN, "IPv6 packet with or without extension headers") \
+ _ (L4, TCP, "TCP packet") \
+ _ (L4, UDP, "UDP packet") \
+ _ (L4, FRAG, "Fragmented IP packet") \
+ _ (L4, SCTP, "SCTP (Stream Control Transmission Protocol) packet") \
+ _ (L4, ICMP, "ICMP packet") \
+ _ (L4, NONFRAG, "Non-fragmented IP packet") \
+ _ (TUNNEL, GRE, "GRE tunneling packet") \
+ _ (TUNNEL, VXLAN, "VXLAN tunneling packet") \
+ _ (TUNNEL, NVGRE, "NVGRE Tunneling packet") \
+ _ (TUNNEL, GENEVE, "GENEVE Tunneling packet") \
+ _ (TUNNEL, GRENAT, "Teredo, VXLAN or GRE Tunneling packet") \
+ _ (INNER_L2, ETHER, "Inner Ethernet packet") \
+ _ (INNER_L2, ETHER_VLAN, "Inner Ethernet packet with VLAN") \
+ _ (INNER_L3, IPV4, "Inner IPv4 packet without extension headers") \
+ _ (INNER_L3, IPV4_EXT, "Inner IPv4 packet with extension headers") \
+ _ (INNER_L3, IPV4_EXT_UNKNOWN, "Inner IPv4 packet with or without extension headers") \
+ _ (INNER_L3, IPV6, "Inner IPv6 packet without extension headers") \
+ _ (INNER_L3, IPV6_EXT, "Inner IPv6 packet with extension headers") \
+ _ (INNER_L3, IPV6_EXT_UNKNOWN, "Inner IPv6 packet with or without extension headers") \
+ _ (INNER_L4, TCP, "Inner TCP packet") \
+ _ (INNER_L4, UDP, "Inner UDP packet") \
+ _ (INNER_L4, FRAG, "Inner fagmented IP packet") \
+ _ (INNER_L4, SCTP, "Inner SCTP (Stream Control Transmission Protocol) packet") \
+ _ (INNER_L4, ICMP, "Inner ICMP packet") \
+ _ (INNER_L4, NONFRAG, "Inner non-fragmented IP packet")
+
+#define foreach_dpdk_pkt_tx_offload_flag \
+ _ (PKT_TX_VLAN_PKT, "TX packet is a 802.1q VLAN packet") \
+ _ (PKT_TX_IP_CKSUM, "IP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_TCP_CKSUM, "TCP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_SCTP_CKSUM, "SCTP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_IEEE1588_TMST, "TX IEEE1588 packet to timestamp")
+
+#define foreach_dpdk_pkt_offload_flag \
+ foreach_dpdk_pkt_rx_offload_flag \
+ foreach_dpdk_pkt_tx_offload_flag
+
+u8 *
+format_dpdk_device_name (u8 * s, va_list * args)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ char *devname_format;
+ char *device_name;
+ u32 i = va_arg (*args, u32);
+ struct rte_eth_dev_info dev_info;
+ u8 *ret;
+
+ if (dm->conf->interface_name_format_decimal)
+ devname_format = "%s%d/%d/%d";
+ else
+ devname_format = "%s%x/%x/%x";
+
+ switch (dm->devices[i].port_type)
+ {
+ case VNET_DPDK_PORT_TYPE_ETH_1G:
+ device_name = "GigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_10G:
+ device_name = "TenGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_40G:
+ device_name = "FortyGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_100G:
+ device_name = "HundredGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_BOND:
+ return format (s, "BondEthernet%d", dm->devices[i].device_index);
+
+ case VNET_DPDK_PORT_TYPE_ETH_SWITCH:
+ device_name = "EthernetSwitch";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_AF_PACKET:
+ rte_eth_dev_info_get (i, &dev_info);
+ return format (s, "af_packet%d", dm->devices[i].af_packet_port_id);
+
+ default:
+ case VNET_DPDK_PORT_TYPE_UNKNOWN:
+ device_name = "UnknownEthernet";
+ break;
+ }
+
+ rte_eth_dev_info_get (i, &dev_info);
+
+ if (dev_info.pci_dev)
+ ret = format (s, devname_format, device_name, dev_info.pci_dev->addr.bus,
+ dev_info.pci_dev->addr.devid,
+ dev_info.pci_dev->addr.function);
+ else
+ ret = format (s, "%s%d", device_name, dm->devices[i].device_index);
+
+ if (dm->devices[i].interface_name_suffix)
+ return format (ret, "/%s", dm->devices[i].interface_name_suffix);
+ return ret;
+}
+
+static u8 *
+format_dpdk_device_type (u8 * s, va_list * args)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ char *dev_type;
+ u32 i = va_arg (*args, u32);
+
+ switch (dm->devices[i].pmd)
+ {
+ case VNET_DPDK_PMD_E1000EM:
+ dev_type = "Intel 82540EM (e1000)";
+ break;
+
+ case VNET_DPDK_PMD_IGB:
+ dev_type = "Intel e1000";
+ break;
+
+ case VNET_DPDK_PMD_I40E:
+ dev_type = "Intel X710/XL710 Family";
+ break;
+
+ case VNET_DPDK_PMD_I40EVF:
+ dev_type = "Intel X710/XL710 Family VF";
+ break;
+
+ case VNET_DPDK_PMD_FM10K:
+ dev_type = "Intel FM10000 Family Ethernet Switch";
+ break;
+
+ case VNET_DPDK_PMD_IGBVF:
+ dev_type = "Intel e1000 VF";
+ break;
+
+ case VNET_DPDK_PMD_VIRTIO:
+ dev_type = "Red Hat Virtio";
+ break;
+
+ case VNET_DPDK_PMD_IXGBEVF:
+ dev_type = "Intel 82599 VF";
+ break;
+
+ case VNET_DPDK_PMD_IXGBE:
+ dev_type = "Intel 82599";
+ break;
+
+ case VNET_DPDK_PMD_ENIC:
+ dev_type = "Cisco VIC";
+ break;
+
+ case VNET_DPDK_PMD_CXGBE:
+ dev_type = "Chelsio T4/T5";
+ break;
+
+ case VNET_DPDK_PMD_MLX5:
+ dev_type = "Mellanox ConnectX-4 Family";
+ break;
+
+ case VNET_DPDK_PMD_VMXNET3:
+ dev_type = "VMware VMXNET3";
+ break;
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ dev_type = "af_packet";
+ break;
+
+ case VNET_DPDK_PMD_BOND:
+ dev_type = "Ethernet Bonding";
+ break;
+
+ case VNET_DPDK_PMD_DPAA2:
+ dev_type = "NXP DPAA2 Mac";
+ break;
+
+ default:
+ case VNET_DPDK_PMD_UNKNOWN:
+ dev_type = "### UNKNOWN ###";
+ break;
+ }
+
+ return format (s, dev_type);
+}
+
+static u8 *
+format_dpdk_link_status (u8 * s, va_list * args)
+{
+ dpdk_device_t *xd = va_arg (*args, dpdk_device_t *);
+ struct rte_eth_link *l = &xd->link;
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index);
+
+ s = format (s, "%s ", l->link_status ? "up" : "down");
+ if (l->link_status)
+ {
+ u32 promisc = rte_eth_promiscuous_get (xd->device_index);
+
+ s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ?
+ "full" : "half");
+ s = format (s, "speed %u mtu %d %s\n", l->link_speed,
+ hi->max_packet_bytes, promisc ? " promisc" : "");
+ }
+ else
+ s = format (s, "\n");
+
+ return s;
+}
+
+#define _line_len 72
+#define _(v, str) \
+if (bitmap & v) { \
+ if (format_get_indent (s) > next_split ) { \
+ next_split += _line_len; \
+ s = format(s,"\n%U", format_white_space, indent); \
+ } \
+ s = format(s, "%s ", str); \
+}
+
+static u8 *
+format_dpdk_rss_hf_name (u8 * s, va_list * args)
+{
+ u64 bitmap = va_arg (*args, u64);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_rss_hf return s;
+}
+
+static u8 *
+format_dpdk_rx_offload_caps (u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_rx_offload_caps return s;
+}
+
+static u8 *
+format_dpdk_tx_offload_caps (u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_tx_offload_caps return s;
+}
+
+#undef _line_len
+#undef _
+
+u8 *
+format_dpdk_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ int verbose = va_arg (*args, int);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dev_instance);
+ uword indent = format_get_indent (s);
+ f64 now = vlib_time_now (dm->vlib_main);
+ struct rte_eth_dev_info di;
+
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+
+ s = format (s, "%U\n%Ucarrier %U",
+ format_dpdk_device_type, xd->device_index,
+ format_white_space, indent + 2, format_dpdk_link_status, xd);
+
+ rte_eth_dev_info_get (xd->device_index, &di);
+
+ if (verbose > 1 && xd->flags & DPDK_DEVICE_FLAG_PMD)
+ {
+ struct rte_pci_device *pci;
+ struct rte_eth_rss_conf rss_conf;
+ int vlan_off;
+ int retval;
+
+ rss_conf.rss_key = 0;
+ retval = rte_eth_dev_rss_hash_conf_get (xd->device_index, &rss_conf);
+ if (retval < 0)
+ clib_warning ("rte_eth_dev_rss_hash_conf_get returned %d", retval);
+ pci = di.pci_dev;
+
+ if (pci)
+ s =
+ format (s,
+ "%Upci id: device %04x:%04x subsystem %04x:%04x\n"
+ "%Upci address: %04x:%02x:%02x.%02x\n",
+ format_white_space, indent + 2, pci->id.vendor_id,
+ pci->id.device_id, pci->id.subsystem_vendor_id,
+ pci->id.subsystem_device_id, format_white_space, indent + 2,
+ pci->addr.domain, pci->addr.bus, pci->addr.devid,
+ pci->addr.function);
+ s =
+ format (s, "%Umax rx packet len: %d\n", format_white_space,
+ indent + 2, di.max_rx_pktlen);
+ s =
+ format (s, "%Umax num of queues: rx %d tx %d\n", format_white_space,
+ indent + 2, di.max_rx_queues, di.max_tx_queues);
+ s =
+ format (s, "%Upromiscuous: unicast %s all-multicast %s\n",
+ format_white_space, indent + 2,
+ rte_eth_promiscuous_get (xd->device_index) ? "on" : "off",
+ rte_eth_promiscuous_get (xd->device_index) ? "on" : "off");
+ vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index);
+ s = format (s, "%Uvlan offload: strip %s filter %s qinq %s\n",
+ format_white_space, indent + 2,
+ vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off");
+ s = format (s, "%Urx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rx_offload_caps, di.rx_offload_capa);
+ s = format (s, "%Utx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_tx_offload_caps, di.tx_offload_capa);
+ s = format (s, "%Urss active: %U\n"
+ "%Urss supported: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, rss_conf.rss_hf,
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, di.flow_type_rss_offloads);
+ }
+
+ s = format (s, "%Urx queues %d, rx desc %d, tx queues %d, tx desc %d\n",
+ format_white_space, indent + 2,
+ xd->rx_q_used, xd->nb_rx_desc, xd->tx_q_used, xd->nb_tx_desc);
+
+ if (xd->cpu_socket > -1)
+ s = format (s, "%Ucpu socket %d\n",
+ format_white_space, indent + 2, xd->cpu_socket);
+
+ /* $$$ MIB counters */
+ {
+#define _(N, V) \
+ if ((xd->stats.V - xd->last_cleared_stats.V) != 0) { \
+ s = format (s, "\n%U%-40U%16Ld", \
+ format_white_space, indent + 2, \
+ format_c_identifier, #N, \
+ xd->stats.V - xd->last_cleared_stats.V); \
+ } \
+
+ foreach_dpdk_counter
+#undef _
+ }
+
+ u8 *xs = 0;
+ u32 i = 0;
+ struct rte_eth_xstat *xstat, *last_xstat;
+ struct rte_eth_xstat_name *xstat_names = 0;
+ int len = rte_eth_xstats_get_names (xd->device_index, NULL, 0);
+ vec_validate (xstat_names, len - 1);
+ rte_eth_xstats_get_names (xd->device_index, xstat_names, len);
+
+ ASSERT (vec_len (xd->xstats) == vec_len (xd->last_cleared_xstats));
+
+ /* *INDENT-OFF* */
+ vec_foreach_index(i, xd->xstats)
+ {
+ u64 delta = 0;
+ xstat = vec_elt_at_index(xd->xstats, i);
+ last_xstat = vec_elt_at_index(xd->last_cleared_xstats, i);
+
+ delta = xstat->value - last_xstat->value;
+ if (verbose == 2 || (verbose && delta))
+ {
+ /* format_c_identifier doesn't like c strings inside vector */
+ u8 * name = format(0,"%s", xstat_names[i].name);
+ xs = format(xs, "\n%U%-38U%16Ld",
+ format_white_space, indent + 4,
+ format_c_identifier, name, delta);
+ vec_free(name);
+ }
+ }
+ /* *INDENT-ON* */
+
+ vec_free (xstat_names);
+
+ if (xs)
+ {
+ s = format (s, "\n%Uextended stats:%v",
+ format_white_space, indent + 2, xs);
+ vec_free (xs);
+ }
+
+ return s;
+}
+
+u8 *
+format_dpdk_tx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main ();
+ dpdk_tx_dma_trace_t *t = va_arg (*va, dpdk_tx_dma_trace_t *);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index);
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U tx queue %d",
+ format_vnet_sw_interface_name, vnm, sw, t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index, format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_ethernet_header_with_length, t->buffer.pre_data,
+ sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+u8 *
+format_dpdk_rx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main ();
+ dpdk_rx_dma_trace_t *t = va_arg (*va, dpdk_rx_dma_trace_t *);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index);
+ format_function_t *f;
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U rx queue %d",
+ format_vnet_sw_interface_name, vnm, sw, t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index, format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_dpdk_rte_mbuf, &t->mb, &t->data);
+
+ if (vm->trace_main.verbose)
+ {
+ s = format (s, "\n%UPacket Dump%s", format_white_space, indent + 2,
+ t->mb.data_len > sizeof (t->data) ? " (truncated)" : "");
+ s = format (s, "\n%U%U", format_white_space, indent + 4,
+ format_hexdump, &t->data,
+ t->mb.data_len >
+ sizeof (t->data) ? sizeof (t->data) : t->mb.data_len);
+ }
+ f = node->format_buffer;
+ if (!f)
+ f = format_hex_bytes;
+ s = format (s, "\n%U%U", format_white_space, indent,
+ f, t->buffer.pre_data, sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+
+static inline u8 *
+format_dpdk_pkt_types (u8 * s, va_list * va)
+{
+ u32 *pkt_types = va_arg (*va, u32 *);
+ uword indent __attribute__ ((unused)) = format_get_indent (s) + 2;
+
+ if (!*pkt_types)
+ return s;
+
+ s = format (s, "Packet Types");
+
+#define _(L, F, S) \
+ if ((*pkt_types & RTE_PTYPE_##L##_MASK) == RTE_PTYPE_##L##_##F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", format_white_space, indent, \
+ "RTE_PTYPE_" #L "_" #F, RTE_PTYPE_##L##_##F, S); \
+ }
+
+ foreach_dpdk_pkt_type
+#undef _
+ return s;
+}
+
+static inline u8 *
+format_dpdk_pkt_offload_flags (u8 * s, va_list * va)
+{
+ u64 *ol_flags = va_arg (*va, u64 *);
+ uword indent = format_get_indent (s) + 2;
+
+ if (!*ol_flags)
+ return s;
+
+ s = format (s, "Packet Offload Flags");
+
+#define _(F, S) \
+ if (*ol_flags & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_offload_flag
+#undef _
+ return s;
+}
+
+u8 *
+format_dpdk_rte_mbuf_vlan (u8 * s, va_list * va)
+{
+ ethernet_vlan_header_tv_t *vlan_hdr =
+ va_arg (*va, ethernet_vlan_header_tv_t *);
+
+ if (clib_net_to_host_u16 (vlan_hdr->type) == ETHERNET_TYPE_DOT1AD)
+ {
+ s = format (s, "%U 802.1q vlan ",
+ format_ethernet_vlan_tci,
+ clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id));
+ vlan_hdr++;
+ }
+
+ s = format (s, "%U",
+ format_ethernet_vlan_tci,
+ clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id));
+
+ return s;
+}
+
+u8 *
+format_dpdk_rte_mbuf (u8 * s, va_list * va)
+{
+ struct rte_mbuf *mb = va_arg (*va, struct rte_mbuf *);
+ ethernet_header_t *eth_hdr = va_arg (*va, ethernet_header_t *);
+ uword indent = format_get_indent (s) + 2;
+
+ s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d"
+ "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x, data_off %d, phys_addr 0x%x"
+ "\n%Upacket_type 0x%x",
+ mb->port, mb->nb_segs, mb->pkt_len,
+ format_white_space, indent,
+ mb->buf_len, mb->data_len, mb->ol_flags, mb->data_off,
+ mb->buf_physaddr, format_white_space, indent, mb->packet_type);
+
+ if (mb->ol_flags)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_offload_flags, &mb->ol_flags);
+
+ if ((mb->ol_flags & PKT_RX_VLAN_PKT) &&
+ ((mb->ol_flags & (PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ_STRIPPED)) == 0))
+ {
+ ethernet_vlan_header_tv_t *vlan_hdr =
+ ((ethernet_vlan_header_tv_t *) & (eth_hdr->type));
+ s = format (s, " %U", format_dpdk_rte_mbuf_vlan, vlan_hdr);
+ }
+
+ if (mb->packet_type)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_types, &mb->packet_type);
+
+ return s;
+}
+
+uword
+unformat_socket_mem (unformat_input_t * input, va_list * va)
+{
+ uword **r = va_arg (*va, uword **);
+ int i = 0;
+ u32 mem;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, ","))
+ hash_set (*r, i, 1024);
+ else if (unformat (input, "%u,", &mem))
+ hash_set (*r, i, mem);
+ else if (unformat (input, "%u", &mem))
+ hash_set (*r, i, mem);
+ else
+ {
+ unformat_put_input (input);
+ goto done;
+ }
+ i++;
+ }
+
+done:
+ return 1;
+}
+
+clib_error_t *
+unformat_rss_fn (unformat_input_t * input, uword * rss_fn)
+{
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (0)
+ ;
+#undef _
+#define _(f, s) \
+ else if (unformat (input, s)) \
+ *rss_fn |= f;
+
+ foreach_dpdk_rss_hf
+#undef _
+ else
+ {
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ }
+ return 0;
+}
+
+clib_error_t *
+unformat_hqos (unformat_input_t * input, dpdk_device_config_hqos_t * hqos)
+{
+ clib_error_t *error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "hqos-thread %u", &hqos->hqos_thread))
+ hqos->hqos_thread_valid = 1;
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ break;
+ }
+ }
+
+ return error;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/hqos.c b/src/vnet/devices/dpdk/hqos.c
new file mode 100644
index 00000000000..d68bc48f80b
--- /dev/null
+++ b/src/vnet/devices/dpdk/hqos.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/bitmap.h>
+
+#include <vnet/vnet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include <vlib/unix/physmem.h>
+#include <vlib/pci/pci.h>
+#include <vlibmemory/api.h>
+#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */
+
+#define vl_typedefs /* define message structures */
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_typedefs
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_printfun
+
+#include "dpdk_priv.h"
+
+dpdk_main_t dpdk_main;
+
+/***
+ *
+ * HQoS default configuration values
+ *
+ ***/
+
+static dpdk_device_config_hqos_t hqos_params_default = {
+ .hqos_thread_valid = 0,
+
+ .swq_size = 4096,
+ .burst_enq = 256,
+ .burst_deq = 220,
+
+ /*
+ * Packet field to identify the subport.
+ *
+ * Default value: Since only one subport is defined by default (see below:
+ * n_subports_per_port = 1), the subport ID is hardcoded to 0.
+ */
+ .pktfield0_slabpos = 0,
+ .pktfield0_slabmask = 0,
+
+ /*
+ * Packet field to identify the pipe.
+ *
+ * Default value: Assuming Ethernet/IPv4/UDP packets, UDP payload bits 12 .. 23
+ */
+ .pktfield1_slabpos = 40,
+ .pktfield1_slabmask = 0x0000000FFF000000LLU,
+
+ /* Packet field used as index into TC translation table to identify the traffic
+ * class and queue.
+ *
+ * Default value: Assuming Ethernet/IPv4 packets, IPv4 DSCP field
+ */
+ .pktfield2_slabpos = 8,
+ .pktfield2_slabmask = 0x00000000000000FCLLU,
+ .tc_table = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ },
+
+ /* port */
+ .port = {
+ .name = NULL, /* Set at init */
+ .socket = 0, /* Set at init */
+ .rate = 1250000000, /* Assuming 10GbE port */
+ .mtu = 14 + 1500, /* Assuming Ethernet/IPv4 pkt (Ethernet FCS not included) */
+ .frame_overhead = RTE_SCHED_FRAME_OVERHEAD_DEFAULT,
+ .n_subports_per_port = 1,
+ .n_pipes_per_subport = 4096,
+ .qsize = {64, 64, 64, 64},
+ .pipe_profiles = NULL, /* Set at config */
+ .n_pipe_profiles = 1,
+
+#ifdef RTE_SCHED_RED
+ .red_params = {
+ /* Traffic Class 0 Colors Green / Yellow / Red */
+ [0][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [0][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [0][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 1 - Colors Green / Yellow / Red */
+ [1][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [1][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [1][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 2 - Colors Green / Yellow / Red */
+ [2][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [2][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [2][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 3 - Colors Green / Yellow / Red */
+ [3][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [3][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [3][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9}
+ },
+#endif /* RTE_SCHED_RED */
+ },
+};
+
+static struct rte_sched_subport_params hqos_subport_params_default = {
+ .tb_rate = 1250000000, /* 10GbE line rate (measured in bytes/second) */
+ .tb_size = 1000000,
+ .tc_rate = {1250000000, 1250000000, 1250000000, 1250000000},
+ .tc_period = 10,
+};
+
+static struct rte_sched_pipe_params hqos_pipe_params_default = {
+ .tb_rate = 305175, /* 10GbE line rate divided by 4K pipes */
+ .tb_size = 1000000,
+ .tc_rate = {305175, 305175, 305175, 305175},
+ .tc_period = 40,
+#ifdef RTE_SCHED_SUBPORT_TC_OV
+ .tc_ov_weight = 1,
+#endif
+ .wrr_weights = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+};
+
+/***
+ *
+ * HQoS configuration
+ *
+ ***/
+
+int
+dpdk_hqos_validate_mask (u64 mask, u32 n)
+{
+ int count = __builtin_popcountll (mask);
+ int pos_lead = sizeof (u64) * 8 - __builtin_clzll (mask);
+ int pos_trail = __builtin_ctzll (mask);
+ int count_expected = __builtin_popcount (n - 1);
+
+ /* Handle the exceptions */
+ if (n == 0)
+ return -1; /* Error */
+
+ if ((mask == 0) && (n == 1))
+ return 0; /* OK */
+
+ if (((mask == 0) && (n != 1)) || ((mask != 0) && (n == 1)))
+ return -2; /* Error */
+
+ /* Check that mask is contiguous */
+ if ((pos_lead - pos_trail) != count)
+ return -3; /* Error */
+
+ /* Check that mask contains the expected number of bits set */
+ if (count != count_expected)
+ return -4; /* Error */
+
+ return 0; /* OK */
+}
+
+void
+dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t *
+ hqos, u32 pipe_profile_id)
+{
+ memcpy (&hqos->pipe[pipe_profile_id], &hqos_pipe_params_default,
+ sizeof (hqos_pipe_params_default));
+}
+
+void
+dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos)
+{
+ struct rte_sched_subport_params *subport_params;
+ struct rte_sched_pipe_params *pipe_params;
+ u32 *pipe_map;
+ u32 i;
+
+ memcpy (hqos, &hqos_params_default, sizeof (hqos_params_default));
+
+ /* pipe */
+ vec_add2 (hqos->pipe, pipe_params, hqos->port.n_pipe_profiles);
+
+ for (i = 0; i < vec_len (hqos->pipe); i++)
+ memcpy (&pipe_params[i],
+ &hqos_pipe_params_default, sizeof (hqos_pipe_params_default));
+
+ hqos->port.pipe_profiles = hqos->pipe;
+
+ /* subport */
+ vec_add2 (hqos->subport, subport_params, hqos->port.n_subports_per_port);
+
+ for (i = 0; i < vec_len (hqos->subport); i++)
+ memcpy (&subport_params[i],
+ &hqos_subport_params_default,
+ sizeof (hqos_subport_params_default));
+
+ /* pipe profile */
+ vec_add2 (hqos->pipe_map,
+ pipe_map,
+ hqos->port.n_subports_per_port * hqos->port.n_pipes_per_subport);
+
+ for (i = 0; i < vec_len (hqos->pipe_map); i++)
+ pipe_map[i] = 0;
+}
+
+/***
+ *
+ * HQoS init
+ *
+ ***/
+
+clib_error_t *
+dpdk_port_setup_hqos (dpdk_device_t * xd, dpdk_device_config_hqos_t * hqos)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ char name[32];
+ u32 subport_id, i;
+ int rv;
+
+ /* Detect the set of worker threads */
+ int worker_thread_first = 0;
+ int worker_thread_count = 0;
+
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ vlib_thread_registration_t *tr =
+ p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (tr && tr->count > 0)
+ {
+ worker_thread_first = tr->first_index;
+ worker_thread_count = tr->count;
+ }
+
+ /* Allocate the per-thread device data array */
+ vec_validate_aligned (xd->hqos_wt, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+ memset (xd->hqos_wt, 0, tm->n_vlib_mains * sizeof (xd->hqos_wt[0]));
+
+ vec_validate_aligned (xd->hqos_ht, 0, CLIB_CACHE_LINE_BYTES);
+ memset (xd->hqos_ht, 0, sizeof (xd->hqos_ht[0]));
+
+ /* Allocate space for one SWQ per worker thread in the I/O TX thread data structure */
+ vec_validate (xd->hqos_ht->swq, worker_thread_count);
+
+ /* SWQ */
+ for (i = 0; i < worker_thread_count + 1; i++)
+ {
+ u32 swq_flags = RING_F_SP_ENQ | RING_F_SC_DEQ;
+
+ snprintf (name, sizeof (name), "SWQ-worker%u-to-device%u", i,
+ xd->device_index);
+ xd->hqos_ht->swq[i] =
+ rte_ring_create (name, hqos->swq_size, xd->cpu_socket, swq_flags);
+ if (xd->hqos_ht->swq[i] == NULL)
+ return clib_error_return (0,
+ "SWQ-worker%u-to-device%u: rte_ring_create err",
+ i, xd->device_index);
+ }
+
+ /*
+ * HQoS
+ */
+
+ /* HQoS port */
+ snprintf (name, sizeof (name), "HQoS%u", xd->device_index);
+ hqos->port.name = strdup (name);
+ if (hqos->port.name == NULL)
+ return clib_error_return (0, "HQoS%u: strdup err", xd->device_index);
+
+ hqos->port.socket = rte_eth_dev_socket_id (xd->device_index);
+ if (hqos->port.socket == SOCKET_ID_ANY)
+ hqos->port.socket = 0;
+
+ xd->hqos_ht->hqos = rte_sched_port_config (&hqos->port);
+ if (xd->hqos_ht->hqos == NULL)
+ return clib_error_return (0, "HQoS%u: rte_sched_port_config err",
+ xd->device_index);
+
+ /* HQoS subport */
+ for (subport_id = 0; subport_id < hqos->port.n_subports_per_port;
+ subport_id++)
+ {
+ u32 pipe_id;
+
+ rv =
+ rte_sched_subport_config (xd->hqos_ht->hqos, subport_id,
+ &hqos->subport[subport_id]);
+ if (rv)
+ return clib_error_return (0,
+ "HQoS%u subport %u: rte_sched_subport_config err (%d)",
+ xd->device_index, subport_id, rv);
+
+ /* HQoS pipe */
+ for (pipe_id = 0; pipe_id < hqos->port.n_pipes_per_subport; pipe_id++)
+ {
+ u32 pos = subport_id * hqos->port.n_pipes_per_subport + pipe_id;
+ u32 profile_id = hqos->pipe_map[pos];
+
+ rv =
+ rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id,
+ profile_id);
+ if (rv)
+ return clib_error_return (0,
+ "HQoS%u subport %u pipe %u: rte_sched_pipe_config err (%d)",
+ xd->device_index, subport_id, pipe_id,
+ rv);
+ }
+ }
+
+ /* Set up per-thread device data for the I/O TX thread */
+ xd->hqos_ht->hqos_burst_enq = hqos->burst_enq;
+ xd->hqos_ht->hqos_burst_deq = hqos->burst_deq;
+ vec_validate (xd->hqos_ht->pkts_enq, 2 * hqos->burst_enq - 1);
+ vec_validate (xd->hqos_ht->pkts_deq, hqos->burst_deq - 1);
+ xd->hqos_ht->pkts_enq_len = 0;
+ xd->hqos_ht->swq_pos = 0;
+ xd->hqos_ht->flush_count = 0;
+
+ /* Set up per-thread device data for each worker thread */
+ for (i = 0; i < worker_thread_count + 1; i++)
+ {
+ u32 tid;
+ if (i)
+ tid = worker_thread_first + (i - 1);
+ else
+ tid = i;
+
+ xd->hqos_wt[tid].swq = xd->hqos_ht->swq[i];
+ xd->hqos_wt[tid].hqos_field0_slabpos = hqos->pktfield0_slabpos;
+ xd->hqos_wt[tid].hqos_field0_slabmask = hqos->pktfield0_slabmask;
+ xd->hqos_wt[tid].hqos_field0_slabshr =
+ __builtin_ctzll (hqos->pktfield0_slabmask);
+ xd->hqos_wt[tid].hqos_field1_slabpos = hqos->pktfield1_slabpos;
+ xd->hqos_wt[tid].hqos_field1_slabmask = hqos->pktfield1_slabmask;
+ xd->hqos_wt[tid].hqos_field1_slabshr =
+ __builtin_ctzll (hqos->pktfield1_slabmask);
+ xd->hqos_wt[tid].hqos_field2_slabpos = hqos->pktfield2_slabpos;
+ xd->hqos_wt[tid].hqos_field2_slabmask = hqos->pktfield2_slabmask;
+ xd->hqos_wt[tid].hqos_field2_slabshr =
+ __builtin_ctzll (hqos->pktfield2_slabmask);
+ memcpy (xd->hqos_wt[tid].hqos_tc_table, hqos->tc_table,
+ sizeof (hqos->tc_table));
+ }
+
+ return 0;
+}
+
+/***
+ *
+ * HQoS run-time
+ *
+ ***/
+/*
+ * dpdk_hqos_thread - Contains the main loop of an HQoS thread.
+ *
+ * w
+ * Information for the current thread
+ */
+static_always_inline void
+dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 cpu_index = vm->cpu_index;
+ u32 dev_pos;
+
+ dev_pos = 0;
+ while (1)
+ {
+ vlib_worker_thread_barrier_check ();
+
+ u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]);
+ if (dev_pos >= n_devs)
+ dev_pos = 0;
+
+ dpdk_device_and_queue_t *dq =
+ vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos);
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
+
+ dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
+ u32 device_index = xd->device_index;
+ u16 queue_id = dq->queue_id;
+
+ struct rte_mbuf **pkts_enq = hqos->pkts_enq;
+ u32 pkts_enq_len = hqos->pkts_enq_len;
+ u32 swq_pos = hqos->swq_pos;
+ u32 n_swq = vec_len (hqos->swq), i;
+ u32 flush_count = hqos->flush_count;
+
+ for (i = 0; i < n_swq; i++)
+ {
+ /* Get current SWQ for this device */
+ struct rte_ring *swq = hqos->swq[swq_pos];
+
+ /* Read SWQ burst to packet buffer of this device */
+ pkts_enq_len += rte_ring_sc_dequeue_burst (swq,
+ (void **)
+ &pkts_enq[pkts_enq_len],
+ hqos->hqos_burst_enq);
+
+ /* Get next SWQ for this device */
+ swq_pos++;
+ if (swq_pos >= n_swq)
+ swq_pos = 0;
+ hqos->swq_pos = swq_pos;
+
+ /* HWQ TX enqueue when burst available */
+ if (pkts_enq_len >= hqos->hqos_burst_enq)
+ {
+ u32 n_pkts = rte_eth_tx_burst (device_index,
+ (uint16_t) queue_id,
+ pkts_enq,
+ (uint16_t) pkts_enq_len);
+
+ for (; n_pkts < pkts_enq_len; n_pkts++)
+ rte_pktmbuf_free (pkts_enq[n_pkts]);
+
+ pkts_enq_len = 0;
+ flush_count = 0;
+ break;
+ }
+ }
+ if (pkts_enq_len)
+ {
+ flush_count++;
+ if (PREDICT_FALSE (flush_count == HQOS_FLUSH_COUNT_THRESHOLD))
+ {
+ rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
+
+ pkts_enq_len = 0;
+ flush_count = 0;
+ }
+ }
+ hqos->pkts_enq_len = pkts_enq_len;
+ hqos->flush_count = flush_count;
+
+ /* Advance to next device */
+ dev_pos++;
+ }
+}
+
+static_always_inline void
+dpdk_hqos_thread_internal (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 cpu_index = vm->cpu_index;
+ u32 dev_pos;
+
+ dev_pos = 0;
+ while (1)
+ {
+ vlib_worker_thread_barrier_check ();
+
+ u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]);
+ if (PREDICT_FALSE (n_devs == 0))
+ {
+ dev_pos = 0;
+ continue;
+ }
+ if (dev_pos >= n_devs)
+ dev_pos = 0;
+
+ dpdk_device_and_queue_t *dq =
+ vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos);
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
+
+ dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
+ u32 device_index = xd->device_index;
+ u16 queue_id = dq->queue_id;
+
+ struct rte_mbuf **pkts_enq = hqos->pkts_enq;
+ struct rte_mbuf **pkts_deq = hqos->pkts_deq;
+ u32 pkts_enq_len = hqos->pkts_enq_len;
+ u32 swq_pos = hqos->swq_pos;
+ u32 n_swq = vec_len (hqos->swq), i;
+ u32 flush_count = hqos->flush_count;
+
+ /*
+ * SWQ dequeue and HQoS enqueue for current device
+ */
+ for (i = 0; i < n_swq; i++)
+ {
+ /* Get current SWQ for this device */
+ struct rte_ring *swq = hqos->swq[swq_pos];
+
+ /* Read SWQ burst to packet buffer of this device */
+ pkts_enq_len += rte_ring_sc_dequeue_burst (swq,
+ (void **)
+ &pkts_enq[pkts_enq_len],
+ hqos->hqos_burst_enq);
+
+ /* Get next SWQ for this device */
+ swq_pos++;
+ if (swq_pos >= n_swq)
+ swq_pos = 0;
+ hqos->swq_pos = swq_pos;
+
+ /* HQoS enqueue when burst available */
+ if (pkts_enq_len >= hqos->hqos_burst_enq)
+ {
+ rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
+
+ pkts_enq_len = 0;
+ flush_count = 0;
+ break;
+ }
+ }
+ if (pkts_enq_len)
+ {
+ flush_count++;
+ if (PREDICT_FALSE (flush_count == HQOS_FLUSH_COUNT_THRESHOLD))
+ {
+ rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
+
+ pkts_enq_len = 0;
+ flush_count = 0;
+ }
+ }
+ hqos->pkts_enq_len = pkts_enq_len;
+ hqos->flush_count = flush_count;
+
+ /*
+ * HQoS dequeue and HWQ TX enqueue for current device
+ */
+ {
+ u32 pkts_deq_len, n_pkts;
+
+ pkts_deq_len = rte_sched_port_dequeue (hqos->hqos,
+ pkts_deq,
+ hqos->hqos_burst_deq);
+
+ for (n_pkts = 0; n_pkts < pkts_deq_len;)
+ n_pkts += rte_eth_tx_burst (device_index,
+ (uint16_t) queue_id,
+ &pkts_deq[n_pkts],
+ (uint16_t) (pkts_deq_len - n_pkts));
+ }
+
+ /* Advance to next device */
+ dev_pos++;
+ }
+}
+
+void
+dpdk_hqos_thread (vlib_worker_thread_t * w)
+{
+ vlib_main_t *vm;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+
+ vm = vlib_get_main ();
+
+ ASSERT (vm->cpu_index == os_get_cpu_number ());
+
+ clib_time_init (&vm->clib_time);
+ clib_mem_set_heap (w->thread_mheap);
+
+ /* Wait until the dpdk init sequence is complete */
+ while (tm->worker_thread_release == 0)
+ vlib_worker_thread_barrier_check ();
+
+ if (vec_len (dm->devices_by_hqos_cpu[vm->cpu_index]) == 0)
+ return
+ clib_error
+ ("current I/O TX thread does not have any devices assigned to it");
+
+ if (DPDK_HQOS_DBG_BYPASS)
+ dpdk_hqos_thread_internal_hqos_dbg_bypass (vm);
+ else
+ dpdk_hqos_thread_internal (vm);
+}
+
+void
+dpdk_hqos_thread_fn (void *arg)
+{
+ vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+ vlib_worker_thread_init (w);
+ dpdk_hqos_thread (w);
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_THREAD (hqos_thread_reg, static) =
+{
+ .name = "hqos-threads",
+ .short_name = "hqos-threads",
+ .function = dpdk_hqos_thread_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * HQoS run-time code to be called by the worker threads
+ */
+#define BITFIELD(byte_array, slab_pos, slab_mask, slab_shr) \
+({ \
+ u64 slab = *((u64 *) &byte_array[slab_pos]); \
+ u64 val = (rte_be_to_cpu_64(slab) & slab_mask) >> slab_shr; \
+ val; \
+})
+
+#define RTE_SCHED_PORT_HIERARCHY(subport, pipe, traffic_class, queue, color) \
+ ((((u64) (queue)) & 0x3) | \
+ ((((u64) (traffic_class)) & 0x3) << 2) | \
+ ((((u64) (color)) & 0x3) << 4) | \
+ ((((u64) (subport)) & 0xFFFF) << 16) | \
+ ((((u64) (pipe)) & 0xFFFFFFFF) << 32))
+
+void
+dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos,
+ struct rte_mbuf **pkts, u32 n_pkts)
+{
+ u32 i;
+
+ for (i = 0; i < (n_pkts & (~0x3)); i += 4)
+ {
+ struct rte_mbuf *pkt0 = pkts[i];
+ struct rte_mbuf *pkt1 = pkts[i + 1];
+ struct rte_mbuf *pkt2 = pkts[i + 2];
+ struct rte_mbuf *pkt3 = pkts[i + 3];
+
+ u8 *pkt0_data = rte_pktmbuf_mtod (pkt0, u8 *);
+ u8 *pkt1_data = rte_pktmbuf_mtod (pkt1, u8 *);
+ u8 *pkt2_data = rte_pktmbuf_mtod (pkt2, u8 *);
+ u8 *pkt3_data = rte_pktmbuf_mtod (pkt3, u8 *);
+
+ u64 pkt0_subport = BITFIELD (pkt0_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt0_pipe = BITFIELD (pkt0_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt0_dscp = BITFIELD (pkt0_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt0_tc = hqos->hqos_tc_table[pkt0_dscp & 0x3F] >> 2;
+ u32 pkt0_tc_q = hqos->hqos_tc_table[pkt0_dscp & 0x3F] & 0x3;
+
+ u64 pkt1_subport = BITFIELD (pkt1_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt1_pipe = BITFIELD (pkt1_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt1_dscp = BITFIELD (pkt1_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt1_tc = hqos->hqos_tc_table[pkt1_dscp & 0x3F] >> 2;
+ u32 pkt1_tc_q = hqos->hqos_tc_table[pkt1_dscp & 0x3F] & 0x3;
+
+ u64 pkt2_subport = BITFIELD (pkt2_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt2_pipe = BITFIELD (pkt2_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt2_dscp = BITFIELD (pkt2_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt2_tc = hqos->hqos_tc_table[pkt2_dscp & 0x3F] >> 2;
+ u32 pkt2_tc_q = hqos->hqos_tc_table[pkt2_dscp & 0x3F] & 0x3;
+
+ u64 pkt3_subport = BITFIELD (pkt3_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt3_pipe = BITFIELD (pkt3_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt3_dscp = BITFIELD (pkt3_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt3_tc = hqos->hqos_tc_table[pkt3_dscp & 0x3F] >> 2;
+ u32 pkt3_tc_q = hqos->hqos_tc_table[pkt3_dscp & 0x3F] & 0x3;
+
+ u64 pkt0_sched = RTE_SCHED_PORT_HIERARCHY (pkt0_subport,
+ pkt0_pipe,
+ pkt0_tc,
+ pkt0_tc_q,
+ 0);
+ u64 pkt1_sched = RTE_SCHED_PORT_HIERARCHY (pkt1_subport,
+ pkt1_pipe,
+ pkt1_tc,
+ pkt1_tc_q,
+ 0);
+ u64 pkt2_sched = RTE_SCHED_PORT_HIERARCHY (pkt2_subport,
+ pkt2_pipe,
+ pkt2_tc,
+ pkt2_tc_q,
+ 0);
+ u64 pkt3_sched = RTE_SCHED_PORT_HIERARCHY (pkt3_subport,
+ pkt3_pipe,
+ pkt3_tc,
+ pkt3_tc_q,
+ 0);
+
+ pkt0->hash.sched.lo = pkt0_sched & 0xFFFFFFFF;
+ pkt0->hash.sched.hi = pkt0_sched >> 32;
+ pkt1->hash.sched.lo = pkt1_sched & 0xFFFFFFFF;
+ pkt1->hash.sched.hi = pkt1_sched >> 32;
+ pkt2->hash.sched.lo = pkt2_sched & 0xFFFFFFFF;
+ pkt2->hash.sched.hi = pkt2_sched >> 32;
+ pkt3->hash.sched.lo = pkt3_sched & 0xFFFFFFFF;
+ pkt3->hash.sched.hi = pkt3_sched >> 32;
+ }
+
+ for (; i < n_pkts; i++)
+ {
+ struct rte_mbuf *pkt = pkts[i];
+
+ u8 *pkt_data = rte_pktmbuf_mtod (pkt, u8 *);
+
+ u64 pkt_subport = BITFIELD (pkt_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt_pipe = BITFIELD (pkt_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt_dscp = BITFIELD (pkt_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt_tc = hqos->hqos_tc_table[pkt_dscp & 0x3F] >> 2;
+ u32 pkt_tc_q = hqos->hqos_tc_table[pkt_dscp & 0x3F] & 0x3;
+
+ u64 pkt_sched = RTE_SCHED_PORT_HIERARCHY (pkt_subport,
+ pkt_pipe,
+ pkt_tc,
+ pkt_tc_q,
+ 0);
+
+ pkt->hash.sched.lo = pkt_sched & 0xFFFFFFFF;
+ pkt->hash.sched.hi = pkt_sched >> 32;
+ }
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/init.c b/src/vnet/devices/dpdk/init.c
new file mode 100755
index 00000000000..693ca985130
--- /dev/null
+++ b/src/vnet/devices/dpdk/init.c
@@ -0,0 +1,1803 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/bitmap.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vlib/unix/physmem.h>
+#include <vlib/pci/pci.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "dpdk_priv.h"
+
+dpdk_main_t dpdk_main;
+
+/* force linker to link functions used by vlib and declared weak */
+void *vlib_weakly_linked_functions[] = {
+ &rte_pktmbuf_init,
+ &rte_pktmbuf_pool_init,
+};
+
+#define LINK_STATE_ELOGS 0
+
+#define DEFAULT_HUGE_DIR "/run/vpp/hugepages"
+#define VPP_RUN_DIR "/run/vpp"
+
+/* Port configuration, mildly modified Intel app values */
+
+static struct rte_eth_conf port_conf_template = {
+ .rxmode = {
+ .split_hdr_size = 0,
+ .header_split = 0, /**< Header Split disabled */
+ .hw_ip_checksum = 0, /**< IP checksum offload disabled */
+ .hw_vlan_filter = 0, /**< VLAN filtering disabled */
+ .hw_strip_crc = 0, /**< CRC stripped by hardware */
+ },
+ .txmode = {
+ .mq_mode = ETH_MQ_TX_NONE,
+ },
+};
+
+clib_error_t *
+dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_buffer_main_t *bm = vm->buffer_main;
+ int rv;
+ int j;
+
+ ASSERT (os_get_cpu_number () == 0);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0);
+ rte_eth_dev_stop (xd->device_index);
+ }
+
+ rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used,
+ xd->tx_q_used, &xd->port_conf);
+
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ /* Set up one TX-queue per worker thread */
+ for (j = 0; j < xd->tx_q_used; j++)
+ {
+ rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc,
+ xd->cpu_socket, &xd->tx_conf);
+
+ /* retry with any other CPU socket */
+ if (rv < 0)
+ rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc,
+ SOCKET_ID_ANY, &xd->tx_conf);
+ if (rv < 0)
+ break;
+ }
+
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_tx_queue_setup[%d]: err %d",
+ xd->device_index, rv);
+
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+
+ rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc,
+ xd->cpu_socket, 0,
+ bm->
+ pktmbuf_pools[xd->cpu_socket_id_by_queue
+ [j]]);
+
+ /* retry with any other CPU socket */
+ if (rv < 0)
+ rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc,
+ SOCKET_ID_ANY, 0,
+ bm->
+ pktmbuf_pools[xd->cpu_socket_id_by_queue
+ [j]]);
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d",
+ xd->device_index, rv);
+ }
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ int rv;
+ rv = rte_eth_dev_start (xd->device_index);
+ if (rv < 0)
+ clib_warning ("rte_eth_dev_start %d returned %d",
+ xd->device_index, rv);
+ }
+ return 0;
+}
+
+static u32
+dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+ u32 old = 0;
+
+ if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC (flags))
+ {
+ old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0;
+
+ if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL)
+ xd->flags |= DPDK_DEVICE_FLAG_PROMISC;
+ else
+ xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
+ rte_eth_promiscuous_enable (xd->device_index);
+ else
+ rte_eth_promiscuous_disable (xd->device_index);
+ }
+ }
+ else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU (flags))
+ {
+ /*
+ * DAW-FIXME: The Cisco VIC firmware does not provide an api for a
+ * driver to dynamically change the mtu. If/when the
+ * VIC firmware gets fixed, then this should be removed.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ struct rte_eth_dev_info dev_info;
+
+ /*
+ * Restore mtu to what has been set by CIMC in the firmware cfg.
+ */
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ hi->max_packet_bytes = dev_info.max_rx_pktlen;
+
+ vlib_cli_output (vlib_get_main (),
+ "Cisco VIC mtu can only be changed "
+ "using CIMC then rebooting the server!");
+ }
+ else
+ {
+ int rv;
+
+ xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ rte_eth_dev_stop (xd->device_index);
+
+ rv = rte_eth_dev_configure
+ (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf);
+
+ if (rv < 0)
+ vlib_cli_output (vlib_get_main (),
+ "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ int rv = rte_eth_dev_start (xd->device_index);
+ if (rv < 0)
+ clib_warning ("rte_eth_dev_start %d returned %d",
+ xd->device_index, rv);
+ }
+ }
+ }
+ return old;
+}
+
+void
+dpdk_device_lock_init (dpdk_device_t * xd)
+{
+ int q;
+ vec_validate (xd->lockp, xd->tx_q_used - 1);
+ for (q = 0; q < xd->tx_q_used; q++)
+ {
+ xd->lockp[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) xd->lockp[q], 0, CLIB_CACHE_LINE_BYTES);
+ }
+}
+
+void
+dpdk_device_lock_free (dpdk_device_t * xd)
+{
+ int q;
+
+ for (q = 0; q < vec_len (xd->lockp); q++)
+ clib_mem_free ((void *) xd->lockp[q]);
+ vec_free (xd->lockp);
+ xd->lockp = 0;
+}
+
+static clib_error_t *
+dpdk_lib_init (dpdk_main_t * dm)
+{
+ u32 nports;
+ u32 nb_desc = 0;
+ int i;
+ clib_error_t *error;
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ vnet_sw_interface_t *sw;
+ vnet_hw_interface_t *hi;
+ dpdk_device_t *xd;
+ vlib_pci_addr_t last_pci_addr;
+ u32 last_pci_addr_port = 0;
+ vlib_thread_registration_t *tr, *tr_hqos;
+ uword *p, *p_hqos;
+
+ u32 next_cpu = 0, next_hqos_cpu = 0;
+ u8 af_packet_port_id = 0;
+ last_pci_addr.as_u32 = ~0;
+
+ dm->input_cpu_first_index = 0;
+ dm->input_cpu_count = 1;
+
+ /* find out which cpus will be used for input */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (tr && tr->count > 0)
+ {
+ dm->input_cpu_first_index = tr->first_index;
+ dm->input_cpu_count = tr->count;
+ }
+
+ vec_validate_aligned (dm->devices_by_cpu, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (dm->workers, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ dm->hqos_cpu_first_index = 0;
+ dm->hqos_cpu_count = 0;
+
+ /* find out which cpus will be used for I/O TX */
+ p_hqos = hash_get_mem (tm->thread_registrations_by_name, "hqos-threads");
+ tr_hqos = p_hqos ? (vlib_thread_registration_t *) p_hqos[0] : 0;
+
+ if (tr_hqos && tr_hqos->count > 0)
+ {
+ dm->hqos_cpu_first_index = tr_hqos->first_index;
+ dm->hqos_cpu_count = tr_hqos->count;
+ }
+
+ vec_validate_aligned (dm->devices_by_hqos_cpu, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (dm->hqos_threads, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ nports = rte_eth_dev_count ();
+ if (nports < 1)
+ {
+ clib_warning ("DPDK drivers found no ports...");
+ }
+
+ if (CLIB_DEBUG > 0)
+ clib_warning ("DPDK drivers found %d ports...", nports);
+
+ /*
+ * All buffers are all allocated from the same rte_mempool.
+ * Thus they all have the same number of data bytes.
+ */
+ dm->vlib_buffer_free_list_index =
+ vlib_buffer_get_or_create_free_list (vm,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
+ "dpdk rx");
+
+ if (dm->conf->enable_tcp_udp_checksum)
+ dm->buffer_flags_template &= ~(IP_BUFFER_L4_CHECKSUM_CORRECT
+ | IP_BUFFER_L4_CHECKSUM_COMPUTED);
+
+ for (i = 0; i < nports; i++)
+ {
+ u8 addr[6];
+ u8 vlan_strip = 0;
+ int j;
+ struct rte_eth_dev_info dev_info;
+ clib_error_t *rv;
+ struct rte_eth_link l;
+ dpdk_device_config_t *devconf = 0;
+ vlib_pci_addr_t pci_addr;
+ uword *p = 0;
+
+ rte_eth_dev_info_get (i, &dev_info);
+ if (dev_info.pci_dev) /* bonded interface has no pci info */
+ {
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr,
+ pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ /* Create vnet interface */
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
+ xd->cpu_socket = (i8) rte_eth_dev_socket_id (i);
+
+ /* Handle interface naming for devices with multiple ports sharing same PCI ID */
+ if (dev_info.pci_dev)
+ {
+ struct rte_eth_dev_info di = { 0 };
+ rte_eth_dev_info_get (i + 1, &di);
+ if (di.pci_dev && pci_addr.as_u32 != last_pci_addr.as_u32 &&
+ memcmp (&dev_info.pci_dev->addr, &di.pci_dev->addr,
+ sizeof (struct rte_pci_addr)) == 0)
+ {
+ xd->interface_name_suffix = format (0, "0");
+ last_pci_addr.as_u32 = pci_addr.as_u32;
+ last_pci_addr_port = i;
+ }
+ else if (pci_addr.as_u32 == last_pci_addr.as_u32)
+ {
+ xd->interface_name_suffix =
+ format (0, "%u", i - last_pci_addr_port);
+ }
+ else
+ {
+ last_pci_addr.as_u32 = ~0;
+ }
+ }
+ else
+ last_pci_addr.as_u32 = ~0;
+
+ clib_memcpy (&xd->tx_conf, &dev_info.default_txconf,
+ sizeof (struct rte_eth_txconf));
+ if (dm->conf->no_multi_seg)
+ {
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 0;
+ }
+ else
+ {
+ xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 1;
+ xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG;
+ }
+
+ clib_memcpy (&xd->port_conf, &port_conf_template,
+ sizeof (struct rte_eth_conf));
+
+ xd->tx_q_used = clib_min (dev_info.max_tx_queues, tm->n_vlib_mains);
+
+ if (devconf->num_tx_queues > 0
+ && devconf->num_tx_queues < xd->tx_q_used)
+ xd->tx_q_used = clib_min (xd->tx_q_used, devconf->num_tx_queues);
+
+ if (devconf->num_rx_queues > 1 && dm->use_rss == 0)
+ {
+ dm->use_rss = 1;
+ }
+
+ if (devconf->num_rx_queues > 1
+ && dev_info.max_rx_queues >= devconf->num_rx_queues)
+ {
+ xd->rx_q_used = devconf->num_rx_queues;
+ xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+ if (devconf->rss_fn == 0)
+ xd->port_conf.rx_adv_conf.rss_conf.rss_hf =
+ ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP;
+ else
+ xd->port_conf.rx_adv_conf.rss_conf.rss_hf = devconf->rss_fn;
+ }
+ else
+ xd->rx_q_used = 1;
+
+ xd->flags |= DPDK_DEVICE_FLAG_PMD;
+
+ /* workaround for drivers not setting driver_name */
+ if ((!dev_info.driver_name) && (dev_info.pci_dev))
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+ dev_info.driver_name = dev_info.pci_dev->driver->name;
+#else
+ dev_info.driver_name = dev_info.pci_dev->driver->driver.name;
+#endif
+ ASSERT (dev_info.driver_name);
+
+ if (!xd->pmd)
+ {
+
+
+#define _(s,f) else if (dev_info.driver_name && \
+ !strcmp(dev_info.driver_name, s)) \
+ xd->pmd = VNET_DPDK_PMD_##f;
+ if (0)
+ ;
+ foreach_dpdk_pmd
+#undef _
+ else
+ xd->pmd = VNET_DPDK_PMD_UNKNOWN;
+
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
+
+ switch (xd->pmd)
+ {
+ /* 1G adapters */
+ case VNET_DPDK_PMD_E1000EM:
+ case VNET_DPDK_PMD_IGB:
+ case VNET_DPDK_PMD_IGBVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ break;
+
+ /* 10G adapters */
+ case VNET_DPDK_PMD_IXGBE:
+ case VNET_DPDK_PMD_IXGBEVF:
+ case VNET_DPDK_PMD_THUNDERX:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ case VNET_DPDK_PMD_DPAA2:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+
+ /* Cisco VIC */
+ case VNET_DPDK_PMD_ENIC:
+ rte_eth_link_get_nowait (i, &l);
+ xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE;
+ if (l.link_speed == 40000)
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ else
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+
+ /* Intel Fortville */
+ case VNET_DPDK_PMD_I40E:
+ case VNET_DPDK_PMD_I40EVF:
+ xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE;
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+
+ switch (dev_info.pci_dev->id.device_id)
+ {
+ case I40E_DEV_ID_10G_BASE_T:
+ case I40E_DEV_ID_SFP_XL710:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ case I40E_DEV_ID_QSFP_A:
+ case I40E_DEV_ID_QSFP_B:
+ case I40E_DEV_ID_QSFP_C:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ case I40E_DEV_ID_VF:
+ rte_eth_link_get_nowait (i, &l);
+ xd->port_type = l.link_speed == 10000 ?
+ VNET_DPDK_PORT_TYPE_ETH_10G : VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+ break;
+
+ case VNET_DPDK_PMD_CXGBE:
+ switch (dev_info.pci_dev->id.device_id)
+ {
+ case 0x540d: /* T580-CR */
+ case 0x5410: /* T580-LP-cr */
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ case 0x5403: /* T540-CR */
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+ break;
+
+ case VNET_DPDK_PMD_MLX5:
+ {
+ char *pn_100g[] = { "MCX415A-CCAT", "MCX416A-CCAT", 0 };
+ char *pn_40g[] = { "MCX413A-BCAT", "MCX414A-BCAT",
+ "MCX415A-BCAT", "MCX416A-BCAT", "MCX4131A-BCAT", 0
+ };
+ char *pn_10g[] = { "MCX4111A-XCAT", "MCX4121A-XCAT", 0 };
+
+ vlib_pci_device_t *pd = vlib_get_pci_device (&pci_addr);
+ u8 *pn = 0;
+ char **c;
+ int found = 0;
+ pn = format (0, "%U%c",
+ format_vlib_pci_vpd, pd->vpd_r, "PN", 0);
+
+ if (!pn)
+ break;
+
+ c = pn_100g;
+ while (!found && c[0])
+ {
+ if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0)
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_100G;
+ break;
+ }
+ c++;
+ }
+
+ c = pn_40g;
+ while (!found && c[0])
+ {
+ if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0)
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ }
+ c++;
+ }
+
+ c = pn_10g;
+ while (!found && c[0])
+ {
+ if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0)
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ }
+ c++;
+ }
+
+ vec_free (pn);
+ }
+
+ break;
+ /* Intel Red Rock Canyon */
+ case VNET_DPDK_PMD_FM10K:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH;
+ break;
+
+ /* virtio */
+ case VNET_DPDK_PMD_VIRTIO:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO;
+ break;
+
+ /* vmxnet3 */
+ case VNET_DPDK_PMD_VMXNET3:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ break;
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET;
+ xd->af_packet_port_id = af_packet_port_id++;
+ break;
+
+ case VNET_DPDK_PMD_BOND:
+ xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE;
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_BOND;
+ break;
+
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+
+ if (devconf->num_rx_desc)
+ xd->nb_rx_desc = devconf->num_rx_desc;
+
+ if (devconf->num_tx_desc)
+ xd->nb_tx_desc = devconf->num_tx_desc;
+ }
+
+ /*
+ * Ensure default mtu is not > the mtu read from the hardware.
+ * Otherwise rte_eth_dev_configure() will fail and the port will
+ * not be available.
+ */
+ if (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen)
+ {
+ /*
+ * This device does not support the platforms's max frame
+ * size. Use it's advertised mru instead.
+ */
+ xd->port_conf.rxmode.max_rx_pkt_len = dev_info.max_rx_pktlen;
+ }
+ else
+ {
+ xd->port_conf.rxmode.max_rx_pkt_len = ETHERNET_MAX_PACKET_BYTES;
+
+ /*
+ * Some platforms do not account for Ethernet FCS (4 bytes) in
+ * MTU calculations. To interop with them increase mru but only
+ * if the device's settings can support it.
+ */
+ if ((dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES + 4)) &&
+ xd->port_conf.rxmode.hw_strip_crc)
+ {
+ /*
+ * Allow additional 4 bytes (for Ethernet FCS). These bytes are
+ * stripped by h/w and so will not consume any buffer memory.
+ */
+ xd->port_conf.rxmode.max_rx_pkt_len += 4;
+ }
+ }
+
+ if (xd->pmd == VNET_DPDK_PMD_AF_PACKET)
+ {
+ f64 now = vlib_time_now (vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+ clib_memcpy (addr + 2, &rnd, sizeof (rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+ else
+ rte_eth_macaddr_get (i, (struct ether_addr *) addr);
+
+ if (xd->tx_q_used < tm->n_vlib_mains)
+ dpdk_device_lock_init (xd);
+
+ xd->device_index = xd - dm->devices;
+ ASSERT (i == xd->device_index);
+ xd->per_interface_next_index = ~0;
+
+ /* assign interface to input thread */
+ dpdk_device_and_queue_t *dq;
+ int q;
+
+ if (devconf->workers)
+ {
+ int i;
+ q = 0;
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (i, devconf->workers, ({
+ int cpu = dm->input_cpu_first_index + i;
+ unsigned lcore = vlib_worker_threads[cpu].lcore_id;
+ vec_validate(xd->cpu_socket_id_by_queue, q);
+ xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore);
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = q++;
+ }));
+ /* *INDENT-ON* */
+ }
+ else
+ for (q = 0; q < xd->rx_q_used; q++)
+ {
+ int cpu = dm->input_cpu_first_index + next_cpu;
+ unsigned lcore = vlib_worker_threads[cpu].lcore_id;
+
+ /*
+ * numa node for worker thread handling this queue
+ * needed for taking buffers from the right mempool
+ */
+ vec_validate (xd->cpu_socket_id_by_queue, q);
+ xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id (lcore);
+
+ /*
+ * construct vector of (device,queue) pairs for each worker thread
+ */
+ vec_add2 (dm->devices_by_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = q;
+
+ next_cpu++;
+ if (next_cpu == dm->input_cpu_count)
+ next_cpu = 0;
+ }
+
+
+ if (devconf->hqos_enabled)
+ {
+ xd->flags |= DPDK_DEVICE_FLAG_HQOS;
+
+ if (devconf->hqos.hqos_thread_valid)
+ {
+ int cpu = dm->hqos_cpu_first_index + devconf->hqos.hqos_thread;
+
+ if (devconf->hqos.hqos_thread >= dm->hqos_cpu_count)
+ return clib_error_return (0, "invalid HQoS thread index");
+
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+ }
+ else
+ {
+ int cpu = dm->hqos_cpu_first_index + next_hqos_cpu;
+
+ if (dm->hqos_cpu_count == 0)
+ return clib_error_return (0, "no HQoS threads available");
+
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+
+ next_hqos_cpu++;
+ if (next_hqos_cpu == dm->hqos_cpu_count)
+ next_hqos_cpu = 0;
+
+ devconf->hqos.hqos_thread_valid = 1;
+ devconf->hqos.hqos_thread = cpu;
+ }
+ }
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], xd->nb_tx_desc,
+ sizeof (tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE - 1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->d_trace_buffers, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ rv = dpdk_port_setup (dm, xd);
+
+ if (rv)
+ return rv;
+
+ if (devconf->hqos_enabled)
+ {
+ rv = dpdk_port_setup_hqos (xd, &devconf->hqos);
+ if (rv)
+ return rv;
+ }
+
+ /* count the number of descriptors used for this device */
+ nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used;
+
+ error = ethernet_register_interface
+ (dm->vnet_main, dpdk_device_class.index, xd->device_index,
+ /* ethernet address */ addr,
+ &xd->vlib_hw_if_index, dpdk_flag_change);
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+ hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+
+ /*
+ * DAW-FIXME: The Cisco VIC firmware does not provide an api for a
+ * driver to dynamically change the mtu. If/when the
+ * VIC firmware gets fixed, then this should be removed.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ /*
+ * Initialize mtu to what has been set by CIMC in the firmware cfg.
+ */
+ hi->max_packet_bytes = dev_info.max_rx_pktlen;
+ if (devconf->vlan_strip_offload != DPDK_DEVICE_VLAN_STRIP_OFF)
+ vlan_strip = 1; /* remove vlan tag from VIC port by default */
+ else
+ clib_warning ("VLAN strip disabled for interface\n");
+ }
+ else if (devconf->vlan_strip_offload == DPDK_DEVICE_VLAN_STRIP_ON)
+ vlan_strip = 1;
+
+ if (vlan_strip)
+ {
+ int vlan_off;
+ vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index);
+ vlan_off |= ETH_VLAN_STRIP_OFFLOAD;
+ xd->port_conf.rxmode.hw_vlan_strip = vlan_off;
+ if (rte_eth_dev_set_vlan_offload (xd->device_index, vlan_off) == 0)
+ clib_warning ("VLAN strip enabled for interface\n");
+ else
+ clib_warning ("VLAN strip cannot be supported by interface\n");
+ }
+
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] =
+ xd->port_conf.rxmode.max_rx_pkt_len - sizeof (ethernet_header_t);
+
+ rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes);
+ }
+
+ if (nb_desc > dm->conf->num_mbufs)
+ clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n",
+ dm->conf->num_mbufs, nb_desc);
+
+ return 0;
+}
+
+static void
+dpdk_bind_devices_to_uio (dpdk_config_main_t * conf)
+{
+ vlib_pci_main_t *pm = &pci_main;
+ clib_error_t *error;
+ vlib_pci_device_t *d;
+ u8 *pci_addr = 0;
+ int num_whitelisted = vec_len (conf->dev_confs);
+
+ /* *INDENT-OFF* */
+ pool_foreach (d, pm->pci_devs, ({
+ dpdk_device_config_t * devconf = 0;
+ vec_reset_length (pci_addr);
+ pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, &d->bus_address, 0);
+
+ if (d->device_class != PCI_CLASS_NETWORK_ETHERNET)
+ continue;
+
+ if (num_whitelisted)
+ {
+ uword * p = hash_get (conf->device_config_index_by_pci_addr, d->bus_address.as_u32);
+
+ if (!p)
+ continue;
+
+ devconf = pool_elt_at_index (conf->dev_confs, p[0]);
+ }
+
+ /* virtio */
+ if (d->vendor_id == 0x1af4 && d->device_id == 0x1000)
+ ;
+ /* vmxnet3 */
+ else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0)
+ ;
+ /* all Intel devices */
+ else if (d->vendor_id == 0x8086)
+ ;
+ /* Cisco VIC */
+ else if (d->vendor_id == 0x1137 && d->device_id == 0x0043)
+ ;
+ /* Chelsio T4/T5 */
+ else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000)
+ ;
+ else
+ {
+ clib_warning ("Unsupported Ethernet PCI device 0x%04x:0x%04x found "
+ "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id,
+ pci_addr);
+ continue;
+ }
+
+ error = vlib_pci_bind_to_uio (d, (char *) conf->uio_driver_name);
+
+ if (error)
+ {
+ if (devconf == 0)
+ {
+ pool_get (conf->dev_confs, devconf);
+ hash_set (conf->device_config_index_by_pci_addr, d->bus_address.as_u32,
+ devconf - conf->dev_confs);
+ devconf->pci_addr.as_u32 = d->bus_address.as_u32;
+ }
+ devconf->is_blacklisted = 1;
+ clib_error_report (error);
+ }
+ }));
+ /* *INDENT-ON* */
+ vec_free (pci_addr);
+}
+
+static clib_error_t *
+dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr,
+ unformat_input_t * input, u8 is_default)
+{
+ clib_error_t *error = 0;
+ uword *p;
+ dpdk_device_config_t *devconf;
+ unformat_input_t sub_input;
+
+ if (is_default)
+ {
+ devconf = &conf->default_devconf;
+ }
+ else
+ {
+ p = hash_get (conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+
+ if (!p)
+ {
+ pool_get (conf->dev_confs, devconf);
+ hash_set (conf->device_config_index_by_pci_addr, pci_addr.as_u32,
+ devconf - conf->dev_confs);
+ }
+ else
+ return clib_error_return (0,
+ "duplicate configuration for PCI address %U",
+ format_vlib_pci_addr, &pci_addr);
+ }
+
+ devconf->pci_addr.as_u32 = pci_addr.as_u32;
+ devconf->hqos_enabled = 0;
+ dpdk_device_config_hqos_default (&devconf->hqos);
+
+ if (!input)
+ return 0;
+
+ unformat_skip_white_space (input);
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "num-rx-queues %u", &devconf->num_rx_queues))
+ ;
+ else if (unformat (input, "num-tx-queues %u", &devconf->num_tx_queues))
+ ;
+ else if (unformat (input, "num-rx-desc %u", &devconf->num_rx_desc))
+ ;
+ else if (unformat (input, "num-tx-desc %u", &devconf->num_tx_desc))
+ ;
+ else if (unformat (input, "workers %U", unformat_bitmap_list,
+ &devconf->workers))
+ ;
+ else
+ if (unformat
+ (input, "rss %U", unformat_vlib_cli_sub_input, &sub_input))
+ {
+ error = unformat_rss_fn (&sub_input, &devconf->rss_fn);
+ if (error)
+ break;
+ }
+ else if (unformat (input, "vlan-strip-offload off"))
+ devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF;
+ else if (unformat (input, "vlan-strip-offload on"))
+ devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON;
+ else
+ if (unformat
+ (input, "hqos %U", unformat_vlib_cli_sub_input, &sub_input))
+ {
+ devconf->hqos_enabled = 1;
+ error = unformat_hqos (&sub_input, &devconf->hqos);
+ if (error)
+ break;
+ }
+ else if (unformat (input, "hqos"))
+ {
+ devconf->hqos_enabled = 1;
+ }
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ break;
+ }
+ }
+
+ if (error)
+ return error;
+
+ if (devconf->workers && devconf->num_rx_queues == 0)
+ devconf->num_rx_queues = clib_bitmap_count_set_bits (devconf->workers);
+ else if (devconf->workers &&
+ clib_bitmap_count_set_bits (devconf->workers) !=
+ devconf->num_rx_queues)
+ error =
+ clib_error_return (0,
+ "%U: number of worker threadds must be "
+ "equal to number of rx queues", format_vlib_pci_addr,
+ &pci_addr);
+
+ return error;
+}
+
+static clib_error_t *
+dpdk_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ clib_error_t *error = 0;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_config_main_t *conf = &dpdk_config_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_device_config_t *devconf;
+ vlib_pci_addr_t pci_addr;
+ unformat_input_t sub_input;
+ u8 *s, *tmp = 0;
+ u8 *rte_cmd = 0, *ethname = 0;
+ u32 log_level;
+ int ret, i;
+ int num_whitelisted = 0;
+ u8 no_pci = 0;
+ u8 no_huge = 0;
+ u8 huge_dir = 0;
+ u8 file_prefix = 0;
+ u8 *socket_mem = 0;
+
+ conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword));
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ /* Prime the pump */
+ if (unformat (input, "no-hugetlb"))
+ {
+ vec_add1 (conf->eal_init_args, (u8 *) "no-huge");
+ no_huge = 1;
+ }
+
+ else if (unformat (input, "enable-tcp-udp-checksum"))
+ conf->enable_tcp_udp_checksum = 1;
+
+ else if (unformat (input, "decimal-interface-names"))
+ conf->interface_name_format_decimal = 1;
+
+ else if (unformat (input, "no-multi-seg"))
+ conf->no_multi_seg = 1;
+
+ else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input,
+ &sub_input))
+ {
+ error =
+ dpdk_device_config (conf, (vlib_pci_addr_t) (u32) ~ 1, &sub_input,
+ 1);
+
+ if (error)
+ return error;
+ }
+ else
+ if (unformat
+ (input, "dev %U %U", unformat_vlib_pci_addr, &pci_addr,
+ unformat_vlib_cli_sub_input, &sub_input))
+ {
+ error = dpdk_device_config (conf, pci_addr, &sub_input, 0);
+
+ if (error)
+ return error;
+
+ num_whitelisted++;
+ }
+ else if (unformat (input, "dev %U", unformat_vlib_pci_addr, &pci_addr))
+ {
+ error = dpdk_device_config (conf, pci_addr, 0, 0);
+
+ if (error)
+ return error;
+
+ num_whitelisted++;
+ }
+ else if (unformat (input, "num-mbufs %d", &conf->num_mbufs))
+ ;
+ else if (unformat (input, "kni %d", &conf->num_kni))
+ ;
+ else if (unformat (input, "uio-driver %s", &conf->uio_driver_name))
+ ;
+ else if (unformat (input, "socket-mem %s", &socket_mem))
+ ;
+ else if (unformat (input, "no-pci"))
+ {
+ no_pci = 1;
+ tmp = format (0, "--no-pci%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ else if (unformat (input, "poll-sleep %d", &dm->poll_sleep))
+ ;
+
+#define _(a) \
+ else if (unformat(input, #a)) \
+ { \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ }
+ foreach_eal_double_hyphen_predicate_arg
+#undef _
+#define _(a) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ if (!strncmp(#a, "huge-dir", 8)) \
+ huge_dir = 1; \
+ else if (!strncmp(#a, "file-prefix", 11)) \
+ file_prefix = 1; \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (conf->eal_init_args, s); \
+ }
+ foreach_eal_double_hyphen_arg
+#undef _
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (conf->eal_init_args, s); \
+ }
+ foreach_eal_single_hyphen_arg
+#undef _
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (conf->eal_init_args, s); \
+ conf->a##_set_manually = 1; \
+ }
+ foreach_eal_single_hyphen_mandatory_arg
+#undef _
+ else if (unformat (input, "default"))
+ ;
+
+ else if (unformat_skip_white_space (input))
+ ;
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+
+ if (!conf->uio_driver_name)
+ conf->uio_driver_name = format (0, "igb_uio%c", 0);
+
+ /*
+ * Use 1G huge pages if available.
+ */
+ if (!no_huge && !huge_dir)
+ {
+ u32 x, *mem_by_socket = 0;
+ uword c = 0;
+ u8 use_1g = 1;
+ u8 use_2m = 1;
+ u8 less_than_1g = 1;
+ int rv;
+
+ umount (DEFAULT_HUGE_DIR);
+
+ /* Process "socket-mem" parameter value */
+ if (vec_len (socket_mem))
+ {
+ unformat_input_t in;
+ unformat_init_vector (&in, socket_mem);
+ while (unformat_check_input (&in) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (&in, "%u,", &x))
+ ;
+ else if (unformat (&in, "%u", &x))
+ ;
+ else if (unformat (&in, ","))
+ x = 0;
+ else
+ break;
+
+ vec_add1 (mem_by_socket, x);
+
+ if (x > 1023)
+ less_than_1g = 0;
+ }
+ /* Note: unformat_free vec_frees(in.buffer), aka socket_mem... */
+ unformat_free (&in);
+ socket_mem = 0;
+ }
+ else
+ {
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
+ {
+ vec_validate(mem_by_socket, c);
+ mem_by_socket[c] = 256; /* default per-socket mem */
+ }
+ ));
+ /* *INDENT-ON* */
+ }
+
+ /* check if available enough 1GB pages for each socket */
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
+ {
+ int pages_avail, page_size, mem;
+
+ vec_validate(mem_by_socket, c);
+ mem = mem_by_socket[c];
+
+ page_size = 1024;
+ pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
+
+ if (pages_avail < 0 || page_size * pages_avail < mem)
+ use_1g = 0;
+
+ page_size = 2;
+ pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
+
+ if (pages_avail < 0 || page_size * pages_avail < mem)
+ use_2m = 0;
+ }));
+ /* *INDENT-ON* */
+
+ if (mem_by_socket == 0)
+ {
+ error = clib_error_return (0, "mem_by_socket NULL");
+ goto done;
+ }
+ _vec_len (mem_by_socket) = c + 1;
+
+ /* regenerate socket_mem string */
+ vec_foreach_index (x, mem_by_socket)
+ socket_mem = format (socket_mem, "%s%u",
+ socket_mem ? "," : "", mem_by_socket[x]);
+ socket_mem = format (socket_mem, "%c", 0);
+
+ vec_free (mem_by_socket);
+
+ rv = mkdir (VPP_RUN_DIR, 0755);
+ if (rv && errno != EEXIST)
+ {
+ error = clib_error_return (0, "mkdir '%s' failed errno %d",
+ VPP_RUN_DIR, errno);
+ goto done;
+ }
+
+ rv = mkdir (DEFAULT_HUGE_DIR, 0755);
+ if (rv && errno != EEXIST)
+ {
+ error = clib_error_return (0, "mkdir '%s' failed errno %d",
+ DEFAULT_HUGE_DIR, errno);
+ goto done;
+ }
+
+ if (use_1g && !(less_than_1g && use_2m))
+ {
+ rv =
+ mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G");
+ }
+ else if (use_2m)
+ {
+ rv = mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL);
+ }
+ else
+ {
+ return clib_error_return (0, "not enough free huge pages");
+ }
+
+ if (rv)
+ {
+ error = clib_error_return (0, "mount failed %d", errno);
+ goto done;
+ }
+
+ tmp = format (0, "--huge-dir%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ if (!file_prefix)
+ {
+ tmp = format (0, "--file-prefix%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "vpp%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ }
+
+ vec_free (rte_cmd);
+ vec_free (ethname);
+
+ if (error)
+ return error;
+
+ /* I'll bet that -c and -n must be the first and second args... */
+ if (!conf->coremask_set_manually)
+ {
+ vlib_thread_registration_t *tr;
+ uword *coremask = 0;
+ int i;
+
+ /* main thread core */
+ coremask = clib_bitmap_set (coremask, tm->main_lcore, 1);
+
+ for (i = 0; i < vec_len (tm->registrations); i++)
+ {
+ tr = tm->registrations[i];
+ coremask = clib_bitmap_or (coremask, tr->coremask);
+ }
+
+ vec_insert (conf->eal_init_args, 2, 1);
+ conf->eal_init_args[1] = (u8 *) "-c";
+ tmp = format (0, "%U%c", format_bitmap_hex, coremask, 0);
+ conf->eal_init_args[2] = tmp;
+ clib_bitmap_free (coremask);
+ }
+
+ if (!conf->nchannels_set_manually)
+ {
+ vec_insert (conf->eal_init_args, 2, 3);
+ conf->eal_init_args[3] = (u8 *) "-n";
+ tmp = format (0, "%d", conf->nchannels);
+ conf->eal_init_args[4] = tmp;
+ }
+
+ if (no_pci == 0 && geteuid () == 0)
+ dpdk_bind_devices_to_uio (conf);
+
+#define _(x) \
+ if (devconf->x == 0 && conf->default_devconf.x > 0) \
+ devconf->x = conf->default_devconf.x ;
+
+ /* *INDENT-OFF* */
+ pool_foreach (devconf, conf->dev_confs, ({
+
+ /* default per-device config items */
+ foreach_dpdk_device_config_item
+
+ /* add DPDK EAL whitelist/blacklist entry */
+ if (num_whitelisted > 0 && devconf->is_blacklisted == 0)
+ {
+ tmp = format (0, "-w%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ else if (num_whitelisted == 0 && devconf->is_blacklisted != 0)
+ {
+ tmp = format (0, "-b%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ }));
+ /* *INDENT-ON* */
+
+#undef _
+
+ /* set master-lcore */
+ tmp = format (0, "--master-lcore%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%u%c", tm->main_lcore, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+
+ /* set socket-mem */
+ tmp = format (0, "--socket-mem%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%s%c", socket_mem, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+
+ /* NULL terminate the "argv" vector, in case of stupidity */
+ vec_add1 (conf->eal_init_args, 0);
+ _vec_len (conf->eal_init_args) -= 1;
+
+ /* Set up DPDK eal and packet mbuf pool early. */
+
+ log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE;
+
+ rte_set_log_level (log_level);
+
+ vm = vlib_get_main ();
+
+ /* make copy of args as rte_eal_init tends to mess up with arg array */
+ for (i = 1; i < vec_len (conf->eal_init_args); i++)
+ conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ",
+ conf->eal_init_args[i]);
+
+ ret =
+ rte_eal_init (vec_len (conf->eal_init_args),
+ (char **) conf->eal_init_args);
+
+ /* lazy umount hugepages */
+ umount2 (DEFAULT_HUGE_DIR, MNT_DETACH);
+
+ if (ret < 0)
+ return clib_error_return (0, "rte_eal_init returned %d", ret);
+
+ /* Dump the physical memory layout prior to creating the mbuf_pool */
+ fprintf (stdout, "DPDK physical memory layout:\n");
+ rte_dump_physmem_layout (stdout);
+
+ /* main thread 1st */
+ error = vlib_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ());
+ if (error)
+ return error;
+
+ for (i = 0; i < RTE_MAX_LCORE; i++)
+ {
+ error = vlib_buffer_pool_create (vm, conf->num_mbufs,
+ rte_lcore_to_socket_id (i));
+ if (error)
+ return error;
+ }
+
+done:
+ return error;
+}
+
+VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk");
+
+void
+dpdk_update_link_state (dpdk_device_t * xd, f64 now)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ struct rte_eth_link prev_link = xd->link;
+ u32 hw_flags = 0;
+ u8 hw_flags_chg = 0;
+
+ /* only update link state for PMD interfaces */
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ return;
+
+ xd->time_last_link_update = now ? now : xd->time_last_link_update;
+ memset (&xd->link, 0, sizeof (xd->link));
+ rte_eth_link_get_nowait (xd->device_index, &xd->link);
+
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t *vm = vlib_get_main ();
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format =
+ "update-link-state: sw_if_index %d, admin_up %d,"
+ "old link_state %d new link_state %d",.format_args = "i4i1i1i1",};
+
+ struct
+ {
+ u32 sw_if_index;
+ u8 admin_up;
+ u8 old_link_state;
+ u8 new_link_state;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0;
+ ed->old_link_state = (u8)
+ vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index);
+ ed->new_link_state = (u8) xd->link.link_status;
+ }
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) &&
+ ((xd->link.link_status != 0) ^
+ vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index)))
+ {
+ hw_flags_chg = 1;
+ hw_flags |= (xd->link.link_status ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0);
+ }
+
+ if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_duplex)
+ {
+ case ETH_LINK_HALF_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX;
+ break;
+ case ETH_LINK_FULL_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX;
+ break;
+ default:
+ break;
+ }
+ }
+ if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_speed)
+ {
+ case ETH_SPEED_NUM_10M:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M;
+ break;
+ case ETH_SPEED_NUM_100M:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M;
+ break;
+ case ETH_SPEED_NUM_1G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G;
+ break;
+ case ETH_SPEED_NUM_10G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G;
+ break;
+ case ETH_SPEED_NUM_40G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G;
+ break;
+ case 0:
+ break;
+ default:
+ clib_warning ("unknown link speed %d", xd->link.link_speed);
+ break;
+ }
+ }
+ if (hw_flags_chg)
+ {
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t *vm = vlib_get_main ();
+
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format =
+ "update-link-state: sw_if_index %d, new flags %d",.format_args
+ = "i4i4",};
+
+ struct
+ {
+ u32 sw_if_index;
+ u32 flags;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->flags = hw_flags;
+ }
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, hw_flags);
+ }
+}
+
+static uword
+dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ clib_error_t *error;
+ vnet_main_t *vnm = vnet_get_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ ethernet_main_t *em = &ethernet_main;
+ dpdk_device_t *xd;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ int i;
+
+ error = dpdk_lib_init (dm);
+
+ /*
+ * Turn on the input node if we found some devices to drive
+ * and we're not running worker threads or i/o threads
+ */
+
+ if (error == 0 && vec_len (dm->devices) > 0)
+ {
+ if (tm->n_vlib_mains == 1)
+ vlib_node_set_state (vm, dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ if (vec_len (dm->devices_by_cpu[i]) > 0)
+ vlib_node_set_state (vlib_mains[i], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ }
+
+ if (error)
+ clib_error_report (error);
+
+ tm->worker_thread_release = 1;
+
+ f64 now = vlib_time_now (vm);
+ vec_foreach (xd, dm->devices)
+ {
+ dpdk_update_link_state (xd, now);
+ }
+
+ {
+ /*
+ * Extra set up for bond interfaces:
+ * 1. Setup MACs for bond interfaces and their slave links which was set
+ * in dpdk_port_setup() but needs to be done again here to take effect.
+ * 2. Set up info for bond interface related CLI support.
+ */
+ int nports = rte_eth_dev_count ();
+ if (nports > 0)
+ {
+ for (i = 0; i < nports; i++)
+ {
+ struct rte_eth_dev_info dev_info;
+ rte_eth_dev_info_get (i, &dev_info);
+ if (!dev_info.driver_name)
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+ dev_info.driver_name = dev_info.pci_dev->driver->name;
+#else
+ dev_info.driver_name = dev_info.pci_dev->driver->driver.name;
+#endif
+ ASSERT (dev_info.driver_name);
+ if (strncmp (dev_info.driver_name, "rte_bond_pmd", 12) == 0)
+ {
+ u8 addr[6];
+ u8 slink[16];
+ int nlink = rte_eth_bond_slaves_get (i, slink, 16);
+ if (nlink > 0)
+ {
+ vnet_hw_interface_t *bhi;
+ ethernet_interface_t *bei;
+ int rv;
+
+ /* Get MAC of 1st slave link */
+ rte_eth_macaddr_get (slink[0],
+ (struct ether_addr *) addr);
+ /* Set MAC of bounded interface to that of 1st slave link */
+ rv =
+ rte_eth_bond_mac_address_set (i,
+ (struct ether_addr *)
+ addr);
+ if (rv < 0)
+ clib_warning ("Failed to set MAC address");
+
+ /* Populate MAC of bonded interface in VPP hw tables */
+ bhi =
+ vnet_get_hw_interface (vnm,
+ dm->devices[i].vlib_hw_if_index);
+ bei =
+ pool_elt_at_index (em->interfaces, bhi->hw_instance);
+ clib_memcpy (bhi->hw_address, addr, 6);
+ clib_memcpy (bei->address, addr, 6);
+ /* Init l3 packet size allowed on bonded interface */
+ bhi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES;
+ bhi->max_l3_packet_bytes[VLIB_RX] =
+ bhi->max_l3_packet_bytes[VLIB_TX] =
+ ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t);
+ while (nlink >= 1)
+ { /* for all slave links */
+ int slave = slink[--nlink];
+ dpdk_device_t *sdev = &dm->devices[slave];
+ vnet_hw_interface_t *shi;
+ vnet_sw_interface_t *ssi;
+ /* Add MAC to all slave links except the first one */
+ if (nlink)
+ rte_eth_dev_mac_addr_add (slave,
+ (struct ether_addr *)
+ addr, 0);
+ /* Set slaves bitmap for bonded interface */
+ bhi->bond_info =
+ clib_bitmap_set (bhi->bond_info,
+ sdev->vlib_hw_if_index, 1);
+ /* Set slave link flags on slave interface */
+ shi =
+ vnet_get_hw_interface (vnm, sdev->vlib_hw_if_index);
+ ssi =
+ vnet_get_sw_interface (vnm, sdev->vlib_sw_if_index);
+ shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE;
+ ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE;
+
+ /* Set l3 packet size allowed as the lowest of slave */
+ if (bhi->max_l3_packet_bytes[VLIB_RX] >
+ shi->max_l3_packet_bytes[VLIB_RX])
+ bhi->max_l3_packet_bytes[VLIB_RX] =
+ bhi->max_l3_packet_bytes[VLIB_TX] =
+ shi->max_l3_packet_bytes[VLIB_RX];
+
+ /* Set max packet size allowed as the lowest of slave */
+ if (bhi->max_packet_bytes > shi->max_packet_bytes)
+ bhi->max_packet_bytes = shi->max_packet_bytes;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ while (1)
+ {
+ /*
+ * check each time through the loop in case intervals are changed
+ */
+ f64 min_wait = dm->link_state_poll_interval < dm->stat_poll_interval ?
+ dm->link_state_poll_interval : dm->stat_poll_interval;
+
+ vlib_process_wait_for_event_or_clock (vm, min_wait);
+
+ if (dm->admin_up_down_in_progress)
+ /* skip the poll if an admin up down is in progress (on any interface) */
+ continue;
+
+ vec_foreach (xd, dm->devices)
+ {
+ f64 now = vlib_time_now (vm);
+ if ((now - xd->time_last_stats_update) >= dm->stat_poll_interval)
+ dpdk_update_counters (xd, now);
+ if ((now - xd->time_last_link_update) >= dm->link_state_poll_interval)
+ dpdk_update_link_state (xd, now);
+
+ }
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (dpdk_process_node,static) = {
+ .function = dpdk_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "dpdk-process",
+ .process_log2_n_stack_bytes = 17,
+};
+/* *INDENT-ON* */
+
+int
+dpdk_set_stat_poll_interval (f64 interval)
+{
+ if (interval < DPDK_MIN_STATS_POLL_INTERVAL)
+ return (VNET_API_ERROR_INVALID_VALUE);
+
+ dpdk_main.stat_poll_interval = interval;
+
+ return 0;
+}
+
+int
+dpdk_set_link_state_poll_interval (f64 interval)
+{
+ if (interval < DPDK_MIN_LINK_POLL_INTERVAL)
+ return (VNET_API_ERROR_INVALID_VALUE);
+
+ dpdk_main.link_state_poll_interval = interval;
+
+ return 0;
+}
+
+clib_error_t *
+dpdk_init (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ vlib_node_t *ei;
+ clib_error_t *error = 0;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+ /* verify that structs are cacheline aligned */
+ STATIC_ASSERT (offsetof (dpdk_device_t, cacheline0) == 0,
+ "Cache line marker must be 1st element in dpdk_device_t");
+ STATIC_ASSERT (offsetof (dpdk_device_t, cacheline1) ==
+ CLIB_CACHE_LINE_BYTES,
+ "Data in cache line 0 is bigger than cache line size");
+ STATIC_ASSERT (offsetof (dpdk_worker_t, cacheline0) == 0,
+ "Cache line marker must be 1st element in dpdk_worker_t");
+ STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0,
+ "Cache line marker must be 1st element in frame_queue_trace_t");
+
+ dm->vlib_main = vm;
+ dm->vnet_main = vnet_get_main ();
+ dm->conf = &dpdk_config_main;
+
+ ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input");
+ if (ei == 0)
+ return clib_error_return (0, "ethernet-input node AWOL");
+
+ dm->ethernet_input_node_index = ei->index;
+
+ dm->conf->nchannels = 4;
+ dm->conf->num_mbufs = dm->conf->num_mbufs ? dm->conf->num_mbufs : NB_MBUF;
+ vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet");
+
+ dm->dpdk_device_by_kni_port_id = hash_create (0, sizeof (uword));
+ dm->vu_sw_if_index_by_listener_fd = hash_create (0, sizeof (uword));
+ dm->vu_sw_if_index_by_sock_fd = hash_create (0, sizeof (uword));
+
+ /* $$$ use n_thread_stacks since it's known-good at this point */
+ vec_validate (dm->recycle, tm->n_thread_stacks - 1);
+
+ /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */
+ dm->buffer_flags_template =
+ (VLIB_BUFFER_TOTAL_LENGTH_VALID | VNET_BUFFER_RTE_MBUF_VALID
+ | IP_BUFFER_L4_CHECKSUM_COMPUTED | IP_BUFFER_L4_CHECKSUM_CORRECT);
+
+ dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL;
+ dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL;
+
+ /* init CLI */
+ if ((error = vlib_call_init_function (vm, dpdk_cli_init)))
+ return error;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (dpdk_init);
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/cli.c b/src/vnet/devices/dpdk/ipsec/cli.c
new file mode 100644
index 00000000000..3b634e036da
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/cli.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+
+static void
+dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display)
+{
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ u32 i, skip_master;
+
+ if (detail_display)
+ vlib_cli_output (vm, "worker\t%10s\t%15s\tdir\tdev\tqp\n",
+ "cipher", "auth");
+ else
+ vlib_cli_output (vm, "worker\tcrypto device id(type)\n");
+
+ skip_master = vlib_num_workers () > 0;
+
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ {
+ uword key, data;
+ u32 cpu_index = vlib_mains[i]->cpu_index;
+ crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+ u8 *s = 0;
+
+ if (skip_master)
+ {
+ skip_master = 0;
+ continue;
+ }
+
+ if (!detail_display)
+ {
+ i32 last_cdev = -1;
+ crypto_qp_data_t *qpd;
+
+ s = format (s, "%u\t", cpu_index);
+
+ /* *INDENT-OFF* */
+ vec_foreach (qpd, cwm->qp_data)
+ {
+ u32 dev_id = qpd->dev_id;
+
+ if ((u16) last_cdev != dev_id)
+ {
+ struct rte_cryptodev_info cdev_info;
+
+ rte_cryptodev_info_get (dev_id, &cdev_info);
+
+ s = format(s, "%u(%s)\t", dev_id, cdev_info.feature_flags &
+ RTE_CRYPTODEV_FF_HW_ACCELERATED ? "HW" : "SW");
+ }
+ last_cdev = dev_id;
+ }
+ /* *INDENT-ON* */
+ vlib_cli_output (vm, "%s", s);
+ }
+ else
+ {
+ char cipher_str[15], auth_str[15];
+ struct rte_cryptodev_capabilities cap;
+ crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *) & key;
+ /* *INDENT-OFF* */
+ hash_foreach (key, data, cwm->algo_qp_map,
+ ({
+ cap.op = RTE_CRYPTO_OP_TYPE_SYMMETRIC;
+ cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+ cap.sym.cipher.algo = p_key->cipher_algo;
+ check_algo_is_supported (&cap, cipher_str);
+ cap.op = RTE_CRYPTO_OP_TYPE_SYMMETRIC;
+ cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_AUTH;
+ cap.sym.auth.algo = p_key->auth_algo;
+ check_algo_is_supported (&cap, auth_str);
+ vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n",
+ vlib_mains[i]->cpu_index, cipher_str, auth_str,
+ p_key->is_outbound ? "out" : "in",
+ cwm->qp_data[data].dev_id,
+ cwm->qp_data[data].qp_id);
+ }));
+ /* *INDENT-ON* */
+ }
+ }
+}
+
+static clib_error_t *
+lcore_cryptodev_map_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u16 detail = 0;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "verbose"))
+ detail = 1;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ dpdk_ipsec_show_mapping (vm, detail);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (lcore_cryptodev_map, static) = {
+ .path = "show crypto device mapping",
+ .short_help =
+ "show cryptodev device mapping <verbose>",
+ .function = lcore_cryptodev_map_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/crypto_node.c b/src/vnet/devices/dpdk/ipsec/crypto_node.c
new file mode 100644
index 00000000000..7b32704ec05
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/crypto_node.c
@@ -0,0 +1,210 @@
+/*
+ *------------------------------------------------------------------
+ * crypto_node.c - DPDK Cryptodev input node
+ *
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ipsec/ipsec.h>
+
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+
+#define foreach_dpdk_crypto_input_next \
+ _(DROP, "error-drop") \
+ _(ENCRYPT_POST, "dpdk-esp-encrypt-post") \
+ _(DECRYPT_POST, "dpdk-esp-decrypt-post")
+
+typedef enum
+{
+#define _(f,s) DPDK_CRYPTO_INPUT_NEXT_##f,
+ foreach_dpdk_crypto_input_next
+#undef _
+ DPDK_CRYPTO_INPUT_N_NEXT,
+} dpdk_crypto_input_next_t;
+
+#define foreach_dpdk_crypto_input_error \
+ _(DQ_COPS, "Crypto ops dequeued") \
+ _(COP_FAILED, "Crypto op failed")
+
+typedef enum
+{
+#define _(f,s) DPDK_CRYPTO_INPUT_ERROR_##f,
+ foreach_dpdk_crypto_input_error
+#undef _
+ DPDK_CRYPTO_INPUT_N_ERROR,
+} dpdk_crypto_input_error_t;
+
+static char *dpdk_crypto_input_error_strings[] = {
+#define _(n, s) s,
+ foreach_dpdk_crypto_input_error
+#undef _
+};
+
+vlib_node_registration_t dpdk_crypto_input_node;
+
+typedef struct
+{
+ u32 cdev;
+ u32 qp;
+ u32 status;
+ u32 sa_idx;
+ u32 next_index;
+} dpdk_crypto_input_trace_t;
+
+static u8 *
+format_dpdk_crypto_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ dpdk_crypto_input_trace_t *t = va_arg (*args, dpdk_crypto_input_trace_t *);
+
+ s = format (s, "dpdk_crypto: cryptodev-id %u queue-pair %u next-index %d",
+ t->cdev, t->qp, t->next_index);
+
+ s = format (s, " status %u sa-idx %u\n", t->status, t->sa_idx);
+
+ return s;
+}
+
+static_always_inline u32
+dpdk_crypto_dequeue (vlib_main_t * vm, vlib_node_runtime_t * node,
+ crypto_qp_data_t * qpd)
+{
+ u32 n_deq, *to_next = 0, next_index, n_cops, def_next_index;
+ struct rte_crypto_op **cops = qpd->cops;
+
+ if (qpd->inflights == 0)
+ return 0;
+
+ if (qpd->is_outbound)
+ def_next_index = DPDK_CRYPTO_INPUT_NEXT_ENCRYPT_POST;
+ else
+ def_next_index = DPDK_CRYPTO_INPUT_NEXT_DECRYPT_POST;
+
+ n_cops = rte_cryptodev_dequeue_burst (qpd->dev_id, qpd->qp_id,
+ cops, VLIB_FRAME_SIZE);
+ n_deq = n_cops;
+ next_index = def_next_index;
+
+ qpd->inflights -= n_cops;
+ ASSERT (qpd->inflights >= 0);
+
+ while (n_cops > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_cops > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t *b0 = 0;
+ struct rte_crypto_op *cop;
+ struct rte_crypto_sym_op *sym_cop;
+
+ cop = cops[0];
+ cops += 1;
+ n_cops -= 1;
+ n_left_to_next -= 1;
+
+ next0 = def_next_index;
+
+ if (PREDICT_FALSE (cop->status != RTE_CRYPTO_OP_STATUS_SUCCESS))
+ {
+ next0 = DPDK_CRYPTO_INPUT_NEXT_DROP;
+ vlib_node_increment_counter (vm, dpdk_crypto_input_node.index,
+ DPDK_CRYPTO_INPUT_ERROR_COP_FAILED,
+ 1);
+ }
+ cop->status = RTE_CRYPTO_OP_STATUS_NOT_PROCESSED;
+
+ sym_cop = (struct rte_crypto_sym_op *) (cop + 1);
+ b0 = vlib_buffer_from_rte_mbuf (sym_cop->m_src);
+ bi0 = vlib_get_buffer_index (vm, b0);
+
+ to_next[0] = bi0;
+ to_next += 1;
+
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vlib_trace_next_frame (vm, node, next0);
+ dpdk_crypto_input_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->cdev = qpd->dev_id;
+ tr->qp = qpd->qp_id;
+ tr->status = cop->status;
+ tr->next_index = next0;
+ tr->sa_idx = vnet_buffer (b0)->ipsec.sad_index;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ crypto_free_cop (qpd, qpd->cops, n_deq);
+
+ vlib_node_increment_counter (vm, dpdk_crypto_input_node.index,
+ DPDK_CRYPTO_INPUT_ERROR_DQ_COPS, n_deq);
+ return n_deq;
+}
+
+static uword
+dpdk_crypto_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 cpu_index = os_get_cpu_number ();
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+ crypto_qp_data_t *qpd;
+ u32 n_deq = 0;
+
+ /* *INDENT-OFF* */
+ vec_foreach (qpd, cwm->qp_data)
+ n_deq += dpdk_crypto_dequeue(vm, node, qpd);
+ /* *INDENT-ON* */
+
+ return n_deq;
+}
+
+VLIB_REGISTER_NODE (dpdk_crypto_input_node) =
+{
+ .function = dpdk_crypto_input_fn,.name = "dpdk-crypto-input",.format_trace =
+ format_dpdk_crypto_input_trace,.type = VLIB_NODE_TYPE_INPUT,.state =
+ VLIB_NODE_STATE_DISABLED,.n_errors =
+ DPDK_CRYPTO_INPUT_N_ERROR,.error_strings =
+ dpdk_crypto_input_error_strings,.n_next_nodes =
+ DPDK_CRYPTO_INPUT_N_NEXT,.next_nodes =
+ {
+#define _(s,n) [DPDK_CRYPTO_INPUT_NEXT_##s] = n,
+ foreach_dpdk_crypto_input_next
+#undef _
+ }
+,};
+
+#if DPDK_CRYPTO==1
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_crypto_input_node, dpdk_crypto_input_fn)
+#endif
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/dir.dox b/src/vnet/devices/dpdk/ipsec/dir.dox
new file mode 100644
index 00000000000..ffebfc4d62e
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/dir.dox
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ @dir vnet/vnet/devices/dpdk/ipsec
+ @brief IPSec ESP encrypt/decrypt using DPDK Cryptodev API
+*/
diff --git a/src/vnet/devices/dpdk/ipsec/dpdk_crypto_ipsec_doc.md b/src/vnet/devices/dpdk/ipsec/dpdk_crypto_ipsec_doc.md
new file mode 100644
index 00000000000..8089696f4a0
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/dpdk_crypto_ipsec_doc.md
@@ -0,0 +1,73 @@
+# VPP IPSec implementation using DPDK Cryptodev API {#dpdk_crypto_ipsec_doc}
+
+This document is meant to contain all related information about implementation and usability.
+
+
+## VPP IPsec with DPDK Cryptodev
+
+DPDK Cryptodev is an asynchronous crypto API that supports both Hardware and Software implementations (for more details refer to [DPDK Cryptography Device Library documentation](http://dpdk.org/doc/guides/prog_guide/cryptodev_lib.html)).
+
+When DPDK Cryptodev support is enabled, the node graph is modified by adding and replacing some of the nodes.
+
+The following nodes are replaced:
+* esp-encrypt -> dpdk-esp-encrypt
+* esp-decrypt -> dpdk-esp-decrypt
+
+The following nodes are added:
+* dpdk-crypto-input : polling input node, basically dequeuing from crypto devices.
+* dpdk-esp-encrypt-post : internal node.
+* dpdk-esp-decrypt-post : internal node.
+
+
+### How to enable VPP IPSec with DPDK Cryptodev support
+
+To enable DPDK Cryptodev support (disabled by default), we need the following env option:
+
+ vpp_uses_dpdk_cryptodev=yes
+
+A couple of ways to achive this:
+* uncomment/add it in the platforms config (ie. build-data/platforms/vpp.mk)
+* set the option when building vpp (ie. make vpp_uses_dpdk_cryptodev=yes build-release)
+
+
+### Crypto Resources allocation
+
+VPP allocates crypto resources based on a best effort approach:
+* first allocate Hardware crypto resources, then Software.
+* if there are not enough crypto resources for all workers, all packets will be dropped if they reach ESP encrypt/decrypt nodes, displaying the warning:
+
+ 0: dpdk_ipsec_init: not enough cryptodevs for ipsec
+
+
+### Configuration example
+
+No especial IPsec configuration is required.
+
+Once DPDK Cryptodev is enabled, the user just needs to provide cryptodevs in the startup.conf.
+
+Example startup.conf:
+
+```
+dpdk {
+ socket-mem 1024,1024
+ num-mbufs 131072
+ dev 0000:81:00.0
+ dev 0000:81:00.1
+ dev 0000:85:01.0
+ dev 0000:85:01.1
+ vdev cryptodev_aesni_mb_pmd,socket_id=1
+ vdev cryptodev_aesni_mb_pmd,socket_id=1
+}
+```
+
+In the above configuration:
+* 0000:85:01.0 and 0000:85:01.1 are crypto BDFs and they require the same driver binding as DPDK Ethernet devices but they do not support any extra configuration options.
+* Two AESNI-MB Software Cryptodev PMDs are created in NUMA node 1.
+
+For further details refer to [DPDK Crypto Device Driver documentation](http://dpdk.org/doc/guides/cryptodevs/index.html)
+
+### Operational data
+
+The following CLI command displays the Cryptodev/Worker mapping:
+
+ show crypto device mapping [verbose]
diff --git a/src/vnet/devices/dpdk/ipsec/esp.h b/src/vnet/devices/dpdk/ipsec/esp.h
new file mode 100644
index 00000000000..7ef90c49816
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/esp.h
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DPDK_ESP_H__
+#define __DPDK_ESP_H__
+
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/esp.h>
+
+typedef struct
+{
+ enum rte_crypto_cipher_algorithm algo;
+ u8 key_len;
+ u8 iv_len;
+} dpdk_esp_crypto_alg_t;
+
+typedef struct
+{
+ enum rte_crypto_auth_algorithm algo;
+ u8 trunc_size;
+} dpdk_esp_integ_alg_t;
+
+typedef struct
+{
+ dpdk_esp_crypto_alg_t *esp_crypto_algs;
+ dpdk_esp_integ_alg_t *esp_integ_algs;
+} dpdk_esp_main_t;
+
+dpdk_esp_main_t dpdk_esp_main;
+
+static_always_inline void
+dpdk_esp_init ()
+{
+ dpdk_esp_main_t *em = &dpdk_esp_main;
+ dpdk_esp_integ_alg_t *i;
+ dpdk_esp_crypto_alg_t *c;
+
+ vec_validate (em->esp_crypto_algs, IPSEC_CRYPTO_N_ALG - 1);
+
+ c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_128];
+ c->algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ c->key_len = 16;
+ c->iv_len = 16;
+
+ c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_192];
+ c->algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ c->key_len = 24;
+ c->iv_len = 16;
+
+ c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_256];
+ c->algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ c->key_len = 32;
+ c->iv_len = 16;
+
+ c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_GCM_128];
+ c->algo = RTE_CRYPTO_CIPHER_AES_GCM;
+ c->key_len = 16;
+ c->iv_len = 8;
+
+ vec_validate (em->esp_integ_algs, IPSEC_INTEG_N_ALG - 1);
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA1_96];
+ i->algo = RTE_CRYPTO_AUTH_SHA1_HMAC;
+ i->trunc_size = 12;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_256_96];
+ i->algo = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ i->trunc_size = 12;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_256_128];
+ i->algo = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ i->trunc_size = 16;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_384_192];
+ i->algo = RTE_CRYPTO_AUTH_SHA384_HMAC;
+ i->trunc_size = 24;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_512_256];
+ i->algo = RTE_CRYPTO_AUTH_SHA512_HMAC;
+ i->trunc_size = 32;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_AES_GCM_128];
+ i->algo = RTE_CRYPTO_AUTH_AES_GCM;
+ i->trunc_size = 16;
+}
+
+static_always_inline int
+add_del_sa_sess (u32 sa_index, u8 is_add)
+{
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ crypto_worker_main_t *cwm;
+ u8 skip_master = vlib_num_workers () > 0;
+
+ /* *INDENT-OFF* */
+ vec_foreach (cwm, dcm->workers_main)
+ {
+ crypto_sa_session_t *sa_sess;
+ u8 is_outbound;
+
+ if (skip_master)
+ {
+ skip_master = 0;
+ continue;
+ }
+
+ for (is_outbound = 0; is_outbound < 2; is_outbound++)
+ {
+ if (is_add)
+ {
+ pool_get (cwm->sa_sess_d[is_outbound], sa_sess);
+ }
+ else
+ {
+ u8 dev_id;
+
+ sa_sess = pool_elt_at_index (cwm->sa_sess_d[is_outbound], sa_index);
+ dev_id = cwm->qp_data[sa_sess->qp_index].dev_id;
+
+ if (!sa_sess->sess)
+ continue;
+
+ if (rte_cryptodev_sym_session_free(dev_id, sa_sess->sess))
+ {
+ clib_warning("failed to free session");
+ return -1;
+ }
+ memset(sa_sess, 0, sizeof(sa_sess[0]));
+ }
+ }
+ }
+ /* *INDENT-OFF* */
+
+ return 0;
+}
+
+static_always_inline int
+translate_crypto_algo(ipsec_crypto_alg_t crypto_algo,
+ struct rte_crypto_sym_xform *cipher_xform)
+{
+ switch (crypto_algo)
+ {
+ case IPSEC_CRYPTO_ALG_NONE:
+ cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_NULL;
+ break;
+ case IPSEC_CRYPTO_ALG_AES_CBC_128:
+ case IPSEC_CRYPTO_ALG_AES_CBC_192:
+ case IPSEC_CRYPTO_ALG_AES_CBC_256:
+ cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ break;
+ case IPSEC_CRYPTO_ALG_AES_GCM_128:
+ cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_GCM;
+ break;
+ default:
+ return -1;
+ }
+
+ cipher_xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+
+ return 0;
+}
+
+static_always_inline int
+translate_integ_algo(ipsec_integ_alg_t integ_alg,
+ struct rte_crypto_sym_xform *auth_xform, int use_esn)
+{
+ switch (integ_alg) {
+ case IPSEC_INTEG_ALG_NONE:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_NULL;
+ auth_xform->auth.digest_length = 0;
+ break;
+ case IPSEC_INTEG_ALG_SHA1_96:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA1_HMAC;
+ auth_xform->auth.digest_length = 12;
+ break;
+ case IPSEC_INTEG_ALG_SHA_256_96:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ auth_xform->auth.digest_length = 12;
+ break;
+ case IPSEC_INTEG_ALG_SHA_256_128:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ auth_xform->auth.digest_length = 16;
+ break;
+ case IPSEC_INTEG_ALG_SHA_384_192:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA384_HMAC;
+ auth_xform->auth.digest_length = 24;
+ break;
+ case IPSEC_INTEG_ALG_SHA_512_256:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA512_HMAC;
+ auth_xform->auth.digest_length = 32;
+ break;
+ case IPSEC_INTEG_ALG_AES_GCM_128:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_AES_GCM;
+ auth_xform->auth.digest_length = 16;
+ auth_xform->auth.add_auth_data_length = use_esn? 12 : 8;
+ break;
+ default:
+ return -1;
+ }
+
+ auth_xform->type = RTE_CRYPTO_SYM_XFORM_AUTH;
+
+ return 0;
+}
+
+static_always_inline int
+create_sym_sess(ipsec_sa_t *sa, crypto_sa_session_t *sa_sess, u8 is_outbound)
+{
+ u32 cpu_index = os_get_cpu_number();
+ dpdk_crypto_main_t * dcm = &dpdk_crypto_main;
+ crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+ struct rte_crypto_sym_xform cipher_xform = {0};
+ struct rte_crypto_sym_xform auth_xform = {0};
+ struct rte_crypto_sym_xform *xfs;
+ uword key = 0, *data;
+ crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *)&key;
+
+ if (sa->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128)
+ {
+ sa->crypto_key_len -= 4;
+ clib_memcpy(&sa->salt, &sa->crypto_key[sa->crypto_key_len], 4);
+ }
+ else
+ {
+ sa->salt = (u32) rand();
+ }
+
+ cipher_xform.type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+ cipher_xform.cipher.key.data = sa->crypto_key;
+ cipher_xform.cipher.key.length = sa->crypto_key_len;
+
+ auth_xform.type = RTE_CRYPTO_SYM_XFORM_AUTH;
+ auth_xform.auth.key.data = sa->integ_key;
+ auth_xform.auth.key.length = sa->integ_key_len;
+
+ if (translate_crypto_algo(sa->crypto_alg, &cipher_xform) < 0)
+ return -1;
+ p_key->cipher_algo = cipher_xform.cipher.algo;
+
+ if (translate_integ_algo(sa->integ_alg, &auth_xform, sa->use_esn) < 0)
+ return -1;
+ p_key->auth_algo = auth_xform.auth.algo;
+
+ if (is_outbound)
+ {
+ cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT;
+ auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_GENERATE;
+ cipher_xform.next = &auth_xform;
+ xfs = &cipher_xform;
+ }
+ else
+ {
+ cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT;
+ auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_VERIFY;
+ auth_xform.next = &cipher_xform;
+ xfs = &auth_xform;
+ }
+
+ p_key->is_outbound = is_outbound;
+
+ data = hash_get(cwm->algo_qp_map, key);
+ if (!data)
+ return -1;
+
+ sa_sess->sess =
+ rte_cryptodev_sym_session_create(cwm->qp_data[*data].dev_id, xfs);
+
+ if (!sa_sess->sess)
+ return -1;
+
+ sa_sess->qp_index = (u8)*data;
+
+ return 0;
+}
+
+#endif /* __DPDK_ESP_H__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/esp_decrypt.c b/src/vnet/devices/dpdk/ipsec/esp_decrypt.c
new file mode 100644
index 00000000000..89ab9f9bc43
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/esp_decrypt.c
@@ -0,0 +1,583 @@
+/*
+ * esp_decrypt.c : IPSec ESP Decrypt node using DPDK Cryptodev
+ *
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/esp.h>
+
+#define foreach_esp_decrypt_next \
+_(DROP, "error-drop") \
+_(IP4_INPUT, "ip4-input") \
+_(IP6_INPUT, "ip6-input")
+
+#define _(v, s) ESP_DECRYPT_NEXT_##v,
+typedef enum {
+ foreach_esp_decrypt_next
+#undef _
+ ESP_DECRYPT_N_NEXT,
+} esp_decrypt_next_t;
+
+#define foreach_esp_decrypt_error \
+ _(RX_PKTS, "ESP pkts received") \
+ _(DECRYPTION_FAILED, "ESP decryption failed") \
+ _(REPLAY, "SA replayed packet") \
+ _(NOT_IP, "Not IP packet (dropped)") \
+ _(ENQ_FAIL, "Enqueue failed (buffer full)") \
+ _(NO_CRYPTODEV, "Cryptodev not configured") \
+ _(BAD_LEN, "Invalid ciphertext length") \
+ _(UNSUPPORTED, "Cipher/Auth not supported")
+
+
+typedef enum {
+#define _(sym,str) ESP_DECRYPT_ERROR_##sym,
+ foreach_esp_decrypt_error
+#undef _
+ ESP_DECRYPT_N_ERROR,
+} esp_decrypt_error_t;
+
+static char * esp_decrypt_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_decrypt_error
+#undef _
+};
+
+vlib_node_registration_t dpdk_esp_decrypt_node;
+
+typedef struct {
+ ipsec_crypto_alg_t crypto_alg;
+ ipsec_integ_alg_t integ_alg;
+} esp_decrypt_trace_t;
+
+/* packet trace format function */
+static u8 * format_esp_decrypt_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ esp_decrypt_trace_t * t = va_arg (*args, esp_decrypt_trace_t *);
+
+ s = format (s, "esp: crypto %U integrity %U",
+ format_ipsec_crypto_alg, t->crypto_alg,
+ format_ipsec_integ_alg, t->integ_alg);
+ return s;
+}
+
+static uword
+dpdk_esp_decrypt_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, *to_next, next_index;
+ ipsec_main_t *im = &ipsec_main;
+ u32 cpu_index = os_get_cpu_number();
+ dpdk_crypto_main_t * dcm = &dpdk_crypto_main;
+ dpdk_esp_main_t * em = &dpdk_esp_main;
+ u32 i;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (PREDICT_FALSE(!dcm->workers_main))
+ {
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_NO_CRYPTODEV, n_left_from);
+ vlib_buffer_free(vm, from, n_left_from);
+ return n_left_from;
+ }
+
+ crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, cpu_index);
+ u32 n_qps = vec_len(cwm->qp_data);
+ struct rte_crypto_op ** cops_to_enq[n_qps];
+ u32 n_cop_qp[n_qps], * bi_to_enq[n_qps];
+
+ for (i = 0; i < n_qps; i++)
+ {
+ bi_to_enq[i] = cwm->qp_data[i].bi;
+ cops_to_enq[i] = cwm->qp_data[i].cops;
+ }
+
+ memset(n_cop_qp, 0, n_qps * sizeof(u32));
+
+ crypto_alloc_cops();
+
+ next_index = ESP_DECRYPT_NEXT_DROP;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, sa_index0 = ~0, seq, icv_size, iv_size;
+ vlib_buffer_t * b0;
+ esp_header_t * esp0;
+ ipsec_sa_t * sa0;
+ struct rte_mbuf * mb0 = 0;
+ const int BLOCK_SIZE = 16;
+ crypto_sa_session_t * sa_sess;
+ void * sess;
+ u16 qp_index;
+ struct rte_crypto_op * cop = 0;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ esp0 = vlib_buffer_get_current (b0);
+
+ sa_index0 = vnet_buffer(b0)->ipsec.sad_index;
+ sa0 = pool_elt_at_index (im->sad, sa_index0);
+
+ seq = clib_host_to_net_u32(esp0->seq);
+
+ /* anti-replay check */
+ if (sa0->use_anti_replay)
+ {
+ int rv = 0;
+
+ if (PREDICT_TRUE(sa0->use_esn))
+ rv = esp_replay_check_esn(sa0, seq);
+ else
+ rv = esp_replay_check(sa0, seq);
+
+ if (PREDICT_FALSE(rv))
+ {
+ clib_warning ("anti-replay SPI %u seq %u", sa0->spi, seq);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_REPLAY, 1);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ goto trace;
+ }
+ }
+
+ if (PREDICT_FALSE(sa0->integ_alg == IPSEC_INTEG_ALG_NONE) ||
+ PREDICT_FALSE(sa0->crypto_alg == IPSEC_CRYPTO_ALG_NONE))
+ {
+ clib_warning ("SPI %u : only cipher + auth supported", sa0->spi);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_UNSUPPORTED, 1);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ goto trace;
+ }
+
+ sa_sess = pool_elt_at_index(cwm->sa_sess_d[0], sa_index0);
+
+ if (PREDICT_FALSE(!sa_sess->sess))
+ {
+ int ret = create_sym_sess(sa0, sa_sess, 0);
+ ASSERT(ret == 0);
+ }
+
+ sess = sa_sess->sess;
+ qp_index = sa_sess->qp_index;
+
+ ASSERT (vec_len (vec_elt (cwm->qp_data, qp_index).free_cops) > 0);
+ cop = vec_pop (vec_elt (cwm->qp_data, qp_index).free_cops);
+ ASSERT (cop->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED);
+
+ cops_to_enq[qp_index][0] = cop;
+ cops_to_enq[qp_index] += 1;
+ n_cop_qp[qp_index] += 1;
+ bi_to_enq[qp_index][0] = bi0;
+ bi_to_enq[qp_index] += 1;
+
+ rte_crypto_op_attach_sym_session(cop, sess);
+
+ icv_size = em->esp_integ_algs[sa0->integ_alg].trunc_size;
+ iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len;
+
+ /* Convert vlib buffer to mbuf */
+ mb0 = rte_mbuf_from_vlib_buffer(b0);
+ mb0->data_len = b0->current_length;
+ mb0->pkt_len = b0->current_length;
+ mb0->data_off = RTE_PKTMBUF_HEADROOM + b0->current_data;
+
+ /* Outer IP header has already been stripped */
+ u16 payload_len = rte_pktmbuf_pkt_len(mb0) - sizeof (esp_header_t) -
+ iv_size - icv_size;
+
+ if ((payload_len & (BLOCK_SIZE - 1)) || (payload_len <= 0))
+ {
+ clib_warning ("payload %u not multiple of %d\n",
+ payload_len, BLOCK_SIZE);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_BAD_LEN, 1);
+ vec_add (vec_elt (cwm->qp_data, qp_index).free_cops, &cop, 1);
+ bi_to_enq[qp_index] -= 1;
+ cops_to_enq[qp_index] -= 1;
+ n_cop_qp[qp_index] -= 1;
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ goto trace;
+ }
+
+ struct rte_crypto_sym_op *sym_cop = (struct rte_crypto_sym_op *)(cop + 1);
+
+ sym_cop->m_src = mb0;
+ sym_cop->cipher.data.offset = sizeof (esp_header_t) + iv_size;
+ sym_cop->cipher.data.length = payload_len;
+
+ u8 *iv = rte_pktmbuf_mtod_offset(mb0, void*, sizeof (esp_header_t));
+ dpdk_cop_priv_t * priv = (dpdk_cop_priv_t *)(sym_cop + 1);
+
+ if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128)
+ {
+ dpdk_gcm_cnt_blk *icb = &priv->cb;
+ icb->salt = sa0->salt;
+ clib_memcpy(icb->iv, iv, 8);
+ icb->cnt = clib_host_to_net_u32(1);
+ sym_cop->cipher.iv.data = (u8 *)icb;
+ sym_cop->cipher.iv.phys_addr = cop->phys_addr +
+ (uintptr_t)icb - (uintptr_t)cop;
+ sym_cop->cipher.iv.length = 16;
+
+ u8 *aad = priv->aad;
+ clib_memcpy(aad, iv - sizeof(esp_header_t), 8);
+ sym_cop->auth.aad.data = aad;
+ sym_cop->auth.aad.phys_addr = cop->phys_addr +
+ (uintptr_t)aad - (uintptr_t)cop;
+ if (sa0->use_esn)
+ {
+ *((u32*)&aad[8]) = sa0->seq_hi;
+ sym_cop->auth.aad.length = 12;
+ }
+ else
+ {
+ sym_cop->auth.aad.length = 8;
+ }
+
+ sym_cop->auth.digest.data = rte_pktmbuf_mtod_offset(mb0, void*,
+ rte_pktmbuf_pkt_len(mb0) - icv_size);
+ sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset(mb0,
+ rte_pktmbuf_pkt_len(mb0) - icv_size);
+ sym_cop->auth.digest.length = icv_size;
+
+ }
+ else
+ {
+ sym_cop->cipher.iv.data = rte_pktmbuf_mtod_offset(mb0, void*,
+ sizeof (esp_header_t));
+ sym_cop->cipher.iv.phys_addr = rte_pktmbuf_mtophys_offset(mb0,
+ sizeof (esp_header_t));
+ sym_cop->cipher.iv.length = iv_size;
+
+ if (sa0->use_esn)
+ {
+ dpdk_cop_priv_t* priv = (dpdk_cop_priv_t*) (sym_cop + 1);
+ u8* payload_end = rte_pktmbuf_mtod_offset(
+ mb0, u8*, sizeof(esp_header_t) + iv_size + payload_len);
+
+ clib_memcpy (priv->icv, payload_end, icv_size);
+ *((u32*) payload_end) = sa0->seq_hi;
+ sym_cop->auth.data.offset = 0;
+ sym_cop->auth.data.length = sizeof(esp_header_t) + iv_size
+ + payload_len + sizeof(sa0->seq_hi);
+ sym_cop->auth.digest.data = priv->icv;
+ sym_cop->auth.digest.phys_addr = cop->phys_addr
+ + (uintptr_t) priv->icv - (uintptr_t) cop;
+ sym_cop->auth.digest.length = icv_size;
+ }
+ else
+ {
+ sym_cop->auth.data.offset = 0;
+ sym_cop->auth.data.length = sizeof(esp_header_t) +
+ iv_size + payload_len;
+
+ sym_cop->auth.digest.data = rte_pktmbuf_mtod_offset(mb0, void*,
+ rte_pktmbuf_pkt_len(mb0) - icv_size);
+ sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset(mb0,
+ rte_pktmbuf_pkt_len(mb0) - icv_size);
+ sym_cop->auth.digest.length = icv_size;
+ }
+ }
+
+trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->crypto_alg = sa0->crypto_alg;
+ tr->integ_alg = sa0->integ_alg;
+ }
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_RX_PKTS,
+ from_frame->n_vectors);
+ crypto_qp_data_t *qpd;
+ /* *INDENT-OFF* */
+ vec_foreach_index (i, cwm->qp_data)
+ {
+ u32 enq;
+
+ qpd = vec_elt_at_index(cwm->qp_data, i);
+ enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id,
+ qpd->cops, n_cop_qp[i]);
+ qpd->inflights += enq;
+
+ if (PREDICT_FALSE(enq < n_cop_qp[i]))
+ {
+ crypto_free_cop (qpd, &qpd->cops[enq], n_cop_qp[i] - enq);
+ vlib_buffer_free (vm, &qpd->bi[enq], n_cop_qp[i] - enq);
+
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_ENQ_FAIL,
+ n_cop_qp[i] - enq);
+ }
+ }
+ /* *INDENT-ON* */
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dpdk_esp_decrypt_node) = {
+ .function = dpdk_esp_decrypt_node_fn,
+ .name = "dpdk-esp-decrypt",
+ .vector_size = sizeof (u32),
+ .format_trace = format_esp_decrypt_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(esp_decrypt_error_strings),
+ .error_strings = esp_decrypt_error_strings,
+
+ .n_next_nodes = ESP_DECRYPT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ESP_DECRYPT_NEXT_##s] = n,
+ foreach_esp_decrypt_next
+#undef _
+ },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_decrypt_node, dpdk_esp_decrypt_node_fn)
+
+/*
+ * Decrypt Post Node
+ */
+
+#define foreach_esp_decrypt_post_error \
+ _(PKTS, "ESP post pkts")
+
+typedef enum {
+#define _(sym,str) ESP_DECRYPT_POST_ERROR_##sym,
+ foreach_esp_decrypt_post_error
+#undef _
+ ESP_DECRYPT_POST_N_ERROR,
+} esp_decrypt_post_error_t;
+
+static char * esp_decrypt_post_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_decrypt_post_error
+#undef _
+};
+
+vlib_node_registration_t dpdk_esp_decrypt_post_node;
+
+static u8 * format_esp_decrypt_post_trace (u8 * s, va_list * args)
+{
+ return s;
+}
+
+static uword
+dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, *to_next = 0, next_index;
+ ipsec_sa_t * sa0;
+ u32 sa_index0 = ~0;
+ ipsec_main_t *im = &ipsec_main;
+ dpdk_esp_main_t *em = &dpdk_esp_main;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ esp_footer_t * f0;
+ u32 bi0, next0, icv_size, iv_size;
+ vlib_buffer_t * b0 = 0;
+ ip4_header_t *ih4 = 0, *oh4 = 0;
+ ip6_header_t *ih6 = 0, *oh6 = 0;
+ u8 tunnel_mode = 1;
+ u8 transport_ip6 = 0;
+
+ next0 = ESP_DECRYPT_NEXT_DROP;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sa_index0 = vnet_buffer(b0)->ipsec.sad_index;
+ sa0 = pool_elt_at_index (im->sad, sa_index0);
+
+ to_next[0] = bi0;
+ to_next += 1;
+
+ icv_size = em->esp_integ_algs[sa0->integ_alg].trunc_size;
+ iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len;
+
+ if (sa0->use_anti_replay)
+ {
+ esp_header_t * esp0 = vlib_buffer_get_current (b0);
+ u32 seq;
+ seq = clib_host_to_net_u32(esp0->seq);
+ if (PREDICT_TRUE(sa0->use_esn))
+ esp_replay_advance_esn(sa0, seq);
+ else
+ esp_replay_advance(sa0, seq);
+ }
+
+ ih4 = (ip4_header_t *) (b0->data + sizeof(ethernet_header_t));
+ vlib_buffer_advance (b0, sizeof (esp_header_t) + iv_size);
+
+ b0->current_length -= (icv_size + 2);
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ f0 = (esp_footer_t *) ((u8 *) vlib_buffer_get_current (b0) +
+ b0->current_length);
+ b0->current_length -= f0->pad_length;
+
+ /* transport mode */
+ if (PREDICT_FALSE(!sa0->is_tunnel && !sa0->is_tunnel_ip6))
+ {
+ tunnel_mode = 0;
+
+ if (PREDICT_TRUE((ih4->ip_version_and_header_length & 0xF0) != 0x40))
+ {
+ if (PREDICT_TRUE((ih4->ip_version_and_header_length & 0xF0) == 0x60))
+ transport_ip6 = 1;
+ else
+ {
+ clib_warning("next header: 0x%x", f0->next_header);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_NOT_IP, 1);
+ goto trace;
+ }
+ }
+ }
+
+ if (PREDICT_TRUE (tunnel_mode))
+ {
+ if (PREDICT_TRUE(f0->next_header == IP_PROTOCOL_IP_IN_IP))
+ next0 = ESP_DECRYPT_NEXT_IP4_INPUT;
+ else if (f0->next_header == IP_PROTOCOL_IPV6)
+ next0 = ESP_DECRYPT_NEXT_IP6_INPUT;
+ else
+ {
+ clib_warning("next header: 0x%x", f0->next_header);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_DECRYPTION_FAILED,
+ 1);
+ goto trace;
+ }
+ }
+ /* transport mode */
+ else
+ {
+ if (PREDICT_FALSE(transport_ip6))
+ {
+ ih6 = (ip6_header_t *) (b0->data + sizeof(ethernet_header_t));
+ vlib_buffer_advance (b0, -sizeof(ip6_header_t));
+ oh6 = vlib_buffer_get_current (b0);
+ memmove(oh6, ih6, sizeof(ip6_header_t));
+
+ next0 = ESP_DECRYPT_NEXT_IP6_INPUT;
+ oh6->protocol = f0->next_header;
+ oh6->payload_length =
+ clib_host_to_net_u16 (
+ vlib_buffer_length_in_chain(vm, b0) -
+ sizeof (ip6_header_t));
+ }
+ else
+ {
+ vlib_buffer_advance (b0, -sizeof(ip4_header_t));
+ oh4 = vlib_buffer_get_current (b0);
+ memmove(oh4, ih4, sizeof(ip4_header_t));
+
+ next0 = ESP_DECRYPT_NEXT_IP4_INPUT;
+ oh4->ip_version_and_header_length = 0x45;
+ oh4->fragment_id = 0;
+ oh4->flags_and_fragment_offset = 0;
+ oh4->protocol = f0->next_header;
+ oh4->length = clib_host_to_net_u16 (
+ vlib_buffer_length_in_chain (vm, b0));
+ oh4->checksum = ip4_header_checksum (oh4);
+ }
+ }
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32)~0;
+
+trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->crypto_alg = sa0->crypto_alg;
+ tr->integ_alg = sa0->integ_alg;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_post_node.index,
+ ESP_DECRYPT_POST_ERROR_PKTS,
+ from_frame->n_vectors);
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dpdk_esp_decrypt_post_node) = {
+ .function = dpdk_esp_decrypt_post_node_fn,
+ .name = "dpdk-esp-decrypt-post",
+ .vector_size = sizeof (u32),
+ .format_trace = format_esp_decrypt_post_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(esp_decrypt_post_error_strings),
+ .error_strings = esp_decrypt_post_error_strings,
+
+ .n_next_nodes = ESP_DECRYPT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ESP_DECRYPT_NEXT_##s] = n,
+ foreach_esp_decrypt_next
+#undef _
+ },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_decrypt_post_node, dpdk_esp_decrypt_post_node_fn)
diff --git a/src/vnet/devices/dpdk/ipsec/esp_encrypt.c b/src/vnet/devices/dpdk/ipsec/esp_encrypt.c
new file mode 100644
index 00000000000..10bb4616eef
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/esp_encrypt.c
@@ -0,0 +1,598 @@
+/*
+ * esp_encrypt.c : IPSec ESP encrypt node using DPDK Cryptodev
+ *
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/esp.h>
+
+#define foreach_esp_encrypt_next \
+_(DROP, "error-drop") \
+_(IP4_LOOKUP, "ip4-lookup") \
+_(IP6_LOOKUP, "ip6-lookup") \
+_(INTERFACE_OUTPUT, "interface-output")
+
+#define _(v, s) ESP_ENCRYPT_NEXT_##v,
+typedef enum
+{
+ foreach_esp_encrypt_next
+#undef _
+ ESP_ENCRYPT_N_NEXT,
+} esp_encrypt_next_t;
+
+#define foreach_esp_encrypt_error \
+ _(RX_PKTS, "ESP pkts received") \
+ _(SEQ_CYCLED, "sequence number cycled") \
+ _(ENQ_FAIL, "Enqueue failed (buffer full)") \
+ _(NO_CRYPTODEV, "Cryptodev not configured") \
+ _(UNSUPPORTED, "Cipher/Auth not supported")
+
+
+typedef enum
+{
+#define _(sym,str) ESP_ENCRYPT_ERROR_##sym,
+ foreach_esp_encrypt_error
+#undef _
+ ESP_ENCRYPT_N_ERROR,
+} esp_encrypt_error_t;
+
+static char *esp_encrypt_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_encrypt_error
+#undef _
+};
+
+vlib_node_registration_t dpdk_esp_encrypt_node;
+
+typedef struct
+{
+ u32 spi;
+ u32 seq;
+ ipsec_crypto_alg_t crypto_alg;
+ ipsec_integ_alg_t integ_alg;
+} esp_encrypt_trace_t;
+
+/* packet trace format function */
+static u8 *
+format_esp_encrypt_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ esp_encrypt_trace_t *t = va_arg (*args, esp_encrypt_trace_t *);
+
+ s = format (s, "esp: spi %u seq %u crypto %U integrity %U",
+ t->spi, t->seq,
+ format_ipsec_crypto_alg, t->crypto_alg,
+ format_ipsec_integ_alg, t->integ_alg);
+ return s;
+}
+
+static uword
+dpdk_esp_encrypt_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, *to_next, next_index;
+ ipsec_main_t *im = &ipsec_main;
+ u32 cpu_index = os_get_cpu_number ();
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ dpdk_esp_main_t *em = &dpdk_esp_main;
+ u32 i;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (PREDICT_FALSE (!dcm->workers_main))
+ {
+ /* Likely there are not enough cryptodevs, so drop frame */
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_NO_CRYPTODEV,
+ n_left_from);
+ vlib_buffer_free (vm, from, n_left_from);
+ return n_left_from;
+ }
+
+ crypto_worker_main_t *cwm = vec_elt_at_index (dcm->workers_main, cpu_index);
+ u32 n_qps = vec_len (cwm->qp_data);
+ struct rte_crypto_op **cops_to_enq[n_qps];
+ u32 n_cop_qp[n_qps], *bi_to_enq[n_qps];
+
+ for (i = 0; i < n_qps; i++)
+ {
+ bi_to_enq[i] = cwm->qp_data[i].bi;
+ cops_to_enq[i] = cwm->qp_data[i].cops;
+ }
+
+ memset (n_cop_qp, 0, n_qps * sizeof (u32));
+
+ crypto_alloc_cops ();
+
+ next_index = ESP_ENCRYPT_NEXT_DROP;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t *b0 = 0;
+ u32 sa_index0;
+ ipsec_sa_t *sa0;
+ ip4_and_esp_header_t *ih0, *oh0 = 0;
+ ip6_and_esp_header_t *ih6_0, *oh6_0 = 0;
+ struct rte_mbuf *mb0 = 0;
+ esp_footer_t *f0;
+ u8 is_ipv6;
+ u8 ip_hdr_size;
+ u8 next_hdr_type;
+ u8 transport_mode = 0;
+ const int BLOCK_SIZE = 16;
+ u32 iv_size;
+ u16 orig_sz;
+ crypto_sa_session_t *sa_sess;
+ void *sess;
+ struct rte_crypto_op *cop = 0;
+ u16 qp_index;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ sa_index0 = vnet_buffer (b0)->ipsec.sad_index;
+ sa0 = pool_elt_at_index (im->sad, sa_index0);
+
+ if (PREDICT_FALSE (esp_seq_advance (sa0)))
+ {
+ clib_warning ("sequence number counter has cycled SPI %u",
+ sa0->spi);
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_SEQ_CYCLED, 1);
+ //TODO: rekey SA
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ goto trace;
+ }
+
+ sa_sess = pool_elt_at_index (cwm->sa_sess_d[1], sa_index0);
+ if (PREDICT_FALSE (!sa_sess->sess))
+ {
+ int ret = create_sym_sess (sa0, sa_sess, 1);
+ ASSERT (ret == 0);
+ }
+
+ qp_index = sa_sess->qp_index;
+ sess = sa_sess->sess;
+
+ ASSERT (vec_len (vec_elt (cwm->qp_data, qp_index).free_cops) > 0);
+ cop = vec_pop (vec_elt (cwm->qp_data, qp_index).free_cops);
+ ASSERT (cop->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED);
+
+ cops_to_enq[qp_index][0] = cop;
+ cops_to_enq[qp_index] += 1;
+ n_cop_qp[qp_index] += 1;
+ bi_to_enq[qp_index][0] = bi0;
+ bi_to_enq[qp_index] += 1;
+
+ ssize_t adv;
+ iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len;
+ ih0 = vlib_buffer_get_current (b0);
+ orig_sz = b0->current_length;
+ is_ipv6 = (ih0->ip4.ip_version_and_header_length & 0xF0) == 0x60;
+ /* is ipv6 */
+ if (PREDICT_TRUE (sa0->is_tunnel))
+ {
+ if (PREDICT_TRUE (!is_ipv6))
+ adv = -sizeof (ip4_and_esp_header_t);
+ else
+ adv = -sizeof (ip6_and_esp_header_t);
+ }
+ else
+ {
+ adv = -sizeof (esp_header_t);
+ if (PREDICT_TRUE (!is_ipv6))
+ orig_sz -= sizeof (ip4_header_t);
+ else
+ orig_sz -= sizeof (ip6_header_t);
+ }
+
+ /*transport mode save the eth header before it is overwritten */
+ if (PREDICT_FALSE (!sa0->is_tunnel))
+ {
+ ethernet_header_t *ieh0 = (ethernet_header_t *)
+ ((u8 *) vlib_buffer_get_current (b0) -
+ sizeof (ethernet_header_t));
+ ethernet_header_t *oeh0 =
+ (ethernet_header_t *) ((u8 *) ieh0 + (adv - iv_size));
+ clib_memcpy (oeh0, ieh0, sizeof (ethernet_header_t));
+ }
+
+ vlib_buffer_advance (b0, adv - iv_size);
+
+ /* XXX IP6/ip4 and IP4/IP6 not supported, only IP4/IP4 and IP6/IP6 */
+
+ /* is ipv6 */
+ if (PREDICT_FALSE (is_ipv6))
+ {
+ ih6_0 = (ip6_and_esp_header_t *) ih0;
+ ip_hdr_size = sizeof (ip6_header_t);
+ oh6_0 = vlib_buffer_get_current (b0);
+
+ if (PREDICT_TRUE (sa0->is_tunnel))
+ {
+ next_hdr_type = IP_PROTOCOL_IPV6;
+ oh6_0->ip6.ip_version_traffic_class_and_flow_label =
+ ih6_0->ip6.ip_version_traffic_class_and_flow_label;
+ }
+ else
+ {
+ next_hdr_type = ih6_0->ip6.protocol;
+ memmove (oh6_0, ih6_0, sizeof (ip6_header_t));
+ }
+
+ oh6_0->ip6.protocol = IP_PROTOCOL_IPSEC_ESP;
+ oh6_0->ip6.hop_limit = 254;
+ oh6_0->esp.spi = clib_net_to_host_u32 (sa0->spi);
+ oh6_0->esp.seq = clib_net_to_host_u32 (sa0->seq);
+ }
+ else
+ {
+ ip_hdr_size = sizeof (ip4_header_t);
+ oh0 = vlib_buffer_get_current (b0);
+
+ if (PREDICT_TRUE (sa0->is_tunnel))
+ {
+ next_hdr_type = IP_PROTOCOL_IP_IN_IP;
+ oh0->ip4.tos = ih0->ip4.tos;
+ }
+ else
+ {
+ next_hdr_type = ih0->ip4.protocol;
+ memmove (oh0, ih0, sizeof (ip4_header_t));
+ }
+
+ oh0->ip4.ip_version_and_header_length = 0x45;
+ oh0->ip4.fragment_id = 0;
+ oh0->ip4.flags_and_fragment_offset = 0;
+ oh0->ip4.ttl = 254;
+ oh0->ip4.protocol = IP_PROTOCOL_IPSEC_ESP;
+ oh0->esp.spi = clib_net_to_host_u32 (sa0->spi);
+ oh0->esp.seq = clib_net_to_host_u32 (sa0->seq);
+ }
+
+ if (PREDICT_TRUE (sa0->is_tunnel && !sa0->is_tunnel_ip6))
+ {
+ oh0->ip4.src_address.as_u32 = sa0->tunnel_src_addr.ip4.as_u32;
+ oh0->ip4.dst_address.as_u32 = sa0->tunnel_dst_addr.ip4.as_u32;
+
+ /* in tunnel mode send it back to FIB */
+ next0 = ESP_ENCRYPT_NEXT_IP4_LOOKUP;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ }
+ else if (sa0->is_tunnel && sa0->is_tunnel_ip6)
+ {
+ oh6_0->ip6.src_address.as_u64[0] =
+ sa0->tunnel_src_addr.ip6.as_u64[0];
+ oh6_0->ip6.src_address.as_u64[1] =
+ sa0->tunnel_src_addr.ip6.as_u64[1];
+ oh6_0->ip6.dst_address.as_u64[0] =
+ sa0->tunnel_dst_addr.ip6.as_u64[0];
+ oh6_0->ip6.dst_address.as_u64[1] =
+ sa0->tunnel_dst_addr.ip6.as_u64[1];
+
+ /* in tunnel mode send it back to FIB */
+ next0 = ESP_ENCRYPT_NEXT_IP6_LOOKUP;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ }
+ else
+ {
+ next0 = ESP_ENCRYPT_NEXT_INTERFACE_OUTPUT;
+ transport_mode = 1;
+ }
+
+ ASSERT (sa0->crypto_alg < IPSEC_CRYPTO_N_ALG);
+ ASSERT (sa0->crypto_alg != IPSEC_CRYPTO_ALG_NONE);
+
+ int blocks = 1 + (orig_sz + 1) / BLOCK_SIZE;
+
+ /* pad packet in input buffer */
+ u8 pad_bytes = BLOCK_SIZE * blocks - 2 - orig_sz;
+ u8 i;
+ u8 *padding = vlib_buffer_get_current (b0) + b0->current_length;
+
+ for (i = 0; i < pad_bytes; ++i)
+ padding[i] = i + 1;
+
+ f0 = vlib_buffer_get_current (b0) + b0->current_length + pad_bytes;
+ f0->pad_length = pad_bytes;
+ f0->next_header = next_hdr_type;
+ b0->current_length += pad_bytes + 2 +
+ em->esp_integ_algs[sa0->integ_alg].trunc_size;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] =
+ vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ struct rte_crypto_sym_op *sym_cop;
+ sym_cop = (struct rte_crypto_sym_op *) (cop + 1);
+
+ dpdk_cop_priv_t *priv = (dpdk_cop_priv_t *) (sym_cop + 1);
+
+ vnet_buffer (b0)->unused[0] = next0;
+
+ mb0 = rte_mbuf_from_vlib_buffer (b0);
+ mb0->data_len = b0->current_length;
+ mb0->pkt_len = b0->current_length;
+ mb0->data_off = RTE_PKTMBUF_HEADROOM + b0->current_data;
+
+ rte_crypto_op_attach_sym_session (cop, sess);
+
+ sym_cop->m_src = mb0;
+
+ dpdk_gcm_cnt_blk *icb = &priv->cb;
+ icb->salt = sa0->salt;
+ icb->iv[0] = sa0->seq;
+ icb->iv[1] = sa0->seq_hi;
+
+ if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128)
+ {
+ icb->cnt = clib_host_to_net_u32 (1);
+ clib_memcpy (vlib_buffer_get_current (b0) + ip_hdr_size +
+ sizeof (esp_header_t), icb->iv, 8);
+ sym_cop->cipher.data.offset =
+ ip_hdr_size + sizeof (esp_header_t) + iv_size;
+ sym_cop->cipher.data.length = BLOCK_SIZE * blocks;
+ sym_cop->cipher.iv.length = 16;
+ }
+ else
+ {
+ sym_cop->cipher.data.offset =
+ ip_hdr_size + sizeof (esp_header_t);
+ sym_cop->cipher.data.length = BLOCK_SIZE * blocks + iv_size;
+ sym_cop->cipher.iv.length = iv_size;
+ }
+
+ sym_cop->cipher.iv.data = (u8 *) icb;
+ sym_cop->cipher.iv.phys_addr = cop->phys_addr + (uintptr_t) icb
+ - (uintptr_t) cop;
+
+
+ ASSERT (sa0->integ_alg < IPSEC_INTEG_N_ALG);
+ ASSERT (sa0->integ_alg != IPSEC_INTEG_ALG_NONE);
+
+ if (PREDICT_FALSE (sa0->integ_alg == IPSEC_INTEG_ALG_AES_GCM_128))
+ {
+ u8 *aad = priv->aad;
+ clib_memcpy (aad, vlib_buffer_get_current (b0) + ip_hdr_size,
+ 8);
+ sym_cop->auth.aad.data = aad;
+ sym_cop->auth.aad.phys_addr = cop->phys_addr +
+ (uintptr_t) aad - (uintptr_t) cop;
+
+ if (PREDICT_FALSE (sa0->use_esn))
+ {
+ *((u32 *) & aad[8]) = sa0->seq_hi;
+ sym_cop->auth.aad.length = 12;
+ }
+ else
+ {
+ sym_cop->auth.aad.length = 8;
+ }
+ }
+ else
+ {
+ sym_cop->auth.data.offset = ip_hdr_size;
+ sym_cop->auth.data.length = b0->current_length - ip_hdr_size
+ - em->esp_integ_algs[sa0->integ_alg].trunc_size;
+
+ if (PREDICT_FALSE (sa0->use_esn))
+ {
+ u8 *payload_end =
+ vlib_buffer_get_current (b0) + b0->current_length;
+ *((u32 *) payload_end) = sa0->seq_hi;
+ sym_cop->auth.data.length += sizeof (sa0->seq_hi);
+ }
+ }
+ sym_cop->auth.digest.data = vlib_buffer_get_current (b0) +
+ b0->current_length -
+ em->esp_integ_algs[sa0->integ_alg].trunc_size;
+ sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset (mb0,
+ b0->current_length
+ -
+ em->esp_integ_algs
+ [sa0->integ_alg].trunc_size);
+ sym_cop->auth.digest.length =
+ em->esp_integ_algs[sa0->integ_alg].trunc_size;
+
+
+ if (PREDICT_FALSE (is_ipv6))
+ {
+ oh6_0->ip6.payload_length =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) -
+ sizeof (ip6_header_t));
+ }
+ else
+ {
+ oh0->ip4.length =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+ oh0->ip4.checksum = ip4_header_checksum (&oh0->ip4);
+ }
+
+ if (transport_mode)
+ vlib_buffer_advance (b0, -sizeof (ethernet_header_t));
+
+ trace:
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ esp_encrypt_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->spi = sa0->spi;
+ tr->seq = sa0->seq - 1;
+ tr->crypto_alg = sa0->crypto_alg;
+ tr->integ_alg = sa0->integ_alg;
+ }
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_RX_PKTS,
+ from_frame->n_vectors);
+ crypto_qp_data_t *qpd;
+ /* *INDENT-OFF* */
+ vec_foreach_index (i, cwm->qp_data)
+ {
+ u32 enq;
+
+ qpd = vec_elt_at_index(cwm->qp_data, i);
+ enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id,
+ qpd->cops, n_cop_qp[i]);
+ qpd->inflights += enq;
+
+ if (PREDICT_FALSE(enq < n_cop_qp[i]))
+ {
+ crypto_free_cop (qpd, &qpd->cops[enq], n_cop_qp[i] - enq);
+ vlib_buffer_free (vm, &qpd->bi[enq], n_cop_qp[i] - enq);
+
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_ENQ_FAIL,
+ n_cop_qp[i] - enq);
+ }
+ }
+ /* *INDENT-ON* */
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dpdk_esp_encrypt_node) =
+{
+ .function = dpdk_esp_encrypt_node_fn,.name = "dpdk-esp-encrypt",.flags =
+ VLIB_NODE_FLAG_IS_OUTPUT,.vector_size = sizeof (u32),.format_trace =
+ format_esp_encrypt_trace,.n_errors =
+ ARRAY_LEN (esp_encrypt_error_strings),.error_strings =
+ esp_encrypt_error_strings,.n_next_nodes = 1,.next_nodes =
+ {
+ [ESP_ENCRYPT_NEXT_DROP] = "error-drop",}
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_encrypt_node, dpdk_esp_encrypt_node_fn)
+/*
+ * ESP Encrypt Post Node
+ */
+#define foreach_esp_encrypt_post_error \
+ _(PKTS, "ESP post pkts")
+ typedef enum
+ {
+#define _(sym,str) ESP_ENCRYPT_POST_ERROR_##sym,
+ foreach_esp_encrypt_post_error
+#undef _
+ ESP_ENCRYPT_POST_N_ERROR,
+ } esp_encrypt_post_error_t;
+
+ static char *esp_encrypt_post_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_encrypt_post_error
+#undef _
+ };
+
+vlib_node_registration_t dpdk_esp_encrypt_post_node;
+
+static u8 *
+format_esp_encrypt_post_trace (u8 * s, va_list * args)
+{
+ return s;
+}
+
+static uword
+dpdk_esp_encrypt_post_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, *to_next = 0, next_index;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t *b0 = 0;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ to_next[0] = bi0;
+ to_next += 1;
+
+ next0 = vnet_buffer (b0)->unused[0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, bi0,
+ next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_post_node.index,
+ ESP_ENCRYPT_POST_ERROR_PKTS,
+ from_frame->n_vectors);
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dpdk_esp_encrypt_post_node) =
+{
+ .function = dpdk_esp_encrypt_post_node_fn,.name =
+ "dpdk-esp-encrypt-post",.vector_size = sizeof (u32),.format_trace =
+ format_esp_encrypt_post_trace,.type = VLIB_NODE_TYPE_INTERNAL,.n_errors =
+ ARRAY_LEN (esp_encrypt_post_error_strings),.error_strings =
+ esp_encrypt_post_error_strings,.n_next_nodes =
+ ESP_ENCRYPT_N_NEXT,.next_nodes =
+ {
+#define _(s,n) [ESP_ENCRYPT_NEXT_##s] = n,
+ foreach_esp_encrypt_next
+#undef _
+ }
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_encrypt_post_node,
+ dpdk_esp_encrypt_post_node_fn)
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/ipsec.c b/src/vnet/devices/dpdk/ipsec/ipsec.c
new file mode 100644
index 00000000000..de253f02636
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/ipsec.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/api_errno.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/esp.h>
+#include <vnet/ipsec/ipsec.h>
+
+#define DPDK_CRYPTO_NB_OBJS 2048
+#define DPDK_CRYPTO_CACHE_SIZE 512
+#define DPDK_CRYPTO_PRIV_SIZE 128
+#define DPDK_CRYPTO_N_QUEUE_DESC 512
+#define DPDK_CRYPTO_NB_COPS (1024 * 4)
+
+/*
+ * return:
+ * -1: update failed
+ * 0: already exist
+ * 1: mapped
+ */
+static int
+update_qp_data (crypto_worker_main_t * cwm,
+ u8 cdev_id, u16 qp_id, u8 is_outbound, u16 * idx)
+{
+ crypto_qp_data_t *qpd;
+
+ /* *INDENT-OFF* */
+ vec_foreach_index (*idx, cwm->qp_data)
+ {
+ qpd = vec_elt_at_index(cwm->qp_data, *idx);
+
+ if (qpd->dev_id == cdev_id && qpd->qp_id == qp_id &&
+ qpd->is_outbound == is_outbound)
+ return 0;
+ }
+ /* *INDENT-ON* */
+
+ vec_add2 (cwm->qp_data, qpd, 1);
+
+ qpd->dev_id = cdev_id;
+ qpd->qp_id = qp_id;
+ qpd->is_outbound = is_outbound;
+
+ return 1;
+}
+
+/*
+ * return:
+ * -1: error
+ * 0: already exist
+ * 1: mapped
+ */
+static int
+add_mapping (crypto_worker_main_t * cwm,
+ u8 cdev_id, u16 qp, u8 is_outbound,
+ const struct rte_cryptodev_capabilities *cipher_cap,
+ const struct rte_cryptodev_capabilities *auth_cap)
+{
+ int mapped;
+ u16 qp_index;
+ uword key = 0, data, *ret;
+ crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *) & key;
+
+ p_key->cipher_algo = (u8) cipher_cap->sym.cipher.algo;
+ p_key->auth_algo = (u8) auth_cap->sym.auth.algo;
+ p_key->is_outbound = is_outbound;
+
+ ret = hash_get (cwm->algo_qp_map, key);
+ if (ret)
+ return 0;
+
+ mapped = update_qp_data (cwm, cdev_id, qp, is_outbound, &qp_index);
+ if (mapped < 0)
+ return -1;
+
+ data = (uword) qp_index;
+
+ ret = hash_set (cwm->algo_qp_map, key, data);
+ if (!ret)
+ rte_panic ("Failed to insert hash table\n");
+
+ return mapped;
+}
+
+/*
+ * return:
+ * 0: already exist
+ * 1: mapped
+ */
+static int
+add_cdev_mapping (crypto_worker_main_t * cwm,
+ struct rte_cryptodev_info *dev_info, u8 cdev_id,
+ u16 qp, u8 is_outbound)
+{
+ const struct rte_cryptodev_capabilities *i, *j;
+ u32 mapped = 0;
+
+ for (i = dev_info->capabilities; i->op != RTE_CRYPTO_OP_TYPE_UNDEFINED; i++)
+ {
+ if (i->sym.xform_type != RTE_CRYPTO_SYM_XFORM_CIPHER)
+ continue;
+
+ if (check_algo_is_supported (i, NULL) != 0)
+ continue;
+
+ for (j = dev_info->capabilities; j->op != RTE_CRYPTO_OP_TYPE_UNDEFINED;
+ j++)
+ {
+ int status = 0;
+
+ if (j->sym.xform_type != RTE_CRYPTO_SYM_XFORM_AUTH)
+ continue;
+
+ if (check_algo_is_supported (j, NULL) != 0)
+ continue;
+
+ status = add_mapping (cwm, cdev_id, qp, is_outbound, i, j);
+ if (status == 1)
+ mapped += 1;
+ if (status < 0)
+ return status;
+ }
+ }
+
+ return mapped;
+}
+
+static int
+check_cryptodev_queues ()
+{
+ u32 n_qs = 0;
+ u8 cdev_id;
+ u32 n_req_qs = 2;
+
+ if (vlib_num_workers () > 0)
+ n_req_qs = vlib_num_workers () * 2;
+
+ for (cdev_id = 0; cdev_id < rte_cryptodev_count (); cdev_id++)
+ {
+ struct rte_cryptodev_info cdev_info;
+
+ rte_cryptodev_info_get (cdev_id, &cdev_info);
+
+ if (!
+ (cdev_info.feature_flags & RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING))
+ continue;
+
+ n_qs += cdev_info.max_nb_queue_pairs;
+ }
+
+ if (n_qs >= n_req_qs)
+ return 0;
+ else
+ return -1;
+}
+
+static clib_error_t *
+dpdk_ipsec_init (vlib_main_t * vm)
+{
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ struct rte_cryptodev_config dev_conf;
+ struct rte_cryptodev_qp_conf qp_conf;
+ struct rte_cryptodev_info cdev_info;
+ struct rte_mempool *rmp;
+ i32 dev_id, ret;
+ u32 i, skip_master;
+
+ if (check_cryptodev_queues () < 0)
+ return clib_error_return (0, "not enough cryptodevs for ipsec");
+
+ vec_alloc (dcm->workers_main, tm->n_vlib_mains);
+ _vec_len (dcm->workers_main) = tm->n_vlib_mains;
+
+ fprintf (stdout, "DPDK Cryptodevs info:\n");
+ fprintf (stdout, "dev_id\tn_qp\tnb_obj\tcache_size\n");
+ /* HW cryptodevs have higher dev_id, use HW first */
+ for (dev_id = rte_cryptodev_count () - 1; dev_id >= 0; dev_id--)
+ {
+ u16 max_nb_qp, qp = 0;
+ skip_master = vlib_num_workers () > 0;
+
+ rte_cryptodev_info_get (dev_id, &cdev_info);
+
+ if (!
+ (cdev_info.feature_flags & RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING))
+ continue;
+
+ max_nb_qp = cdev_info.max_nb_queue_pairs;
+
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ {
+ u8 is_outbound;
+ crypto_worker_main_t *cwm;
+ uword *map;
+
+ if (skip_master)
+ {
+ skip_master = 0;
+ continue;
+ }
+
+ cwm = vec_elt_at_index (dcm->workers_main, i);
+ map = cwm->algo_qp_map;
+
+ if (!map)
+ {
+ map = hash_create (0, sizeof (crypto_worker_qp_key_t));
+ if (!map)
+ return clib_error_return (0, "unable to create hash table "
+ "for worker %u",
+ vlib_mains[i]->cpu_index);
+ cwm->algo_qp_map = map;
+ }
+
+ for (is_outbound = 0; is_outbound < 2 && qp < max_nb_qp;
+ is_outbound++)
+ {
+ int mapped = add_cdev_mapping (cwm, &cdev_info,
+ dev_id, qp, is_outbound);
+ if (mapped > 0)
+ qp++;
+
+ if (mapped < 0)
+ return clib_error_return (0,
+ "too many queues for one worker");
+ }
+ }
+
+ if (qp == 0)
+ continue;
+
+ dev_conf.socket_id = rte_cryptodev_socket_id (dev_id);
+ dev_conf.nb_queue_pairs = cdev_info.max_nb_queue_pairs;
+ dev_conf.session_mp.nb_objs = DPDK_CRYPTO_NB_OBJS;
+ dev_conf.session_mp.cache_size = DPDK_CRYPTO_CACHE_SIZE;
+
+ ret = rte_cryptodev_configure (dev_id, &dev_conf);
+ if (ret < 0)
+ return clib_error_return (0, "cryptodev %u config error", dev_id);
+
+ qp_conf.nb_descriptors = DPDK_CRYPTO_N_QUEUE_DESC;
+ for (qp = 0; qp < dev_conf.nb_queue_pairs; qp++)
+ {
+ ret = rte_cryptodev_queue_pair_setup (dev_id, qp, &qp_conf,
+ dev_conf.socket_id);
+ if (ret < 0)
+ return clib_error_return (0, "cryptodev %u qp %u setup error",
+ dev_id, qp);
+ }
+ fprintf (stdout, "%u\t%u\t%u\t%u\n", dev_id, dev_conf.nb_queue_pairs,
+ DPDK_CRYPTO_NB_OBJS, DPDK_CRYPTO_CACHE_SIZE);
+ }
+
+ u32 socket_id = rte_socket_id ();
+
+ vec_validate_aligned (dcm->cop_pools, socket_id, CLIB_CACHE_LINE_BYTES);
+
+ /* pool already exists, nothing to do */
+ if (dcm->cop_pools[socket_id])
+ return 0;
+
+ u8 *pool_name = format (0, "crypto_op_pool_socket%u%c", socket_id, 0);
+
+ rmp = rte_crypto_op_pool_create ((char *) pool_name,
+ RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ DPDK_CRYPTO_NB_COPS *
+ (1 + vlib_num_workers ()),
+ DPDK_CRYPTO_CACHE_SIZE,
+ DPDK_CRYPTO_PRIV_SIZE, socket_id);
+ vec_free (pool_name);
+
+ if (!rmp)
+ return clib_error_return (0, "failed to allocate mempool on socket %u",
+ socket_id);
+ dcm->cop_pools[socket_id] = rmp;
+
+ dpdk_esp_init ();
+
+ if (vec_len (vlib_mains) == 0)
+ vlib_node_set_state (&vlib_global_main, dpdk_crypto_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else
+ for (i = 1; i < tm->n_vlib_mains; i++)
+ vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ return 0;
+}
+
+VLIB_MAIN_LOOP_ENTER_FUNCTION (dpdk_ipsec_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/ipsec.h b/src/vnet/devices/dpdk/ipsec/ipsec.h
new file mode 100644
index 00000000000..e6c7498c0d3
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/ipsec.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DPDK_IPSEC_H__
+#define __DPDK_IPSEC_H__
+
+#include <vnet/vnet.h>
+
+#undef always_inline
+#include <rte_crypto.h>
+#include <rte_cryptodev.h>
+
+#if CLIB_DEBUG > 0
+#define always_inline static inline
+#else
+#define always_inline static inline __attribute__ ((__always_inline__))
+#endif
+
+
+#define MAX_QP_PER_LCORE 16
+
+typedef struct
+{
+ u32 salt;
+ u32 iv[2];
+ u32 cnt;
+} dpdk_gcm_cnt_blk;
+
+typedef struct
+{
+ dpdk_gcm_cnt_blk cb;
+ union
+ {
+ u8 aad[12];
+ u8 icv[64];
+ };
+} dpdk_cop_priv_t;
+
+typedef struct
+{
+ u8 cipher_algo;
+ u8 auth_algo;
+ u8 is_outbound;
+} crypto_worker_qp_key_t;
+
+typedef struct
+{
+ u16 dev_id;
+ u16 qp_id;
+ u16 is_outbound;
+ i16 inflights;
+ u32 bi[VLIB_FRAME_SIZE];
+ struct rte_crypto_op *cops[VLIB_FRAME_SIZE];
+ struct rte_crypto_op **free_cops;
+} crypto_qp_data_t;
+
+typedef struct
+{
+ u8 qp_index;
+ void *sess;
+} crypto_sa_session_t;
+
+typedef struct
+{
+ crypto_sa_session_t *sa_sess_d[2];
+ crypto_qp_data_t *qp_data;
+ uword *algo_qp_map;
+} crypto_worker_main_t;
+
+typedef struct
+{
+ struct rte_mempool **cop_pools;
+ crypto_worker_main_t *workers_main;
+} dpdk_crypto_main_t;
+
+dpdk_crypto_main_t dpdk_crypto_main;
+
+extern vlib_node_registration_t dpdk_crypto_input_node;
+
+#define CRYPTO_N_FREE_COPS (VLIB_FRAME_SIZE * 3)
+
+static_always_inline void
+crypto_alloc_cops ()
+{
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ u32 cpu_index = os_get_cpu_number ();
+ crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+ unsigned socket_id = rte_socket_id ();
+ crypto_qp_data_t *qpd;
+
+ /* *INDENT-OFF* */
+ vec_foreach (qpd, cwm->qp_data)
+ {
+ u32 l = vec_len (qpd->free_cops);
+
+ if (PREDICT_FALSE (l < VLIB_FRAME_SIZE))
+ {
+ u32 n_alloc;
+
+ if (PREDICT_FALSE (!qpd->free_cops))
+ vec_alloc (qpd->free_cops, CRYPTO_N_FREE_COPS);
+
+ n_alloc = rte_crypto_op_bulk_alloc (dcm->cop_pools[socket_id],
+ RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ &qpd->free_cops[l],
+ CRYPTO_N_FREE_COPS - l - 1);
+
+ _vec_len (qpd->free_cops) = l + n_alloc;
+ }
+ }
+ /* *INDENT-ON* */
+}
+
+static_always_inline void
+crypto_free_cop (crypto_qp_data_t * qpd, struct rte_crypto_op **cops, u32 n)
+{
+ u32 l = vec_len (qpd->free_cops);
+
+ if (l + n >= CRYPTO_N_FREE_COPS)
+ {
+ l -= VLIB_FRAME_SIZE;
+ rte_mempool_put_bulk (cops[0]->mempool,
+ (void **) &qpd->free_cops[l], VLIB_FRAME_SIZE);
+ }
+ clib_memcpy (&qpd->free_cops[l], cops, sizeof (*cops) * n);
+
+ _vec_len (qpd->free_cops) = l + n;
+}
+
+static_always_inline int
+check_algo_is_supported (const struct rte_cryptodev_capabilities *cap,
+ char *name)
+{
+ struct
+ {
+ uint8_t cipher_algo;
+ enum rte_crypto_sym_xform_type type;
+ union
+ {
+ enum rte_crypto_auth_algorithm auth;
+ enum rte_crypto_cipher_algorithm cipher;
+ };
+ char *name;
+ } supported_algo[] =
+ {
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher =
+ RTE_CRYPTO_CIPHER_NULL,.name = "NULL"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher =
+ RTE_CRYPTO_CIPHER_AES_CBC,.name = "AES_CBC"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher =
+ RTE_CRYPTO_CIPHER_AES_CTR,.name = "AES_CTR"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher =
+ RTE_CRYPTO_CIPHER_3DES_CBC,.name = "3DES-CBC"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.auth =
+ RTE_CRYPTO_CIPHER_AES_GCM,.name = "AES-GCM"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_SHA1_HMAC,.name = "HMAC-SHA1"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_SHA256_HMAC,.name = "HMAC-SHA256"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_SHA384_HMAC,.name = "HMAC-SHA384"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_SHA512_HMAC,.name = "HMAC-SHA512"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_AES_XCBC_MAC,.name = "AES-XCBC-MAC"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_AES_GCM,.name = "AES-GCM"},
+ {
+ /* tail */
+ .type = RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED},};
+ uint32_t i = 0;
+
+ if (cap->op != RTE_CRYPTO_OP_TYPE_SYMMETRIC)
+ return -1;
+
+ while (supported_algo[i].type != RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED)
+ {
+ if (cap->sym.xform_type == supported_algo[i].type)
+ {
+ if ((cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_CIPHER &&
+ cap->sym.cipher.algo == supported_algo[i].cipher) ||
+ (cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AUTH &&
+ cap->sym.auth.algo == supported_algo[i].auth))
+ {
+ if (name)
+ strcpy (name, supported_algo[i].name);
+ return 0;
+ }
+ }
+
+ i++;
+ }
+
+ return -1;
+}
+
+#endif /* __DPDK_IPSEC_H__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/node.c b/src/vnet/devices/dpdk/node.c
new file mode 100644
index 00000000000..e541cdbcbd2
--- /dev/null
+++ b/src/vnet/devices/dpdk/node.c
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls/packet.h>
+#include <vnet/handoff.h>
+#include <vnet/devices/devices.h>
+#include <vnet/feature/feature.h>
+
+#include "dpdk_priv.h"
+
+static char *dpdk_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_error
+#undef _
+};
+
+always_inline int
+vlib_buffer_is_ip4 (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) b->data;
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP4));
+}
+
+always_inline int
+vlib_buffer_is_ip6 (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) b->data;
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6));
+}
+
+always_inline int
+vlib_buffer_is_mpls (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) b->data;
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST));
+}
+
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+/* New ol_flags bits added in DPDK-16.11 */
+#define PKT_RX_IP_CKSUM_GOOD (1ULL << 7)
+#endif
+
+always_inline u32
+dpdk_rx_next_from_etype (struct rte_mbuf * mb, vlib_buffer_t * b0)
+{
+ if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0)))
+ if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0))
+ return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT;
+ else
+ return VNET_DEVICE_INPUT_NEXT_IP4_INPUT;
+ else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0)))
+ return VNET_DEVICE_INPUT_NEXT_IP6_INPUT;
+ else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0)))
+ return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT;
+ else
+ return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+}
+
+always_inline int
+dpdk_mbuf_is_vlan (struct rte_mbuf *mb)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+ return (mb->packet_type & RTE_PTYPE_L2_ETHER_VLAN) ==
+ RTE_PTYPE_L2_ETHER_VLAN;
+#else
+ return
+ (mb->ol_flags &
+ (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ_STRIPPED)) ==
+ PKT_RX_VLAN_PKT;
+#endif
+}
+
+always_inline int
+dpdk_mbuf_is_ip4 (struct rte_mbuf *mb)
+{
+ return RTE_ETH_IS_IPV4_HDR (mb->packet_type) != 0;
+}
+
+always_inline int
+dpdk_mbuf_is_ip6 (struct rte_mbuf *mb)
+{
+ return RTE_ETH_IS_IPV6_HDR (mb->packet_type) != 0;
+}
+
+always_inline u32
+dpdk_rx_next_from_mb (struct rte_mbuf * mb, vlib_buffer_t * b0)
+{
+ if (PREDICT_FALSE (dpdk_mbuf_is_vlan (mb)))
+ return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ else if (PREDICT_TRUE (dpdk_mbuf_is_ip4 (mb)))
+ return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT;
+ else if (PREDICT_TRUE (dpdk_mbuf_is_ip6 (mb)))
+ return VNET_DEVICE_INPUT_NEXT_IP6_INPUT;
+ else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0)))
+ return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT;
+ else
+ return dpdk_rx_next_from_etype (mb, b0);
+}
+
+always_inline void
+dpdk_rx_error_from_mb (struct rte_mbuf *mb, u32 * next, u8 * error)
+{
+ if (mb->ol_flags & PKT_RX_IP_CKSUM_BAD)
+ {
+ *error = DPDK_ERROR_IP_CHECKSUM_ERROR;
+ *next = VNET_DEVICE_INPUT_NEXT_DROP;
+ }
+ else
+ *error = DPDK_ERROR_NONE;
+}
+
+void
+dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id, u32 * buffers, uword n_buffers)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ u32 *b, n_left;
+ u32 next0;
+
+ n_left = n_buffers;
+ b = buffers;
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ dpdk_rx_dma_trace_t *t0;
+ struct rte_mbuf *mb;
+ u8 error0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ mb = rte_mbuf_from_vlib_buffer (b0);
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ next0 = xd->per_interface_next_index;
+ else if (PREDICT_TRUE
+ ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0))
+ next0 = dpdk_rx_next_from_mb (mb, b0);
+ else
+ next0 = dpdk_rx_next_from_etype (mb, b0);
+
+ dpdk_rx_error_from_mb (mb, &next0, &error0);
+
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = bi0;
+
+ clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
+ clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ clib_memcpy (t0->buffer.pre_data, b0->data,
+ sizeof (t0->buffer.pre_data));
+ clib_memcpy (&t0->data, mb->buf_addr + mb->data_off, sizeof (t0->data));
+
+ b += 1;
+ }
+}
+
+static inline u32
+dpdk_rx_burst (dpdk_main_t * dm, dpdk_device_t * xd, u16 queue_id)
+{
+ u32 n_buffers;
+ u32 n_left;
+ u32 n_this_chunk;
+
+ n_left = VLIB_FRAME_SIZE;
+ n_buffers = 0;
+
+ if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD))
+ {
+ while (n_left)
+ {
+ n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id,
+ xd->rx_vectors[queue_id] +
+ n_buffers, n_left);
+ n_buffers += n_this_chunk;
+ n_left -= n_this_chunk;
+
+ /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */
+ if (n_this_chunk < 32)
+ break;
+ }
+ }
+ else
+ {
+ ASSERT (0);
+ }
+
+ return n_buffers;
+}
+
+
+static_always_inline void
+dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b,
+ struct rte_mbuf *mb, vlib_buffer_free_list_t * fl)
+{
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb_seg = 0;
+ vlib_buffer_t *b_seg, *b_chain = 0;
+ mb_seg = mb->next;
+ b_chain = b;
+
+ while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
+ {
+ ASSERT (mb_seg != 0);
+
+ b_seg = vlib_buffer_from_rte_mbuf (mb_seg);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+
+ ASSERT ((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT (b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data =
+ (mb_seg->buf_addr + mb_seg->data_off) - (void *) b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b->total_length_not_including_first_buffer += mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+}
+
+static_always_inline void
+dpdk_prefetch_buffer (struct rte_mbuf *mb)
+{
+ vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb);
+ CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, STORE);
+}
+
+/*
+ * This function is used when there are no worker threads.
+ * The main thread performs IO and forwards the packets.
+ */
+static_always_inline u32
+dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
+ vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id)
+{
+ u32 n_buffers;
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ u32 n_left_to_next, *to_next;
+ u32 mb_index;
+ vlib_main_t *vm = vlib_get_main ();
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__ ((unused));
+ vlib_buffer_free_list_t *fl;
+ u32 buffer_flags_template;
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
+ return 0;
+
+ n_buffers = dpdk_rx_burst (dm, xd, queue_id);
+
+ if (n_buffers == 0)
+ {
+ return 0;
+ }
+
+ buffer_flags_template = dm->buffer_flags_template;
+
+ vec_reset_length (xd->d_trace_buffers[cpu_index]);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node);
+
+ if (n_trace > 0)
+ {
+ u32 n = clib_min (n_trace, n_buffers);
+ mb_index = 0;
+
+ while (n--)
+ {
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++];
+ vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb);
+ vec_add1 (xd->d_trace_buffers[cpu_index],
+ vlib_get_buffer_index (vm, b));
+ }
+ }
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t *b0, *b1, *b2, *b3;
+ u32 bi0, next0, l3_offset0;
+ u32 bi1, next1, l3_offset1;
+ u32 bi2, next2, l3_offset2;
+ u32 bi3, next3, l3_offset3;
+ u8 error0, error1, error2, error3;
+ u64 or_ol_flags;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_buffers > 8 && n_left_to_next > 4)
+ {
+ struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index];
+ struct rte_mbuf *mb1 = xd->rx_vectors[queue_id][mb_index + 1];
+ struct rte_mbuf *mb2 = xd->rx_vectors[queue_id][mb_index + 2];
+ struct rte_mbuf *mb3 = xd->rx_vectors[queue_id][mb_index + 3];
+
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 4]);
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 5]);
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 6]);
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 7]);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG)
+ {
+ if (PREDICT_FALSE (mb0->nb_segs > 1))
+ dpdk_prefetch_buffer (mb0->next);
+ if (PREDICT_FALSE (mb1->nb_segs > 1))
+ dpdk_prefetch_buffer (mb1->next);
+ if (PREDICT_FALSE (mb2->nb_segs > 1))
+ dpdk_prefetch_buffer (mb2->next);
+ if (PREDICT_FALSE (mb3->nb_segs > 1))
+ dpdk_prefetch_buffer (mb3->next);
+ }
+
+ ASSERT (mb0);
+ ASSERT (mb1);
+ ASSERT (mb2);
+ ASSERT (mb3);
+
+ or_ol_flags = (mb0->ol_flags | mb1->ol_flags |
+ mb2->ol_flags | mb3->ol_flags);
+ b0 = vlib_buffer_from_rte_mbuf (mb0);
+ b1 = vlib_buffer_from_rte_mbuf (mb1);
+ b2 = vlib_buffer_from_rte_mbuf (mb2);
+ b3 = vlib_buffer_from_rte_mbuf (mb3);
+
+ vlib_buffer_init_for_free_list (b0, fl);
+ vlib_buffer_init_for_free_list (b1, fl);
+ vlib_buffer_init_for_free_list (b2, fl);
+ vlib_buffer_init_for_free_list (b3, fl);
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+ bi1 = vlib_get_buffer_index (vm, b1);
+ bi2 = vlib_get_buffer_index (vm, b2);
+ bi3 = vlib_get_buffer_index (vm, b3);
+
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ to_next[2] = bi2;
+ to_next[3] = bi3;
+ to_next += 4;
+ n_left_to_next -= 4;
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ {
+ next0 = next1 = next2 = next3 = xd->per_interface_next_index;
+ }
+ else if (PREDICT_TRUE
+ ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0))
+ {
+ next0 = dpdk_rx_next_from_mb (mb0, b0);
+ next1 = dpdk_rx_next_from_mb (mb1, b1);
+ next2 = dpdk_rx_next_from_mb (mb2, b2);
+ next3 = dpdk_rx_next_from_mb (mb3, b3);
+ }
+ else
+ {
+ next0 = dpdk_rx_next_from_etype (mb0, b0);
+ next1 = dpdk_rx_next_from_etype (mb1, b1);
+ next2 = dpdk_rx_next_from_etype (mb2, b2);
+ next3 = dpdk_rx_next_from_etype (mb3, b3);
+ }
+
+ if (PREDICT_FALSE (or_ol_flags & PKT_RX_IP_CKSUM_BAD))
+ {
+ dpdk_rx_error_from_mb (mb0, &next0, &error0);
+ dpdk_rx_error_from_mb (mb1, &next1, &error1);
+ dpdk_rx_error_from_mb (mb2, &next2, &error2);
+ dpdk_rx_error_from_mb (mb3, &next3, &error3);
+ b0->error = node->errors[error0];
+ b1->error = node->errors[error1];
+ b2->error = node->errors[error2];
+ b3->error = node->errors[error3];
+ }
+ else
+ {
+ b0->error = b1->error = node->errors[DPDK_ERROR_NONE];
+ b2->error = b3->error = node->errors[DPDK_ERROR_NONE];
+ }
+
+ l3_offset0 = device_input_next_node_advance[next0];
+ l3_offset1 = device_input_next_node_advance[next1];
+ l3_offset2 = device_input_next_node_advance[next2];
+ l3_offset3 = device_input_next_node_advance[next3];
+
+ b0->current_data = l3_offset0 + mb0->data_off;
+ b1->current_data = l3_offset1 + mb1->data_off;
+ b2->current_data = l3_offset2 + mb2->data_off;
+ b3->current_data = l3_offset3 + mb3->data_off;
+
+ b0->current_data -= RTE_PKTMBUF_HEADROOM;
+ b1->current_data -= RTE_PKTMBUF_HEADROOM;
+ b2->current_data -= RTE_PKTMBUF_HEADROOM;
+ b3->current_data -= RTE_PKTMBUF_HEADROOM;
+
+ b0->current_length = mb0->data_len - l3_offset0;
+ b1->current_length = mb1->data_len - l3_offset1;
+ b2->current_length = mb2->data_len - l3_offset2;
+ b3->current_length = mb3->data_len - l3_offset3;
+
+ b0->flags = buffer_flags_template;
+ b1->flags = buffer_flags_template;
+ b2->flags = buffer_flags_template;
+ b3->flags = buffer_flags_template;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b2)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b3)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ vnet_buffer (b2)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ vnet_buffer (b3)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+
+ n_rx_bytes += mb0->pkt_len;
+ n_rx_bytes += mb1->pkt_len;
+ n_rx_bytes += mb2->pkt_len;
+ n_rx_bytes += mb3->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG)
+ {
+ dpdk_process_subseq_segs (vm, b0, mb0, fl);
+ dpdk_process_subseq_segs (vm, b1, mb1, fl);
+ dpdk_process_subseq_segs (vm, b2, mb2, fl);
+ dpdk_process_subseq_segs (vm, b3, mb3, fl);
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3);
+
+ /* Do we have any driver RX features configured on the interface? */
+ vnet_feature_start_device_input_x4 (xd->vlib_sw_if_index,
+ &next0, &next1, &next2, &next3,
+ b0, b1, b2, b3,
+ l3_offset0, l3_offset1,
+ l3_offset2, l3_offset3);
+
+ vlib_validate_buffer_enqueue_x4 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, bi2, bi3,
+ next0, next1, next2, next3);
+ n_buffers -= 4;
+ mb_index += 4;
+ }
+ while (n_buffers > 0 && n_left_to_next > 0)
+ {
+ struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index];
+
+ ASSERT (mb0);
+
+ b0 = vlib_buffer_from_rte_mbuf (mb0);
+
+ /* Prefetch one next segment if it exists. */
+ if (PREDICT_FALSE (mb0->nb_segs > 1))
+ dpdk_prefetch_buffer (mb0->next);
+
+ vlib_buffer_init_for_free_list (b0, fl);
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+
+ to_next[0] = bi0;
+ to_next++;
+ n_left_to_next--;
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ next0 = xd->per_interface_next_index;
+ else if (PREDICT_TRUE
+ ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0))
+ next0 = dpdk_rx_next_from_mb (mb0, b0);
+ else
+ next0 = dpdk_rx_next_from_etype (mb0, b0);
+
+ dpdk_rx_error_from_mb (mb0, &next0, &error0);
+ b0->error = node->errors[error0];
+
+ l3_offset0 = device_input_next_node_advance[next0];
+
+ b0->current_data = l3_offset0;
+ b0->current_data += mb0->data_off - RTE_PKTMBUF_HEADROOM;
+ b0->current_length = mb0->data_len - l3_offset0;
+
+ b0->flags = buffer_flags_template;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ n_rx_bytes += mb0->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ dpdk_process_subseq_segs (vm, b0, mb0, fl);
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+
+ /* Do we have any driver RX features configured on the interface? */
+ vnet_feature_start_device_input_x1 (xd->vlib_sw_if_index, &next0,
+ b0, l3_offset0);
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ n_buffers--;
+ mb_index++;
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[cpu_index]) > 0))
+ {
+ dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers[cpu_index],
+ vec_len (xd->d_trace_buffers[cpu_index]));
+ vlib_set_trace_count (vm, node, n_trace -
+ vec_len (xd->d_trace_buffers[cpu_index]));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main ()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes);
+
+ dpdk_worker_t *dw = vec_elt_at_index (dm->workers, cpu_index);
+ dw->aggregate_rx_packets += mb_index;
+
+ return mb_index;
+}
+
+static inline void
+poll_rate_limit (dpdk_main_t * dm)
+{
+ /* Limit the poll rate by sleeping for N msec between polls */
+ if (PREDICT_FALSE (dm->poll_sleep != 0))
+ {
+ struct timespec ts, tsrem;
+
+ ts.tv_sec = 0;
+ ts.tv_nsec = 1000 * 1000 * dm->poll_sleep; /* 1ms */
+
+ while (nanosleep (&ts, &tsrem) < 0)
+ {
+ ts = tsrem;
+ }
+ }
+}
+
+/** \brief Main DPDK input node
+ @node dpdk-input
+
+ This is the main DPDK input node: across each assigned interface,
+ call rte_eth_rx_burst(...) or similar to obtain a vector of
+ packets to process. Handle early packet discard. Derive @c
+ vlib_buffer_t metadata from <code>struct rte_mbuf</code> metadata,
+ Depending on the resulting metadata: adjust <code>b->current_data,
+ b->current_length </code> and dispatch directly to
+ ip4-input-no-checksum, or ip6-input. Trace the packet if required.
+
+ @param vm vlib_main_t corresponding to the current thread
+ @param node vlib_node_runtime_t
+ @param f vlib_frame_t input-node, not used.
+
+ @par Graph mechanics: buffer metadata, next index usage
+
+ @em Uses:
+ - <code>struct rte_mbuf mb->ol_flags</code>
+ - PKT_RX_IP_CKSUM_BAD
+ - <code> RTE_ETH_IS_xxx_HDR(mb->packet_type) </code>
+ - packet classification result
+
+ @em Sets:
+ - <code>b->error</code> if the packet is to be dropped immediately
+ - <code>b->current_data, b->current_length</code>
+ - adjusted as needed to skip the L2 header in direct-dispatch cases
+ - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
+ - rx interface sw_if_index
+ - <code>vnet_buffer(b)->sw_if_index[VLIB_TX] = ~0</code>
+ - required by ipX-lookup
+ - <code>b->flags</code>
+ - to indicate multi-segment pkts (VLIB_BUFFER_NEXT_PRESENT), etc.
+
+ <em>Next Nodes:</em>
+ - Static arcs to: error-drop, ethernet-input,
+ ip4-input-no-checksum, ip6-input, mpls-input
+ - per-interface redirection, controlled by
+ <code>xd->per_interface_next_index</code>
+*/
+
+static uword
+dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd;
+ uword n_rx_packets = 0;
+ dpdk_device_and_queue_t *dq;
+ u32 cpu_index = os_get_cpu_number ();
+
+ /*
+ * Poll all devices on this cpu for input/interrupts.
+ */
+ /* *INDENT-OFF* */
+ vec_foreach (dq, dm->devices_by_cpu[cpu_index])
+ {
+ xd = vec_elt_at_index(dm->devices, dq->device);
+ n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id);
+ }
+ /* *INDENT-ON* */
+
+ poll_rate_limit (dm);
+
+ return n_rx_packets;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (dpdk_input_node) = {
+ .function = dpdk_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "dpdk-input",
+ .sibling_of = "device-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_dpdk_rx_dma_trace,
+
+ .n_errors = DPDK_N_ERROR,
+ .error_strings = dpdk_error_strings,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_input_node, dpdk_input);
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/qos_doc.md b/src/vnet/devices/dpdk/qos_doc.md
new file mode 100644
index 00000000000..9bd0659d616
--- /dev/null
+++ b/src/vnet/devices/dpdk/qos_doc.md
@@ -0,0 +1,404 @@
+# QoS Hierarchical Scheduler {#qos_doc}
+
+The Quality-of-Service (QoS) scheduler performs egress-traffic management by
+prioritizing the transmission of the packets of different type services and
+subcribers based on the Service Level Agreements (SLAs). The QoS scheduler can
+be enabled on one or more NIC output interfaces depending upon the
+requirement.
+
+
+## Overview
+
+The QoS schdeuler supports a number of scheduling and shaping levels which
+construct hierarchical-tree. The first level in the hierarchy is port (i.e.
+the physical interface) that constitutes the root node of the tree. The
+subsequent level is subport which represents the group of the
+users/subscribers. The individual user/subscriber is represented by the pipe
+at the next level. Each user can have different traffic type based on the
+criteria of specific loss rate, jitter, and latency. These traffic types are
+represented at the traffic-class level in the form of different traffic-
+classes. The last level contains number of queues which are grouped together
+to host the packets of the specific class type traffic.
+
+The QoS scheduler implementation requires flow classification, enqueue and
+dequeue operations. The flow classification is mandatory stage for HQoS where
+incoming packets are classified by mapping the packet fields information to
+5-tuple (HQoS subport, pipe, traffic class, queue within traffic class, and
+color) and storing that information in mbuf sched field. The enqueue operation
+uses this information to determine the queue for storing the packet, and at
+this stage, if the specific queue is full, QoS drops the packet. The dequeue
+operation consists of scheduling the packet based on its length and available
+credits, and handing over the scheduled packet to the output interface.
+
+For more information on QoS Scheduler, please refer DPDK Programmer's Guide-
+http://dpdk.org/doc/guides/prog_guide/qos_framework.html
+
+
+### QoS Schdeuler Parameters
+
+Following illustrates the default HQoS configuration for each 10GbE output
+port:
+
+Single subport (subport 0):
+ - Subport rate set to 100% of port rate
+ - Each of the 4 traffic classes has rate set to 100% of port rate
+
+4K pipes per subport 0 (pipes 0 .. 4095) with identical configuration:
+ - Pipe rate set to 1/4K of port rate
+ - Each of the 4 traffic classes has rate set to 100% of pipe rate
+ - Within each traffic class, the byte-level WRR weights for the 4 queues are set to 1:1:1:1
+
+
+#### Port configuration
+
+```
+port {
+ rate 1250000000 /* Assuming 10GbE port */
+ frame_overhead 24 /* Overhead fields per Ethernet frame:
+ * 7B (Preamble) +
+ * 1B (Start of Frame Delimiter (SFD)) +
+ * 4B (Frame Check Sequence (FCS)) +
+ * 12B (Inter Frame Gap (IFG))
+ */
+ mtu 1522 /* Assuming Ethernet/IPv4 pkt (FCS not included) */
+ n_subports_per_port 1 /* Number of subports per output interface */
+ n_pipes_per_subport 4096 /* Number of pipes (users/subscribers) */
+ queue_sizes 64 64 64 64 /* Packet queue size for each traffic class.
+ * All queues within the same pipe traffic class
+ * have the same size. Queues from different
+ * pipes serving the same traffic class have
+ * the same size. */
+}
+```
+
+
+#### Subport configuration
+
+```
+subport 0 {
+ tb_rate 1250000000 /* Subport level token bucket rate (bytes per second) */
+ tb_size 1000000 /* Subport level token bucket size (bytes) */
+ tc0_rate 1250000000 /* Subport level token bucket rate for traffic class 0 (bytes per second) */
+ tc1_rate 1250000000 /* Subport level token bucket rate for traffic class 1 (bytes per second) */
+ tc2_rate 1250000000 /* Subport level token bucket rate for traffic class 2 (bytes per second) */
+ tc3_rate 1250000000 /* Subport level token bucket rate for traffic class 3 (bytes per second) */
+ tc_period 10 /* Time interval for refilling the token bucket associated with traffic class (Milliseconds) */
+ pipe 0 4095 profile 0 /* pipes (users/subscribers) configured with pipe profile 0 */
+}
+```
+
+
+#### Pipe configuration
+
+```
+pipe_profile 0 {
+ tb_rate 305175 /* Pipe level token bucket rate (bytes per second) */
+ tb_size 1000000 /* Pipe level token bucket size (bytes) */
+ tc0_rate 305175 /* Pipe level token bucket rate for traffic class 0 (bytes per second) */
+ tc1_rate 305175 /* Pipe level token bucket rate for traffic class 1 (bytes per second) */
+ tc2_rate 305175 /* Pipe level token bucket rate for traffic class 2 (bytes per second) */
+ tc3_rate 305175 /* Pipe level token bucket rate for traffic class 3 (bytes per second) */
+ tc_period 40 /* Time interval for refilling the token bucket associated with traffic class at pipe level (Milliseconds) */
+ tc3_oversubscription_weight 1 /* Weight traffic class 3 oversubscription */
+ tc0_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 0 */
+ tc1_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 1 */
+ tc2_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 2 */
+ tc3_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 3 */
+}
+```
+
+
+#### Random Early Detection (RED) parameters per traffic class and color (Green / Yellow / Red)
+
+```
+red {
+ tc0_wred_min 48 40 32 /* Minimum threshold for traffic class 0 queue (min_th) in number of packets */
+ tc0_wred_max 64 64 64 /* Maximum threshold for traffic class 0 queue (max_th) in number of packets */
+ tc0_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 0 queue (maxp = 1 / maxp_inv) */
+ tc0_wred_weight 9 9 9 /* Traffic Class 0 queue weight */
+ tc1_wred_min 48 40 32 /* Minimum threshold for traffic class 1 queue (min_th) in number of packets */
+ tc1_wred_max 64 64 64 /* Maximum threshold for traffic class 1 queue (max_th) in number of packets */
+ tc1_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 1 queue (maxp = 1 / maxp_inv) */
+ tc1_wred_weight 9 9 9 /* Traffic Class 1 queue weight */
+ tc2_wred_min 48 40 32 /* Minimum threshold for traffic class 2 queue (min_th) in number of packets */
+ tc2_wred_max 64 64 64 /* Maximum threshold for traffic class 2 queue (max_th) in number of packets */
+ tc2_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 2 queue (maxp = 1 / maxp_inv) */
+ tc2_wred_weight 9 9 9 /* Traffic Class 2 queue weight */
+ tc3_wred_min 48 40 32 /* Minimum threshold for traffic class 3 queue (min_th) in number of packets */
+ tc3_wred_max 64 64 64 /* Maximum threshold for traffic class 3 queue (max_th) in number of packets */
+ tc3_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 3 queue (maxp = 1 / maxp_inv) */
+ tc3_wred_weight 9 9 9 /* Traffic Class 3 queue weight */
+}
+```
+
+
+### DPDK QoS Scheduler Integration in VPP
+
+The Hierarchical Quaity-of-Service (HQoS) scheduler object could be seen as
+part of the logical NIC output interface. To enable HQoS on specific output
+interface, vpp startup.conf file has to be configured accordingly. The output
+interface that requires HQoS, should have "hqos" parameter specified in dpdk
+section. Another optional parameter "hqos-thread" has been defined which can
+be used to associate the output interface with specific hqos thread. In cpu
+section of the config file, "corelist-hqos-threads" is introduced to assign
+logical cpu cores to run the HQoS threads. A HQoS thread can run multiple HQoS
+objects each associated with different output interfaces. All worker threads
+instead of writing packets to NIC TX queue directly, write the packets to a
+software queues. The hqos_threads read the software queues, and enqueue the
+packets to HQoS objects, as well as dequeue packets from HQOS objects and
+write them to NIC output interfaces. The worker threads need to be able to
+send the packets to any output interface, therefore, each HQoS object
+associated with NIC output interface should have software queues equal to
+worker threads count.
+
+Following illustrates the sample startup configuration file with 4x worker
+threads feeding 2x hqos threads that handle each QoS scheduler for 1x output
+interface.
+
+```
+dpdk {
+ socket-mem 16384,16384
+
+ dev 0000:02:00.0 {
+ num-rx-queues 2
+ hqos
+ }
+ dev 0000:06:00.0 {
+ num-rx-queues 2
+ hqos
+ }
+
+ num-mbufs 1000000
+}
+
+cpu {
+ main-core 0
+ corelist-workers 1, 2, 3, 4
+ corelist-hqos-threads 5, 6
+}
+```
+
+
+### QoS scheduler CLI Commands
+
+Each QoS scheduler instance is initialised with default parameters required to
+configure hqos port, subport, pipe and queues. Some of the parameters can be
+re-configured in run-time through CLI commands.
+
+
+#### Configuration
+
+Following commands can be used to configure QoS scheduler parameters.
+
+The command below can be used to set the subport level parameters such as
+token bucket rate (bytes per seconds), token bucket size (bytes), traffic
+class rates (bytes per seconds) and token update period (Milliseconds).
+
+```
+set dpdk interface hqos subport <if-name> subport <n> [rate <n>]
+ [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]
+```
+
+For setting the pipe profile, following command can be used.
+
+```
+set dpdk interface hqos pipe <if-name> subport <n> pipe <n> profile <n>
+```
+
+To assign QoS scheduler instance to the specific thread, following command can
+be used.
+
+```
+set dpdk interface hqos placement <if-name> thread <n>
+```
+
+The command below is used to set the packet fields required for classifiying
+the incoming packet. As a result of classification process, packet field
+information will be mapped to 5 tuples (subport, pipe, traffic class, pipe,
+color) and stored in packet mbuf.
+
+```
+set dpdk interface hqos pktfield <if-name> id <n> offset <n> mask <n>
+```
+
+The DSCP table entries used for idenfiying the traffic class and queue can be set using the command below;
+
+```
+set dpdk interface hqos tctbl <if-name> entry <n> tc <n> queue <n>
+```
+
+
+#### Show Command
+
+The QoS Scheduler configuration can displayed using the command below.
+
+```
+ vpp# show dpdk interface hqos TenGigabitEthernet2/0/0
+ Thread:
+ Input SWQ size = 4096 packets
+ Enqueue burst size = 256 packets
+ Dequeue burst size = 220 packets
+ Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000
+ Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000
+ Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc
+ Packet field 2 translation table:
+ [ 0 .. 15]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [16 .. 31]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [32 .. 47]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [48 .. 63]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ Port:
+ Rate = 1250000000 bytes/second
+ MTU = 1514 bytes
+ Frame overhead = 24 bytes
+ Number of subports = 1
+ Number of pipes per subport = 4096
+ Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets
+ Number of pipe profiles = 1
+ Pipe profile 0:
+ Rate = 305175 bytes/second
+ Token bucket size = 1000000 bytes
+ Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second
+ TC period = 40 milliseconds
+ TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+```
+
+The QoS Scheduler placement over the logical cpu cores can be displayed using
+below command.
+
+```
+ vpp# show dpdk interface hqos placement
+ Thread 5 (vpp_hqos-threads_0 at lcore 5):
+ TenGigabitEthernet2/0/0 queue 0
+ Thread 6 (vpp_hqos-threads_1 at lcore 6):
+ TenGigabitEthernet4/0/1 queue 0
+```
+
+
+### QoS Scheduler Binary APIs
+
+This section explans the available binary APIs for configuring QoS scheduler
+parameters in run-time.
+
+The following API can be used to set the pipe profile of a pipe that belongs
+to a given subport:
+
+```
+sw_interface_set_dpdk_hqos_pipe rx <intfc> | sw_if_index <id>
+ subport <subport-id> pipe <pipe-id> profile <profile-id>
+```
+
+The data structures used for set the pipe profile parameter are as follows;
+
+```
+ /** \\brief DPDK interface HQoS pipe profile set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param subport - subport ID
+ @param pipe - pipe ID within its subport
+ @param profile - pipe profile ID
+ */
+ define sw_interface_set_dpdk_hqos_pipe {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 subport;
+ u32 pipe;
+ u32 profile;
+ };
+
+ /** \\brief DPDK interface HQoS pipe profile set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_pipe_reply {
+ u32 context;
+ i32 retval;
+ };
+```
+
+The following API can be used to set the subport level parameters, for
+example- token bucket rate (bytes per seconds), token bucket size (bytes),
+traffic class rate (bytes per seconds) and tokens update period.
+
+```
+sw_interface_set_dpdk_hqos_subport rx <intfc> | sw_if_index <id>
+ subport <subport-id> [rate <n>] [bktsize <n>]
+ [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]
+```
+
+The data structures used for set the subport level parameter are as follows;
+
+```
+ /** \\brief DPDK interface HQoS subport parameters set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param subport - subport ID
+ @param tb_rate - subport token bucket rate (measured in bytes/second)
+ @param tb_size - subport token bucket size (measured in credits)
+ @param tc_rate - subport traffic class 0 .. 3 rates (measured in bytes/second)
+ @param tc_period - enforcement period for rates (measured in milliseconds)
+ */
+ define sw_interface_set_dpdk_hqos_subport {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 subport;
+ u32 tb_rate;
+ u32 tb_size;
+ u32 tc_rate[4];
+ u32 tc_period;
+ };
+
+ /** \\brief DPDK interface HQoS subport parameters set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_subport_reply {
+ u32 context;
+ i32 retval;
+ };
+```
+
+The following API can be used set the DSCP table entry. The DSCP table have
+64 entries to map the packet DSCP field onto traffic class and hqos input
+queue.
+
+```
+sw_interface_set_dpdk_hqos_tctbl rx <intfc> | sw_if_index <id>
+ entry <n> tc <n> queue <n>
+```
+
+The data structures used for setting DSCP table entries are given below.
+
+```
+ /** \\brief DPDK interface HQoS tctbl entry set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param entry - entry index ID
+ @param tc - traffic class (0 .. 3)
+ @param queue - traffic class queue (0 .. 3)
+ */
+ define sw_interface_set_dpdk_hqos_tctbl {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 entry;
+ u32 tc;
+ u32 queue;
+ };
+
+ /** \\brief DPDK interface HQoS tctbl entry set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_tctbl_reply {
+ u32 context;
+ i32 retval;
+ };
+```
diff --git a/src/vnet/devices/netmap/cli.c b/src/vnet/devices/netmap/cli.c
new file mode 100644
index 00000000000..68695250506
--- /dev/null
+++ b/src/vnet/devices/netmap/cli.c
@@ -0,0 +1,146 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/netmap/net_netmap.h>
+#include <vnet/devices/netmap/netmap.h>
+
+static clib_error_t *
+netmap_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u8 *host_if_name = NULL;
+ u8 hwaddr[6];
+ u8 *hw_addr_ptr = 0;
+ int r;
+ u8 is_pipe = 0;
+ u8 is_master = 0;
+ u32 sw_if_index = ~0;
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "name %s", &host_if_name))
+ ;
+ else
+ if (unformat
+ (line_input, "hw-addr %U", unformat_ethernet_address, hwaddr))
+ hw_addr_ptr = hwaddr;
+ else if (unformat (line_input, "pipe"))
+ is_pipe = 1;
+ else if (unformat (line_input, "master"))
+ is_master = 1;
+ else if (unformat (line_input, "slave"))
+ is_master = 0;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (host_if_name == NULL)
+ return clib_error_return (0, "missing host interface name");
+
+ r =
+ netmap_create_if (vm, host_if_name, hw_addr_ptr, is_pipe, is_master,
+ &sw_if_index);
+
+ if (r == VNET_API_ERROR_SYSCALL_ERROR_1)
+ return clib_error_return (0, "%s (errno %d)", strerror (errno), errno);
+
+ if (r == VNET_API_ERROR_INVALID_INTERFACE)
+ return clib_error_return (0, "Invalid interface name");
+
+ if (r == VNET_API_ERROR_SUBIF_ALREADY_EXISTS)
+ return clib_error_return (0, "Interface already exists");
+
+ vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (),
+ sw_if_index);
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (netmap_create_command, static) = {
+ .path = "create netmap",
+ .short_help = "create netmap name [<intf name>|valeXXX:YYY] "
+ "[hw-addr <mac>] [pipe] [master|slave]",
+ .function = netmap_create_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+netmap_delete_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u8 *host_if_name = NULL;
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "name %s", &host_if_name))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (host_if_name == NULL)
+ return clib_error_return (0, "missing host interface name");
+
+ netmap_delete_if (vm, host_if_name);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (netmap_delete_command, static) = {
+ .path = "delete netmap",
+ .short_help = "delete netmap name <interface name>",
+ .function = netmap_delete_command_fn,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+netmap_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (netmap_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/netmap/device.c b/src/vnet/devices/netmap/device.c
new file mode 100644
index 00000000000..2152824f733
--- /dev/null
+++ b/src/vnet/devices/netmap/device.c
@@ -0,0 +1,261 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/netmap/net_netmap.h>
+#include <vnet/devices/netmap/netmap.h>
+
+#define foreach_netmap_tx_func_error \
+_(NO_FREE_SLOTS, "no free tx slots") \
+_(PENDING_MSGS, "pending msgs in tx ring")
+
+typedef enum
+{
+#define _(f,s) NETMAP_TX_ERROR_##f,
+ foreach_netmap_tx_func_error
+#undef _
+ NETMAP_TX_N_ERROR,
+} netmap_tx_func_error_t;
+
+static char *netmap_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_netmap_tx_func_error
+#undef _
+};
+
+
+static u8 *
+format_netmap_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ netmap_main_t *apm = &netmap_main;
+ netmap_if_t *nif = pool_elt_at_index (apm->interfaces, i);
+
+ s = format (s, "netmap-%s", nif->host_if_name);
+ return s;
+}
+
+static u8 *
+format_netmap_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ int verbose = va_arg (*args, int);
+ netmap_main_t *nm = &netmap_main;
+ netmap_if_t *nif = vec_elt_at_index (nm->interfaces, dev_instance);
+ uword indent = format_get_indent (s);
+
+ s = format (s, "NETMAP interface");
+ if (verbose)
+ {
+ s = format (s, "\n%U version %d flags 0x%x"
+ "\n%U region %u memsize 0x%x offset 0x%x"
+ "\n%U tx_slots %u rx_slots %u tx_rings %u rx_rings %u",
+ format_white_space, indent + 2,
+ nif->req->nr_version,
+ nif->req->nr_flags,
+ format_white_space, indent + 2,
+ nif->mem_region,
+ nif->req->nr_memsize,
+ nif->req->nr_offset,
+ format_white_space, indent + 2,
+ nif->req->nr_tx_slots,
+ nif->req->nr_rx_slots,
+ nif->req->nr_tx_rings, nif->req->nr_rx_rings);
+ }
+ return s;
+}
+
+static u8 *
+format_netmap_tx_trace (u8 * s, va_list * args)
+{
+ s = format (s, "Unimplemented...");
+ return s;
+}
+
+static uword
+netmap_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ netmap_main_t *nm = &netmap_main;
+ u32 *buffers = vlib_frame_args (frame);
+ u32 n_left = frame->n_vectors;
+ f64 const time_constant = 1e3;
+ vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
+ netmap_if_t *nif = pool_elt_at_index (nm->interfaces, rd->dev_instance);
+ int cur_ring;
+
+ if (PREDICT_FALSE (nif->lockp != 0))
+ {
+ while (__sync_lock_test_and_set (nif->lockp, 1))
+ ;
+ }
+
+ cur_ring = nif->first_tx_ring;
+
+ while (n_left && cur_ring <= nif->last_tx_ring)
+ {
+ struct netmap_ring *ring = NETMAP_TXRING (nif->nifp, cur_ring);
+ int n_free_slots = nm_ring_space (ring);
+ uint cur = ring->cur;
+
+ if (nm_tx_pending (ring))
+ {
+ if (ioctl (nif->fd, NIOCTXSYNC, NULL) < 0)
+ clib_unix_warning ("NIOCTXSYNC");
+ clib_cpu_time_wait (time_constant);
+
+ if (nm_tx_pending (ring) && !n_free_slots)
+ {
+ cur_ring++;
+ continue;
+ }
+ }
+
+ while (n_left && n_free_slots)
+ {
+ vlib_buffer_t *b0 = 0;
+ u32 bi = buffers[0];
+ u32 len;
+ u32 offset = 0;
+ buffers++;
+
+ struct netmap_slot *slot = &ring->slot[cur];
+
+ do
+ {
+ b0 = vlib_get_buffer (vm, bi);
+ len = b0->current_length;
+ /* memcpy */
+ clib_memcpy ((u8 *) NETMAP_BUF (ring, slot->buf_idx) + offset,
+ vlib_buffer_get_current (b0), len);
+ offset += len;
+ }
+ while ((bi = b0->next_buffer));
+
+ slot->len = offset;
+ cur = (cur + 1) % ring->num_slots;
+ n_free_slots--;
+ n_left--;
+ }
+ CLIB_MEMORY_BARRIER ();
+ ring->head = ring->cur = cur;
+ }
+
+ if (n_left < frame->n_vectors)
+ ioctl (nif->fd, NIOCTXSYNC, NULL);
+
+ if (PREDICT_FALSE (nif->lockp != 0))
+ *nif->lockp = 0;
+
+ if (n_left)
+ vlib_error_count (vm, node->node_index,
+ (n_left ==
+ frame->n_vectors ? NETMAP_TX_ERROR_PENDING_MSGS :
+ NETMAP_TX_ERROR_NO_FREE_SLOTS), n_left);
+
+ vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+ return frame->n_vectors;
+}
+
+static void
+netmap_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ netmap_main_t *apm = &netmap_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ netmap_if_t *nif = pool_elt_at_index (apm->interfaces, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ nif->per_interface_next_index = node_index;
+ return;
+ }
+
+ nif->per_interface_next_index =
+ vlib_node_add_next (vlib_get_main (), netmap_input_node.index,
+ node_index);
+}
+
+static void
+netmap_clear_hw_interface_counters (u32 instance)
+{
+ /* Nothing for now */
+}
+
+static clib_error_t *
+netmap_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ netmap_main_t *apm = &netmap_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ netmap_if_t *nif = pool_elt_at_index (apm->interfaces, hw->dev_instance);
+ u32 hw_flags;
+
+ nif->is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+
+ if (nif->is_admin_up)
+ hw_flags = VNET_HW_INTERFACE_FLAG_LINK_UP;
+ else
+ hw_flags = 0;
+
+ vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags);
+
+ return 0;
+}
+
+static clib_error_t *
+netmap_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t *st, int is_add)
+{
+ /* Nothing for now */
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (netmap_device_class) = {
+ .name = "netmap",
+ .tx_function = netmap_interface_tx,
+ .format_device_name = format_netmap_device_name,
+ .format_device = format_netmap_device,
+ .format_tx_trace = format_netmap_tx_trace,
+ .tx_function_n_errors = NETMAP_TX_N_ERROR,
+ .tx_function_error_strings = netmap_tx_func_error_strings,
+ .rx_redirect_to_node = netmap_set_interface_next_node,
+ .clear_counters = netmap_clear_hw_interface_counters,
+ .admin_up_down_function = netmap_interface_admin_up_down,
+ .subif_add_del_function = netmap_subif_add_del_function,
+};
+
+VLIB_DEVICE_TX_FUNCTION_MULTIARCH(netmap_device_class,
+ netmap_interface_tx)
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/netmap/net_netmap.h b/src/vnet/devices/netmap/net_netmap.h
new file mode 100644
index 00000000000..fd4253b7c0c
--- /dev/null
+++ b/src/vnet/devices/netmap/net_netmap.h
@@ -0,0 +1,650 @@
+/*
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD: head/sys/net/netmap.h 251139 2013-05-30 14:07:14Z luigi $
+ *
+ * Definitions of constants and the structures used by the netmap
+ * framework, for the part visible to both kernel and userspace.
+ * Detailed info on netmap is available with "man netmap" or at
+ *
+ * http://info.iet.unipi.it/~luigi/netmap/
+ *
+ * This API is also used to communicate with the VALE software switch
+ */
+
+#ifndef _NET_NETMAP_H_
+#define _NET_NETMAP_H_
+
+#define NETMAP_API 11 /* current API version */
+
+#define NETMAP_MIN_API 11 /* min and max versions accepted */
+#define NETMAP_MAX_API 15
+/*
+ * Some fields should be cache-aligned to reduce contention.
+ * The alignment is architecture and OS dependent, but rather than
+ * digging into OS headers to find the exact value we use an estimate
+ * that should cover most architectures.
+ */
+#define NM_CACHE_ALIGN 128
+
+/*
+ * --- Netmap data structures ---
+ *
+ * The userspace data structures used by netmap are shown below.
+ * They are allocated by the kernel and mmap()ed by userspace threads.
+ * Pointers are implemented as memory offsets or indexes,
+ * so that they can be easily dereferenced in kernel and userspace.
+
+ KERNEL (opaque, obviously)
+
+ ====================================================================
+ |
+ USERSPACE | struct netmap_ring
+ +---->+---------------+
+ / | head,cur,tail |
+ struct netmap_if (nifp, 1 per fd) / | buf_ofs |
+ +---------------+ / | other fields |
+ | ni_tx_rings | / +===============+
+ | ni_rx_rings | / | buf_idx, len | slot[0]
+ | | / | flags, ptr |
+ | | / +---------------+
+ +===============+ / | buf_idx, len | slot[1]
+ | txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
+ | txring_ofs[1] | +---------------+
+ (tx+1 entries) (num_slots entries)
+ | txring_ofs[t] | | buf_idx, len | slot[n-1]
+ +---------------+ | flags, ptr |
+ | rxring_ofs[0] | +---------------+
+ | rxring_ofs[1] |
+ (rx+1 entries)
+ | rxring_ofs[r] |
+ +---------------+
+
+ * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to
+ * a file descriptor, the mmap()ed region contains a (logically readonly)
+ * struct netmap_if pointing to struct netmap_ring's.
+ *
+ * There is one netmap_ring per physical NIC ring, plus one tx/rx ring
+ * pair attached to the host stack (this pair is unused for non-NIC ports).
+ *
+ * All physical/host stack ports share the same memory region,
+ * so that zero-copy can be implemented between them.
+ * VALE switch ports instead have separate memory regions.
+ *
+ * The netmap_ring is the userspace-visible replica of the NIC ring.
+ * Each slot has the index of a buffer (MTU-sized and residing in the
+ * mmapped region), its length and some flags. An extra 64-bit pointer
+ * is provided for user-supplied buffers in the tx path.
+ *
+ * In user space, the buffer address is computed as
+ * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE
+ *
+ * Added in NETMAP_API 11:
+ *
+ * + NIOCREGIF can request the allocation of extra spare buffers from
+ * the same memory pool. The desired number of buffers must be in
+ * nr_arg3. The ioctl may return fewer buffers, depending on memory
+ * availability. nr_arg3 will return the actual value, and, once
+ * mapped, nifp->ni_bufs_head will be the index of the first buffer.
+ *
+ * The buffers are linked to each other using the first uint32_t
+ * as the index. On close, ni_bufs_head must point to the list of
+ * buffers to be released.
+ *
+ * + NIOCREGIF can request space for extra rings (and buffers)
+ * allocated in the same memory space. The number of extra rings
+ * is in nr_arg1, and is advisory. This is a no-op on NICs where
+ * the size of the memory space is fixed.
+ *
+ * + NIOCREGIF can attach to PIPE rings sharing the same memory
+ * space with a parent device. The ifname indicates the parent device,
+ * which must already exist. Flags in nr_flags indicate if we want to
+ * bind the master or slave side, the index (from nr_ringid)
+ * is just a cookie and does not need to be sequential.
+ *
+ * + NIOCREGIF can also attach to 'monitor' rings that replicate
+ * the content of specific rings, also from the same memory space.
+ *
+ * Extra flags in nr_flags support the above functions.
+ * Application libraries may use the following naming scheme:
+ * netmap:foo all NIC ring pairs
+ * netmap:foo^ only host ring pair
+ * netmap:foo+ all NIC ring + host ring pairs
+ * netmap:foo-k the k-th NIC ring pair
+ * netmap:foo{k PIPE ring pair k, master side
+ * netmap:foo}k PIPE ring pair k, slave side
+ */
+
+/*
+ * struct netmap_slot is a buffer descriptor
+ */
+struct netmap_slot {
+ uint32_t buf_idx; /* buffer index */
+ uint16_t len; /* length for this slot */
+ uint16_t flags; /* buf changed, etc. */
+ uint64_t ptr; /* pointer for indirect buffers */
+};
+
+/*
+ * The following flags control how the slot is used
+ */
+
+#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */
+ /*
+ * must be set whenever buf_idx is changed (as it might be
+ * necessary to recompute the physical address and mapping)
+ *
+ * It is also set by the kernel whenever the buf_idx is
+ * changed internally (e.g., by pipes). Applications may
+ * use this information to know when they can reuse the
+ * contents of previously prepared buffers.
+ */
+
+#define NS_REPORT 0x0002 /* ask the hardware to report results */
+ /*
+ * Request notification when slot is used by the hardware.
+ * Normally transmit completions are handled lazily and
+ * may be unreported. This flag lets us know when a slot
+ * has been sent (e.g. to terminate the sender).
+ */
+
+#define NS_FORWARD 0x0004 /* pass packet 'forward' */
+ /*
+ * (Only for physical ports, rx rings with NR_FORWARD set).
+ * Slot released to the kernel (i.e. before ring->head) with
+ * this flag set are passed to the peer ring (host/NIC),
+ * thus restoring the host-NIC connection for these slots.
+ * This supports efficient traffic monitoring or firewalling.
+ */
+
+#define NS_NO_LEARN 0x0008 /* disable bridge learning */
+ /*
+ * On a VALE switch, do not 'learn' the source port for
+ * this buffer.
+ */
+
+#define NS_INDIRECT 0x0010 /* userspace buffer */
+ /*
+ * (VALE tx rings only) data is in a userspace buffer,
+ * whose address is in the 'ptr' field in the slot.
+ */
+
+#define NS_MOREFRAG 0x0020 /* packet has more fragments */
+ /*
+ * (VALE ports only)
+ * Set on all but the last slot of a multi-segment packet.
+ * The 'len' field refers to the individual fragment.
+ */
+
+#define NS_PORT_SHIFT 8
+#define NS_PORT_MASK (0xff << NS_PORT_SHIFT)
+ /*
+ * The high 8 bits of the flag, if not zero, indicate the
+ * destination port for the VALE switch, overriding
+ * the lookup table.
+ */
+
+#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff)
+ /*
+ * (VALE rx rings only) the high 8 bits
+ * are the number of fragments.
+ */
+
+
+/*
+ * struct netmap_ring
+ *
+ * Netmap representation of a TX or RX ring (also known as "queue").
+ * This is a queue implemented as a fixed-size circular array.
+ * At the software level the important fields are: head, cur, tail.
+ *
+ * In TX rings:
+ *
+ * head first slot available for transmission.
+ * cur wakeup point. select() and poll() will unblock
+ * when 'tail' moves past 'cur'
+ * tail (readonly) first slot reserved to the kernel
+ *
+ * [head .. tail-1] can be used for new packets to send;
+ * 'head' and 'cur' must be incremented as slots are filled
+ * with new packets to be sent;
+ * 'cur' can be moved further ahead if we need more space
+ * for new transmissions. XXX todo (2014-03-12)
+ *
+ * In RX rings:
+ *
+ * head first valid received packet
+ * cur wakeup point. select() and poll() will unblock
+ * when 'tail' moves past 'cur'
+ * tail (readonly) first slot reserved to the kernel
+ *
+ * [head .. tail-1] contain received packets;
+ * 'head' and 'cur' must be incremented as slots are consumed
+ * and can be returned to the kernel;
+ * 'cur' can be moved further ahead if we want to wait for
+ * new packets without returning the previous ones.
+ *
+ * DATA OWNERSHIP/LOCKING:
+ * The netmap_ring, and all slots and buffers in the range
+ * [head .. tail-1] are owned by the user program;
+ * the kernel only accesses them during a netmap system call
+ * and in the user thread context.
+ *
+ * Other slots and buffers are reserved for use by the kernel
+ */
+struct netmap_ring {
+ /*
+ * buf_ofs is meant to be used through macros.
+ * It contains the offset of the buffer region from this
+ * descriptor.
+ */
+ const int64_t buf_ofs;
+ const uint32_t num_slots; /* number of slots in the ring. */
+ const uint32_t nr_buf_size;
+ const uint16_t ringid;
+ const uint16_t dir; /* 0: tx, 1: rx */
+
+ uint32_t head; /* (u) first user slot */
+ uint32_t cur; /* (u) wakeup point */
+ uint32_t tail; /* (k) first kernel slot */
+
+ uint32_t flags;
+
+ struct timeval ts; /* (k) time of last *sync() */
+
+ /* opaque room for a mutex or similar object */
+#if !defined(_WIN32) || defined(__CYGWIN__)
+ uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128];
+#else
+ uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128];
+#endif
+
+ /* the slots follow. This struct has variable size */
+ struct netmap_slot slot[0]; /* array of slots. */
+};
+
+
+/*
+ * RING FLAGS
+ */
+#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
+ /*
+ * updates the 'ts' field on each netmap syscall. This saves
+ * saves a separate gettimeofday(), and is not much worse than
+ * software timestamps generated in the interrupt handler.
+ */
+
+#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
+ /*
+ * Enables the NS_FORWARD slot flag for the ring.
+ */
+
+
+/*
+ * Netmap representation of an interface and its queue(s).
+ * This is initialized by the kernel when binding a file
+ * descriptor to a port, and should be considered as readonly
+ * by user programs. The kernel never uses it.
+ *
+ * There is one netmap_if for each file descriptor on which we want
+ * to select/poll.
+ * select/poll operates on one or all pairs depending on the value of
+ * nmr_queueid passed on the ioctl.
+ */
+struct netmap_if {
+ char ni_name[IFNAMSIZ]; /* name of the interface. */
+ const uint32_t ni_version; /* API version, currently unused */
+ const uint32_t ni_flags; /* properties */
+#define NI_PRIV_MEM 0x1 /* private memory region */
+
+ /*
+ * The number of packet rings available in netmap mode.
+ * Physical NICs can have different numbers of tx and rx rings.
+ * Physical NICs also have a 'host' ring pair.
+ * Additionally, clients can request additional ring pairs to
+ * be used for internal communication.
+ */
+ const uint32_t ni_tx_rings; /* number of HW tx rings */
+ const uint32_t ni_rx_rings; /* number of HW rx rings */
+
+ uint32_t ni_bufs_head; /* head index for extra bufs */
+ uint32_t ni_spare1[5];
+ /*
+ * The following array contains the offset of each netmap ring
+ * from this structure, in the following order:
+ * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings;
+ * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings.
+ *
+ * The area is filled up by the kernel on NIOCREGIF,
+ * and then only read by userspace code.
+ */
+ const ssize_t ring_ofs[0];
+};
+
+
+#ifndef NIOCREGIF
+/*
+ * ioctl names and related fields
+ *
+ * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
+ * whose identity is set in NIOCREGIF through nr_ringid.
+ * These are non blocking and take no argument.
+ *
+ * NIOCGINFO takes a struct ifreq, the interface name is the input,
+ * the outputs are number of queues and number of descriptor
+ * for each queue (useful to set number of threads etc.).
+ * The info returned is only advisory and may change before
+ * the interface is bound to a file descriptor.
+ *
+ * NIOCREGIF takes an interface name within a struct nmre,
+ * and activates netmap mode on the interface (if possible).
+ *
+ * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we
+ * can pass it down to other NIC-related ioctls.
+ *
+ * The actual argument (struct nmreq) has a number of options to request
+ * different functions.
+ * The following are used in NIOCREGIF when nr_cmd == 0:
+ *
+ * nr_name (in)
+ * The name of the port (em0, valeXXX:YYY, etc.)
+ * limited to IFNAMSIZ for backward compatibility.
+ *
+ * nr_version (in/out)
+ * Must match NETMAP_API as used in the kernel, error otherwise.
+ * Always returns the desired value on output.
+ *
+ * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out)
+ * On input, non-zero values may be used to reconfigure the port
+ * according to the requested values, but this is not guaranteed.
+ * On output the actual values in use are reported.
+ *
+ * nr_ringid (in)
+ * Indicates how rings should be bound to the file descriptors.
+ * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK)
+ * are used to indicate the ring number, and nr_flags specifies
+ * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected.
+ *
+ * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED:
+ * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control
+ * the binding as follows:
+ * 0 (default) binds all physical rings
+ * NETMAP_HW_RING | ring number binds a single ring pair
+ * NETMAP_SW_RING binds only the host tx/rx rings
+ *
+ * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push
+ * packets on tx rings only if POLLOUT is set.
+ * The default is to push any pending packet.
+ *
+ * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release
+ * packets on rx rings also when POLLIN is NOT set.
+ * The default is to touch the rx ring only with POLLIN.
+ * Note that this is the opposite of TX because it
+ * reflects the common usage.
+ *
+ * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead.
+ * NETMAP_PRIV_MEM is set on return for ports that do not use
+ * the global memory allocator.
+ * This information is not significant and applications
+ * should look at the region id in nr_arg2
+ *
+ * nr_flags is the recommended mode to indicate which rings should
+ * be bound to a file descriptor. Values are NR_REG_*
+ *
+ * nr_arg1 (in) The number of extra rings to be reserved.
+ * Especially when allocating a VALE port the system only
+ * allocates the amount of memory needed for the port.
+ * If more shared memory rings are desired (e.g. for pipes),
+ * the first invocation for the same basename/allocator
+ * should specify a suitable number. Memory cannot be
+ * extended after the first allocation without closing
+ * all ports on the same region.
+ *
+ * nr_arg2 (in/out) The identity of the memory region used.
+ * On input, 0 means the system decides autonomously,
+ * other values may try to select a specific region.
+ * On return the actual value is reported.
+ * Region '1' is the global allocator, normally shared
+ * by all interfaces. Other values are private regions.
+ * If two ports the same region zero-copy is possible.
+ *
+ * nr_arg3 (in/out) number of extra buffers to be allocated.
+ *
+ *
+ *
+ * nr_cmd (in) if non-zero indicates a special command:
+ * NETMAP_BDG_ATTACH and nr_name = vale*:ifname
+ * attaches the NIC to the switch; nr_ringid specifies
+ * which rings to use. Used by vale-ctl -a ...
+ * nr_arg1 = NETMAP_BDG_HOST also attaches the host port
+ * as in vale-ctl -h ...
+ *
+ * NETMAP_BDG_DETACH and nr_name = vale*:ifname
+ * disconnects a previously attached NIC.
+ * Used by vale-ctl -d ...
+ *
+ * NETMAP_BDG_LIST
+ * list the configuration of VALE switches.
+ *
+ * NETMAP_BDG_VNET_HDR
+ * Set the virtio-net header length used by the client
+ * of a VALE switch port.
+ *
+ * NETMAP_BDG_NEWIF
+ * create a persistent VALE port with name nr_name.
+ * Used by vale-ctl -n ...
+ *
+ * NETMAP_BDG_DELIF
+ * delete a persistent VALE port. Used by vale-ctl -d ...
+ *
+ * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific
+ *
+ *
+ *
+ */
+
+
+/*
+ * struct nmreq overlays a struct ifreq (just the name)
+ */
+struct nmreq {
+ char nr_name[IFNAMSIZ];
+ uint32_t nr_version; /* API version */
+ uint32_t nr_offset; /* nifp offset in the shared region */
+ uint32_t nr_memsize; /* size of the shared region */
+ uint32_t nr_tx_slots; /* slots in tx rings */
+ uint32_t nr_rx_slots; /* slots in rx rings */
+ uint16_t nr_tx_rings; /* number of tx rings */
+ uint16_t nr_rx_rings; /* number of rx rings */
+
+ uint16_t nr_ringid; /* ring(s) we care about */
+#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */
+#define NETMAP_SW_RING 0x2000 /* only host ring pair */
+
+#define NETMAP_RING_MASK 0x0fff /* the ring number */
+
+#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */
+
+#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */
+
+ uint16_t nr_cmd;
+#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
+#define NETMAP_BDG_DETACH 2 /* detach the NIC */
+#define NETMAP_BDG_REGOPS 3 /* register bridge callbacks */
+#define NETMAP_BDG_LIST 4 /* get bridge's info */
+#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */
+#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */
+#define NETMAP_BDG_NEWIF 6 /* create a virtual port */
+#define NETMAP_BDG_DELIF 7 /* destroy a virtual port */
+#define NETMAP_PT_HOST_CREATE 8 /* create ptnetmap kthreads */
+#define NETMAP_PT_HOST_DELETE 9 /* delete ptnetmap kthreads */
+#define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */
+#define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */
+#define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */
+ uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */
+#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */
+
+ uint16_t nr_arg2;
+ uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */
+ uint32_t nr_flags;
+ /* various modes, extends nr_ringid */
+ uint32_t spare2[1];
+};
+
+#define NR_REG_MASK 0xf /* values for nr_flags */
+enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */
+ NR_REG_ALL_NIC = 1,
+ NR_REG_SW = 2,
+ NR_REG_NIC_SW = 3,
+ NR_REG_ONE_NIC = 4,
+ NR_REG_PIPE_MASTER = 5,
+ NR_REG_PIPE_SLAVE = 6,
+};
+/* monitor uses the NR_REG to select the rings to monitor */
+#define NR_MONITOR_TX 0x100
+#define NR_MONITOR_RX 0x200
+#define NR_ZCOPY_MON 0x400
+/* request exclusive access to the selected rings */
+#define NR_EXCLUSIVE 0x800
+/* request ptnetmap host support */
+#define NR_PASSTHROUGH_HOST NR_PTNETMAP_HOST /* deprecated */
+#define NR_PTNETMAP_HOST 0x1000
+#define NR_RX_RINGS_ONLY 0x2000
+#define NR_TX_RINGS_ONLY 0x4000
+/* Applications set this flag if they are able to deal with virtio-net headers,
+ * that is send/receive frames that start with a virtio-net header.
+ * If not set, NIOCREGIF will fail with netmap ports that require applications
+ * to use those headers. If the flag is set, the application can use the
+ * NETMAP_VNET_HDR_GET command to figure out the header length. */
+#define NR_ACCEPT_VNET_HDR 0x8000
+
+
+/*
+ * Windows does not have _IOWR(). _IO(), _IOW() and _IOR() are defined
+ * in ws2def.h but not sure if they are in the form we need.
+ * XXX so we redefine them
+ * in a convenient way to use for DeviceIoControl signatures
+ */
+#ifdef _WIN32
+#undef _IO // ws2def.h
+#define _WIN_NM_IOCTL_TYPE 40000
+#define _IO(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \
+ METHOD_BUFFERED, FILE_ANY_ACCESS )
+#define _IO_direct(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \
+ METHOD_OUT_DIRECT, FILE_ANY_ACCESS )
+
+#define _IOWR(_c, _n, _s) _IO(_c, _n)
+
+/* We havesome internal sysctl in addition to the externally visible ones */
+#define NETMAP_MMAP _IO_direct('i', 160) // note METHOD_OUT_DIRECT
+#define NETMAP_POLL _IO('i', 162)
+
+/* and also two setsockopt for sysctl emulation */
+#define NETMAP_SETSOCKOPT _IO('i', 140)
+#define NETMAP_GETSOCKOPT _IO('i', 141)
+
+
+//These linknames are for the Netmap Core Driver
+#define NETMAP_NT_DEVICE_NAME L"\\Device\\NETMAP"
+#define NETMAP_DOS_DEVICE_NAME L"\\DosDevices\\netmap"
+
+//Definition of a structure used to pass a virtual address within an IOCTL
+typedef struct _MEMORY_ENTRY {
+ PVOID pUsermodeVirtualAddress;
+} MEMORY_ENTRY, *PMEMORY_ENTRY;
+
+typedef struct _POLL_REQUEST_DATA {
+ int events;
+ int timeout;
+ int revents;
+} POLL_REQUEST_DATA;
+
+#endif /* _WIN32 */
+
+/*
+ * FreeBSD uses the size value embedded in the _IOWR to determine
+ * how much to copy in/out. So we need it to match the actual
+ * data structure we pass. We put some spares in the structure
+ * to ease compatibility with other versions
+ */
+#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */
+#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */
+#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */
+#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */
+#define NIOCCONFIG _IOWR('i',150, struct nm_ifreq) /* for ext. modules */
+#endif /* !NIOCREGIF */
+
+
+/*
+ * Helper functions for kernel and userspace
+ */
+
+/*
+ * check if space is available in the ring.
+ */
+static inline int
+nm_ring_empty(struct netmap_ring *ring)
+{
+ return (ring->cur == ring->tail);
+}
+
+/*
+ * Opaque structure that is passed to an external kernel
+ * module via ioctl(fd, NIOCCONFIG, req) for a user-owned
+ * bridge port (at this point ephemeral VALE interface).
+ */
+#define NM_IFRDATA_LEN 256
+struct nm_ifreq {
+ char nifr_name[IFNAMSIZ];
+ char data[NM_IFRDATA_LEN];
+};
+
+/*
+ * netmap kernel thread configuration
+ */
+/* bhyve/vmm.ko MSIX parameters for IOCTL */
+struct ptn_vmm_ioctl_msix {
+ uint64_t msg;
+ uint64_t addr;
+};
+
+/* IOCTL parameters */
+struct nm_kth_ioctl {
+ u_long com;
+ /* TODO: use union */
+ union {
+ struct ptn_vmm_ioctl_msix msix;
+ } data;
+};
+
+/* Configuration of a ptnetmap ring */
+struct ptnet_ring_cfg {
+ uint64_t ioeventfd; /* eventfd in linux, tsleep() parameter in FreeBSD */
+ uint64_t irqfd; /* eventfd in linux, ioctl fd in FreeBSD */
+ struct nm_kth_ioctl ioctl; /* ioctl parameter to send irq (only used in bhyve/FreeBSD) */
+};
+#endif /* _NET_NETMAP_H_ */
diff --git a/src/vnet/devices/netmap/netmap.api b/src/vnet/devices/netmap/netmap.api
new file mode 100644
index 00000000000..377ccffda4c
--- /dev/null
+++ b/src/vnet/devices/netmap/netmap.api
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2015-2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** \brief Create netmap
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param netmap_if_name - interface name
+ @param hw_addr - interface MAC
+ @param use_random_hw_addr - use random generated MAC
+ @param is_pipe - is pipe
+ @param is_master - 0=slave, 1=master
+*/
+define netmap_create
+{
+ u32 client_index;
+ u32 context;
+
+ u8 netmap_if_name[64];
+ u8 hw_addr[6];
+ u8 use_random_hw_addr;
+ u8 is_pipe;
+ u8 is_master;
+};
+
+/** \brief Create netmap response
+ @param context - sender context, to match reply w/ request
+ @param retval - return value for request
+*/
+define netmap_create_reply
+{
+ u32 context;
+ i32 retval;
+};
+
+/** \brief Delete netmap
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param netmap_if_name - interface name
+*/
+define netmap_delete
+{
+ u32 client_index;
+ u32 context;
+
+ u8 netmap_if_name[64];
+};
+
+/** \brief Delete netmap response
+ @param context - sender context, to match reply w/ request
+ @param retval - return value for request
+*/
+define netmap_delete_reply
+{
+ u32 context;
+ i32 retval;
+};
+
+/*
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/netmap/netmap.c b/src/vnet/devices/netmap/netmap.c
new file mode 100644
index 00000000000..3bdb442dda2
--- /dev/null
+++ b/src/vnet/devices/netmap/netmap.c
@@ -0,0 +1,316 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <vnet/devices/netmap/net_netmap.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/netmap/netmap.h>
+
+static u32
+netmap_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi,
+ u32 flags)
+{
+ /* nothing for now */
+ return 0;
+}
+
+static clib_error_t *
+netmap_fd_read_ready (unix_file_t * uf)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ netmap_main_t *nm = &netmap_main;
+ u32 idx = uf->private_data;
+
+ nm->pending_input_bitmap =
+ clib_bitmap_set (nm->pending_input_bitmap, idx, 1);
+
+ /* Schedule the rx node */
+ vlib_node_set_interrupt_pending (vm, netmap_input_node.index);
+
+ return 0;
+}
+
+static void
+close_netmap_if (netmap_main_t * nm, netmap_if_t * nif)
+{
+ if (nif->unix_file_index != ~0)
+ {
+ unix_file_del (&unix_main, unix_main.file_pool + nif->unix_file_index);
+ nif->unix_file_index = ~0;
+ }
+ else if (nif->fd > -1)
+ close (nif->fd);
+
+ if (nif->mem_region)
+ {
+ netmap_mem_region_t *reg = &nm->mem_regions[nif->mem_region];
+ if (--reg->refcnt == 0)
+ {
+ munmap (reg->mem, reg->region_size);
+ reg->region_size = 0;
+ }
+ }
+
+
+ mhash_unset (&nm->if_index_by_host_if_name, nif->host_if_name,
+ &nif->if_index);
+ vec_free (nif->host_if_name);
+ vec_free (nif->req);
+
+ memset (nif, 0, sizeof (*nif));
+ pool_put (nm->interfaces, nif);
+}
+
+int
+netmap_worker_thread_enable ()
+{
+ /* if worker threads are enabled, switch to polling mode */
+ foreach_vlib_main ((
+ {
+ vlib_node_set_state (this_vlib_main,
+ netmap_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ }));
+
+ return 0;
+}
+
+int
+netmap_worker_thread_disable ()
+{
+ foreach_vlib_main ((
+ {
+ vlib_node_set_state (this_vlib_main,
+ netmap_input_node.index,
+ VLIB_NODE_STATE_INTERRUPT);
+ }));
+
+ return 0;
+}
+
+int
+netmap_create_if (vlib_main_t * vm, u8 * if_name, u8 * hw_addr_set,
+ u8 is_pipe, u8 is_master, u32 * sw_if_index)
+{
+ netmap_main_t *nm = &netmap_main;
+ int ret = 0;
+ netmap_if_t *nif = 0;
+ u8 hw_addr[6];
+ clib_error_t *error = 0;
+ vnet_sw_interface_t *sw;
+ vnet_main_t *vnm = vnet_get_main ();
+ uword *p;
+ struct nmreq *req = 0;
+ netmap_mem_region_t *reg;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ int fd;
+
+ p = mhash_get (&nm->if_index_by_host_if_name, if_name);
+ if (p)
+ return VNET_API_ERROR_SUBIF_ALREADY_EXISTS;
+
+ fd = open ("/dev/netmap", O_RDWR);
+ if (fd < 0)
+ return VNET_API_ERROR_SUBIF_ALREADY_EXISTS;
+
+ pool_get (nm->interfaces, nif);
+ nif->if_index = nif - nm->interfaces;
+ nif->fd = fd;
+ nif->unix_file_index = ~0;
+
+ vec_validate (req, 0);
+ nif->req = req;
+ req->nr_version = NETMAP_API;
+ req->nr_flags = NR_REG_ALL_NIC;
+
+ if (is_pipe)
+ req->nr_flags = is_master ? NR_REG_PIPE_MASTER : NR_REG_PIPE_SLAVE;
+ else
+ req->nr_flags = NR_REG_ALL_NIC;
+
+ req->nr_flags |= NR_ACCEPT_VNET_HDR;
+ snprintf (req->nr_name, IFNAMSIZ, "%s", if_name);
+ req->nr_name[IFNAMSIZ - 1] = 0;
+
+ if (ioctl (nif->fd, NIOCREGIF, req))
+ {
+ ret = VNET_API_ERROR_NOT_CONNECTED;
+ goto error;
+ }
+
+ nif->mem_region = req->nr_arg2;
+ vec_validate (nm->mem_regions, nif->mem_region);
+ reg = &nm->mem_regions[nif->mem_region];
+ if (reg->region_size == 0)
+ {
+ reg->mem = mmap (NULL, req->nr_memsize, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ clib_warning ("mem %p", reg->mem);
+ if (reg->mem == MAP_FAILED)
+ {
+ ret = VNET_API_ERROR_NOT_CONNECTED;
+ goto error;
+ }
+ reg->region_size = req->nr_memsize;
+ }
+ reg->refcnt++;
+
+ nif->nifp = NETMAP_IF (reg->mem, req->nr_offset);
+ nif->first_rx_ring = 0;
+ nif->last_rx_ring = 0;
+ nif->first_tx_ring = 0;
+ nif->last_tx_ring = 0;
+ nif->host_if_name = if_name;
+ nif->per_interface_next_index = ~0;
+
+ if (tm->n_vlib_mains > 1)
+ {
+ nif->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) nif->lockp, 0, CLIB_CACHE_LINE_BYTES);
+ }
+
+ {
+ unix_file_t template = { 0 };
+ template.read_function = netmap_fd_read_ready;
+ template.file_descriptor = nif->fd;
+ template.private_data = nif->if_index;
+ nif->unix_file_index = unix_file_add (&unix_main, &template);
+ }
+
+ /*use configured or generate random MAC address */
+ if (hw_addr_set)
+ memcpy (hw_addr, hw_addr_set, 6);
+ else
+ {
+ f64 now = vlib_time_now (vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (hw_addr + 2, &rnd, sizeof (rnd));
+ hw_addr[0] = 2;
+ hw_addr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface (vnm, netmap_device_class.index,
+ nif->if_index, hw_addr,
+ &nif->hw_if_index,
+ netmap_eth_flag_change);
+
+ if (error)
+ {
+ clib_error_report (error);
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ sw = vnet_get_hw_sw_interface (vnm, nif->hw_if_index);
+ nif->sw_if_index = sw->sw_if_index;
+
+ mhash_set_mem (&nm->if_index_by_host_if_name, if_name, &nif->if_index, 0);
+
+ if (sw_if_index)
+ *sw_if_index = nif->sw_if_index;
+
+ if (tm->n_vlib_mains > 1 && pool_elts (nm->interfaces) == 1)
+ netmap_worker_thread_enable ();
+
+ return 0;
+
+error:
+ close_netmap_if (nm, nif);
+ return ret;
+}
+
+int
+netmap_delete_if (vlib_main_t * vm, u8 * host_if_name)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ netmap_main_t *nm = &netmap_main;
+ netmap_if_t *nif;
+ uword *p;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+ p = mhash_get (&nm->if_index_by_host_if_name, host_if_name);
+ if (p == NULL)
+ {
+ clib_warning ("Host interface %s does not exist", host_if_name);
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ }
+ nif = pool_elt_at_index (nm->interfaces, p[0]);
+
+ /* bring down the interface */
+ vnet_hw_interface_set_flags (vnm, nif->hw_if_index, 0);
+
+ ethernet_delete_interface (vnm, nif->hw_if_index);
+
+ close_netmap_if (nm, nif);
+
+ if (tm->n_vlib_mains > 1 && pool_elts (nm->interfaces) == 0)
+ netmap_worker_thread_disable ();
+
+ return 0;
+}
+
+static clib_error_t *
+netmap_init (vlib_main_t * vm)
+{
+ netmap_main_t *nm = &netmap_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ vlib_thread_registration_t *tr;
+ uword *p;
+
+ memset (nm, 0, sizeof (netmap_main_t));
+
+ nm->input_cpu_first_index = 0;
+ nm->input_cpu_count = 1;
+
+ /* find out which cpus will be used for input */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (tr && tr->count > 0)
+ {
+ nm->input_cpu_first_index = tr->first_index;
+ nm->input_cpu_count = tr->count;
+ }
+
+ mhash_init_vec_string (&nm->if_index_by_host_if_name, sizeof (uword));
+
+ vec_validate_aligned (nm->rx_buffers, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (netmap_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/netmap/netmap.h b/src/vnet/devices/netmap/netmap.h
new file mode 100644
index 00000000000..39a94043c3c
--- /dev/null
+++ b/src/vnet/devices/netmap/netmap.h
@@ -0,0 +1,164 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+/*
+ * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ volatile u32 *lockp;
+ u8 *host_if_name;
+ uword if_index;
+ u32 hw_if_index;
+ u32 sw_if_index;
+ u32 unix_file_index;
+
+ u32 per_interface_next_index;
+ u8 is_admin_up;
+
+ /* netmap */
+ struct nmreq *req;
+ u16 mem_region;
+ int fd;
+ struct netmap_if *nifp;
+ u16 first_tx_ring;
+ u16 last_tx_ring;
+ u16 first_rx_ring;
+ u16 last_rx_ring;
+
+} netmap_if_t;
+
+typedef struct
+{
+ char *mem;
+ u32 region_size;
+ int refcnt;
+} netmap_mem_region_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ netmap_if_t *interfaces;
+
+ /* bitmap of pending rx interfaces */
+ uword *pending_input_bitmap;
+
+ /* rx buffer cache */
+ u32 **rx_buffers;
+
+ /* hash of host interface names */
+ mhash_t if_index_by_host_if_name;
+
+ /* vector of memory regions */
+ netmap_mem_region_t *mem_regions;
+
+ /* first cpu index */
+ u32 input_cpu_first_index;
+
+ /* total cpu count */
+ u32 input_cpu_count;
+} netmap_main_t;
+
+netmap_main_t netmap_main;
+extern vnet_device_class_t netmap_device_class;
+extern vlib_node_registration_t netmap_input_node;
+
+int netmap_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set,
+ u8 is_pipe, u8 is_master, u32 * sw_if_index);
+int netmap_delete_if (vlib_main_t * vm, u8 * host_if_name);
+
+
+/* Macros and helper functions from sys/net/netmap_user.h */
+
+#ifdef _NET_NETMAP_H_
+
+#define _NETMAP_OFFSET(type, ptr, offset) \
+ ((type)(void *)((char *)(ptr) + (offset)))
+
+#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs)
+
+#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
+ nifp, (nifp)->ring_ofs[index] )
+
+#define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
+ nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] )
+
+#define NETMAP_BUF(ring, index) \
+ ((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size))
+
+#define NETMAP_BUF_IDX(ring, buf) \
+ ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \
+ (ring)->nr_buf_size )
+
+static inline uint32_t
+nm_ring_next (struct netmap_ring *ring, uint32_t i)
+{
+ return (PREDICT_FALSE (i + 1 == ring->num_slots) ? 0 : i + 1);
+}
+
+
+/*
+ * Return 1 if we have pending transmissions in the tx ring.
+ * When everything is complete ring->head = ring->tail + 1 (modulo ring size)
+ */
+static inline int
+nm_tx_pending (struct netmap_ring *ring)
+{
+ return nm_ring_next (ring, ring->tail) != ring->head;
+}
+
+static inline uint32_t
+nm_ring_space (struct netmap_ring *ring)
+{
+ int ret = ring->tail - ring->cur;
+ if (ret < 0)
+ ret += ring->num_slots;
+ return ret;
+}
+#endif
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/netmap/netmap_api.c b/src/vnet/devices/netmap/netmap_api.c
new file mode 100644
index 00000000000..9a393b1fda4
--- /dev/null
+++ b/src/vnet/devices/netmap/netmap_api.c
@@ -0,0 +1,137 @@
+/*
+ *------------------------------------------------------------------
+ * netmap_api.c - netmap api
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/vnet.h>
+#include <vlibmemory/api.h>
+
+#include <vnet/interface.h>
+#include <vnet/api_errno.h>
+#include <vnet/devices/netmap/netmap.h>
+
+#include <vnet/vnet_msg_enum.h>
+
+#define vl_typedefs /* define message structures */
+#include <vnet/vnet_all_api_h.h>
+#undef vl_typedefs
+
+#define vl_endianfun /* define message structures */
+#include <vnet/vnet_all_api_h.h>
+#undef vl_endianfun
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vnet/vnet_all_api_h.h>
+#undef vl_printfun
+
+#include <vlibapi/api_helper_macros.h>
+
+#define foreach_vpe_api_msg \
+_(NETMAP_CREATE, netmap_create) \
+_(NETMAP_DELETE, netmap_delete) \
+
+static void
+vl_api_netmap_create_t_handler (vl_api_netmap_create_t * mp)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vl_api_netmap_create_reply_t *rmp;
+ int rv = 0;
+ u8 *if_name = NULL;
+
+ if_name = format (0, "%s", mp->netmap_if_name);
+ vec_add1 (if_name, 0);
+
+ rv =
+ netmap_create_if (vm, if_name, mp->use_random_hw_addr ? 0 : mp->hw_addr,
+ mp->is_pipe, mp->is_master, 0);
+
+ vec_free (if_name);
+
+ REPLY_MACRO (VL_API_NETMAP_CREATE_REPLY);
+}
+
+static void
+vl_api_netmap_delete_t_handler (vl_api_netmap_delete_t * mp)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vl_api_netmap_delete_reply_t *rmp;
+ int rv = 0;
+ u8 *if_name = NULL;
+
+ if_name = format (0, "%s", mp->netmap_if_name);
+ vec_add1 (if_name, 0);
+
+ rv = netmap_delete_if (vm, if_name);
+
+ vec_free (if_name);
+
+ REPLY_MACRO (VL_API_NETMAP_DELETE_REPLY);
+}
+
+/*
+ * netmap_api_hookup
+ * Add vpe's API message handlers to the table.
+ * vlib has alread mapped shared memory and
+ * added the client registration handlers.
+ * See .../vlib-api/vlibmemory/memclnt_vlib.c:memclnt_process()
+ */
+#define vl_msg_name_crc_list
+#include <vnet/vnet_all_api_h.h>
+#undef vl_msg_name_crc_list
+
+static void
+setup_message_id_table (api_main_t * am)
+{
+#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id);
+ foreach_vl_msg_name_crc_netmap;
+#undef _
+}
+
+static clib_error_t *
+netmap_api_hookup (vlib_main_t * vm)
+{
+ api_main_t *am = &api_main;
+
+#define _(N,n) \
+ vl_msg_api_set_handlers(VL_API_##N, #n, \
+ vl_api_##n##_t_handler, \
+ vl_noop_handler, \
+ vl_api_##n##_t_endian, \
+ vl_api_##n##_t_print, \
+ sizeof(vl_api_##n##_t), 1);
+ foreach_vpe_api_msg;
+#undef _
+
+ /*
+ * Set up the (msg_name, crc, message-id) table
+ */
+ setup_message_id_table (am);
+
+ return 0;
+}
+
+VLIB_API_INIT_FUNCTION (netmap_api_hookup);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/netmap/node.c b/src/vnet/devices/netmap/node.c
new file mode 100644
index 00000000000..19895e4754a
--- /dev/null
+++ b/src/vnet/devices/netmap/node.c
@@ -0,0 +1,300 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+#include <vnet/feature/feature.h>
+
+#include <vnet/devices/netmap/net_netmap.h>
+#include <vnet/devices/netmap/netmap.h>
+
+#define foreach_netmap_input_error
+
+typedef enum
+{
+#define _(f,s) NETMAP_INPUT_ERROR_##f,
+ foreach_netmap_input_error
+#undef _
+ NETMAP_INPUT_N_ERROR,
+} netmap_input_error_t;
+
+static char *netmap_input_error_strings[] = {
+#define _(n,s) s,
+ foreach_netmap_input_error
+#undef _
+};
+
+typedef struct
+{
+ u32 next_index;
+ u32 hw_if_index;
+ struct netmap_slot slot;
+} netmap_input_trace_t;
+
+static u8 *
+format_netmap_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ netmap_input_trace_t *t = va_arg (*args, netmap_input_trace_t *);
+ uword indent = format_get_indent (s);
+
+ s = format (s, "netmap: hw_if_index %d next-index %d",
+ t->hw_if_index, t->next_index);
+ s = format (s, "\n%Uslot: flags 0x%x len %u buf_idx %u",
+ format_white_space, indent + 2,
+ t->slot.flags, t->slot.len, t->slot.buf_idx);
+ return s;
+}
+
+always_inline void
+buffer_add_to_chain (vlib_main_t * vm, u32 bi, u32 first_bi, u32 prev_bi)
+{
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ vlib_buffer_t *first_b = vlib_get_buffer (vm, first_bi);
+ vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_bi);
+
+ /* update first buffer */
+ first_b->total_length_not_including_first_buffer += b->current_length;
+
+ /* update previous buffer */
+ prev_b->next_buffer = bi;
+ prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+ /* update current buffer */
+ b->next_buffer = 0;
+}
+
+always_inline uword
+netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame, netmap_if_t * nif)
+{
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ uword n_trace = vlib_get_trace_count (vm, node);
+ netmap_main_t *nm = &netmap_main;
+ u32 n_rx_packets = 0;
+ u32 n_rx_bytes = 0;
+ u32 *to_next = 0;
+ u32 n_free_bufs;
+ struct netmap_ring *ring;
+ int cur_ring;
+ u32 cpu_index = os_get_cpu_number ();
+ u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ if (nif->per_interface_next_index != ~0)
+ next_index = nif->per_interface_next_index;
+
+ n_free_bufs = vec_len (nm->rx_buffers[cpu_index]);
+ if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE))
+ {
+ vec_validate (nm->rx_buffers[cpu_index],
+ VLIB_FRAME_SIZE + n_free_bufs - 1);
+ n_free_bufs +=
+ vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs],
+ VLIB_FRAME_SIZE);
+ _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs;
+ }
+
+ cur_ring = nif->first_rx_ring;
+ while (cur_ring <= nif->last_rx_ring && n_free_bufs)
+ {
+ int r = 0;
+ u32 cur_slot_index;
+ ring = NETMAP_RXRING (nif->nifp, cur_ring);
+ r = nm_ring_space (ring);
+
+ if (!r)
+ {
+ cur_ring++;
+ continue;
+ }
+
+ if (r > n_free_bufs)
+ r = n_free_bufs;
+
+ cur_slot_index = ring->cur;
+ while (r)
+ {
+ u32 n_left_to_next;
+ u32 next0 = next_index;
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (r && n_left_to_next)
+ {
+ vlib_buffer_t *first_b0 = 0;
+ u32 offset = 0;
+ u32 bi0 = 0, first_bi0 = 0, prev_bi0;
+ u32 next_slot_index = (cur_slot_index + 1) % ring->num_slots;
+ u32 next2_slot_index = (cur_slot_index + 2) % ring->num_slots;
+ struct netmap_slot *slot = &ring->slot[cur_slot_index];
+ u32 data_len = slot->len;
+
+ /* prefetch 2 slots in advance */
+ CLIB_PREFETCH (&ring->slot[next2_slot_index],
+ CLIB_CACHE_LINE_BYTES, LOAD);
+ /* prefetch start of next packet */
+ CLIB_PREFETCH (NETMAP_BUF
+ (ring, ring->slot[next_slot_index].buf_idx),
+ CLIB_CACHE_LINE_BYTES, LOAD);
+
+ while (data_len && n_free_bufs)
+ {
+ vlib_buffer_t *b0;
+ /* grab free buffer */
+ u32 last_empty_buffer =
+ vec_len (nm->rx_buffers[cpu_index]) - 1;
+ prev_bi0 = bi0;
+ bi0 = nm->rx_buffers[cpu_index][last_empty_buffer];
+ b0 = vlib_get_buffer (vm, bi0);
+ _vec_len (nm->rx_buffers[cpu_index]) = last_empty_buffer;
+ n_free_bufs--;
+
+ /* copy data */
+ u32 bytes_to_copy =
+ data_len > n_buffer_bytes ? n_buffer_bytes : data_len;
+ b0->current_data = 0;
+ clib_memcpy (vlib_buffer_get_current (b0),
+ (u8 *) NETMAP_BUF (ring,
+ slot->buf_idx) + offset,
+ bytes_to_copy);
+
+ /* fill buffer header */
+ b0->current_length = bytes_to_copy;
+
+ if (offset == 0)
+ {
+ b0->total_length_not_including_first_buffer = 0;
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] =
+ nif->sw_if_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ first_bi0 = bi0;
+ first_b0 = vlib_get_buffer (vm, first_bi0);
+ }
+ else
+ buffer_add_to_chain (vm, bi0, first_bi0, prev_bi0);
+
+ offset += bytes_to_copy;
+ data_len -= bytes_to_copy;
+ }
+
+ /* trace */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b0);
+ if (PREDICT_FALSE (n_trace > 0))
+ {
+ if (PREDICT_TRUE (first_b0 != 0))
+ {
+ netmap_input_trace_t *tr;
+ vlib_trace_buffer (vm, node, next0, first_b0,
+ /* follow_chain */ 0);
+ vlib_set_trace_count (vm, node, --n_trace);
+ tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->hw_if_index = nif->hw_if_index;
+ memcpy (&tr->slot, slot, sizeof (struct netmap_slot));
+ }
+ }
+
+ /* redirect if feature path enabled */
+ vnet_feature_start_device_input_x1 (nif->sw_if_index, &next0,
+ first_b0, 0);
+
+ /* enque and take next packet */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, first_bi0,
+ next0);
+
+ /* next packet */
+ n_rx_packets++;
+ n_rx_bytes += slot->len;
+ to_next[0] = first_bi0;
+ to_next += 1;
+ n_left_to_next--;
+ cur_slot_index = next_slot_index;
+
+ r--;
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ ring->head = ring->cur = cur_slot_index;
+ cur_ring++;
+ }
+
+ if (n_rx_packets)
+ ioctl (nif->fd, NIOCRXSYNC, NULL);
+
+ vlib_increment_combined_counter
+ (vnet_get_main ()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number (), nif->hw_if_index, n_rx_packets, n_rx_bytes);
+
+ return n_rx_packets;
+}
+
+static uword
+netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ int i;
+ u32 n_rx_packets = 0;
+ u32 cpu_index = os_get_cpu_number ();
+ netmap_main_t *nm = &netmap_main;
+ netmap_if_t *nmi;
+
+ for (i = 0; i < vec_len (nm->interfaces); i++)
+ {
+ nmi = vec_elt_at_index (nm->interfaces, i);
+ if (nmi->is_admin_up &&
+ (i % nm->input_cpu_count) ==
+ (cpu_index - nm->input_cpu_first_index))
+ n_rx_packets += netmap_device_input_fn (vm, node, frame, nmi);
+ }
+
+ return n_rx_packets;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (netmap_input_node) = {
+ .function = netmap_input_fn,
+ .name = "netmap-input",
+ .sibling_of = "device-input",
+ .format_trace = format_netmap_input_trace,
+ .type = VLIB_NODE_TYPE_INPUT,
+ /* default state is INTERRUPT mode, switch to POLLING if worker threads are enabled */
+ .state = VLIB_NODE_STATE_INTERRUPT,
+ .n_errors = NETMAP_INPUT_N_ERROR,
+ .error_strings = netmap_input_error_strings,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (netmap_input_node, netmap_input_fn)
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/nic/ixge.c b/src/vnet/devices/nic/ixge.c
new file mode 100644
index 00000000000..d4c4c6b7414
--- /dev/null
+++ b/src/vnet/devices/nic/ixge.c
@@ -0,0 +1,2938 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * WARNING!
+ * This driver is not intended for production use and it is unsupported.
+ * It is provided for educational use only.
+ * Please use supported DPDK driver instead.
+ */
+
+#if __x86_64__
+#include <vppinfra/vector.h>
+
+#ifndef CLIB_HAVE_VEC128
+#warning HACK: ixge driver wont really work, missing u32x4
+typedef unsigned long long u32x4;
+#endif
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/pci/pci.h>
+#include <vnet/vnet.h>
+#include <vnet/devices/nic/ixge.h>
+#include <vnet/ethernet/ethernet.h>
+
+#define IXGE_ALWAYS_POLL 0
+
+#define EVENT_SET_FLAGS 0
+#define IXGE_HWBP_RACE_ELOG 0
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+
+/* 10 GIG E (XGE) PHY IEEE 802.3 clause 45 definitions. */
+#define XGE_PHY_DEV_TYPE_PMA_PMD 1
+#define XGE_PHY_DEV_TYPE_PHY_XS 4
+#define XGE_PHY_ID1 0x2
+#define XGE_PHY_ID2 0x3
+#define XGE_PHY_CONTROL 0x0
+#define XGE_PHY_CONTROL_RESET (1 << 15)
+
+ixge_main_t ixge_main;
+static vlib_node_registration_t ixge_input_node;
+static vlib_node_registration_t ixge_process_node;
+
+static void
+ixge_semaphore_get (ixge_device_t * xd)
+{
+ ixge_main_t *xm = &ixge_main;
+ vlib_main_t *vm = xm->vlib_main;
+ ixge_regs_t *r = xd->regs;
+ u32 i;
+
+ i = 0;
+ while (!(r->software_semaphore & (1 << 0)))
+ {
+ if (i > 0)
+ vlib_process_suspend (vm, 100e-6);
+ i++;
+ }
+ do
+ {
+ r->software_semaphore |= 1 << 1;
+ }
+ while (!(r->software_semaphore & (1 << 1)));
+}
+
+static void
+ixge_semaphore_release (ixge_device_t * xd)
+{
+ ixge_regs_t *r = xd->regs;
+ r->software_semaphore &= ~3;
+}
+
+static void
+ixge_software_firmware_sync (ixge_device_t * xd, u32 sw_mask)
+{
+ ixge_main_t *xm = &ixge_main;
+ vlib_main_t *vm = xm->vlib_main;
+ ixge_regs_t *r = xd->regs;
+ u32 fw_mask = sw_mask << 5;
+ u32 m, done = 0;
+
+ while (!done)
+ {
+ ixge_semaphore_get (xd);
+ m = r->software_firmware_sync;
+ done = (m & fw_mask) == 0;
+ if (done)
+ r->software_firmware_sync = m | sw_mask;
+ ixge_semaphore_release (xd);
+ if (!done)
+ vlib_process_suspend (vm, 10e-3);
+ }
+}
+
+static void
+ixge_software_firmware_sync_release (ixge_device_t * xd, u32 sw_mask)
+{
+ ixge_regs_t *r = xd->regs;
+ ixge_semaphore_get (xd);
+ r->software_firmware_sync &= ~sw_mask;
+ ixge_semaphore_release (xd);
+}
+
+u32
+ixge_read_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index,
+ u32 v, u32 is_read)
+{
+ ixge_regs_t *r = xd->regs;
+ const u32 busy_bit = 1 << 30;
+ u32 x;
+
+ ASSERT (xd->phy_index < 2);
+ ixge_software_firmware_sync (xd, 1 << (1 + xd->phy_index));
+
+ ASSERT (reg_index < (1 << 16));
+ ASSERT (dev_type < (1 << 5));
+ if (!is_read)
+ r->xge_mac.phy_data = v;
+
+ /* Address cycle. */
+ x =
+ reg_index | (dev_type << 16) | (xd->
+ phys[xd->phy_index].mdio_address << 21);
+ r->xge_mac.phy_command = x | busy_bit;
+ /* Busy wait timed to take 28e-6 secs. No suspend. */
+ while (r->xge_mac.phy_command & busy_bit)
+ ;
+
+ r->xge_mac.phy_command = x | ((is_read ? 2 : 1) << 26) | busy_bit;
+ while (r->xge_mac.phy_command & busy_bit)
+ ;
+
+ if (is_read)
+ v = r->xge_mac.phy_data >> 16;
+
+ ixge_software_firmware_sync_release (xd, 1 << (1 + xd->phy_index));
+
+ return v;
+}
+
+static u32
+ixge_read_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index)
+{
+ return ixge_read_write_phy_reg (xd, dev_type, reg_index, 0, /* is_read */
+ 1);
+}
+
+static void
+ixge_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, u32 v)
+{
+ (void) ixge_read_write_phy_reg (xd, dev_type, reg_index, v, /* is_read */
+ 0);
+}
+
+static void
+ixge_i2c_put_bits (i2c_bus_t * b, int scl, int sda)
+{
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data);
+ u32 v;
+
+ v = 0;
+ v |= (sda != 0) << 3;
+ v |= (scl != 0) << 1;
+ xd->regs->i2c_control = v;
+}
+
+static void
+ixge_i2c_get_bits (i2c_bus_t * b, int *scl, int *sda)
+{
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data);
+ u32 v;
+
+ v = xd->regs->i2c_control;
+ *sda = (v & (1 << 2)) != 0;
+ *scl = (v & (1 << 0)) != 0;
+}
+
+static u16
+ixge_read_eeprom (ixge_device_t * xd, u32 address)
+{
+ ixge_regs_t *r = xd->regs;
+ u32 v;
+ r->eeprom_read = (( /* start bit */ (1 << 0)) | (address << 2));
+ /* Wait for done bit. */
+ while (!((v = r->eeprom_read) & (1 << 1)))
+ ;
+ return v >> 16;
+}
+
+static void
+ixge_sfp_enable_disable_laser (ixge_device_t * xd, uword enable)
+{
+ u32 tx_disable_bit = 1 << 3;
+ if (enable)
+ xd->regs->sdp_control &= ~tx_disable_bit;
+ else
+ xd->regs->sdp_control |= tx_disable_bit;
+}
+
+static void
+ixge_sfp_enable_disable_10g (ixge_device_t * xd, uword enable)
+{
+ u32 is_10g_bit = 1 << 5;
+ if (enable)
+ xd->regs->sdp_control |= is_10g_bit;
+ else
+ xd->regs->sdp_control &= ~is_10g_bit;
+}
+
+static clib_error_t *
+ixge_sfp_phy_init_from_eeprom (ixge_device_t * xd, u16 sfp_type)
+{
+ u16 a, id, reg_values_addr = 0;
+
+ a = ixge_read_eeprom (xd, 0x2b);
+ if (a == 0 || a == 0xffff)
+ return clib_error_create ("no init sequence in eeprom");
+
+ while (1)
+ {
+ id = ixge_read_eeprom (xd, ++a);
+ if (id == 0xffff)
+ break;
+ reg_values_addr = ixge_read_eeprom (xd, ++a);
+ if (id == sfp_type)
+ break;
+ }
+ if (id != sfp_type)
+ return clib_error_create ("failed to find id 0x%x", sfp_type);
+
+ ixge_software_firmware_sync (xd, 1 << 3);
+ while (1)
+ {
+ u16 v = ixge_read_eeprom (xd, ++reg_values_addr);
+ if (v == 0xffff)
+ break;
+ xd->regs->core_analog_config = v;
+ }
+ ixge_software_firmware_sync_release (xd, 1 << 3);
+
+ /* Make sure laser is off. We'll turn on the laser when
+ the interface is brought up. */
+ ixge_sfp_enable_disable_laser (xd, /* enable */ 0);
+ ixge_sfp_enable_disable_10g (xd, /* is_10g */ 1);
+
+ return 0;
+}
+
+static void
+ixge_sfp_device_up_down (ixge_device_t * xd, uword is_up)
+{
+ u32 v;
+
+ if (is_up)
+ {
+ /* pma/pmd 10g serial SFI. */
+ xd->regs->xge_mac.auto_negotiation_control2 &= ~(3 << 16);
+ xd->regs->xge_mac.auto_negotiation_control2 |= 2 << 16;
+
+ v = xd->regs->xge_mac.auto_negotiation_control;
+ v &= ~(7 << 13);
+ v |= (0 << 13);
+ /* Restart autoneg. */
+ v |= (1 << 12);
+ xd->regs->xge_mac.auto_negotiation_control = v;
+
+ while (!(xd->regs->xge_mac.link_partner_ability[0] & 0xf0000))
+ ;
+
+ v = xd->regs->xge_mac.auto_negotiation_control;
+
+ /* link mode 10g sfi serdes */
+ v &= ~(7 << 13);
+ v |= (3 << 13);
+
+ /* Restart autoneg. */
+ v |= (1 << 12);
+ xd->regs->xge_mac.auto_negotiation_control = v;
+
+ xd->regs->xge_mac.link_status;
+ }
+
+ ixge_sfp_enable_disable_laser (xd, /* enable */ is_up);
+
+ /* Give time for link partner to notice that we're up. */
+ if (is_up && vlib_in_process_context (vlib_get_main ()))
+ {
+ vlib_process_suspend (vlib_get_main (), 300e-3);
+ }
+}
+
+always_inline ixge_dma_regs_t *
+get_dma_regs (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 qi)
+{
+ ixge_regs_t *r = xd->regs;
+ ASSERT (qi < 128);
+ if (rt == VLIB_RX)
+ return qi < 64 ? &r->rx_dma0[qi] : &r->rx_dma1[qi - 64];
+ else
+ return &r->tx_dma[qi];
+}
+
+static clib_error_t *
+ixge_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, hif->dev_instance);
+ ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0);
+
+ if (is_up)
+ {
+ xd->regs->rx_enable |= 1;
+ xd->regs->tx_dma_control |= 1;
+ dr->control |= 1 << 25;
+ while (!(dr->control & (1 << 25)))
+ ;
+ }
+ else
+ {
+ xd->regs->rx_enable &= ~1;
+ xd->regs->tx_dma_control &= ~1;
+ }
+
+ ixge_sfp_device_up_down (xd, is_up);
+
+ return /* no error */ 0;
+}
+
+static void
+ixge_sfp_phy_init (ixge_device_t * xd)
+{
+ ixge_phy_t *phy = xd->phys + xd->phy_index;
+ i2c_bus_t *ib = &xd->i2c_bus;
+
+ ib->private_data = xd->device_index;
+ ib->put_bits = ixge_i2c_put_bits;
+ ib->get_bits = ixge_i2c_get_bits;
+ vlib_i2c_init (ib);
+
+ vlib_i2c_read_eeprom (ib, 0x50, 0, 128, (u8 *) & xd->sfp_eeprom);
+
+ if (vlib_i2c_bus_timed_out (ib) || !sfp_eeprom_is_valid (&xd->sfp_eeprom))
+ xd->sfp_eeprom.id = SFP_ID_unknown;
+ else
+ {
+ /* FIXME 5 => SR/LR eeprom ID. */
+ clib_error_t *e =
+ ixge_sfp_phy_init_from_eeprom (xd, 5 + xd->pci_function);
+ if (e)
+ clib_error_report (e);
+ }
+
+ phy->mdio_address = ~0;
+}
+
+static void
+ixge_phy_init (ixge_device_t * xd)
+{
+ ixge_main_t *xm = &ixge_main;
+ vlib_main_t *vm = xm->vlib_main;
+ ixge_phy_t *phy = xd->phys + xd->phy_index;
+
+ switch (xd->device_id)
+ {
+ case IXGE_82599_sfp:
+ case IXGE_82599_sfp_em:
+ case IXGE_82599_sfp_fcoe:
+ /* others? */
+ return ixge_sfp_phy_init (xd);
+
+ default:
+ break;
+ }
+
+ /* Probe address of phy. */
+ {
+ u32 i, v;
+
+ phy->mdio_address = ~0;
+ for (i = 0; i < 32; i++)
+ {
+ phy->mdio_address = i;
+ v = ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1);
+ if (v != 0xffff && v != 0)
+ break;
+ }
+
+ /* No PHY found? */
+ if (i >= 32)
+ return;
+ }
+
+ phy->id =
+ ((ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1) << 16) |
+ ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID2));
+
+ {
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .function = (char *) __FUNCTION__,.format =
+ "ixge %d, phy id 0x%d mdio address %d",.format_args = "i4i4i4",};
+ struct
+ {
+ u32 instance, id, address;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->instance = xd->device_index;
+ ed->id = phy->id;
+ ed->address = phy->mdio_address;
+ }
+
+ /* Reset phy. */
+ ixge_write_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL,
+ XGE_PHY_CONTROL_RESET);
+
+ /* Wait for self-clearning reset bit to clear. */
+ do
+ {
+ vlib_process_suspend (vm, 1e-3);
+ }
+ while (ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL) &
+ XGE_PHY_CONTROL_RESET);
+}
+
+static u8 *
+format_ixge_rx_from_hw_descriptor (u8 * s, va_list * va)
+{
+ ixge_rx_from_hw_descriptor_t *d =
+ va_arg (*va, ixge_rx_from_hw_descriptor_t *);
+ u32 s0 = d->status[0], s2 = d->status[2];
+ u32 is_ip4, is_ip6, is_ip, is_tcp, is_udp;
+ uword indent = format_get_indent (s);
+
+ s = format (s, "%s-owned",
+ (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE) ? "sw" :
+ "hw");
+ s =
+ format (s, ", length this descriptor %d, l3 offset %d",
+ d->n_packet_bytes_this_descriptor,
+ IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s0));
+ if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET)
+ s = format (s, ", end-of-packet");
+
+ s = format (s, "\n%U", format_white_space, indent);
+
+ if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR)
+ s = format (s, "layer2 error");
+
+ if (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2)
+ {
+ s = format (s, "layer 2 type %d", (s0 & 0x1f));
+ return s;
+ }
+
+ if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN)
+ s = format (s, "vlan header 0x%x\n%U", d->vlan_tag,
+ format_white_space, indent);
+
+ if ((is_ip4 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4)))
+ {
+ s = format (s, "ip4%s",
+ (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT) ? " options" :
+ "");
+ if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED)
+ s = format (s, " checksum %s",
+ (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) ?
+ "bad" : "ok");
+ }
+ if ((is_ip6 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6)))
+ s = format (s, "ip6%s",
+ (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT) ? " extended" :
+ "");
+ is_tcp = is_udp = 0;
+ if ((is_ip = (is_ip4 | is_ip6)))
+ {
+ is_tcp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP) != 0;
+ is_udp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP) != 0;
+ if (is_tcp)
+ s = format (s, ", tcp");
+ if (is_udp)
+ s = format (s, ", udp");
+ }
+
+ if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED)
+ s = format (s, ", tcp checksum %s",
+ (s2 & IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR) ? "bad" :
+ "ok");
+ if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)
+ s = format (s, ", udp checksum %s",
+ (s2 & IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR) ? "bad" :
+ "ok");
+
+ return s;
+}
+
+static u8 *
+format_ixge_tx_descriptor (u8 * s, va_list * va)
+{
+ ixge_tx_descriptor_t *d = va_arg (*va, ixge_tx_descriptor_t *);
+ u32 s0 = d->status0, s1 = d->status1;
+ uword indent = format_get_indent (s);
+ u32 v;
+
+ s = format (s, "buffer 0x%Lx, %d packet bytes, %d bytes this buffer",
+ d->buffer_address, s1 >> 14, d->n_bytes_this_buffer);
+
+ s = format (s, "\n%U", format_white_space, indent);
+
+ if ((v = (s0 >> 0) & 3))
+ s = format (s, "reserved 0x%x, ", v);
+
+ if ((v = (s0 >> 2) & 3))
+ s = format (s, "mac 0x%x, ", v);
+
+ if ((v = (s0 >> 4) & 0xf) != 3)
+ s = format (s, "type 0x%x, ", v);
+
+ s = format (s, "%s%s%s%s%s%s%s%s",
+ (s0 & (1 << 8)) ? "eop, " : "",
+ (s0 & (1 << 9)) ? "insert-fcs, " : "",
+ (s0 & (1 << 10)) ? "reserved26, " : "",
+ (s0 & (1 << 11)) ? "report-status, " : "",
+ (s0 & (1 << 12)) ? "reserved28, " : "",
+ (s0 & (1 << 13)) ? "is-advanced, " : "",
+ (s0 & (1 << 14)) ? "vlan-enable, " : "",
+ (s0 & (1 << 15)) ? "tx-segmentation, " : "");
+
+ if ((v = s1 & 0xf) != 0)
+ s = format (s, "status 0x%x, ", v);
+
+ if ((v = (s1 >> 4) & 0xf))
+ s = format (s, "context 0x%x, ", v);
+
+ if ((v = (s1 >> 8) & 0x3f))
+ s = format (s, "options 0x%x, ", v);
+
+ return s;
+}
+
+typedef struct
+{
+ ixge_descriptor_t before, after;
+
+ u32 buffer_index;
+
+ u16 device_index;
+
+ u8 queue_index;
+
+ u8 is_start_of_packet;
+
+ /* Copy of VLIB buffer; packet data stored in pre_data. */
+ vlib_buffer_t buffer;
+} ixge_rx_dma_trace_t;
+
+static u8 *
+format_ixge_rx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ vlib_node_t *node = va_arg (*va, vlib_node_t *);
+ vnet_main_t *vnm = vnet_get_main ();
+ ixge_rx_dma_trace_t *t = va_arg (*va, ixge_rx_dma_trace_t *);
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index);
+ format_function_t *f;
+ uword indent = format_get_indent (s);
+
+ {
+ vnet_sw_interface_t *sw =
+ vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+ s =
+ format (s, "%U rx queue %d", format_vnet_sw_interface_name, vnm, sw,
+ t->queue_index);
+ }
+
+ s = format (s, "\n%Ubefore: %U",
+ format_white_space, indent,
+ format_ixge_rx_from_hw_descriptor, &t->before);
+ s = format (s, "\n%Uafter : head/tail address 0x%Lx/0x%Lx",
+ format_white_space, indent,
+ t->after.rx_to_hw.head_address, t->after.rx_to_hw.tail_address);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index, format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U", format_white_space, indent);
+
+ f = node->format_buffer;
+ if (!f || !t->is_start_of_packet)
+ f = format_hex_bytes;
+ s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+#define foreach_ixge_error \
+ _ (none, "no error") \
+ _ (tx_full_drops, "tx ring full drops") \
+ _ (ip4_checksum_error, "ip4 checksum errors") \
+ _ (rx_alloc_fail, "rx buf alloc from free list failed") \
+ _ (rx_alloc_no_physmem, "rx buf alloc failed no physmem")
+
+typedef enum
+{
+#define _(f,s) IXGE_ERROR_##f,
+ foreach_ixge_error
+#undef _
+ IXGE_N_ERROR,
+} ixge_error_t;
+
+always_inline void
+ixge_rx_next_and_error_from_status_x1 (ixge_device_t * xd,
+ u32 s00, u32 s02,
+ u8 * next0, u8 * error0, u32 * flags0)
+{
+ u8 is0_ip4, is0_ip6, n0, e0;
+ u32 f0;
+
+ e0 = IXGE_ERROR_none;
+ n0 = IXGE_RX_NEXT_ETHERNET_INPUT;
+
+ is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED;
+ n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0;
+
+ e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR)
+ ? IXGE_ERROR_ip4_checksum_error : e0);
+
+ is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6;
+ n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0;
+
+ n0 = (xd->per_interface_next_index != ~0) ?
+ xd->per_interface_next_index : n0;
+
+ /* Check for error. */
+ n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0;
+
+ f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED
+ | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED))
+ ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0);
+
+ f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR
+ | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR))
+ ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT);
+
+ *error0 = e0;
+ *next0 = n0;
+ *flags0 = f0;
+}
+
+always_inline void
+ixge_rx_next_and_error_from_status_x2 (ixge_device_t * xd,
+ u32 s00, u32 s02,
+ u32 s10, u32 s12,
+ u8 * next0, u8 * error0, u32 * flags0,
+ u8 * next1, u8 * error1, u32 * flags1)
+{
+ u8 is0_ip4, is0_ip6, n0, e0;
+ u8 is1_ip4, is1_ip6, n1, e1;
+ u32 f0, f1;
+
+ e0 = e1 = IXGE_ERROR_none;
+ n0 = n1 = IXGE_RX_NEXT_IP4_INPUT;
+
+ is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED;
+ is1_ip4 = s12 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED;
+
+ n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0;
+ n1 = is1_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n1;
+
+ e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR)
+ ? IXGE_ERROR_ip4_checksum_error : e0);
+ e1 = (is1_ip4 && (s12 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR)
+ ? IXGE_ERROR_ip4_checksum_error : e1);
+
+ is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6;
+ is1_ip6 = s10 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6;
+
+ n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0;
+ n1 = is1_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n1;
+
+ n0 = (xd->per_interface_next_index != ~0) ?
+ xd->per_interface_next_index : n0;
+ n1 = (xd->per_interface_next_index != ~0) ?
+ xd->per_interface_next_index : n1;
+
+ /* Check for error. */
+ n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0;
+ n1 = e1 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n1;
+
+ *error0 = e0;
+ *error1 = e1;
+
+ *next0 = n0;
+ *next1 = n1;
+
+ f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED
+ | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED))
+ ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0);
+ f1 = ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED
+ | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED))
+ ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0);
+
+ f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR
+ | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR))
+ ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT);
+ f1 |= ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR
+ | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR))
+ ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT);
+
+ *flags0 = f0;
+ *flags1 = f1;
+}
+
+static void
+ixge_rx_trace (ixge_main_t * xm,
+ ixge_device_t * xd,
+ ixge_dma_queue_t * dq,
+ ixge_descriptor_t * before_descriptors,
+ u32 * before_buffers,
+ ixge_descriptor_t * after_descriptors, uword n_descriptors)
+{
+ vlib_main_t *vm = xm->vlib_main;
+ vlib_node_runtime_t *node = dq->rx.node;
+ ixge_rx_from_hw_descriptor_t *bd;
+ ixge_rx_to_hw_descriptor_t *ad;
+ u32 *b, n_left, is_sop, next_index_sop;
+
+ n_left = n_descriptors;
+ b = before_buffers;
+ bd = &before_descriptors->rx_from_hw;
+ ad = &after_descriptors->rx_to_hw;
+ is_sop = dq->rx.is_start_of_packet;
+ next_index_sop = dq->rx.saved_start_of_packet_next_index;
+
+ while (n_left >= 2)
+ {
+ u32 bi0, bi1, flags0, flags1;
+ vlib_buffer_t *b0, *b1;
+ ixge_rx_dma_trace_t *t0, *t1;
+ u8 next0, error0, next1, error1;
+
+ bi0 = b[0];
+ bi1 = b[1];
+ n_left -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ ixge_rx_next_and_error_from_status_x2 (xd,
+ bd[0].status[0], bd[0].status[2],
+ bd[1].status[0], bd[1].status[2],
+ &next0, &error0, &flags0,
+ &next1, &error1, &flags1);
+
+ next_index_sop = is_sop ? next0 : next_index_sop;
+ vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->is_start_of_packet = is_sop;
+ is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+
+ next_index_sop = is_sop ? next1 : next_index_sop;
+ vlib_trace_buffer (vm, node, next_index_sop, b1, /* follow_chain */ 0);
+ t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
+ t1->is_start_of_packet = is_sop;
+ is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+
+ t0->queue_index = dq->queue_index;
+ t1->queue_index = dq->queue_index;
+ t0->device_index = xd->device_index;
+ t1->device_index = xd->device_index;
+ t0->before.rx_from_hw = bd[0];
+ t1->before.rx_from_hw = bd[1];
+ t0->after.rx_to_hw = ad[0];
+ t1->after.rx_to_hw = ad[1];
+ t0->buffer_index = bi0;
+ t1->buffer_index = bi1;
+ memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data));
+ memcpy (t0->buffer.pre_data, b0->data + b0->current_data,
+ sizeof (t0->buffer.pre_data));
+ memcpy (t1->buffer.pre_data, b1->data + b1->current_data,
+ sizeof (t1->buffer.pre_data));
+
+ b += 2;
+ bd += 2;
+ ad += 2;
+ }
+
+ while (n_left >= 1)
+ {
+ u32 bi0, flags0;
+ vlib_buffer_t *b0;
+ ixge_rx_dma_trace_t *t0;
+ u8 next0, error0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ ixge_rx_next_and_error_from_status_x1 (xd,
+ bd[0].status[0], bd[0].status[2],
+ &next0, &error0, &flags0);
+
+ next_index_sop = is_sop ? next0 : next_index_sop;
+ vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->is_start_of_packet = is_sop;
+ is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+
+ t0->queue_index = dq->queue_index;
+ t0->device_index = xd->device_index;
+ t0->before.rx_from_hw = bd[0];
+ t0->after.rx_to_hw = ad[0];
+ t0->buffer_index = bi0;
+ memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ memcpy (t0->buffer.pre_data, b0->data + b0->current_data,
+ sizeof (t0->buffer.pre_data));
+
+ b += 1;
+ bd += 1;
+ ad += 1;
+ }
+}
+
+typedef struct
+{
+ ixge_tx_descriptor_t descriptor;
+
+ u32 buffer_index;
+
+ u16 device_index;
+
+ u8 queue_index;
+
+ u8 is_start_of_packet;
+
+ /* Copy of VLIB buffer; packet data stored in pre_data. */
+ vlib_buffer_t buffer;
+} ixge_tx_dma_trace_t;
+
+static u8 *
+format_ixge_tx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ ixge_tx_dma_trace_t *t = va_arg (*va, ixge_tx_dma_trace_t *);
+ vnet_main_t *vnm = vnet_get_main ();
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index);
+ format_function_t *f;
+ uword indent = format_get_indent (s);
+
+ {
+ vnet_sw_interface_t *sw =
+ vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+ s =
+ format (s, "%U tx queue %d", format_vnet_sw_interface_name, vnm, sw,
+ t->queue_index);
+ }
+
+ s = format (s, "\n%Udescriptor: %U",
+ format_white_space, indent,
+ format_ixge_tx_descriptor, &t->descriptor);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index, format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U", format_white_space, indent);
+
+ f = format_ethernet_header_with_length;
+ if (!f || !t->is_start_of_packet)
+ f = format_hex_bytes;
+ s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+typedef struct
+{
+ vlib_node_runtime_t *node;
+
+ u32 is_start_of_packet;
+
+ u32 n_bytes_in_packet;
+
+ ixge_tx_descriptor_t *start_of_packet_descriptor;
+} ixge_tx_state_t;
+
+static void
+ixge_tx_trace (ixge_main_t * xm,
+ ixge_device_t * xd,
+ ixge_dma_queue_t * dq,
+ ixge_tx_state_t * tx_state,
+ ixge_tx_descriptor_t * descriptors,
+ u32 * buffers, uword n_descriptors)
+{
+ vlib_main_t *vm = xm->vlib_main;
+ vlib_node_runtime_t *node = tx_state->node;
+ ixge_tx_descriptor_t *d;
+ u32 *b, n_left, is_sop;
+
+ n_left = n_descriptors;
+ b = buffers;
+ d = descriptors;
+ is_sop = tx_state->is_start_of_packet;
+
+ while (n_left >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t *b0, *b1;
+ ixge_tx_dma_trace_t *t0, *t1;
+
+ bi0 = b[0];
+ bi1 = b[1];
+ n_left -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->is_start_of_packet = is_sop;
+ is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+
+ t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
+ t1->is_start_of_packet = is_sop;
+ is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+
+ t0->queue_index = dq->queue_index;
+ t1->queue_index = dq->queue_index;
+ t0->device_index = xd->device_index;
+ t1->device_index = xd->device_index;
+ t0->descriptor = d[0];
+ t1->descriptor = d[1];
+ t0->buffer_index = bi0;
+ t1->buffer_index = bi1;
+ memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data));
+ memcpy (t0->buffer.pre_data, b0->data + b0->current_data,
+ sizeof (t0->buffer.pre_data));
+ memcpy (t1->buffer.pre_data, b1->data + b1->current_data,
+ sizeof (t1->buffer.pre_data));
+
+ b += 2;
+ d += 2;
+ }
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ ixge_tx_dma_trace_t *t0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->is_start_of_packet = is_sop;
+ is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+
+ t0->queue_index = dq->queue_index;
+ t0->device_index = xd->device_index;
+ t0->descriptor = d[0];
+ t0->buffer_index = bi0;
+ memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ memcpy (t0->buffer.pre_data, b0->data + b0->current_data,
+ sizeof (t0->buffer.pre_data));
+
+ b += 1;
+ d += 1;
+ }
+}
+
+always_inline uword
+ixge_ring_sub (ixge_dma_queue_t * q, u32 i0, u32 i1)
+{
+ i32 d = i1 - i0;
+ ASSERT (i0 < q->n_descriptors);
+ ASSERT (i1 < q->n_descriptors);
+ return d < 0 ? q->n_descriptors + d : d;
+}
+
+always_inline uword
+ixge_ring_add (ixge_dma_queue_t * q, u32 i0, u32 i1)
+{
+ u32 d = i0 + i1;
+ ASSERT (i0 < q->n_descriptors);
+ ASSERT (i1 < q->n_descriptors);
+ d -= d >= q->n_descriptors ? q->n_descriptors : 0;
+ return d;
+}
+
+always_inline uword
+ixge_tx_descriptor_matches_template (ixge_main_t * xm,
+ ixge_tx_descriptor_t * d)
+{
+ u32 cmp;
+
+ cmp = ((d->status0 & xm->tx_descriptor_template_mask.status0)
+ ^ xm->tx_descriptor_template.status0);
+ if (cmp)
+ return 0;
+ cmp = ((d->status1 & xm->tx_descriptor_template_mask.status1)
+ ^ xm->tx_descriptor_template.status1);
+ if (cmp)
+ return 0;
+
+ return 1;
+}
+
+static uword
+ixge_tx_no_wrap (ixge_main_t * xm,
+ ixge_device_t * xd,
+ ixge_dma_queue_t * dq,
+ u32 * buffers,
+ u32 start_descriptor_index,
+ u32 n_descriptors, ixge_tx_state_t * tx_state)
+{
+ vlib_main_t *vm = xm->vlib_main;
+ ixge_tx_descriptor_t *d, *d_sop;
+ u32 n_left = n_descriptors;
+ u32 *to_free = vec_end (xm->tx_buffers_pending_free);
+ u32 *to_tx =
+ vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index);
+ u32 is_sop = tx_state->is_start_of_packet;
+ u32 len_sop = tx_state->n_bytes_in_packet;
+ u16 template_status = xm->tx_descriptor_template.status0;
+ u32 descriptor_prefetch_rotor = 0;
+
+ ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors);
+ d = &dq->descriptors[start_descriptor_index].tx;
+ d_sop = is_sop ? d : tx_state->start_of_packet_descriptor;
+
+ while (n_left >= 4)
+ {
+ vlib_buffer_t *b0, *b1;
+ u32 bi0, fi0, len0;
+ u32 bi1, fi1, len1;
+ u8 is_eop0, is_eop1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD);
+
+ if ((descriptor_prefetch_rotor & 0x3) == 0)
+ CLIB_PREFETCH (d + 4, CLIB_CACHE_LINE_BYTES, STORE);
+
+ descriptor_prefetch_rotor += 2;
+
+ bi0 = buffers[0];
+ bi1 = buffers[1];
+
+ to_free[0] = fi0 = to_tx[0];
+ to_tx[0] = bi0;
+ to_free += fi0 != 0;
+
+ to_free[0] = fi1 = to_tx[1];
+ to_tx[1] = bi1;
+ to_free += fi1 != 0;
+
+ buffers += 2;
+ n_left -= 2;
+ to_tx += 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+ is_eop1 = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+
+ len0 = b0->current_length;
+ len1 = b1->current_length;
+
+ ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0));
+ ASSERT (ixge_tx_descriptor_matches_template (xm, d + 1));
+
+ d[0].buffer_address =
+ vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data;
+ d[1].buffer_address =
+ vlib_get_buffer_data_physical_address (vm, bi1) + b1->current_data;
+
+ d[0].n_bytes_this_buffer = len0;
+ d[1].n_bytes_this_buffer = len1;
+
+ d[0].status0 =
+ template_status | (is_eop0 <<
+ IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET);
+ d[1].status0 =
+ template_status | (is_eop1 <<
+ IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET);
+
+ len_sop = (is_sop ? 0 : len_sop) + len0;
+ d_sop[0].status1 =
+ IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop);
+ d += 1;
+ d_sop = is_eop0 ? d : d_sop;
+
+ is_sop = is_eop0;
+
+ len_sop = (is_sop ? 0 : len_sop) + len1;
+ d_sop[0].status1 =
+ IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop);
+ d += 1;
+ d_sop = is_eop1 ? d : d_sop;
+
+ is_sop = is_eop1;
+ }
+
+ while (n_left > 0)
+ {
+ vlib_buffer_t *b0;
+ u32 bi0, fi0, len0;
+ u8 is_eop0;
+
+ bi0 = buffers[0];
+
+ to_free[0] = fi0 = to_tx[0];
+ to_tx[0] = bi0;
+ to_free += fi0 != 0;
+
+ buffers += 1;
+ n_left -= 1;
+ to_tx += 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0;
+
+ len0 = b0->current_length;
+
+ ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0));
+
+ d[0].buffer_address =
+ vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data;
+
+ d[0].n_bytes_this_buffer = len0;
+
+ d[0].status0 =
+ template_status | (is_eop0 <<
+ IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET);
+
+ len_sop = (is_sop ? 0 : len_sop) + len0;
+ d_sop[0].status1 =
+ IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop);
+ d += 1;
+ d_sop = is_eop0 ? d : d_sop;
+
+ is_sop = is_eop0;
+ }
+
+ if (tx_state->node->flags & VLIB_NODE_FLAG_TRACE)
+ {
+ to_tx =
+ vec_elt_at_index (dq->descriptor_buffer_indices,
+ start_descriptor_index);
+ ixge_tx_trace (xm, xd, dq, tx_state,
+ &dq->descriptors[start_descriptor_index].tx, to_tx,
+ n_descriptors);
+ }
+
+ _vec_len (xm->tx_buffers_pending_free) =
+ to_free - xm->tx_buffers_pending_free;
+
+ /* When we are done d_sop can point to end of ring. Wrap it if so. */
+ {
+ ixge_tx_descriptor_t *d_start = &dq->descriptors[0].tx;
+
+ ASSERT (d_sop - d_start <= dq->n_descriptors);
+ d_sop = d_sop - d_start == dq->n_descriptors ? d_start : d_sop;
+ }
+
+ tx_state->is_start_of_packet = is_sop;
+ tx_state->start_of_packet_descriptor = d_sop;
+ tx_state->n_bytes_in_packet = len_sop;
+
+ return n_descriptors;
+}
+
+static uword
+ixge_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ ixge_main_t *xm = &ixge_main;
+ vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, rd->dev_instance);
+ ixge_dma_queue_t *dq;
+ u32 *from, n_left_tx, n_descriptors_to_tx, n_tail_drop;
+ u32 queue_index = 0; /* fixme parameter */
+ ixge_tx_state_t tx_state;
+
+ tx_state.node = node;
+ tx_state.is_start_of_packet = 1;
+ tx_state.start_of_packet_descriptor = 0;
+ tx_state.n_bytes_in_packet = 0;
+
+ from = vlib_frame_vector_args (f);
+
+ dq = vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index);
+
+ dq->head_index = dq->tx.head_index_write_back[0];
+
+ /* Since head == tail means ring is empty we can send up to dq->n_descriptors - 1. */
+ n_left_tx = dq->n_descriptors - 1;
+ n_left_tx -= ixge_ring_sub (dq, dq->head_index, dq->tail_index);
+
+ _vec_len (xm->tx_buffers_pending_free) = 0;
+
+ n_descriptors_to_tx = f->n_vectors;
+ n_tail_drop = 0;
+ if (PREDICT_FALSE (n_descriptors_to_tx > n_left_tx))
+ {
+ i32 i, n_ok, i_eop, i_sop;
+
+ i_sop = i_eop = ~0;
+ for (i = n_left_tx - 1; i >= 0; i--)
+ {
+ vlib_buffer_t *b = vlib_get_buffer (vm, from[i]);
+ if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ if (i_sop != ~0 && i_eop != ~0)
+ break;
+ i_eop = i;
+ i_sop = i + 1;
+ }
+ }
+ if (i == 0)
+ n_ok = 0;
+ else
+ n_ok = i_eop + 1;
+
+ {
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .function = (char *) __FUNCTION__,.format =
+ "ixge %d, ring full to tx %d head %d tail %d",.format_args =
+ "i2i2i2i2",};
+ struct
+ {
+ u16 instance, to_tx, head, tail;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->instance = xd->device_index;
+ ed->to_tx = n_descriptors_to_tx;
+ ed->head = dq->head_index;
+ ed->tail = dq->tail_index;
+ }
+
+ if (n_ok < n_descriptors_to_tx)
+ {
+ n_tail_drop = n_descriptors_to_tx - n_ok;
+ vec_add (xm->tx_buffers_pending_free, from + n_ok, n_tail_drop);
+ vlib_error_count (vm, ixge_input_node.index,
+ IXGE_ERROR_tx_full_drops, n_tail_drop);
+ }
+
+ n_descriptors_to_tx = n_ok;
+ }
+
+ dq->tx.n_buffers_on_ring += n_descriptors_to_tx;
+
+ /* Process from tail to end of descriptor ring. */
+ if (n_descriptors_to_tx > 0 && dq->tail_index < dq->n_descriptors)
+ {
+ u32 n =
+ clib_min (dq->n_descriptors - dq->tail_index, n_descriptors_to_tx);
+ n = ixge_tx_no_wrap (xm, xd, dq, from, dq->tail_index, n, &tx_state);
+ from += n;
+ n_descriptors_to_tx -= n;
+ dq->tail_index += n;
+ ASSERT (dq->tail_index <= dq->n_descriptors);
+ if (dq->tail_index == dq->n_descriptors)
+ dq->tail_index = 0;
+ }
+
+ if (n_descriptors_to_tx > 0)
+ {
+ u32 n =
+ ixge_tx_no_wrap (xm, xd, dq, from, 0, n_descriptors_to_tx, &tx_state);
+ from += n;
+ ASSERT (n == n_descriptors_to_tx);
+ dq->tail_index += n;
+ ASSERT (dq->tail_index <= dq->n_descriptors);
+ if (dq->tail_index == dq->n_descriptors)
+ dq->tail_index = 0;
+ }
+
+ /* We should only get full packets. */
+ ASSERT (tx_state.is_start_of_packet);
+
+ /* Report status when last descriptor is done. */
+ {
+ u32 i = dq->tail_index == 0 ? dq->n_descriptors - 1 : dq->tail_index - 1;
+ ixge_tx_descriptor_t *d = &dq->descriptors[i].tx;
+ d->status0 |= IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS;
+ }
+
+ /* Give new descriptors to hardware. */
+ {
+ ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_TX, queue_index);
+
+ CLIB_MEMORY_BARRIER ();
+
+ dr->tail_index = dq->tail_index;
+ }
+
+ /* Free any buffers that are done. */
+ {
+ u32 n = _vec_len (xm->tx_buffers_pending_free);
+ if (n > 0)
+ {
+ vlib_buffer_free_no_next (vm, xm->tx_buffers_pending_free, n);
+ _vec_len (xm->tx_buffers_pending_free) = 0;
+ ASSERT (dq->tx.n_buffers_on_ring >= n);
+ dq->tx.n_buffers_on_ring -= (n - n_tail_drop);
+ }
+ }
+
+ return f->n_vectors;
+}
+
+static uword
+ixge_rx_queue_no_wrap (ixge_main_t * xm,
+ ixge_device_t * xd,
+ ixge_dma_queue_t * dq,
+ u32 start_descriptor_index, u32 n_descriptors)
+{
+ vlib_main_t *vm = xm->vlib_main;
+ vlib_node_runtime_t *node = dq->rx.node;
+ ixge_descriptor_t *d;
+ static ixge_descriptor_t *d_trace_save;
+ static u32 *d_trace_buffers;
+ u32 n_descriptors_left = n_descriptors;
+ u32 *to_rx =
+ vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index);
+ u32 *to_add;
+ u32 bi_sop = dq->rx.saved_start_of_packet_buffer_index;
+ u32 bi_last = dq->rx.saved_last_buffer_index;
+ u32 next_index_sop = dq->rx.saved_start_of_packet_next_index;
+ u32 is_sop = dq->rx.is_start_of_packet;
+ u32 next_index, n_left_to_next, *to_next;
+ u32 n_packets = 0;
+ u32 n_bytes = 0;
+ u32 n_trace = vlib_get_trace_count (vm, node);
+ vlib_buffer_t *b_last, b_dummy;
+
+ ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors);
+ d = &dq->descriptors[start_descriptor_index];
+
+ b_last = bi_last != ~0 ? vlib_get_buffer (vm, bi_last) : &b_dummy;
+ next_index = dq->rx.next_index;
+
+ if (n_trace > 0)
+ {
+ u32 n = clib_min (n_trace, n_descriptors);
+ if (d_trace_save)
+ {
+ _vec_len (d_trace_save) = 0;
+ _vec_len (d_trace_buffers) = 0;
+ }
+ vec_add (d_trace_save, (ixge_descriptor_t *) d, n);
+ vec_add (d_trace_buffers, to_rx, n);
+ }
+
+ {
+ uword l = vec_len (xm->rx_buffers_to_add);
+
+ if (l < n_descriptors_left)
+ {
+ u32 n_to_alloc = 2 * dq->n_descriptors - l;
+ u32 n_allocated;
+
+ vec_resize (xm->rx_buffers_to_add, n_to_alloc);
+
+ _vec_len (xm->rx_buffers_to_add) = l;
+ n_allocated = vlib_buffer_alloc_from_free_list
+ (vm, xm->rx_buffers_to_add + l, n_to_alloc,
+ xm->vlib_buffer_free_list_index);
+ _vec_len (xm->rx_buffers_to_add) += n_allocated;
+
+ /* Handle transient allocation failure */
+ if (PREDICT_FALSE (l + n_allocated <= n_descriptors_left))
+ {
+ if (n_allocated == 0)
+ vlib_error_count (vm, ixge_input_node.index,
+ IXGE_ERROR_rx_alloc_no_physmem, 1);
+ else
+ vlib_error_count (vm, ixge_input_node.index,
+ IXGE_ERROR_rx_alloc_fail, 1);
+
+ n_descriptors_left = l + n_allocated;
+ }
+ n_descriptors = n_descriptors_left;
+ }
+
+ /* Add buffers from end of vector going backwards. */
+ to_add = vec_end (xm->rx_buffers_to_add) - 1;
+ }
+
+ while (n_descriptors_left > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_descriptors_left >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t *b0, *b1;
+ u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0;
+ u32 bi1, fi1, len1, l3_offset1, s21, s01, flags1;
+ u8 is_eop0, error0, next0;
+ u8 is_eop1, error1, next1;
+ ixge_descriptor_t d0, d1;
+
+ vlib_prefetch_buffer_with_index (vm, to_rx[2], STORE);
+ vlib_prefetch_buffer_with_index (vm, to_rx[3], STORE);
+
+ CLIB_PREFETCH (d + 2, 32, STORE);
+
+ d0.as_u32x4 = d[0].as_u32x4;
+ d1.as_u32x4 = d[1].as_u32x4;
+
+ s20 = d0.rx_from_hw.status[2];
+ s21 = d1.rx_from_hw.status[2];
+
+ s00 = d0.rx_from_hw.status[0];
+ s01 = d1.rx_from_hw.status[0];
+
+ if (!
+ ((s20 & s21) & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE))
+ goto found_hw_owned_descriptor_x2;
+
+ bi0 = to_rx[0];
+ bi1 = to_rx[1];
+
+ ASSERT (to_add - 1 >= xm->rx_buffers_to_add);
+ fi0 = to_add[0];
+ fi1 = to_add[-1];
+
+ to_rx[0] = fi0;
+ to_rx[1] = fi1;
+ to_rx += 2;
+ to_add -= 2;
+
+ ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED ==
+ vlib_buffer_is_known (vm, bi0));
+ ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED ==
+ vlib_buffer_is_known (vm, bi1));
+ ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED ==
+ vlib_buffer_is_known (vm, fi0));
+ ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED ==
+ vlib_buffer_is_known (vm, fi1));
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1);
+
+ CLIB_PREFETCH (b0->data, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b1->data, CLIB_CACHE_LINE_BYTES, LOAD);
+
+ is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0;
+ is_eop1 = (s21 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0;
+
+ ixge_rx_next_and_error_from_status_x2 (xd, s00, s20, s01, s21,
+ &next0, &error0, &flags0,
+ &next1, &error1, &flags1);
+
+ next0 = is_sop ? next0 : next_index_sop;
+ next1 = is_eop0 ? next1 : next0;
+ next_index_sop = next1;
+
+ b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT);
+ b1->flags |= flags1 | (!is_eop1 << VLIB_BUFFER_LOG2_NEXT_PRESENT);
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+
+ b0->error = node->errors[error0];
+ b1->error = node->errors[error1];
+
+ len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor;
+ len1 = d1.rx_from_hw.n_packet_bytes_this_descriptor;
+ n_bytes += len0 + len1;
+ n_packets += is_eop0 + is_eop1;
+
+ /* Give new buffers to hardware. */
+ d0.rx_to_hw.tail_address =
+ vlib_get_buffer_data_physical_address (vm, fi0);
+ d1.rx_to_hw.tail_address =
+ vlib_get_buffer_data_physical_address (vm, fi1);
+ d0.rx_to_hw.head_address = d[0].rx_to_hw.tail_address;
+ d1.rx_to_hw.head_address = d[1].rx_to_hw.tail_address;
+ d[0].as_u32x4 = d0.as_u32x4;
+ d[1].as_u32x4 = d1.as_u32x4;
+
+ d += 2;
+ n_descriptors_left -= 2;
+
+ /* Point to either l2 or l3 header depending on next. */
+ l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT))
+ ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0;
+ l3_offset1 = (is_eop0 && (next1 != IXGE_RX_NEXT_ETHERNET_INPUT))
+ ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s01) : 0;
+
+ b0->current_length = len0 - l3_offset0;
+ b1->current_length = len1 - l3_offset1;
+ b0->current_data = l3_offset0;
+ b1->current_data = l3_offset1;
+
+ b_last->next_buffer = is_sop ? ~0 : bi0;
+ b0->next_buffer = is_eop0 ? ~0 : bi1;
+ bi_last = bi1;
+ b_last = b1;
+
+ if (CLIB_DEBUG > 0)
+ {
+ u32 bi_sop0 = is_sop ? bi0 : bi_sop;
+ u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0;
+
+ if (is_eop0)
+ {
+ u8 *msg = vlib_validate_buffer (vm, bi_sop0,
+ /* follow_buffer_next */ 1);
+ ASSERT (!msg);
+ }
+ if (is_eop1)
+ {
+ u8 *msg = vlib_validate_buffer (vm, bi_sop1,
+ /* follow_buffer_next */ 1);
+ ASSERT (!msg);
+ }
+ }
+ if (0) /* "Dave" version */
+ {
+ u32 bi_sop0 = is_sop ? bi0 : bi_sop;
+ u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0;
+
+ if (is_eop0)
+ {
+ to_next[0] = bi_sop0;
+ to_next++;
+ n_left_to_next--;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi_sop0, next0);
+ }
+ if (is_eop1)
+ {
+ to_next[0] = bi_sop1;
+ to_next++;
+ n_left_to_next--;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi_sop1, next1);
+ }
+ is_sop = is_eop1;
+ bi_sop = bi_sop1;
+ }
+ if (1) /* "Eliot" version */
+ {
+ /* Speculatively enqueue to cached next. */
+ u8 saved_is_sop = is_sop;
+ u32 bi_sop_save = bi_sop;
+
+ bi_sop = saved_is_sop ? bi0 : bi_sop;
+ to_next[0] = bi_sop;
+ to_next += is_eop0;
+ n_left_to_next -= is_eop0;
+
+ bi_sop = is_eop0 ? bi1 : bi_sop;
+ to_next[0] = bi_sop;
+ to_next += is_eop1;
+ n_left_to_next -= is_eop1;
+
+ is_sop = is_eop1;
+
+ if (PREDICT_FALSE
+ (!(next0 == next_index && next1 == next_index)))
+ {
+ /* Undo speculation. */
+ to_next -= is_eop0 + is_eop1;
+ n_left_to_next += is_eop0 + is_eop1;
+
+ /* Re-do both descriptors being careful about where we enqueue. */
+ bi_sop = saved_is_sop ? bi0 : bi_sop_save;
+ if (is_eop0)
+ {
+ if (next0 != next_index)
+ vlib_set_next_frame_buffer (vm, node, next0, bi_sop);
+ else
+ {
+ to_next[0] = bi_sop;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ bi_sop = is_eop0 ? bi1 : bi_sop;
+ if (is_eop1)
+ {
+ if (next1 != next_index)
+ vlib_set_next_frame_buffer (vm, node, next1, bi_sop);
+ else
+ {
+ to_next[0] = bi_sop;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ /* Switch cached next index when next for both packets is the same. */
+ if (is_eop0 && is_eop1 && next0 == next1)
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ next_index = next0;
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ /* Bail out of dual loop and proceed with single loop. */
+ found_hw_owned_descriptor_x2:
+
+ while (n_descriptors_left > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t *b0;
+ u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0;
+ u8 is_eop0, error0, next0;
+ ixge_descriptor_t d0;
+
+ d0.as_u32x4 = d[0].as_u32x4;
+
+ s20 = d0.rx_from_hw.status[2];
+ s00 = d0.rx_from_hw.status[0];
+
+ if (!(s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE))
+ goto found_hw_owned_descriptor_x1;
+
+ bi0 = to_rx[0];
+ ASSERT (to_add >= xm->rx_buffers_to_add);
+ fi0 = to_add[0];
+
+ to_rx[0] = fi0;
+ to_rx += 1;
+ to_add -= 1;
+
+ ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED ==
+ vlib_buffer_is_known (vm, bi0));
+ ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED ==
+ vlib_buffer_is_known (vm, fi0));
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+
+ is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0;
+ ixge_rx_next_and_error_from_status_x1
+ (xd, s00, s20, &next0, &error0, &flags0);
+
+ next0 = is_sop ? next0 : next_index_sop;
+ next_index_sop = next0;
+
+ b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT);
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+
+ b0->error = node->errors[error0];
+
+ len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor;
+ n_bytes += len0;
+ n_packets += is_eop0;
+
+ /* Give new buffer to hardware. */
+ d0.rx_to_hw.tail_address =
+ vlib_get_buffer_data_physical_address (vm, fi0);
+ d0.rx_to_hw.head_address = d0.rx_to_hw.tail_address;
+ d[0].as_u32x4 = d0.as_u32x4;
+
+ d += 1;
+ n_descriptors_left -= 1;
+
+ /* Point to either l2 or l3 header depending on next. */
+ l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT))
+ ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0;
+ b0->current_length = len0 - l3_offset0;
+ b0->current_data = l3_offset0;
+
+ b_last->next_buffer = is_sop ? ~0 : bi0;
+ bi_last = bi0;
+ b_last = b0;
+
+ bi_sop = is_sop ? bi0 : bi_sop;
+
+ if (CLIB_DEBUG > 0 && is_eop0)
+ {
+ u8 *msg =
+ vlib_validate_buffer (vm, bi_sop, /* follow_buffer_next */ 1);
+ ASSERT (!msg);
+ }
+
+ if (0) /* "Dave" version */
+ {
+ if (is_eop0)
+ {
+ to_next[0] = bi_sop;
+ to_next++;
+ n_left_to_next--;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi_sop, next0);
+ }
+ }
+ if (1) /* "Eliot" version */
+ {
+ if (PREDICT_TRUE (next0 == next_index))
+ {
+ to_next[0] = bi_sop;
+ to_next += is_eop0;
+ n_left_to_next -= is_eop0;
+ }
+ else
+ {
+ if (next0 != next_index && is_eop0)
+ vlib_set_next_frame_buffer (vm, node, next0, bi_sop);
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ next_index = next0;
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+ }
+ }
+ is_sop = is_eop0;
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+found_hw_owned_descriptor_x1:
+ if (n_descriptors_left > 0)
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+
+ _vec_len (xm->rx_buffers_to_add) = (to_add + 1) - xm->rx_buffers_to_add;
+
+ {
+ u32 n_done = n_descriptors - n_descriptors_left;
+
+ if (n_trace > 0 && n_done > 0)
+ {
+ u32 n = clib_min (n_trace, n_done);
+ ixge_rx_trace (xm, xd, dq,
+ d_trace_save,
+ d_trace_buffers,
+ &dq->descriptors[start_descriptor_index], n);
+ vlib_set_trace_count (vm, node, n_trace - n);
+ }
+ if (d_trace_save)
+ {
+ _vec_len (d_trace_save) = 0;
+ _vec_len (d_trace_buffers) = 0;
+ }
+
+ /* Don't keep a reference to b_last if we don't have to.
+ Otherwise we can over-write a next_buffer pointer after already haven
+ enqueued a packet. */
+ if (is_sop)
+ {
+ b_last->next_buffer = ~0;
+ bi_last = ~0;
+ }
+
+ dq->rx.n_descriptors_done_this_call = n_done;
+ dq->rx.n_descriptors_done_total += n_done;
+ dq->rx.is_start_of_packet = is_sop;
+ dq->rx.saved_start_of_packet_buffer_index = bi_sop;
+ dq->rx.saved_last_buffer_index = bi_last;
+ dq->rx.saved_start_of_packet_next_index = next_index_sop;
+ dq->rx.next_index = next_index;
+ dq->rx.n_bytes += n_bytes;
+
+ return n_packets;
+ }
+}
+
+static uword
+ixge_rx_queue (ixge_main_t * xm,
+ ixge_device_t * xd,
+ vlib_node_runtime_t * node, u32 queue_index)
+{
+ ixge_dma_queue_t *dq =
+ vec_elt_at_index (xd->dma_queues[VLIB_RX], queue_index);
+ ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, dq->queue_index);
+ uword n_packets = 0;
+ u32 hw_head_index, sw_head_index;
+
+ /* One time initialization. */
+ if (!dq->rx.node)
+ {
+ dq->rx.node = node;
+ dq->rx.is_start_of_packet = 1;
+ dq->rx.saved_start_of_packet_buffer_index = ~0;
+ dq->rx.saved_last_buffer_index = ~0;
+ }
+
+ dq->rx.next_index = node->cached_next_index;
+
+ dq->rx.n_descriptors_done_total = 0;
+ dq->rx.n_descriptors_done_this_call = 0;
+ dq->rx.n_bytes = 0;
+
+ /* Fetch head from hardware and compare to where we think we are. */
+ hw_head_index = dr->head_index;
+ sw_head_index = dq->head_index;
+
+ if (hw_head_index == sw_head_index)
+ goto done;
+
+ if (hw_head_index < sw_head_index)
+ {
+ u32 n_tried = dq->n_descriptors - sw_head_index;
+ n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried);
+ sw_head_index =
+ ixge_ring_add (dq, sw_head_index,
+ dq->rx.n_descriptors_done_this_call);
+
+ if (dq->rx.n_descriptors_done_this_call != n_tried)
+ goto done;
+ }
+ if (hw_head_index >= sw_head_index)
+ {
+ u32 n_tried = hw_head_index - sw_head_index;
+ n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried);
+ sw_head_index =
+ ixge_ring_add (dq, sw_head_index,
+ dq->rx.n_descriptors_done_this_call);
+ }
+
+done:
+ dq->head_index = sw_head_index;
+ dq->tail_index =
+ ixge_ring_add (dq, dq->tail_index, dq->rx.n_descriptors_done_total);
+
+ /* Give tail back to hardware. */
+ CLIB_MEMORY_BARRIER ();
+
+ dr->tail_index = dq->tail_index;
+
+ vlib_increment_combined_counter (vnet_main.
+ interface_main.combined_sw_if_counters +
+ VNET_INTERFACE_COUNTER_RX,
+ 0 /* cpu_index */ ,
+ xd->vlib_sw_if_index, n_packets,
+ dq->rx.n_bytes);
+
+ return n_packets;
+}
+
+static void
+ixge_interrupt (ixge_main_t * xm, ixge_device_t * xd, u32 i)
+{
+ vlib_main_t *vm = xm->vlib_main;
+ ixge_regs_t *r = xd->regs;
+
+ if (i != 20)
+ {
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .function = (char *) __FUNCTION__,.format =
+ "ixge %d, %s",.format_args = "i1t1",.n_enum_strings =
+ 16,.enum_strings =
+ {
+ "flow director",
+ "rx miss",
+ "pci exception",
+ "mailbox",
+ "link status change",
+ "linksec key exchange",
+ "manageability event",
+ "reserved23",
+ "sdp0",
+ "sdp1",
+ "sdp2",
+ "sdp3",
+ "ecc", "descriptor handler error", "tcp timer", "other",},};
+ struct
+ {
+ u8 instance;
+ u8 index;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->instance = xd->device_index;
+ ed->index = i - 16;
+ }
+ else
+ {
+ u32 v = r->xge_mac.link_status;
+ uword is_up = (v & (1 << 30)) != 0;
+
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .function = (char *) __FUNCTION__,.format =
+ "ixge %d, link status change 0x%x",.format_args = "i4i4",};
+ struct
+ {
+ u32 instance, link_status;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->instance = xd->device_index;
+ ed->link_status = v;
+ xd->link_status_at_last_link_change = v;
+
+ vlib_process_signal_event (vm, ixge_process_node.index,
+ EVENT_SET_FLAGS,
+ ((is_up << 31) | xd->vlib_hw_if_index));
+ }
+}
+
+always_inline u32
+clean_block (u32 * b, u32 * t, u32 n_left)
+{
+ u32 *t0 = t;
+
+ while (n_left >= 4)
+ {
+ u32 bi0, bi1, bi2, bi3;
+
+ t[0] = bi0 = b[0];
+ b[0] = 0;
+ t += bi0 != 0;
+
+ t[0] = bi1 = b[1];
+ b[1] = 0;
+ t += bi1 != 0;
+
+ t[0] = bi2 = b[2];
+ b[2] = 0;
+ t += bi2 != 0;
+
+ t[0] = bi3 = b[3];
+ b[3] = 0;
+ t += bi3 != 0;
+
+ b += 4;
+ n_left -= 4;
+ }
+
+ while (n_left > 0)
+ {
+ u32 bi0;
+
+ t[0] = bi0 = b[0];
+ b[0] = 0;
+ t += bi0 != 0;
+ b += 1;
+ n_left -= 1;
+ }
+
+ return t - t0;
+}
+
+static void
+ixge_tx_queue (ixge_main_t * xm, ixge_device_t * xd, u32 queue_index)
+{
+ vlib_main_t *vm = xm->vlib_main;
+ ixge_dma_queue_t *dq =
+ vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index);
+ u32 n_clean, *b, *t, *t0;
+ i32 n_hw_owned_descriptors;
+ i32 first_to_clean, last_to_clean;
+ u64 hwbp_race = 0;
+
+ /* Handle case where head write back pointer update
+ * arrives after the interrupt during high PCI bus loads.
+ */
+ while ((dq->head_index == dq->tx.head_index_write_back[0]) &&
+ dq->tx.n_buffers_on_ring && (dq->head_index != dq->tail_index))
+ {
+ hwbp_race++;
+ if (IXGE_HWBP_RACE_ELOG && (hwbp_race == 1))
+ {
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .function = (char *) __FUNCTION__,.format =
+ "ixge %d tx head index race: head %4d, tail %4d, buffs %4d",.format_args
+ = "i4i4i4i4",};
+ struct
+ {
+ u32 instance, head_index, tail_index, n_buffers_on_ring;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->instance = xd->device_index;
+ ed->head_index = dq->head_index;
+ ed->tail_index = dq->tail_index;
+ ed->n_buffers_on_ring = dq->tx.n_buffers_on_ring;
+ }
+ }
+
+ dq->head_index = dq->tx.head_index_write_back[0];
+ n_hw_owned_descriptors = ixge_ring_sub (dq, dq->head_index, dq->tail_index);
+ ASSERT (dq->tx.n_buffers_on_ring >= n_hw_owned_descriptors);
+ n_clean = dq->tx.n_buffers_on_ring - n_hw_owned_descriptors;
+
+ if (IXGE_HWBP_RACE_ELOG && hwbp_race)
+ {
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .function = (char *) __FUNCTION__,.format =
+ "ixge %d tx head index race: head %4d, hw_owned %4d, n_clean %4d, retries %d",.format_args
+ = "i4i4i4i4i4",};
+ struct
+ {
+ u32 instance, head_index, n_hw_owned_descriptors, n_clean, retries;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->instance = xd->device_index;
+ ed->head_index = dq->head_index;
+ ed->n_hw_owned_descriptors = n_hw_owned_descriptors;
+ ed->n_clean = n_clean;
+ ed->retries = hwbp_race;
+ }
+
+ /*
+ * This function used to wait until hardware owned zero descriptors.
+ * At high PPS rates, that doesn't happen until the TX ring is
+ * completely full of descriptors which need to be cleaned up.
+ * That, in turn, causes TX ring-full drops and/or long RX service
+ * interruptions.
+ */
+ if (n_clean == 0)
+ return;
+
+ /* Clean the n_clean descriptors prior to the reported hardware head */
+ last_to_clean = dq->head_index - 1;
+ last_to_clean = (last_to_clean < 0) ? last_to_clean + dq->n_descriptors :
+ last_to_clean;
+
+ first_to_clean = (last_to_clean) - (n_clean - 1);
+ first_to_clean = (first_to_clean < 0) ? first_to_clean + dq->n_descriptors :
+ first_to_clean;
+
+ vec_resize (xm->tx_buffers_pending_free, dq->n_descriptors - 1);
+ t0 = t = xm->tx_buffers_pending_free;
+ b = dq->descriptor_buffer_indices + first_to_clean;
+
+ /* Wrap case: clean from first to end, then start to last */
+ if (first_to_clean > last_to_clean)
+ {
+ t += clean_block (b, t, (dq->n_descriptors - 1) - first_to_clean);
+ first_to_clean = 0;
+ b = dq->descriptor_buffer_indices;
+ }
+
+ /* Typical case: clean from first to last */
+ if (first_to_clean <= last_to_clean)
+ t += clean_block (b, t, (last_to_clean - first_to_clean) + 1);
+
+ if (t > t0)
+ {
+ u32 n = t - t0;
+ vlib_buffer_free_no_next (vm, t0, n);
+ ASSERT (dq->tx.n_buffers_on_ring >= n);
+ dq->tx.n_buffers_on_ring -= n;
+ _vec_len (xm->tx_buffers_pending_free) = 0;
+ }
+}
+
+/* RX queue interrupts 0 thru 7; TX 8 thru 15. */
+always_inline uword
+ixge_interrupt_is_rx_queue (uword i)
+{
+ return i < 8;
+}
+
+always_inline uword
+ixge_interrupt_is_tx_queue (uword i)
+{
+ return i >= 8 && i < 16;
+}
+
+always_inline uword
+ixge_tx_queue_to_interrupt (uword i)
+{
+ return 8 + i;
+}
+
+always_inline uword
+ixge_rx_queue_to_interrupt (uword i)
+{
+ return 0 + i;
+}
+
+always_inline uword
+ixge_interrupt_rx_queue (uword i)
+{
+ ASSERT (ixge_interrupt_is_rx_queue (i));
+ return i - 0;
+}
+
+always_inline uword
+ixge_interrupt_tx_queue (uword i)
+{
+ ASSERT (ixge_interrupt_is_tx_queue (i));
+ return i - 8;
+}
+
+static uword
+ixge_device_input (ixge_main_t * xm,
+ ixge_device_t * xd, vlib_node_runtime_t * node)
+{
+ ixge_regs_t *r = xd->regs;
+ u32 i, s;
+ uword n_rx_packets = 0;
+
+ s = r->interrupt.status_write_1_to_set;
+ if (s)
+ r->interrupt.status_write_1_to_clear = s;
+
+ /* *INDENT-OFF* */
+ foreach_set_bit (i, s, ({
+ if (ixge_interrupt_is_rx_queue (i))
+ n_rx_packets += ixge_rx_queue (xm, xd, node, ixge_interrupt_rx_queue (i));
+
+ else if (ixge_interrupt_is_tx_queue (i))
+ ixge_tx_queue (xm, xd, ixge_interrupt_tx_queue (i));
+
+ else
+ ixge_interrupt (xm, xd, i);
+ }));
+ /* *INDENT-ON* */
+
+ return n_rx_packets;
+}
+
+static uword
+ixge_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd;
+ uword n_rx_packets = 0;
+
+ if (node->state == VLIB_NODE_STATE_INTERRUPT)
+ {
+ uword i;
+
+ /* Loop over devices with interrupts. */
+ /* *INDENT-OFF* */
+ foreach_set_bit (i, node->runtime_data[0], ({
+ xd = vec_elt_at_index (xm->devices, i);
+ n_rx_packets += ixge_device_input (xm, xd, node);
+
+ /* Re-enable interrupts since we're going to stay in interrupt mode. */
+ if (! (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE))
+ xd->regs->interrupt.enable_write_1_to_set = ~0;
+ }));
+ /* *INDENT-ON* */
+
+ /* Clear mask of devices with pending interrupts. */
+ node->runtime_data[0] = 0;
+ }
+ else
+ {
+ /* Poll all devices for input/interrupts. */
+ vec_foreach (xd, xm->devices)
+ {
+ n_rx_packets += ixge_device_input (xm, xd, node);
+
+ /* Re-enable interrupts when switching out of polling mode. */
+ if (node->flags &
+ VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE)
+ xd->regs->interrupt.enable_write_1_to_set = ~0;
+ }
+ }
+
+ return n_rx_packets;
+}
+
+static char *ixge_error_strings[] = {
+#define _(n,s) s,
+ foreach_ixge_error
+#undef _
+};
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (ixge_input_node, static) = {
+ .function = ixge_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "ixge-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_ixge_rx_dma_trace,
+
+ .n_errors = IXGE_N_ERROR,
+ .error_strings = ixge_error_strings,
+
+ .n_next_nodes = IXGE_RX_N_NEXT,
+ .next_nodes = {
+ [IXGE_RX_NEXT_DROP] = "error-drop",
+ [IXGE_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [IXGE_RX_NEXT_IP4_INPUT] = "ip4-input",
+ [IXGE_RX_NEXT_IP6_INPUT] = "ip6-input",
+ },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH_CLONE (ixge_input)
+CLIB_MULTIARCH_SELECT_FN (ixge_input)
+/* *INDENT-ON* */
+
+static u8 *
+format_ixge_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, i);
+ return format (s, "TenGigabitEthernet%U",
+ format_vlib_pci_handle, &xd->pci_device.bus_address);
+}
+
+#define IXGE_COUNTER_IS_64_BIT (1 << 0)
+#define IXGE_COUNTER_NOT_CLEAR_ON_READ (1 << 1)
+
+static u8 ixge_counter_flags[] = {
+#define _(a,f) 0,
+#define _64(a,f) IXGE_COUNTER_IS_64_BIT,
+ foreach_ixge_counter
+#undef _
+#undef _64
+};
+
+static void
+ixge_update_counters (ixge_device_t * xd)
+{
+ /* Byte offset for counter registers. */
+ static u32 reg_offsets[] = {
+#define _(a,f) (a) / sizeof (u32),
+#define _64(a,f) _(a,f)
+ foreach_ixge_counter
+#undef _
+#undef _64
+ };
+ volatile u32 *r = (volatile u32 *) xd->regs;
+ int i;
+
+ for (i = 0; i < ARRAY_LEN (xd->counters); i++)
+ {
+ u32 o = reg_offsets[i];
+ xd->counters[i] += r[o];
+ if (ixge_counter_flags[i] & IXGE_COUNTER_NOT_CLEAR_ON_READ)
+ r[o] = 0;
+ if (ixge_counter_flags[i] & IXGE_COUNTER_IS_64_BIT)
+ xd->counters[i] += (u64) r[o + 1] << (u64) 32;
+ }
+}
+
+static u8 *
+format_ixge_device_id (u8 * s, va_list * args)
+{
+ u32 device_id = va_arg (*args, u32);
+ char *t = 0;
+ switch (device_id)
+ {
+#define _(f,n) case n: t = #f; break;
+ foreach_ixge_pci_device_id;
+#undef _
+ default:
+ t = 0;
+ break;
+ }
+ if (t == 0)
+ s = format (s, "unknown 0x%x", device_id);
+ else
+ s = format (s, "%s", t);
+ return s;
+}
+
+static u8 *
+format_ixge_link_status (u8 * s, va_list * args)
+{
+ ixge_device_t *xd = va_arg (*args, ixge_device_t *);
+ u32 v = xd->link_status_at_last_link_change;
+
+ s = format (s, "%s", (v & (1 << 30)) ? "up" : "down");
+
+ {
+ char *modes[] = {
+ "1g", "10g parallel", "10g serial", "autoneg",
+ };
+ char *speeds[] = {
+ "unknown", "100m", "1g", "10g",
+ };
+ s = format (s, ", mode %s, speed %s",
+ modes[(v >> 26) & 3], speeds[(v >> 28) & 3]);
+ }
+
+ return s;
+}
+
+static u8 *
+format_ixge_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ CLIB_UNUSED (int verbose) = va_arg (*args, int);
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, dev_instance);
+ ixge_phy_t *phy = xd->phys + xd->phy_index;
+ uword indent = format_get_indent (s);
+
+ ixge_update_counters (xd);
+ xd->link_status_at_last_link_change = xd->regs->xge_mac.link_status;
+
+ s = format (s, "Intel 8259X: id %U\n%Ulink %U",
+ format_ixge_device_id, xd->device_id,
+ format_white_space, indent + 2, format_ixge_link_status, xd);
+
+ {
+
+ s = format (s, "\n%UPCIe %U", format_white_space, indent + 2,
+ format_vlib_pci_link_speed, &xd->pci_device);
+ }
+
+ s = format (s, "\n%U", format_white_space, indent + 2);
+ if (phy->mdio_address != ~0)
+ s = format (s, "PHY address %d, id 0x%x", phy->mdio_address, phy->id);
+ else if (xd->sfp_eeprom.id == SFP_ID_sfp)
+ s = format (s, "SFP %U", format_sfp_eeprom, &xd->sfp_eeprom);
+ else
+ s = format (s, "PHY not found");
+
+ /* FIXME */
+ {
+ ixge_dma_queue_t *dq = vec_elt_at_index (xd->dma_queues[VLIB_RX], 0);
+ ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0);
+ u32 hw_head_index = dr->head_index;
+ u32 sw_head_index = dq->head_index;
+ u32 nitems;
+
+ nitems = ixge_ring_sub (dq, hw_head_index, sw_head_index);
+ s = format (s, "\n%U%d unprocessed, %d total buffers on rx queue 0 ring",
+ format_white_space, indent + 2, nitems, dq->n_descriptors);
+
+ s = format (s, "\n%U%d buffers in driver rx cache",
+ format_white_space, indent + 2,
+ vec_len (xm->rx_buffers_to_add));
+
+ s = format (s, "\n%U%d buffers on tx queue 0 ring",
+ format_white_space, indent + 2,
+ xd->dma_queues[VLIB_TX][0].tx.n_buffers_on_ring);
+ }
+ {
+ u32 i;
+ u64 v;
+ static char *names[] = {
+#define _(a,f) #f,
+#define _64(a,f) _(a,f)
+ foreach_ixge_counter
+#undef _
+#undef _64
+ };
+
+ for (i = 0; i < ARRAY_LEN (names); i++)
+ {
+ v = xd->counters[i] - xd->counters_last_clear[i];
+ if (v != 0)
+ s = format (s, "\n%U%-40U%16Ld",
+ format_white_space, indent + 2,
+ format_c_identifier, names[i], v);
+ }
+ }
+
+ return s;
+}
+
+static void
+ixge_clear_hw_interface_counters (u32 instance)
+{
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, instance);
+ ixge_update_counters (xd);
+ memcpy (xd->counters_last_clear, xd->counters, sizeof (xd->counters));
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void
+ixge_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ ixge_main_t *xm = &ixge_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ ixge_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ xd->per_interface_next_index = node_index;
+ return;
+ }
+
+ xd->per_interface_next_index =
+ vlib_node_add_next (xm->vlib_main, ixge_input_node.index, node_index);
+}
+
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (ixge_device_class) = {
+ .name = "ixge",
+ .tx_function = ixge_interface_tx,
+ .format_device_name = format_ixge_device_name,
+ .format_device = format_ixge_device,
+ .format_tx_trace = format_ixge_tx_dma_trace,
+ .clear_counters = ixge_clear_hw_interface_counters,
+ .admin_up_down_function = ixge_interface_admin_up_down,
+ .rx_redirect_to_node = ixge_set_interface_next_node,
+ .flatten_output_chains = 1,
+};
+/* *INDENT-ON* */
+
+#define IXGE_N_BYTES_IN_RX_BUFFER (2048) // DAW-HACK: Set Rx buffer size so all packets < ETH_MTU_SIZE fit in the buffer (i.e. sop & eop for all descriptors).
+
+static clib_error_t *
+ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index)
+{
+ ixge_main_t *xm = &ixge_main;
+ vlib_main_t *vm = xm->vlib_main;
+ ixge_dma_queue_t *dq;
+ clib_error_t *error = 0;
+
+ vec_validate (xd->dma_queues[rt], queue_index);
+ dq = vec_elt_at_index (xd->dma_queues[rt], queue_index);
+
+ if (!xm->n_descriptors_per_cache_line)
+ xm->n_descriptors_per_cache_line =
+ CLIB_CACHE_LINE_BYTES / sizeof (dq->descriptors[0]);
+
+ if (!xm->n_bytes_in_rx_buffer)
+ xm->n_bytes_in_rx_buffer = IXGE_N_BYTES_IN_RX_BUFFER;
+ xm->n_bytes_in_rx_buffer = round_pow2 (xm->n_bytes_in_rx_buffer, 1024);
+ if (!xm->vlib_buffer_free_list_index)
+ {
+ xm->vlib_buffer_free_list_index =
+ vlib_buffer_get_or_create_free_list (vm, xm->n_bytes_in_rx_buffer,
+ "ixge rx");
+ ASSERT (xm->vlib_buffer_free_list_index != 0);
+ }
+
+ if (!xm->n_descriptors[rt])
+ xm->n_descriptors[rt] = 4 * VLIB_FRAME_SIZE;
+
+ dq->queue_index = queue_index;
+ dq->n_descriptors =
+ round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line);
+ dq->head_index = dq->tail_index = 0;
+
+ dq->descriptors = vlib_physmem_alloc_aligned (vm, &error,
+ dq->n_descriptors *
+ sizeof (dq->descriptors[0]),
+ 128 /* per chip spec */ );
+ if (error)
+ return error;
+
+ memset (dq->descriptors, 0,
+ dq->n_descriptors * sizeof (dq->descriptors[0]));
+ vec_resize (dq->descriptor_buffer_indices, dq->n_descriptors);
+
+ if (rt == VLIB_RX)
+ {
+ u32 n_alloc, i;
+
+ n_alloc = vlib_buffer_alloc_from_free_list
+ (vm, dq->descriptor_buffer_indices,
+ vec_len (dq->descriptor_buffer_indices),
+ xm->vlib_buffer_free_list_index);
+ ASSERT (n_alloc == vec_len (dq->descriptor_buffer_indices));
+ for (i = 0; i < n_alloc; i++)
+ {
+ vlib_buffer_t *b =
+ vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]);
+ dq->descriptors[i].rx_to_hw.tail_address =
+ vlib_physmem_virtual_to_physical (vm, b->data);
+ }
+ }
+ else
+ {
+ u32 i;
+
+ dq->tx.head_index_write_back =
+ vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES);
+
+ for (i = 0; i < dq->n_descriptors; i++)
+ dq->descriptors[i].tx = xm->tx_descriptor_template;
+
+ vec_validate (xm->tx_buffers_pending_free, dq->n_descriptors - 1);
+ }
+
+ {
+ ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index);
+ u64 a;
+
+ a = vlib_physmem_virtual_to_physical (vm, dq->descriptors);
+ dr->descriptor_address[0] = a & 0xFFFFFFFF;
+ dr->descriptor_address[1] = a >> (u64) 32;
+ dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]);
+ dq->head_index = dq->tail_index = 0;
+
+ if (rt == VLIB_RX)
+ {
+ ASSERT ((xm->n_bytes_in_rx_buffer / 1024) < 32);
+ dr->rx_split_control =
+ ( /* buffer size */ ((xm->n_bytes_in_rx_buffer / 1024) << 0)
+ | ( /* lo free descriptor threshold (units of 64 descriptors) */
+ (1 << 22)) | ( /* descriptor type: advanced one buffer */
+ (1 << 25)) | ( /* drop if no descriptors available */
+ (1 << 28)));
+
+ /* Give hardware all but last 16 cache lines' worth of descriptors. */
+ dq->tail_index = dq->n_descriptors -
+ 16 * xm->n_descriptors_per_cache_line;
+ }
+ else
+ {
+ /* Make sure its initialized before hardware can get to it. */
+ dq->tx.head_index_write_back[0] = dq->head_index;
+
+ a =
+ vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back);
+ dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a;
+ dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32;
+ }
+
+ /* DMA on 82599 does not work with [13] rx data write relaxed ordering
+ and [12] undocumented set. */
+ if (rt == VLIB_RX)
+ dr->dca_control &= ~((1 << 13) | (1 << 12));
+
+ CLIB_MEMORY_BARRIER ();
+
+ if (rt == VLIB_TX)
+ {
+ xd->regs->tx_dma_control |= (1 << 0);
+ dr->control |= ((32 << 0) /* prefetch threshold */
+ | (64 << 8) /* host threshold */
+ | (0 << 16) /* writeback threshold */ );
+ }
+
+ /* Enable this queue and wait for hardware to initialize
+ before adding to tail. */
+ if (rt == VLIB_TX)
+ {
+ dr->control |= 1 << 25;
+ while (!(dr->control & (1 << 25)))
+ ;
+ }
+
+ /* Set head/tail indices and enable DMA. */
+ dr->head_index = dq->head_index;
+ dr->tail_index = dq->tail_index;
+ }
+
+ return error;
+}
+
+static u32
+ixge_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
+{
+ ixge_device_t *xd;
+ ixge_regs_t *r;
+ u32 old;
+ ixge_main_t *xm = &ixge_main;
+
+ xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+ r = xd->regs;
+
+ old = r->filter_control;
+
+ if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL)
+ r->filter_control = old | (1 << 9) /* unicast promiscuous */ ;
+ else
+ r->filter_control = old & ~(1 << 9);
+
+ return old;
+}
+
+static void
+ixge_device_init (ixge_main_t * xm)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ ixge_device_t *xd;
+
+ /* Reset chip(s). */
+ vec_foreach (xd, xm->devices)
+ {
+ ixge_regs_t *r = xd->regs;
+ const u32 reset_bit = (1 << 26) | (1 << 3);
+
+ r->control |= reset_bit;
+
+ /* No need to suspend. Timed to take ~1e-6 secs */
+ while (r->control & reset_bit)
+ ;
+
+ /* Software loaded. */
+ r->extended_control |= (1 << 28);
+
+ ixge_phy_init (xd);
+
+ /* Register ethernet interface. */
+ {
+ u8 addr8[6];
+ u32 i, addr32[2];
+ clib_error_t *error;
+
+ addr32[0] = r->rx_ethernet_address0[0][0];
+ addr32[1] = r->rx_ethernet_address0[0][1];
+ for (i = 0; i < 6; i++)
+ addr8[i] = addr32[i / 4] >> ((i % 4) * 8);
+
+ error = ethernet_register_interface
+ (vnm, ixge_device_class.index, xd->device_index,
+ /* ethernet address */ addr8,
+ &xd->vlib_hw_if_index, ixge_flag_change);
+ if (error)
+ clib_error_report (error);
+ }
+
+ {
+ vnet_sw_interface_t *sw =
+ vnet_get_hw_sw_interface (vnm, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+ }
+
+ ixge_dma_init (xd, VLIB_RX, /* queue_index */ 0);
+
+ xm->n_descriptors[VLIB_TX] = 20 * VLIB_FRAME_SIZE;
+
+ ixge_dma_init (xd, VLIB_TX, /* queue_index */ 0);
+
+ /* RX/TX queue 0 gets mapped to interrupt bits 0 & 8. */
+ r->interrupt.queue_mapping[0] = (( /* valid bit */ (1 << 7) |
+ ixge_rx_queue_to_interrupt (0)) << 0);
+
+ r->interrupt.queue_mapping[0] |= (( /* valid bit */ (1 << 7) |
+ ixge_tx_queue_to_interrupt (0)) << 8);
+
+ /* No use in getting too many interrupts.
+ Limit them to one every 3/4 ring size at line rate
+ min sized packets.
+ No need for this since kernel/vlib main loop provides adequate interrupt
+ limiting scheme. */
+ if (0)
+ {
+ f64 line_rate_max_pps =
+ 10e9 / (8 * (64 + /* interframe padding */ 20));
+ ixge_throttle_queue_interrupt (r, 0,
+ .75 * xm->n_descriptors[VLIB_RX] /
+ line_rate_max_pps);
+ }
+
+ /* Accept all multicast and broadcast packets. Should really add them
+ to the dst_ethernet_address register array. */
+ r->filter_control |= (1 << 10) | (1 << 8);
+
+ /* Enable frames up to size in mac frame size register. */
+ r->xge_mac.control |= 1 << 2;
+ r->xge_mac.rx_max_frame_size = (9216 + 14) << 16;
+
+ /* Enable all interrupts. */
+ if (!IXGE_ALWAYS_POLL)
+ r->interrupt.enable_write_1_to_set = ~0;
+ }
+}
+
+static uword
+ixge_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ ixge_main_t *xm = &ixge_main;
+ ixge_device_t *xd;
+ uword event_type, *event_data = 0;
+ f64 timeout, link_debounce_deadline;
+
+ ixge_device_init (xm);
+
+ /* Clear all counters. */
+ vec_foreach (xd, xm->devices)
+ {
+ ixge_update_counters (xd);
+ memset (xd->counters, 0, sizeof (xd->counters));
+ }
+
+ timeout = 30.0;
+ link_debounce_deadline = 1e70;
+
+ while (1)
+ {
+ /* 36 bit stat counters could overflow in ~50 secs.
+ We poll every 30 secs to be conservative. */
+ vlib_process_wait_for_event_or_clock (vm, timeout);
+
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ switch (event_type)
+ {
+ case EVENT_SET_FLAGS:
+ /* 1 ms */
+ link_debounce_deadline = vlib_time_now (vm) + 1e-3;
+ timeout = 1e-3;
+ break;
+
+ case ~0:
+ /* No events found: timer expired. */
+ if (vlib_time_now (vm) > link_debounce_deadline)
+ {
+ vec_foreach (xd, xm->devices)
+ {
+ ixge_regs_t *r = xd->regs;
+ u32 v = r->xge_mac.link_status;
+ uword is_up = (v & (1 << 30)) != 0;
+
+ vnet_hw_interface_set_flags
+ (vnm, xd->vlib_hw_if_index,
+ is_up ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0);
+ }
+ link_debounce_deadline = 1e70;
+ timeout = 30.0;
+ }
+ break;
+
+ default:
+ ASSERT (0);
+ }
+
+ if (event_data)
+ _vec_len (event_data) = 0;
+
+ /* Query stats every 30 secs. */
+ {
+ f64 now = vlib_time_now (vm);
+ if (now - xm->time_last_stats_update > 30)
+ {
+ xm->time_last_stats_update = now;
+ vec_foreach (xd, xm->devices) ixge_update_counters (xd);
+ }
+ }
+ }
+
+ return 0;
+}
+
+static vlib_node_registration_t ixge_process_node = {
+ .function = ixge_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "ixge-process",
+};
+
+clib_error_t *
+ixge_init (vlib_main_t * vm)
+{
+ ixge_main_t *xm = &ixge_main;
+ clib_error_t *error;
+
+ xm->vlib_main = vm;
+ memset (&xm->tx_descriptor_template, 0,
+ sizeof (xm->tx_descriptor_template));
+ memset (&xm->tx_descriptor_template_mask, 0,
+ sizeof (xm->tx_descriptor_template_mask));
+ xm->tx_descriptor_template.status0 =
+ (IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED |
+ IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED |
+ IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS);
+ xm->tx_descriptor_template_mask.status0 = 0xffff;
+ xm->tx_descriptor_template_mask.status1 = 0x00003fff;
+
+ xm->tx_descriptor_template_mask.status0 &=
+ ~(IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET
+ | IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS);
+ xm->tx_descriptor_template_mask.status1 &=
+ ~(IXGE_TX_DESCRIPTOR_STATUS1_DONE);
+
+ error = vlib_call_init_function (vm, pci_bus_init);
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (ixge_init);
+
+
+static void
+ixge_pci_intr_handler (vlib_pci_device_t * dev)
+{
+ ixge_main_t *xm = &ixge_main;
+ vlib_main_t *vm = xm->vlib_main;
+
+ vlib_node_set_interrupt_pending (vm, ixge_input_node.index);
+
+ /* Let node know which device is interrupting. */
+ {
+ vlib_node_runtime_t *rt =
+ vlib_node_get_runtime (vm, ixge_input_node.index);
+ rt->runtime_data[0] |= 1 << dev->private_data;
+ }
+}
+
+static clib_error_t *
+ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev)
+{
+ ixge_main_t *xm = &ixge_main;
+ clib_error_t *error;
+ void *r;
+ ixge_device_t *xd;
+
+ /* Device found: make sure we have dma memory. */
+ if (unix_physmem_is_fake (vm))
+ return clib_error_return (0, "no physical memory available");
+
+ error = vlib_pci_map_resource (dev, 0, &r);
+ if (error)
+ return error;
+
+ vec_add2 (xm->devices, xd, 1);
+
+ if (vec_len (xm->devices) == 1)
+ {
+ ixge_input_node.function = ixge_input_multiarch_select ();
+ }
+
+ xd->pci_device = dev[0];
+ xd->device_id = xd->pci_device.config0.header.device_id;
+ xd->regs = r;
+ xd->device_index = xd - xm->devices;
+ xd->pci_function = dev->bus_address.function;
+ xd->per_interface_next_index = ~0;
+
+
+ /* Chip found so enable node. */
+ {
+ vlib_node_set_state (vm, ixge_input_node.index,
+ (IXGE_ALWAYS_POLL
+ ? VLIB_NODE_STATE_POLLING
+ : VLIB_NODE_STATE_INTERRUPT));
+
+ dev->private_data = xd->device_index;
+ }
+
+ if (vec_len (xm->devices) == 1)
+ {
+ vlib_register_node (vm, &ixge_process_node);
+ xm->process_node_index = ixge_process_node.index;
+ }
+
+ error = vlib_pci_bus_master_enable (dev);
+
+ if (error)
+ return error;
+
+ return vlib_pci_intr_enable (dev);
+}
+
+/* *INDENT-OFF* */
+PCI_REGISTER_DEVICE (ixge_pci_device_registration,static) = {
+ .init_function = ixge_pci_init,
+ .interrupt_handler = ixge_pci_intr_handler,
+ .supported_devices = {
+#define _(t,i) { .vendor_id = PCI_VENDOR_ID_INTEL, .device_id = i, },
+ foreach_ixge_pci_device_id
+#undef _
+ { 0 },
+ },
+};
+/* *INDENT-ON* */
+
+void
+ixge_set_next_node (ixge_rx_next_t next, char *name)
+{
+ vlib_node_registration_t *r = &ixge_input_node;
+
+ switch (next)
+ {
+ case IXGE_RX_NEXT_IP4_INPUT:
+ case IXGE_RX_NEXT_IP6_INPUT:
+ case IXGE_RX_NEXT_ETHERNET_INPUT:
+ r->next_nodes[next] = name;
+ break;
+
+ default:
+ clib_warning ("%s: illegal next %d\n", __FUNCTION__, next);
+ break;
+ }
+}
+#endif
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/nic/ixge.h b/src/vnet/devices/nic/ixge.h
new file mode 100644
index 00000000000..a8e652dcdab
--- /dev/null
+++ b/src/vnet/devices/nic/ixge.h
@@ -0,0 +1,1293 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_ixge_h
+#define included_ixge_h
+
+#include <vnet/vnet.h>
+#include <vlib/pci/pci.h>
+#include <vlib/i2c.h>
+#include <vnet/devices/nic/sfp.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+
+typedef volatile struct
+{
+ /* [31:7] 128 byte aligned. */
+ u32 descriptor_address[2];
+ u32 n_descriptor_bytes;
+
+ /* [5] rx/tx descriptor dca enable
+ [6] rx packet head dca enable
+ [7] rx packet tail dca enable
+ [9] rx/tx descriptor relaxed order
+ [11] rx/tx descriptor write back relaxed order
+ [13] rx/tx data write/read relaxed order
+ [15] rx head data write relaxed order
+ [31:24] apic id for cpu's cache. */
+ u32 dca_control;
+
+ u32 head_index;
+
+ /* [4:0] tail buffer size (in 1k byte units)
+ [13:8] head buffer size (in 64 byte units)
+ [24:22] lo free descriptors threshold (units of 64 descriptors)
+ [27:25] descriptor type 0 = legacy, 1 = advanced one buffer (e.g. tail),
+ 2 = advanced header splitting (head + tail), 5 = advanced header
+ splitting (head only).
+ [28] drop if no descriptors available. */
+ u32 rx_split_control;
+
+ u32 tail_index;
+ CLIB_PAD_FROM_TO (0x1c, 0x28);
+
+ /* [7:0] rx/tx prefetch threshold
+ [15:8] rx/tx host threshold
+ [24:16] rx/tx write back threshold
+ [25] rx/tx enable
+ [26] tx descriptor writeback flush
+ [30] rx strip vlan enable */
+ u32 control;
+
+ u32 rx_coallesce_control;
+
+ union
+ {
+ struct
+ {
+ /* packets bytes lo hi */
+ u32 stats[3];
+
+ u32 unused;
+ } rx;
+
+ struct
+ {
+ u32 unused[2];
+
+ /* [0] enables head write back. */
+ u32 head_index_write_back_address[2];
+ } tx;
+ };
+} ixge_dma_regs_t;
+
+/* Only advanced descriptors are supported. */
+typedef struct
+{
+ u64 tail_address;
+ u64 head_address;
+} ixge_rx_to_hw_descriptor_t;
+
+typedef struct
+{
+ u32 status[3];
+ u16 n_packet_bytes_this_descriptor;
+ u16 vlan_tag;
+} ixge_rx_from_hw_descriptor_t;
+
+#define IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2 (1 << (4 + 11))
+/* Valid if not layer2. */
+#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4 (1 << (4 + 0))
+#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT (1 << (4 + 1))
+#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6 (1 << (4 + 2))
+#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT (1 << (4 + 3))
+#define IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP (1 << (4 + 4))
+#define IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP (1 << (4 + 5))
+#define IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET(s) (((s) >> 21) & 0x3ff)
+
+#define IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE (1 << (0 + 0))
+#define IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET (1 << (0 + 1))
+#define IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN (1 << (0 + 3))
+#define IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED (1 << (0 + 4))
+#define IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED (1 << (0 + 5))
+#define IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED (1 << (0 + 6))
+#define IXGE_RX_DESCRIPTOR_STATUS2_NOT_UNICAST (1 << (0 + 7))
+#define IXGE_RX_DESCRIPTOR_STATUS2_IS_DOUBLE_VLAN (1 << (0 + 9))
+#define IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR (1 << (0 + 10))
+#define IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR (1 << (20 + 9))
+#define IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR (1 << (20 + 10))
+#define IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR (1 << (20 + 11))
+
+/* For layer2 packets stats0 bottom 3 bits give ether type index from filter. */
+#define IXGE_RX_DESCRIPTOR_STATUS0_LAYER2_ETHERNET_TYPE(s) ((s) & 7)
+
+typedef struct
+{
+ u64 buffer_address;
+ u16 n_bytes_this_buffer;
+ u16 status0;
+ u32 status1;
+#define IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED (3 << 4)
+#define IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED (1 << (8 + 5))
+#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS (8 + 3)
+#define IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS)
+#define IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS (1 << (8 + 1))
+#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET (8 + 0)
+#define IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET)
+#define IXGE_TX_DESCRIPTOR_STATUS1_DONE (1 << 0)
+#define IXGE_TX_DESCRIPTOR_STATUS1_CONTEXT(i) (/* valid */ (1 << 7) | ((i) << 4))
+#define IXGE_TX_DESCRIPTOR_STATUS1_IPSEC_OFFLOAD (1 << (8 + 2))
+#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_TCP_UDP_CHECKSUM (1 << (8 + 1))
+#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_IP4_CHECKSUM (1 << (8 + 0))
+#define IXGE_TX_DESCRIPTOR_STATUS0_N_BYTES_THIS_BUFFER(l) ((l) << 0)
+#define IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET(l) ((l) << 14)
+} ixge_tx_descriptor_t;
+
+typedef struct
+{
+ struct
+ {
+ u8 checksum_start_offset;
+ u8 checksum_insert_offset;
+ u16 checksum_end_offset;
+ } ip, tcp;
+ u32 status0;
+
+ u8 status1;
+
+ /* Byte offset after UDP/TCP header. */
+ u8 payload_offset;
+
+ u16 max_tcp_segment_size;
+} __attribute__ ((packed)) ixge_tx_context_descriptor_t;
+
+typedef union
+{
+ ixge_rx_to_hw_descriptor_t rx_to_hw;
+ ixge_rx_from_hw_descriptor_t rx_from_hw;
+ ixge_tx_descriptor_t tx;
+ u32x4 as_u32x4;
+} ixge_descriptor_t;
+
+typedef volatile struct
+{
+ /* [2] pcie master disable
+ [3] mac reset
+ [26] global device reset */
+ u32 control;
+ u32 control_alias;
+ /* [3:2] device id (0 or 1 for dual port chips)
+ [7] link is up
+ [17:10] num vfs
+ [18] io active
+ [19] pcie master enable status */
+ u32 status_read_only;
+ CLIB_PAD_FROM_TO (0xc, 0x18);
+ /* [14] pf reset done
+ [17] relaxed ordering disable
+ [26] extended vlan enable
+ [28] driver loaded */
+ u32 extended_control;
+ CLIB_PAD_FROM_TO (0x1c, 0x20);
+
+ /* software definable pins.
+ sdp_data [7:0]
+ sdp_is_output [15:8]
+ sdp_is_native [23:16]
+ sdp_function [31:24].
+ */
+ u32 sdp_control;
+ CLIB_PAD_FROM_TO (0x24, 0x28);
+
+ /* [0] i2c clock in
+ [1] i2c clock out
+ [2] i2c data in
+ [3] i2c data out */
+ u32 i2c_control;
+ CLIB_PAD_FROM_TO (0x2c, 0x4c);
+ u32 tcp_timer;
+
+ CLIB_PAD_FROM_TO (0x50, 0x200);
+
+ u32 led_control;
+
+ CLIB_PAD_FROM_TO (0x204, 0x600);
+ u32 core_spare;
+ CLIB_PAD_FROM_TO (0x604, 0x700);
+
+ struct
+ {
+ u32 vflr_events_clear[4];
+ u32 mailbox_interrupt_status[4];
+ u32 mailbox_interrupt_enable[4];
+ CLIB_PAD_FROM_TO (0x730, 0x800);
+ } pf_foo;
+
+ struct
+ {
+ u32 status_write_1_to_clear;
+ CLIB_PAD_FROM_TO (0x804, 0x808);
+ u32 status_write_1_to_set;
+ CLIB_PAD_FROM_TO (0x80c, 0x810);
+ u32 status_auto_clear_enable;
+ CLIB_PAD_FROM_TO (0x814, 0x820);
+
+ /* [11:3] minimum inter-interrupt interval
+ (2e-6 units; 20e-6 units for fast ethernet).
+ [15] low-latency interrupt moderation enable
+ [20:16] low-latency interrupt credit
+ [27:21] interval counter
+ [31] write disable for credit and counter (write only). */
+ u32 throttle0[24];
+
+ u32 enable_write_1_to_set;
+ CLIB_PAD_FROM_TO (0x884, 0x888);
+ u32 enable_write_1_to_clear;
+ CLIB_PAD_FROM_TO (0x88c, 0x890);
+ u32 enable_auto_clear;
+ u32 msi_to_eitr_select;
+ /* [3:0] spd 0-3 interrupt detection enable
+ [4] msi-x enable
+ [5] other clear disable (makes other bits in status not clear on read)
+ etc. */
+ u32 control;
+ CLIB_PAD_FROM_TO (0x89c, 0x900);
+
+ /* Defines interrupt mapping for 128 rx + 128 tx queues.
+ 64 x 4 8 bit entries.
+ For register [i]:
+ [5:0] bit in interrupt status for rx queue 2*i + 0
+ [7] valid bit
+ [13:8] bit for tx queue 2*i + 0
+ [15] valid bit
+ similar for rx 2*i + 1 and tx 2*i + 1. */
+ u32 queue_mapping[64];
+
+ /* tcp timer [7:0] and other interrupts [15:8] */
+ u32 misc_mapping;
+ CLIB_PAD_FROM_TO (0xa04, 0xa90);
+
+ /* 64 interrupts determined by mappings. */
+ u32 status1_write_1_to_clear[4];
+ u32 enable1_write_1_to_set[4];
+ u32 enable1_write_1_to_clear[4];
+ CLIB_PAD_FROM_TO (0xac0, 0xad0);
+ u32 status1_enable_auto_clear[4];
+ CLIB_PAD_FROM_TO (0xae0, 0x1000);
+ } interrupt;
+
+ ixge_dma_regs_t rx_dma0[64];
+
+ CLIB_PAD_FROM_TO (0x2000, 0x2140);
+ u32 dcb_rx_packet_plane_t4_config[8];
+ u32 dcb_rx_packet_plane_t4_status[8];
+ CLIB_PAD_FROM_TO (0x2180, 0x2300);
+
+ /* reg i defines mapping for 4 rx queues starting at 4*i + 0. */
+ u32 rx_queue_stats_mapping[32];
+ u32 rx_queue_stats_control;
+
+ CLIB_PAD_FROM_TO (0x2384, 0x2410);
+ u32 fc_user_descriptor_ptr[2];
+ u32 fc_buffer_control;
+ CLIB_PAD_FROM_TO (0x241c, 0x2420);
+ u32 fc_rx_dma;
+ CLIB_PAD_FROM_TO (0x2424, 0x2430);
+ u32 dcb_packet_plane_control;
+ CLIB_PAD_FROM_TO (0x2434, 0x2f00);
+
+ u32 rx_dma_control;
+ u32 pf_queue_drop_enable;
+ CLIB_PAD_FROM_TO (0x2f08, 0x2f20);
+ u32 rx_dma_descriptor_cache_config;
+ CLIB_PAD_FROM_TO (0x2f24, 0x3000);
+
+ /* 1 bit. */
+ u32 rx_enable;
+ CLIB_PAD_FROM_TO (0x3004, 0x3008);
+ /* [15:0] ether type (little endian)
+ [31:16] opcode (big endian) */
+ u32 flow_control_control;
+ CLIB_PAD_FROM_TO (0x300c, 0x3020);
+ /* 3 bit traffic class for each of 8 priorities. */
+ u32 rx_priority_to_traffic_class;
+ CLIB_PAD_FROM_TO (0x3024, 0x3028);
+ u32 rx_coallesce_data_buffer_control;
+ CLIB_PAD_FROM_TO (0x302c, 0x3190);
+ u32 rx_packet_buffer_flush_detect;
+ CLIB_PAD_FROM_TO (0x3194, 0x3200);
+ u32 flow_control_tx_timers[4]; /* 2 timer values */
+ CLIB_PAD_FROM_TO (0x3210, 0x3220);
+ u32 flow_control_rx_threshold_lo[8];
+ CLIB_PAD_FROM_TO (0x3240, 0x3260);
+ u32 flow_control_rx_threshold_hi[8];
+ CLIB_PAD_FROM_TO (0x3280, 0x32a0);
+ u32 flow_control_refresh_threshold;
+ CLIB_PAD_FROM_TO (0x32a4, 0x3c00);
+ /* For each of 8 traffic classes (units of bytes). */
+ u32 rx_packet_buffer_size[8];
+ CLIB_PAD_FROM_TO (0x3c20, 0x3d00);
+ u32 flow_control_config;
+ CLIB_PAD_FROM_TO (0x3d04, 0x4200);
+
+ struct
+ {
+ u32 pcs_config;
+ CLIB_PAD_FROM_TO (0x4204, 0x4208);
+ u32 link_control;
+ u32 link_status;
+ u32 pcs_debug[2];
+ u32 auto_negotiation;
+ u32 link_partner_ability;
+ u32 auto_negotiation_tx_next_page;
+ u32 auto_negotiation_link_partner_next_page;
+ CLIB_PAD_FROM_TO (0x4228, 0x4240);
+ } gige_mac;
+
+ struct
+ {
+ /* [0] tx crc enable
+ [2] enable frames up to max frame size register [31:16]
+ [10] pad frames < 64 bytes if specified by user
+ [15] loopback enable
+ [16] mdc hi speed
+ [17] turn off mdc between mdio packets */
+ u32 control;
+
+ /* [5] rx symbol error (all bits clear on read)
+ [6] rx illegal symbol
+ [7] rx idle error
+ [8] rx local fault
+ [9] rx remote fault */
+ u32 status;
+
+ u32 pause_and_pace_control;
+ CLIB_PAD_FROM_TO (0x424c, 0x425c);
+ u32 phy_command;
+ u32 phy_data;
+ CLIB_PAD_FROM_TO (0x4264, 0x4268);
+
+ /* [31:16] max frame size in bytes. */
+ u32 rx_max_frame_size;
+ CLIB_PAD_FROM_TO (0x426c, 0x4288);
+
+ /* [0]
+ [2] pcs receive link up? (latch lo)
+ [7] local fault
+ [1]
+ [0] pcs 10g base r capable
+ [1] pcs 10g base x capable
+ [2] pcs 10g base w capable
+ [10] rx local fault
+ [11] tx local fault
+ [15:14] 2 => device present at this address (else not present) */
+ u32 xgxs_status[2];
+
+ u32 base_x_pcs_status;
+
+ /* [0] pass unrecognized flow control frames
+ [1] discard pause frames
+ [2] rx priority flow control enable (only in dcb mode)
+ [3] rx flow control enable. */
+ u32 flow_control;
+
+ /* [3:0] tx lanes change polarity
+ [7:4] rx lanes change polarity
+ [11:8] swizzle tx lanes
+ [15:12] swizzle rx lanes
+ 4 x 2 bit tx lane swap
+ 4 x 2 bit rx lane swap. */
+ u32 serdes_control;
+
+ u32 fifo_control;
+
+ /* [0] force link up
+ [1] autoneg ack2 bit to transmit
+ [6:2] autoneg selector field to transmit
+ [8:7] 10g pma/pmd type 0 => xaui, 1 kx4, 2 cx4
+ [9] 1g pma/pmd type 0 => sfi, 1 => kx/bx
+ [10] disable 10g on without main power
+ [11] restart autoneg on transition to dx power state
+ [12] restart autoneg
+ [15:13] link mode:
+ 0 => 1g no autoneg
+ 1 => 10g kx4 parallel link no autoneg
+ 2 => 1g bx autoneg
+ 3 => 10g sfi serdes
+ 4 => kx4/kx/kr
+ 5 => xgmii 1g/100m
+ 6 => kx4/kx/kr 1g an
+ 7 kx4/kx/kr sgmii.
+ [16] kr support
+ [17] fec requested
+ [18] fec ability
+ etc. */
+ u32 auto_negotiation_control;
+
+ /* [0] signal detect 1g/100m
+ [1] fec signal detect
+ [2] 10g serial pcs fec block lock
+ [3] 10g serial high error rate
+ [4] 10g serial pcs block lock
+ [5] kx/kx4/kr autoneg next page received
+ [6] kx/kx4/kr backplane autoneg next page received
+ [7] link status clear to read
+ [11:8] 10g signal detect (4 lanes) (for serial just lane 0)
+ [12] 10g serial signal detect
+ [16:13] 10g parallel lane sync status
+ [17] 10g parallel align status
+ [18] 1g sync status
+ [19] kx/kx4/kr backplane autoneg is idle
+ [20] 1g autoneg enabled
+ [21] 1g pcs enabled for sgmii
+ [22] 10g xgxs enabled
+ [23] 10g serial fec enabled (forward error detection)
+ [24] 10g kr pcs enabled
+ [25] sgmii enabled
+ [27:26] mac link mode
+ 0 => 1g
+ 1 => 10g parallel
+ 2 => 10g serial
+ 3 => autoneg
+ [29:28] link speed
+ 1 => 100m
+ 2 => 1g
+ 3 => 10g
+ [30] link is up
+ [31] kx/kx4/kr backplane autoneg completed successfully. */
+ u32 link_status;
+
+ /* [17:16] pma/pmd for 10g serial
+ 0 => kr, 2 => sfi
+ [18] disable dme pages */
+ u32 auto_negotiation_control2;
+
+ CLIB_PAD_FROM_TO (0x42ac, 0x42b0);
+ u32 link_partner_ability[2];
+ CLIB_PAD_FROM_TO (0x42b8, 0x42d0);
+ u32 manageability_control;
+ u32 link_partner_next_page[2];
+ CLIB_PAD_FROM_TO (0x42dc, 0x42e0);
+ u32 kr_pcs_control;
+ u32 kr_pcs_status;
+ u32 fec_status[2];
+ CLIB_PAD_FROM_TO (0x42f0, 0x4314);
+ u32 sgmii_control;
+ CLIB_PAD_FROM_TO (0x4318, 0x4324);
+ u32 link_status2;
+ CLIB_PAD_FROM_TO (0x4328, 0x4900);
+ } xge_mac;
+
+ u32 tx_dcb_control;
+ u32 tx_dcb_descriptor_plane_queue_select;
+ u32 tx_dcb_descriptor_plane_t1_config;
+ u32 tx_dcb_descriptor_plane_t1_status;
+ CLIB_PAD_FROM_TO (0x4910, 0x4950);
+
+ /* For each TC in units of 1k bytes. */
+ u32 tx_packet_buffer_thresholds[8];
+ CLIB_PAD_FROM_TO (0x4970, 0x4980);
+ struct
+ {
+ u32 mmw;
+ u32 config;
+ u32 status;
+ u32 rate_drift;
+ } dcb_tx_rate_scheduler;
+ CLIB_PAD_FROM_TO (0x4990, 0x4a80);
+ u32 tx_dma_control;
+ CLIB_PAD_FROM_TO (0x4a84, 0x4a88);
+ u32 tx_dma_tcp_flags_control[2];
+ CLIB_PAD_FROM_TO (0x4a90, 0x4b00);
+ u32 pf_mailbox[64];
+ CLIB_PAD_FROM_TO (0x4c00, 0x5000);
+
+ /* RX */
+ u32 checksum_control;
+ CLIB_PAD_FROM_TO (0x5004, 0x5008);
+ u32 rx_filter_control;
+ CLIB_PAD_FROM_TO (0x500c, 0x5010);
+ u32 management_vlan_tag[8];
+ u32 management_udp_tcp_ports[8];
+ CLIB_PAD_FROM_TO (0x5050, 0x5078);
+ /* little endian. */
+ u32 extended_vlan_ether_type;
+ CLIB_PAD_FROM_TO (0x507c, 0x5080);
+ /* [1] store/dma bad packets
+ [8] accept all multicast
+ [9] accept all unicast
+ [10] accept all broadcast. */
+ u32 filter_control;
+ CLIB_PAD_FROM_TO (0x5084, 0x5088);
+ /* [15:0] vlan ethernet type (0x8100) little endian
+ [28] cfi bit expected
+ [29] drop packets with unexpected cfi bit
+ [30] vlan filter enable. */
+ u32 vlan_control;
+ CLIB_PAD_FROM_TO (0x508c, 0x5090);
+ /* [1:0] hi bit of ethernet address for 12 bit index into multicast table
+ 0 => 47, 1 => 46, 2 => 45, 3 => 43.
+ [2] enable multicast filter
+ */
+ u32 multicast_control;
+ CLIB_PAD_FROM_TO (0x5094, 0x5100);
+ u32 fcoe_rx_control;
+ CLIB_PAD_FROM_TO (0x5104, 0x5108);
+ u32 fc_flt_context;
+ CLIB_PAD_FROM_TO (0x510c, 0x5110);
+ u32 fc_filter_control;
+ CLIB_PAD_FROM_TO (0x5114, 0x5120);
+ u32 rx_message_type_lo;
+ CLIB_PAD_FROM_TO (0x5124, 0x5128);
+ /* [15:0] ethernet type (little endian)
+ [18:16] matche pri in vlan tag
+ [19] priority match enable
+ [25:20] virtualization pool
+ [26] pool enable
+ [27] is fcoe
+ [30] ieee 1588 timestamp enable
+ [31] filter enable.
+ (See ethernet_type_queue_select.) */
+ u32 ethernet_type_queue_filter[8];
+ CLIB_PAD_FROM_TO (0x5148, 0x5160);
+ /* [7:0] l2 ethernet type and
+ [15:8] l2 ethernet type or */
+ u32 management_decision_filters1[8];
+ u32 vf_vm_tx_switch_loopback_enable[2];
+ u32 rx_time_sync_control;
+ CLIB_PAD_FROM_TO (0x518c, 0x5190);
+ u32 management_ethernet_type_filters[4];
+ u32 rx_timestamp_attributes_lo;
+ u32 rx_timestamp_hi;
+ u32 rx_timestamp_attributes_hi;
+ CLIB_PAD_FROM_TO (0x51ac, 0x51b0);
+ u32 pf_virtual_control;
+ CLIB_PAD_FROM_TO (0x51b4, 0x51d8);
+ u32 fc_offset_parameter;
+ CLIB_PAD_FROM_TO (0x51dc, 0x51e0);
+ u32 vf_rx_enable[2];
+ u32 rx_timestamp_lo;
+ CLIB_PAD_FROM_TO (0x51ec, 0x5200);
+ /* 12 bits determined by multicast_control
+ lookup bits in this vector. */
+ u32 multicast_enable[128];
+
+ /* [0] ethernet address [31:0]
+ [1] [15:0] ethernet address [47:32]
+ [31] valid bit.
+ Index 0 is read from eeprom after reset. */
+ u32 rx_ethernet_address0[16][2];
+
+ CLIB_PAD_FROM_TO (0x5480, 0x5800);
+ u32 wake_up_control;
+ CLIB_PAD_FROM_TO (0x5804, 0x5808);
+ u32 wake_up_filter_control;
+ CLIB_PAD_FROM_TO (0x580c, 0x5818);
+ u32 multiple_rx_queue_command_82598;
+ CLIB_PAD_FROM_TO (0x581c, 0x5820);
+ u32 management_control;
+ u32 management_filter_control;
+ CLIB_PAD_FROM_TO (0x5828, 0x5838);
+ u32 wake_up_ip4_address_valid;
+ CLIB_PAD_FROM_TO (0x583c, 0x5840);
+ u32 wake_up_ip4_address_table[4];
+ u32 management_control_to_host;
+ CLIB_PAD_FROM_TO (0x5854, 0x5880);
+ u32 wake_up_ip6_address_table[4];
+
+ /* unicast_and broadcast_and vlan_and ip_address_and
+ etc. */
+ u32 management_decision_filters[8];
+
+ u32 management_ip4_or_ip6_address_filters[4][4];
+ CLIB_PAD_FROM_TO (0x58f0, 0x5900);
+ u32 wake_up_packet_length;
+ CLIB_PAD_FROM_TO (0x5904, 0x5910);
+ u32 management_ethernet_address_filters[4][2];
+ CLIB_PAD_FROM_TO (0x5930, 0x5a00);
+ u32 wake_up_packet_memory[32];
+ CLIB_PAD_FROM_TO (0x5a80, 0x5c00);
+ u32 redirection_table_82598[32];
+ u32 rss_random_keys_82598[10];
+ CLIB_PAD_FROM_TO (0x5ca8, 0x6000);
+
+ ixge_dma_regs_t tx_dma[128];
+
+ u32 pf_vm_vlan_insert[64];
+ u32 tx_dma_tcp_max_alloc_size_requests;
+ CLIB_PAD_FROM_TO (0x8104, 0x8110);
+ u32 vf_tx_enable[2];
+ CLIB_PAD_FROM_TO (0x8118, 0x8120);
+ /* [0] dcb mode enable
+ [1] virtualization mode enable
+ [3:2] number of tcs/qs per pool. */
+ u32 multiple_tx_queues_command;
+ CLIB_PAD_FROM_TO (0x8124, 0x8200);
+ u32 pf_vf_anti_spoof[8];
+ u32 pf_dma_tx_switch_control;
+ CLIB_PAD_FROM_TO (0x8224, 0x82e0);
+ u32 tx_strict_low_latency_queues[4];
+ CLIB_PAD_FROM_TO (0x82f0, 0x8600);
+ u32 tx_queue_stats_mapping_82599[32];
+ u32 tx_queue_packet_counts[32];
+ u32 tx_queue_byte_counts[32][2];
+
+ struct
+ {
+ u32 control;
+ u32 status;
+ u32 buffer_almost_full;
+ CLIB_PAD_FROM_TO (0x880c, 0x8810);
+ u32 buffer_min_ifg;
+ CLIB_PAD_FROM_TO (0x8814, 0x8900);
+ } tx_security;
+
+ struct
+ {
+ u32 index;
+ u32 salt;
+ u32 key[4];
+ CLIB_PAD_FROM_TO (0x8918, 0x8a00);
+ } tx_ipsec;
+
+ struct
+ {
+ u32 capabilities;
+ u32 control;
+ u32 tx_sci[2];
+ u32 sa;
+ u32 sa_pn[2];
+ u32 key[2][4];
+ /* untagged packets, encrypted packets, protected packets,
+ encrypted bytes, protected bytes */
+ u32 stats[5];
+ CLIB_PAD_FROM_TO (0x8a50, 0x8c00);
+ } tx_link_security;
+
+ struct
+ {
+ u32 control;
+ u32 timestamp_value[2];
+ u32 system_time[2];
+ u32 increment_attributes;
+ u32 time_adjustment_offset[2];
+ u32 aux_control;
+ u32 target_time[2][2];
+ CLIB_PAD_FROM_TO (0x8c34, 0x8c3c);
+ u32 aux_time_stamp[2][2];
+ CLIB_PAD_FROM_TO (0x8c4c, 0x8d00);
+ } tx_timesync;
+
+ struct
+ {
+ u32 control;
+ u32 status;
+ CLIB_PAD_FROM_TO (0x8d08, 0x8e00);
+ } rx_security;
+
+ struct
+ {
+ u32 index;
+ u32 ip_address[4];
+ u32 spi;
+ u32 ip_index;
+ u32 key[4];
+ u32 salt;
+ u32 mode;
+ CLIB_PAD_FROM_TO (0x8e34, 0x8f00);
+ } rx_ipsec;
+
+ struct
+ {
+ u32 capabilities;
+ u32 control;
+ u32 sci[2];
+ u32 sa[2];
+ u32 sa_pn[2];
+ u32 key[2][4];
+ /* see datasheet */
+ u32 stats[17];
+ CLIB_PAD_FROM_TO (0x8f84, 0x9000);
+ } rx_link_security;
+
+ /* 4 wake up, 2 management, 2 wake up. */
+ u32 flexible_filters[8][16][4];
+ CLIB_PAD_FROM_TO (0x9800, 0xa000);
+
+ /* 4096 bits. */
+ u32 vlan_filter[128];
+
+ /* [0] ethernet address [31:0]
+ [1] [15:0] ethernet address [47:32]
+ [31] valid bit.
+ Index 0 is read from eeprom after reset. */
+ u32 rx_ethernet_address1[128][2];
+
+ /* select one of 64 pools for each rx address. */
+ u32 rx_ethernet_address_pool_select[128][2];
+ CLIB_PAD_FROM_TO (0xaa00, 0xc800);
+ u32 tx_priority_to_traffic_class;
+ CLIB_PAD_FROM_TO (0xc804, 0xcc00);
+
+ /* In bytes units of 1k. Total packet buffer is 160k. */
+ u32 tx_packet_buffer_size[8];
+
+ CLIB_PAD_FROM_TO (0xcc20, 0xcd10);
+ u32 tx_manageability_tc_mapping;
+ CLIB_PAD_FROM_TO (0xcd14, 0xcd20);
+ u32 dcb_tx_packet_plane_t2_config[8];
+ u32 dcb_tx_packet_plane_t2_status[8];
+ CLIB_PAD_FROM_TO (0xcd60, 0xce00);
+
+ u32 tx_flow_control_status;
+ CLIB_PAD_FROM_TO (0xce04, 0xd000);
+
+ ixge_dma_regs_t rx_dma1[64];
+
+ struct
+ {
+ /* Bigendian ip4 src/dst address. */
+ u32 src_address[128];
+ u32 dst_address[128];
+
+ /* TCP/UDP ports [15:0] src [31:16] dst; bigendian. */
+ u32 tcp_udp_port[128];
+
+ /* [1:0] protocol tcp, udp, sctp, other
+ [4:2] match priority (highest wins)
+ [13:8] pool
+ [25] src address match disable
+ [26] dst address match disable
+ [27] src port match disable
+ [28] dst port match disable
+ [29] protocol match disable
+ [30] pool match disable
+ [31] enable. */
+ u32 control[128];
+
+ /* [12] size bypass
+ [19:13] must be 0x80
+ [20] low-latency interrupt
+ [27:21] rx queue. */
+ u32 interrupt[128];
+ } ip4_filters;
+
+ CLIB_PAD_FROM_TO (0xea00, 0xeb00);
+ /* 4 bit rss output index indexed by 7 bit hash.
+ 128 8 bit fields = 32 registers. */
+ u32 redirection_table_82599[32];
+
+ u32 rss_random_key_82599[10];
+ CLIB_PAD_FROM_TO (0xeba8, 0xec00);
+ /* [15:0] reserved
+ [22:16] rx queue index
+ [29] low-latency interrupt on match
+ [31] enable */
+ u32 ethernet_type_queue_select[8];
+ CLIB_PAD_FROM_TO (0xec20, 0xec30);
+ u32 syn_packet_queue_filter;
+ CLIB_PAD_FROM_TO (0xec34, 0xec60);
+ u32 immediate_interrupt_rx_vlan_priority;
+ CLIB_PAD_FROM_TO (0xec64, 0xec70);
+ u32 rss_queues_per_traffic_class;
+ CLIB_PAD_FROM_TO (0xec74, 0xec90);
+ u32 lli_size_threshold;
+ CLIB_PAD_FROM_TO (0xec94, 0xed00);
+
+ struct
+ {
+ u32 control;
+ CLIB_PAD_FROM_TO (0xed04, 0xed10);
+ u32 table[8];
+ CLIB_PAD_FROM_TO (0xed30, 0xee00);
+ } fcoe_redirection;
+
+ struct
+ {
+ /* [1:0] packet buffer allocation 0 => disabled, else 64k*2^(f-1)
+ [3] packet buffer initialization done
+ [4] perfetch match mode
+ [5] report status in rss field of rx descriptors
+ [7] report status always
+ [14:8] drop queue
+ [20:16] flex 2 byte packet offset (units of 2 bytes)
+ [27:24] max linked list length
+ [31:28] full threshold. */
+ u32 control;
+ CLIB_PAD_FROM_TO (0xee04, 0xee0c);
+
+ u32 data[8];
+
+ /* [1:0] 0 => no action, 1 => add, 2 => remove, 3 => query.
+ [2] valid filter found by query command
+ [3] filter update override
+ [4] ip6 adress table
+ [6:5] l4 protocol reserved, udp, tcp, sctp
+ [7] is ip6
+ [8] clear head/tail
+ [9] packet drop action
+ [10] matched packet generates low-latency interrupt
+ [11] last in linked list
+ [12] collision
+ [15] rx queue enable
+ [22:16] rx queue
+ [29:24] pool. */
+ u32 command;
+
+ CLIB_PAD_FROM_TO (0xee30, 0xee3c);
+ /* ip4 dst/src address, tcp ports, udp ports.
+ set bits mean bit is ignored. */
+ u32 ip4_masks[4];
+ u32 filter_length;
+ u32 usage_stats;
+ u32 failed_usage_stats;
+ u32 filters_match_stats;
+ u32 filters_miss_stats;
+ CLIB_PAD_FROM_TO (0xee60, 0xee68);
+ /* Lookup, signature. */
+ u32 hash_keys[2];
+ /* [15:0] ip6 src address 1 bit per byte
+ [31:16] ip6 dst address. */
+ u32 ip6_mask;
+ /* [0] vlan id
+ [1] vlan priority
+ [2] pool
+ [3] ip protocol
+ [4] flex
+ [5] dst ip6. */
+ u32 other_mask;
+ CLIB_PAD_FROM_TO (0xee78, 0xf000);
+ } flow_director;
+
+ struct
+ {
+ u32 l2_control[64];
+ u32 vlan_pool_filter[64];
+ u32 vlan_pool_filter_bitmap[128];
+ u32 dst_ethernet_address[128];
+ u32 mirror_rule[4];
+ u32 mirror_rule_vlan[8];
+ u32 mirror_rule_pool[8];
+ CLIB_PAD_FROM_TO (0xf650, 0x10010);
+ } pf_bar;
+
+ u32 eeprom_flash_control;
+ /* [0] start
+ [1] done
+ [15:2] address
+ [31:16] read data. */
+ u32 eeprom_read;
+ CLIB_PAD_FROM_TO (0x10018, 0x1001c);
+ u32 flash_access;
+ CLIB_PAD_FROM_TO (0x10020, 0x10114);
+ u32 flash_data;
+ u32 flash_control;
+ u32 flash_read_data;
+ CLIB_PAD_FROM_TO (0x10120, 0x1013c);
+ u32 flash_opcode;
+ u32 software_semaphore;
+ CLIB_PAD_FROM_TO (0x10144, 0x10148);
+ u32 firmware_semaphore;
+ CLIB_PAD_FROM_TO (0x1014c, 0x10160);
+ u32 software_firmware_sync;
+ CLIB_PAD_FROM_TO (0x10164, 0x10200);
+ u32 general_rx_control;
+ CLIB_PAD_FROM_TO (0x10204, 0x11000);
+
+ struct
+ {
+ u32 control;
+ CLIB_PAD_FROM_TO (0x11004, 0x11010);
+ /* [3:0] enable counters
+ [7:4] leaky bucket counter mode
+ [29] reset
+ [30] stop
+ [31] start. */
+ u32 counter_control;
+ /* [7:0],[15:8],[23:16],[31:24] event for counters 0-3.
+ event codes:
+ 0x0 bad tlp
+ 0x10 reqs that reached timeout
+ etc. */
+ u32 counter_event;
+ CLIB_PAD_FROM_TO (0x11018, 0x11020);
+ u32 counters_clear_on_read[4];
+ u32 counter_config[4];
+ struct
+ {
+ u32 address;
+ u32 data;
+ } indirect_access;
+ CLIB_PAD_FROM_TO (0x11048, 0x11050);
+ u32 extended_control;
+ CLIB_PAD_FROM_TO (0x11054, 0x11064);
+ u32 mirrored_revision_id;
+ CLIB_PAD_FROM_TO (0x11068, 0x11070);
+ u32 dca_requester_id_information;
+
+ /* [0] global disable
+ [4:1] mode: 0 => legacy, 1 => dca 1.0. */
+ u32 dca_control;
+ CLIB_PAD_FROM_TO (0x11078, 0x110b0);
+ /* [0] pci completion abort
+ [1] unsupported i/o address
+ [2] wrong byte enable
+ [3] pci timeout */
+ u32 pcie_interrupt_status;
+ CLIB_PAD_FROM_TO (0x110b4, 0x110b8);
+ u32 pcie_interrupt_enable;
+ CLIB_PAD_FROM_TO (0x110bc, 0x110c0);
+ u32 msi_x_pba_clear[8];
+ CLIB_PAD_FROM_TO (0x110e0, 0x12300);
+ } pcie;
+
+ u32 interrupt_throttle1[128 - 24];
+ CLIB_PAD_FROM_TO (0x124a0, 0x14f00);
+
+ u32 core_analog_config;
+ CLIB_PAD_FROM_TO (0x14f04, 0x14f10);
+ u32 core_common_config;
+ CLIB_PAD_FROM_TO (0x14f14, 0x15f14);
+
+ u32 link_sec_software_firmware_interface;
+} ixge_regs_t;
+
+typedef union
+{
+ struct
+ {
+ /* Addresses bigendian. */
+ union
+ {
+ struct
+ {
+ ip6_address_t src_address;
+ u32 unused[1];
+ } ip6;
+ struct
+ {
+ u32 unused[3];
+ ip4_address_t src_address, dst_address;
+ } ip4;
+ };
+
+ /* [15:0] src port (little endian).
+ [31:16] dst port. */
+ u32 tcp_udp_ports;
+
+ /* [15:0] vlan (cfi bit set to 0).
+ [31:16] flex bytes. bigendian. */
+ u32 vlan_and_flex_word;
+
+ /* [14:0] hash
+ [15] bucket valid
+ [31:16] signature (signature filers)/sw-index (perfect match). */
+ u32 hash;
+ };
+
+ u32 as_u32[8];
+} ixge_flow_director_key_t;
+
+always_inline void
+ixge_throttle_queue_interrupt (ixge_regs_t * r,
+ u32 queue_interrupt_index,
+ f64 inter_interrupt_interval_in_secs)
+{
+ volatile u32 *tr =
+ (queue_interrupt_index < ARRAY_LEN (r->interrupt.throttle0)
+ ? &r->interrupt.throttle0[queue_interrupt_index]
+ : &r->interrupt_throttle1[queue_interrupt_index]);
+ ASSERT (queue_interrupt_index < 128);
+ u32 v;
+ i32 i, mask = (1 << 9) - 1;
+
+ i = flt_round_nearest (inter_interrupt_interval_in_secs / 2e-6);
+ i = i < 1 ? 1 : i;
+ i = i >= mask ? mask : i;
+
+ v = tr[0];
+ v &= ~(mask << 3);
+ v |= i << 3;
+ tr[0] = v;
+}
+
+#define foreach_ixge_counter \
+ _ (0x40d0, rx_total_packets) \
+ _64 (0x40c0, rx_total_bytes) \
+ _ (0x41b0, rx_good_packets_before_filtering) \
+ _64 (0x41b4, rx_good_bytes_before_filtering) \
+ _ (0x2f50, rx_dma_good_packets) \
+ _64 (0x2f54, rx_dma_good_bytes) \
+ _ (0x2f5c, rx_dma_duplicated_good_packets) \
+ _64 (0x2f60, rx_dma_duplicated_good_bytes) \
+ _ (0x2f68, rx_dma_good_loopback_packets) \
+ _64 (0x2f6c, rx_dma_good_loopback_bytes) \
+ _ (0x2f74, rx_dma_good_duplicated_loopback_packets) \
+ _64 (0x2f78, rx_dma_good_duplicated_loopback_bytes) \
+ _ (0x4074, rx_good_packets) \
+ _64 (0x4088, rx_good_bytes) \
+ _ (0x407c, rx_multicast_packets) \
+ _ (0x4078, rx_broadcast_packets) \
+ _ (0x405c, rx_64_byte_packets) \
+ _ (0x4060, rx_65_127_byte_packets) \
+ _ (0x4064, rx_128_255_byte_packets) \
+ _ (0x4068, rx_256_511_byte_packets) \
+ _ (0x406c, rx_512_1023_byte_packets) \
+ _ (0x4070, rx_gt_1023_byte_packets) \
+ _ (0x4000, rx_crc_errors) \
+ _ (0x4120, rx_ip_checksum_errors) \
+ _ (0x4004, rx_illegal_symbol_errors) \
+ _ (0x4008, rx_error_symbol_errors) \
+ _ (0x4034, rx_mac_local_faults) \
+ _ (0x4038, rx_mac_remote_faults) \
+ _ (0x4040, rx_length_errors) \
+ _ (0x41a4, rx_xons) \
+ _ (0x41a8, rx_xoffs) \
+ _ (0x40a4, rx_undersize_packets) \
+ _ (0x40a8, rx_fragments) \
+ _ (0x40ac, rx_oversize_packets) \
+ _ (0x40b0, rx_jabbers) \
+ _ (0x40b4, rx_management_packets) \
+ _ (0x40b8, rx_management_drops) \
+ _ (0x3fa0, rx_missed_packets_pool_0) \
+ _ (0x40d4, tx_total_packets) \
+ _ (0x4080, tx_good_packets) \
+ _64 (0x4090, tx_good_bytes) \
+ _ (0x40f0, tx_multicast_packets) \
+ _ (0x40f4, tx_broadcast_packets) \
+ _ (0x87a0, tx_dma_good_packets) \
+ _64 (0x87a4, tx_dma_good_bytes) \
+ _ (0x40d8, tx_64_byte_packets) \
+ _ (0x40dc, tx_65_127_byte_packets) \
+ _ (0x40e0, tx_128_255_byte_packets) \
+ _ (0x40e4, tx_256_511_byte_packets) \
+ _ (0x40e8, tx_512_1023_byte_packets) \
+ _ (0x40ec, tx_gt_1023_byte_packets) \
+ _ (0x4010, tx_undersize_drops) \
+ _ (0x8780, switch_security_violation_packets) \
+ _ (0x5118, fc_crc_errors) \
+ _ (0x241c, fc_rx_drops) \
+ _ (0x2424, fc_last_error_count) \
+ _ (0x2428, fcoe_rx_packets) \
+ _ (0x242c, fcoe_rx_dwords) \
+ _ (0x8784, fcoe_tx_packets) \
+ _ (0x8788, fcoe_tx_dwords) \
+ _ (0x1030, queue_0_rx_count) \
+ _ (0x1430, queue_0_drop_count) \
+ _ (0x1070, queue_1_rx_count) \
+ _ (0x1470, queue_1_drop_count) \
+ _ (0x10b0, queue_2_rx_count) \
+ _ (0x14b0, queue_2_drop_count) \
+ _ (0x10f0, queue_3_rx_count) \
+ _ (0x14f0, queue_3_drop_count) \
+ _ (0x1130, queue_4_rx_count) \
+ _ (0x1530, queue_4_drop_count) \
+ _ (0x1170, queue_5_rx_count) \
+ _ (0x1570, queue_5_drop_count) \
+ _ (0x11b0, queue_6_rx_count) \
+ _ (0x15b0, queue_6_drop_count) \
+ _ (0x11f0, queue_7_rx_count) \
+ _ (0x15f0, queue_7_drop_count) \
+ _ (0x1230, queue_8_rx_count) \
+ _ (0x1630, queue_8_drop_count) \
+ _ (0x1270, queue_9_rx_count) \
+ _ (0x1270, queue_9_drop_count)
+
+
+
+
+typedef enum
+{
+#define _(a,f) IXGE_COUNTER_##f,
+#define _64(a,f) _(a,f)
+ foreach_ixge_counter
+#undef _
+#undef _64
+ IXGE_N_COUNTER,
+} ixge_counter_type_t;
+
+typedef struct
+{
+ u32 mdio_address;
+
+ /* 32 bit ID read from ID registers. */
+ u32 id;
+} ixge_phy_t;
+
+typedef struct
+{
+ /* Cache aligned descriptors. */
+ ixge_descriptor_t *descriptors;
+
+ /* Number of descriptors in table. */
+ u32 n_descriptors;
+
+ /* Software head and tail pointers into descriptor ring. */
+ u32 head_index, tail_index;
+
+ /* Index into dma_queues vector. */
+ u32 queue_index;
+
+ /* Buffer indices corresponding to each active descriptor. */
+ u32 *descriptor_buffer_indices;
+
+ union
+ {
+ struct
+ {
+ u32 *volatile head_index_write_back;
+
+ u32 n_buffers_on_ring;
+ } tx;
+
+ struct
+ {
+ /* Buffer indices to use to replenish each descriptor. */
+ u32 *replenish_buffer_indices;
+
+ vlib_node_runtime_t *node;
+ u32 next_index;
+
+ u32 saved_start_of_packet_buffer_index;
+
+ u32 saved_start_of_packet_next_index;
+ u32 saved_last_buffer_index;
+
+ u32 is_start_of_packet;
+
+ u32 n_descriptors_done_total;
+
+ u32 n_descriptors_done_this_call;
+
+ u32 n_bytes;
+ } rx;
+ };
+} ixge_dma_queue_t;
+
+#define foreach_ixge_pci_device_id \
+ _ (82598, 0x10b6) \
+ _ (82598_bx, 0x1508) \
+ _ (82598af_dual_port, 0x10c6) \
+ _ (82598af_single_port, 0x10c7) \
+ _ (82598at, 0x10c8) \
+ _ (82598at2, 0x150b) \
+ _ (82598eb_sfp_lom, 0x10db) \
+ _ (82598eb_cx4, 0x10dd) \
+ _ (82598_cx4_dual_port, 0x10ec) \
+ _ (82598_da_dual_port, 0x10f1) \
+ _ (82598_sr_dual_port_em, 0x10e1) \
+ _ (82598eb_xf_lr, 0x10f4) \
+ _ (82599_kx4, 0x10f7) \
+ _ (82599_kx4_mezz, 0x1514) \
+ _ (82599_kr, 0x1517) \
+ _ (82599_combo_backplane, 0x10f8) \
+ _ (82599_cx4, 0x10f9) \
+ _ (82599_sfp, 0x10fb) \
+ _ (82599_backplane_fcoe, 0x152a) \
+ _ (82599_sfp_fcoe, 0x1529) \
+ _ (82599_sfp_em, 0x1507) \
+ _ (82599_xaui_lom, 0x10fc) \
+ _ (82599_t3_lom, 0x151c) \
+ _ (x540t, 0x1528)
+
+typedef enum
+{
+#define _(f,n) IXGE_##f = n,
+ foreach_ixge_pci_device_id
+#undef _
+} ixge_pci_device_id_t;
+
+typedef struct
+{
+ /* registers */
+ ixge_regs_t *regs;
+
+ /* Specific next index when using dynamic redirection */
+ u32 per_interface_next_index;
+
+ /* PCI bus info. */
+ vlib_pci_device_t pci_device;
+
+ /* From PCI config space header. */
+ ixge_pci_device_id_t device_id;
+
+ u16 device_index;
+
+ /* 0 or 1. */
+ u16 pci_function;
+
+ /* VLIB interface for this instance. */
+ u32 vlib_hw_if_index, vlib_sw_if_index;
+
+ ixge_dma_queue_t *dma_queues[VLIB_N_RX_TX];
+
+ /* Phy index (0 or 1) and address on MDI bus. */
+ u32 phy_index;
+ ixge_phy_t phys[2];
+
+ /* Value of link_status register at last link change. */
+ u32 link_status_at_last_link_change;
+
+ i2c_bus_t i2c_bus;
+ sfp_eeprom_t sfp_eeprom;
+
+ /* Counters. */
+ u64 counters[IXGE_N_COUNTER], counters_last_clear[IXGE_N_COUNTER];
+} ixge_device_t;
+
+typedef struct
+{
+ vlib_main_t *vlib_main;
+
+ /* Vector of devices. */
+ ixge_device_t *devices;
+
+ /* Descriptor ring sizes. */
+ u32 n_descriptors[VLIB_N_RX_TX];
+
+ /* RX buffer size. Must be at least 1k; will be rounded to
+ next largest 1k size. */
+ u32 n_bytes_in_rx_buffer;
+
+ u32 n_descriptors_per_cache_line;
+
+ u32 vlib_buffer_free_list_index;
+
+ u32 process_node_index;
+
+ /* Template and mask for initializing/validating TX descriptors. */
+ ixge_tx_descriptor_t tx_descriptor_template, tx_descriptor_template_mask;
+
+ /* Vector of buffers for which TX is done and can be freed. */
+ u32 *tx_buffers_pending_free;
+
+ u32 *rx_buffers_to_add;
+
+ f64 time_last_stats_update;
+} ixge_main_t;
+
+ixge_main_t ixge_main;
+vnet_device_class_t ixge_device_class;
+
+typedef enum
+{
+ IXGE_RX_NEXT_IP4_INPUT,
+ IXGE_RX_NEXT_IP6_INPUT,
+ IXGE_RX_NEXT_ETHERNET_INPUT,
+ IXGE_RX_NEXT_DROP,
+ IXGE_RX_N_NEXT,
+} ixge_rx_next_t;
+
+void ixge_set_next_node (ixge_rx_next_t, char *);
+
+#endif /* included_ixge_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/nic/sfp.c b/src/vnet/devices/nic/sfp.c
new file mode 100644
index 00000000000..9e9c008dc15
--- /dev/null
+++ b/src/vnet/devices/nic/sfp.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/devices/nic/sfp.h>
+
+static u8 *
+format_space_terminated (u8 * s, va_list * args)
+{
+ u32 l = va_arg (*args, u32);
+ u8 *v = va_arg (*args, u8 *);
+ u8 *p;
+
+ for (p = v + l - 1; p >= v && p[0] == ' '; p--)
+ ;
+ vec_add (s, v, clib_min (p - v + 1, l));
+ return s;
+}
+
+static u8 *
+format_sfp_id (u8 * s, va_list * args)
+{
+ u32 id = va_arg (*args, u32);
+ char *t = 0;
+ switch (id)
+ {
+#define _(f) case SFP_ID_##f: t = #f; break;
+ foreach_sfp_id
+#undef _
+ default:
+ return format (s, "unknown 0x%x", id);
+ }
+ return format (s, "%s", t);
+}
+
+static u8 *
+format_sfp_compatibility (u8 * s, va_list * args)
+{
+ u32 c = va_arg (*args, u32);
+ char *t = 0;
+ switch (c)
+ {
+#define _(a,b,f) case SFP_COMPATIBILITY_##f: t = #f; break;
+ foreach_sfp_compatibility
+#undef _
+ default:
+ return format (s, "unknown 0x%x", c);
+ }
+ return format (s, "%s", t);
+}
+
+u32
+sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c)
+{
+ static struct
+ {
+ u8 byte, bit;
+ } t[] =
+ {
+#define _(a,b,f) { .byte = a, .bit = b, },
+ foreach_sfp_compatibility
+#undef _
+ };
+
+ ASSERT (c < ARRAY_LEN (t));
+ return (e->compatibility[t[c].byte] & (1 << t[c].bit)) != 0;
+}
+
+u8 *
+format_sfp_eeprom (u8 * s, va_list * args)
+{
+ sfp_eeprom_t *e = va_arg (*args, sfp_eeprom_t *);
+ uword indent = format_get_indent (s);
+ int i;
+
+ if (e->id != SFP_ID_sfp)
+ s = format (s, "id %U, ", format_sfp_id, e->id);
+
+ s = format (s, "compatibility:");
+ for (i = 0; i < SFP_N_COMPATIBILITY; i++)
+ if (sfp_is_comatible (e, i))
+ s = format (s, " %U", format_sfp_compatibility, i);
+
+ s = format (s, "\n%Uvendor: %U, part %U",
+ format_white_space, indent,
+ format_space_terminated, sizeof (e->vendor_name),
+ e->vendor_name, format_space_terminated,
+ sizeof (e->vendor_part_number), e->vendor_part_number);
+ s =
+ format (s, "\n%Urevision: %U, serial: %U, date code: %U",
+ format_white_space, indent, format_space_terminated,
+ sizeof (e->vendor_revision), e->vendor_revision,
+ format_space_terminated, sizeof (e->vendor_serial_number),
+ e->vendor_serial_number, format_space_terminated,
+ sizeof (e->vendor_date_code), e->vendor_date_code);
+
+ return s;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/nic/sfp.h b/src/vnet/devices/nic/sfp.h
new file mode 100644
index 00000000000..a1ac7997a44
--- /dev/null
+++ b/src/vnet/devices/nic/sfp.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vnet_optics_sfp_h
+#define included_vnet_optics_sfp_h
+
+#include <vppinfra/format.h>
+
+#define foreach_sfp_id \
+ _ (unknown) \
+ _ (gbic) \
+ _ (on_motherboard) \
+ _ (sfp)
+
+typedef enum
+{
+#define _(f) SFP_ID_##f,
+ foreach_sfp_id
+#undef _
+} sfp_id_t;
+
+typedef struct
+{
+ u8 id;
+ u8 extended_id;
+ u8 connector_type;
+ u8 compatibility[8];
+ u8 encoding;
+ u8 nominal_bit_rate_100mbits_per_sec;
+ u8 reserved13;
+ u8 link_length[5];
+ u8 reserved19;
+ u8 vendor_name[16];
+ u8 reserved36;
+ u8 vendor_oui[3];
+ u8 vendor_part_number[16];
+ u8 vendor_revision[4];
+ /* 16 bit value network byte order. */
+ u8 laser_wavelength_in_nm[2];
+ u8 reserved62;
+ u8 checksum_0_to_62;
+
+ u8 options[2];
+ u8 max_bit_rate_margin_percent;
+ u8 min_bit_rate_margin_percent;
+ u8 vendor_serial_number[16];
+ u8 vendor_date_code[8];
+ u8 reserved92[3];
+ u8 checksum_63_to_94;
+ u8 vendor_specific[32];
+ u8 reserved128[384];
+
+ /* Vendor specific data follows. */
+ u8 vendor_specific1[0];
+} sfp_eeprom_t;
+
+always_inline uword
+sfp_eeprom_is_valid (sfp_eeprom_t * e)
+{
+ int i;
+ u8 sum = 0;
+ for (i = 0; i < 63; i++)
+ sum += ((u8 *) e)[i];
+ return sum == e->checksum_0_to_62;
+}
+
+/* _ (byte_index, bit_index, name) */
+#define foreach_sfp_compatibility \
+ _ (0, 4, 10g_base_sr) \
+ _ (0, 5, 10g_base_lr) \
+ _ (1, 2, oc48_long_reach) \
+ _ (1, 1, oc48_intermediate_reach) \
+ _ (1, 0, oc48_short_reach) \
+ _ (2, 6, oc12_long_reach) \
+ _ (2, 5, oc12_intermediate_reach) \
+ _ (2, 4, oc12_short_reach) \
+ _ (2, 2, oc3_long_reach) \
+ _ (2, 1, oc3_intermediate_reach) \
+ _ (2, 0, oc3_short_reach) \
+ _ (3, 3, 1g_base_t) \
+ _ (3, 2, 1g_base_cx) \
+ _ (3, 1, 1g_base_lx) \
+ _ (3, 0, 1g_base_sx)
+
+typedef enum
+{
+#define _(a,b,f) SFP_COMPATIBILITY_##f,
+ foreach_sfp_compatibility
+#undef _
+ SFP_N_COMPATIBILITY,
+} sfp_compatibility_t;
+
+u32 sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c);
+
+format_function_t format_sfp_eeprom;
+
+#endif /* included_vnet_optics_sfp_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/ssvm/node.c b/src/vnet/devices/ssvm/node.c
new file mode 100644
index 00000000000..3a695b1d8c0
--- /dev/null
+++ b/src/vnet/devices/ssvm/node.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ssvm_eth.h"
+
+vlib_node_registration_t ssvm_eth_input_node;
+
+typedef struct
+{
+ u32 next_index;
+ u32 sw_if_index;
+} ssvm_eth_input_trace_t;
+
+/* packet trace format function */
+static u8 *
+format_ssvm_eth_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ssvm_eth_input_trace_t *t = va_arg (*args, ssvm_eth_input_trace_t *);
+
+ s = format (s, "SSVM_ETH_INPUT: sw_if_index %d, next index %d",
+ t->sw_if_index, t->next_index);
+ return s;
+}
+
+vlib_node_registration_t ssvm_eth_input_node;
+
+#define foreach_ssvm_eth_input_error \
+_(NO_BUFFERS, "Rx packet drops (no buffers)")
+
+typedef enum
+{
+#define _(sym,str) SSVM_ETH_INPUT_ERROR_##sym,
+ foreach_ssvm_eth_input_error
+#undef _
+ SSVM_ETH_INPUT_N_ERROR,
+} ssvm_eth_input_error_t;
+
+static char *ssvm_eth_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ssvm_eth_input_error
+#undef _
+};
+
+typedef enum
+{
+ SSVM_ETH_INPUT_NEXT_DROP,
+ SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT,
+ SSVM_ETH_INPUT_NEXT_IP4_INPUT,
+ SSVM_ETH_INPUT_NEXT_IP6_INPUT,
+ SSVM_ETH_INPUT_NEXT_MPLS_INPUT,
+ SSVM_ETH_INPUT_N_NEXT,
+} ssvm_eth_input_next_t;
+
+static inline uword
+ssvm_eth_device_input (ssvm_eth_main_t * em,
+ ssvm_private_t * intfc, vlib_node_runtime_t * node)
+{
+ ssvm_shared_header_t *sh = intfc->sh;
+ vlib_main_t *vm = em->vlib_main;
+ unix_shared_memory_queue_t *q;
+ ssvm_eth_queue_elt_t *elt, *elts;
+ u32 elt_index;
+ u32 my_pid = intfc->my_pid;
+ int rx_queue_index;
+ u32 n_to_alloc = VLIB_FRAME_SIZE * 2;
+ u32 n_allocated, n_present_in_cache;
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ vlib_buffer_free_list_t *fl;
+ u32 n_left_to_next, *to_next;
+ u32 next0;
+ u32 n_buffers;
+ u32 n_available;
+ u32 bi0, saved_bi0;
+ vlib_buffer_t *b0, *prev;
+ u32 saved_cache_size = 0;
+ ethernet_header_t *eh0;
+ u16 type0;
+ u32 n_rx_bytes = 0, l3_offset0;
+ u32 cpu_index = os_get_cpu_number ();
+ u32 trace_cnt __attribute__ ((unused)) = vlib_get_trace_count (vm, node);
+ volatile u32 *lock;
+ u32 *elt_indices;
+ uword n_trace = vlib_get_trace_count (vm, node);
+
+ /* Either side down? buh-bye... */
+ if (pointer_to_uword (sh->opaque[MASTER_ADMIN_STATE_INDEX]) == 0 ||
+ pointer_to_uword (sh->opaque[SLAVE_ADMIN_STATE_INDEX]) == 0)
+ return 0;
+
+ if (intfc->i_am_master)
+ q = (unix_shared_memory_queue_t *) (sh->opaque[TO_MASTER_Q_INDEX]);
+ else
+ q = (unix_shared_memory_queue_t *) (sh->opaque[TO_SLAVE_Q_INDEX]);
+
+ /* Nothing to do? */
+ if (q->cursize == 0)
+ return 0;
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ vec_reset_length (intfc->rx_queue);
+
+ lock = (u32 *) q;
+ while (__sync_lock_test_and_set (lock, 1))
+ ;
+ while (q->cursize > 0)
+ {
+ unix_shared_memory_queue_sub_raw (q, (u8 *) & elt_index);
+ ASSERT (elt_index < 2048);
+ vec_add1 (intfc->rx_queue, elt_index);
+ }
+ CLIB_MEMORY_BARRIER ();
+ *lock = 0;
+
+ n_present_in_cache = vec_len (em->buffer_cache);
+
+ if (vec_len (em->buffer_cache) < vec_len (intfc->rx_queue) * 2)
+ {
+ vec_validate (em->buffer_cache,
+ n_to_alloc + vec_len (em->buffer_cache) - 1);
+ n_allocated =
+ vlib_buffer_alloc (vm, &em->buffer_cache[n_present_in_cache],
+ n_to_alloc);
+
+ n_present_in_cache += n_allocated;
+ _vec_len (em->buffer_cache) = n_present_in_cache;
+ }
+
+ elts = (ssvm_eth_queue_elt_t *) (sh->opaque[CHUNK_POOL_INDEX]);
+
+ n_buffers = vec_len (intfc->rx_queue);
+ rx_queue_index = 0;
+
+ while (n_buffers > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_buffers > 0 && n_left_to_next > 0)
+ {
+ elt = elts + intfc->rx_queue[rx_queue_index];
+
+ saved_cache_size = n_present_in_cache;
+ if (PREDICT_FALSE (saved_cache_size == 0))
+ {
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ goto out;
+ }
+ saved_bi0 = bi0 = em->buffer_cache[--n_present_in_cache];
+ b0 = vlib_get_buffer (vm, bi0);
+ prev = 0;
+
+ while (1)
+ {
+ vlib_buffer_init_for_free_list (b0, fl);
+
+ b0->current_data = elt->current_data_hint;
+ b0->current_length = elt->length_this_buffer;
+ b0->total_length_not_including_first_buffer =
+ elt->total_length_not_including_first_buffer;
+
+ clib_memcpy (b0->data + b0->current_data, elt->data,
+ b0->current_length);
+
+ if (PREDICT_FALSE (prev != 0))
+ prev->next_buffer = bi0;
+
+ if (PREDICT_FALSE (elt->flags & SSVM_BUFFER_NEXT_PRESENT))
+ {
+ prev = b0;
+ if (PREDICT_FALSE (n_present_in_cache == 0))
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ goto out;
+ }
+ bi0 = em->buffer_cache[--n_present_in_cache];
+ b0 = vlib_get_buffer (vm, bi0);
+ }
+ else
+ break;
+ }
+
+ saved_cache_size = n_present_in_cache;
+
+ to_next[0] = saved_bi0;
+ to_next++;
+ n_left_to_next--;
+
+ b0 = vlib_get_buffer (vm, saved_bi0);
+ eh0 = vlib_buffer_get_current (b0);
+
+ type0 = clib_net_to_host_u16 (eh0->type);
+
+ next0 = SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT;
+
+ if (type0 == ETHERNET_TYPE_IP4)
+ next0 = SSVM_ETH_INPUT_NEXT_IP4_INPUT;
+ else if (type0 == ETHERNET_TYPE_IP6)
+ next0 = SSVM_ETH_INPUT_NEXT_IP6_INPUT;
+ else if (type0 == ETHERNET_TYPE_MPLS_UNICAST)
+ next0 = SSVM_ETH_INPUT_NEXT_MPLS_INPUT;
+
+ l3_offset0 = ((next0 == SSVM_ETH_INPUT_NEXT_IP4_INPUT ||
+ next0 == SSVM_ETH_INPUT_NEXT_IP6_INPUT ||
+ next0 == SSVM_ETH_INPUT_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ n_rx_bytes += b0->current_length
+ + b0->total_length_not_including_first_buffer;
+
+ b0->current_data += l3_offset0;
+ b0->current_length -= l3_offset0;
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = intfc->vlib_hw_if_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+
+ if (PREDICT_FALSE (n_trace > 0))
+ {
+ ssvm_eth_input_trace_t *tr;
+
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 1);
+ vlib_set_trace_count (vm, node, --n_trace);
+
+ tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+
+ tr->next_index = next0;
+ tr->sw_if_index = intfc->vlib_hw_if_index;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ n_buffers--;
+ rx_queue_index++;
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+out:
+ if (em->buffer_cache)
+ _vec_len (em->buffer_cache) = saved_cache_size;
+ else
+ ASSERT (saved_cache_size == 0);
+
+ ssvm_lock (sh, my_pid, 2);
+
+ ASSERT (vec_len (intfc->rx_queue) > 0);
+
+ n_available = (u32) pointer_to_uword (sh->opaque[CHUNK_POOL_NFREE]);
+ elt_indices = (u32 *) (sh->opaque[CHUNK_POOL_FREELIST_INDEX]);
+
+ clib_memcpy (&elt_indices[n_available], intfc->rx_queue,
+ vec_len (intfc->rx_queue) * sizeof (u32));
+
+ n_available += vec_len (intfc->rx_queue);
+ sh->opaque[CHUNK_POOL_NFREE] = uword_to_pointer (n_available, void *);
+
+ ssvm_unlock (sh);
+
+ vlib_error_count (vm, node->node_index, SSVM_ETH_INPUT_ERROR_NO_BUFFERS,
+ n_buffers);
+
+ vlib_increment_combined_counter
+ (vnet_get_main ()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX, cpu_index,
+ intfc->vlib_hw_if_index, rx_queue_index, n_rx_bytes);
+
+ return rx_queue_index;
+}
+
+static uword
+ssvm_eth_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ ssvm_eth_main_t *em = &ssvm_eth_main;
+ ssvm_private_t *intfc;
+ uword n_rx_packets = 0;
+
+ vec_foreach (intfc, em->intfcs)
+ {
+ n_rx_packets += ssvm_eth_device_input (em, intfc, node);
+ }
+
+ return n_rx_packets;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (ssvm_eth_input_node) = {
+ .function = ssvm_eth_input_node_fn,
+ .name = "ssvm_eth_input",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ssvm_eth_input_trace,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .n_errors = ARRAY_LEN(ssvm_eth_input_error_strings),
+ .error_strings = ssvm_eth_input_error_strings,
+
+ .n_next_nodes = SSVM_ETH_INPUT_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [SSVM_ETH_INPUT_NEXT_DROP] = "error-drop",
+ [SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [SSVM_ETH_INPUT_NEXT_IP4_INPUT] = "ip4-input",
+ [SSVM_ETH_INPUT_NEXT_IP6_INPUT] = "ip6-input",
+ [SSVM_ETH_INPUT_NEXT_MPLS_INPUT] = "mpls-input",
+ },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (ssvm_eth_input_node, ssvm_eth_input_node_fn)
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/ssvm/ssvm_eth.c b/src/vnet/devices/ssvm/ssvm_eth.c
new file mode 100644
index 00000000000..db4fafa9a14
--- /dev/null
+++ b/src/vnet/devices/ssvm/ssvm_eth.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ssvm_eth.h"
+
+ssvm_eth_main_t ssvm_eth_main;
+
+#define foreach_ssvm_eth_tx_func_error \
+_(RING_FULL, "Tx packet drops (ring full)") \
+_(NO_BUFFERS, "Tx packet drops (no buffers)") \
+_(ADMIN_DOWN, "Tx packet drops (admin down)")
+
+typedef enum
+{
+#define _(f,s) SSVM_ETH_TX_ERROR_##f,
+ foreach_ssvm_eth_tx_func_error
+#undef _
+ SSVM_ETH_TX_N_ERROR,
+} ssvm_eth_tx_func_error_t;
+
+static u32 ssvm_eth_flag_change (vnet_main_t * vnm,
+ vnet_hw_interface_t * hi, u32 flags);
+
+int
+ssvm_eth_create (ssvm_eth_main_t * em, u8 * name, int is_master)
+{
+ ssvm_private_t *intfc;
+ void *oldheap;
+ clib_error_t *e;
+ unix_shared_memory_queue_t *q;
+ ssvm_shared_header_t *sh;
+ ssvm_eth_queue_elt_t *elts;
+ u32 *elt_indices;
+ u8 enet_addr[6];
+ int i, rv;
+
+ vec_add2 (em->intfcs, intfc, 1);
+
+ intfc->ssvm_size = em->segment_size;
+ intfc->i_am_master = 1;
+ intfc->name = name;
+ intfc->my_pid = getpid ();
+ if (is_master == 0)
+ {
+ rv = ssvm_slave_init (intfc, 20 /* timeout in seconds */ );
+ if (rv < 0)
+ return rv;
+ goto create_vnet_interface;
+ }
+
+ intfc->requested_va = em->next_base_va;
+ em->next_base_va += em->segment_size;
+ rv = ssvm_master_init (intfc, intfc - em->intfcs /* master index */ );
+
+ if (rv < 0)
+ return rv;
+
+ /* OK, segment created, set up queues and so forth. */
+
+ sh = intfc->sh;
+ oldheap = ssvm_push_heap (sh);
+
+ q = unix_shared_memory_queue_init (em->queue_elts, sizeof (u32),
+ 0 /* consumer pid not interesting */ ,
+ 0 /* signal not sent */ );
+ sh->opaque[TO_MASTER_Q_INDEX] = (void *) q;
+ q = unix_shared_memory_queue_init (em->queue_elts, sizeof (u32),
+ 0 /* consumer pid not interesting */ ,
+ 0 /* signal not sent */ );
+ sh->opaque[TO_SLAVE_Q_INDEX] = (void *) q;
+
+ /*
+ * Preallocate the requested number of buffer chunks
+ * There must be a better way to do this, etc.
+ * Add some slop to avoid pool reallocation, which will not go well
+ */
+ elts = 0;
+ elt_indices = 0;
+
+ vec_validate_aligned (elts, em->nbuffers - 1, CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (elt_indices, em->nbuffers - 1, CLIB_CACHE_LINE_BYTES);
+
+ for (i = 0; i < em->nbuffers; i++)
+ elt_indices[i] = i;
+
+ sh->opaque[CHUNK_POOL_INDEX] = (void *) elts;
+ sh->opaque[CHUNK_POOL_FREELIST_INDEX] = (void *) elt_indices;
+ sh->opaque[CHUNK_POOL_NFREE] = (void *) (uword) em->nbuffers;
+
+ ssvm_pop_heap (oldheap);
+
+create_vnet_interface:
+
+ sh = intfc->sh;
+
+ memset (enet_addr, 0, sizeof (enet_addr));
+ enet_addr[0] = 2;
+ enet_addr[1] = 0xFE;
+ enet_addr[2] = is_master;
+ enet_addr[5] = sh->master_index;
+
+ e = ethernet_register_interface
+ (em->vnet_main, ssvm_eth_device_class.index, intfc - em->intfcs,
+ /* ethernet address */ enet_addr,
+ &intfc->vlib_hw_if_index, ssvm_eth_flag_change);
+
+ if (e)
+ {
+ clib_error_report (e);
+ /* $$$$ unmap offending region? */
+ return VNET_API_ERROR_INVALID_INTERFACE;
+ }
+
+ /* Declare link up */
+ vnet_hw_interface_set_flags (em->vnet_main, intfc->vlib_hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+
+ /* Let the games begin... */
+ if (is_master)
+ sh->ready = 1;
+ return 0;
+}
+
+static clib_error_t *
+ssvm_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ u8 *name;
+ int is_master = 1;
+ int i, rv;
+ ssvm_eth_main_t *em = &ssvm_eth_main;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "base-va %llx", &em->next_base_va))
+ ;
+ else if (unformat (input, "segment-size %lld", &em->segment_size))
+ em->segment_size = 1ULL << (max_log2 (em->segment_size));
+ else if (unformat (input, "nbuffers %lld", &em->nbuffers))
+ ;
+ else if (unformat (input, "queue-elts %lld", &em->queue_elts))
+ ;
+ else if (unformat (input, "slave"))
+ is_master = 0;
+ else if (unformat (input, "%s", &name))
+ vec_add1 (em->names, name);
+ else
+ break;
+ }
+
+ /* No configured instances, we're done... */
+ if (vec_len (em->names) == 0)
+ return 0;
+
+ for (i = 0; i < vec_len (em->names); i++)
+ {
+ rv = ssvm_eth_create (em, em->names[i], is_master);
+ if (rv < 0)
+ return clib_error_return (0, "ssvm_eth_create '%s' failed, error %d",
+ em->names[i], rv);
+ }
+
+ vlib_node_set_state (vm, ssvm_eth_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (ssvm_config, "ssvm_eth");
+
+
+static clib_error_t *
+ssvm_eth_init (vlib_main_t * vm)
+{
+ ssvm_eth_main_t *em = &ssvm_eth_main;
+
+ if (((sizeof (ssvm_eth_queue_elt_t) / CLIB_CACHE_LINE_BYTES)
+ * CLIB_CACHE_LINE_BYTES) != sizeof (ssvm_eth_queue_elt_t))
+ clib_warning ("ssvm_eth_queue_elt_t size %d not a multiple of %d",
+ sizeof (ssvm_eth_queue_elt_t), CLIB_CACHE_LINE_BYTES);
+
+ em->vlib_main = vm;
+ em->vnet_main = vnet_get_main ();
+ em->elog_main = &vm->elog_main;
+
+ /* default config param values... */
+
+ em->next_base_va = 0x600000000ULL;
+ /*
+ * Allocate 2 full superframes in each dir (256 x 2 x 2 x 2048 bytes),
+ * 2mb; double that so we have plenty of space... 4mb
+ */
+ em->segment_size = 8 << 20;
+ em->nbuffers = 1024;
+ em->queue_elts = 512;
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ssvm_eth_init);
+
+static char *ssvm_eth_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_ssvm_eth_tx_func_error
+#undef _
+};
+
+static u8 *
+format_ssvm_eth_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+
+ s = format (s, "ssvmEthernet%d", i);
+ return s;
+}
+
+static u8 *
+format_ssvm_eth_device (u8 * s, va_list * args)
+{
+ s = format (s, "SSVM Ethernet");
+ return s;
+}
+
+static u8 *
+format_ssvm_eth_tx_trace (u8 * s, va_list * args)
+{
+ s = format (s, "Unimplemented...");
+ return s;
+}
+
+
+static uword
+ssvm_eth_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ ssvm_eth_main_t *em = &ssvm_eth_main;
+ vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
+ ssvm_private_t *intfc = vec_elt_at_index (em->intfcs, rd->dev_instance);
+ ssvm_shared_header_t *sh = intfc->sh;
+ unix_shared_memory_queue_t *q;
+ u32 *from;
+ u32 n_left;
+ ssvm_eth_queue_elt_t *elts, *elt, *prev_elt;
+ u32 my_pid = intfc->my_pid;
+ vlib_buffer_t *b0;
+ u32 bi0;
+ u32 size_this_buffer;
+ u32 chunks_this_buffer;
+ u8 i_am_master = intfc->i_am_master;
+ u32 elt_index;
+ int is_ring_full, interface_down;
+ int i;
+ volatile u32 *queue_lock;
+ u32 n_to_alloc = VLIB_FRAME_SIZE;
+ u32 n_allocated, n_present_in_cache, n_available;
+ u32 *elt_indices;
+
+ if (i_am_master)
+ q = (unix_shared_memory_queue_t *) sh->opaque[TO_SLAVE_Q_INDEX];
+ else
+ q = (unix_shared_memory_queue_t *) sh->opaque[TO_MASTER_Q_INDEX];
+
+ queue_lock = (u32 *) q;
+
+ from = vlib_frame_vector_args (f);
+ n_left = f->n_vectors;
+ is_ring_full = 0;
+ interface_down = 0;
+
+ n_present_in_cache = vec_len (em->chunk_cache);
+
+ /* admin / link up/down check */
+ if (sh->opaque[MASTER_ADMIN_STATE_INDEX] == 0 ||
+ sh->opaque[SLAVE_ADMIN_STATE_INDEX] == 0)
+ {
+ interface_down = 1;
+ goto out;
+ }
+
+ ssvm_lock (sh, my_pid, 1);
+
+ elts = (ssvm_eth_queue_elt_t *) (sh->opaque[CHUNK_POOL_INDEX]);
+ elt_indices = (u32 *) (sh->opaque[CHUNK_POOL_FREELIST_INDEX]);
+ n_available = (u32) pointer_to_uword (sh->opaque[CHUNK_POOL_NFREE]);
+
+ if (n_present_in_cache < n_left * 2)
+ {
+ vec_validate (em->chunk_cache, n_to_alloc + n_present_in_cache - 1);
+
+ n_allocated = n_to_alloc < n_available ? n_to_alloc : n_available;
+
+ if (PREDICT_TRUE (n_allocated > 0))
+ {
+ clib_memcpy (&em->chunk_cache[n_present_in_cache],
+ &elt_indices[n_available - n_allocated],
+ sizeof (u32) * n_allocated);
+ }
+
+ n_present_in_cache += n_allocated;
+ n_available -= n_allocated;
+ sh->opaque[CHUNK_POOL_NFREE] = uword_to_pointer (n_available, void *);
+ _vec_len (em->chunk_cache) = n_present_in_cache;
+ }
+
+ ssvm_unlock (sh);
+
+ while (n_left)
+ {
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+
+ size_this_buffer = vlib_buffer_length_in_chain (vm, b0);
+ chunks_this_buffer = (size_this_buffer + (SSVM_BUFFER_SIZE - 1))
+ / SSVM_BUFFER_SIZE;
+
+ /* If we're not going to be able to enqueue the buffer, tail drop. */
+ if (q->cursize >= q->maxsize)
+ {
+ is_ring_full = 1;
+ break;
+ }
+
+ prev_elt = 0;
+ elt_index = ~0;
+ for (i = 0; i < chunks_this_buffer; i++)
+ {
+ if (PREDICT_FALSE (n_present_in_cache == 0))
+ goto out;
+
+ elt_index = em->chunk_cache[--n_present_in_cache];
+ elt = elts + elt_index;
+
+ elt->type = SSVM_PACKET_TYPE;
+ elt->flags = 0;
+ elt->total_length_not_including_first_buffer =
+ b0->total_length_not_including_first_buffer;
+ elt->length_this_buffer = b0->current_length;
+ elt->current_data_hint = b0->current_data;
+ elt->owner = !i_am_master;
+ elt->tag = 1;
+
+ clib_memcpy (elt->data, b0->data + b0->current_data,
+ b0->current_length);
+
+ if (PREDICT_FALSE (prev_elt != 0))
+ prev_elt->next_index = elt - elts;
+
+ if (PREDICT_FALSE (i < (chunks_this_buffer - 1)))
+ {
+ elt->flags = SSVM_BUFFER_NEXT_PRESENT;
+ ASSERT (b0->flags & VLIB_BUFFER_NEXT_PRESENT);
+ b0 = vlib_get_buffer (vm, b0->next_buffer);
+ }
+ prev_elt = elt;
+ }
+
+ while (__sync_lock_test_and_set (queue_lock, 1))
+ ;
+
+ unix_shared_memory_queue_add_raw (q, (u8 *) & elt_index);
+ CLIB_MEMORY_BARRIER ();
+ *queue_lock = 0;
+
+ from++;
+ n_left--;
+ }
+
+out:
+ if (PREDICT_FALSE (n_left))
+ {
+ if (is_ring_full)
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_RING_FULL,
+ n_left);
+ else if (interface_down)
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_ADMIN_DOWN,
+ n_left);
+ else
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_NO_BUFFERS,
+ n_left);
+
+ vlib_buffer_free (vm, from, n_left);
+ }
+ else
+ vlib_buffer_free (vm, vlib_frame_vector_args (f), f->n_vectors);
+
+ if (PREDICT_TRUE (vec_len (em->chunk_cache)))
+ _vec_len (em->chunk_cache) = n_present_in_cache;
+
+ return f->n_vectors;
+}
+
+static void
+ssvm_eth_clear_hw_interface_counters (u32 instance)
+{
+ /* Nothing for now */
+}
+
+static clib_error_t *
+ssvm_eth_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index,
+ u32 flags)
+{
+ vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ ssvm_eth_main_t *em = &ssvm_eth_main;
+ ssvm_private_t *intfc = vec_elt_at_index (em->intfcs, hif->dev_instance);
+ ssvm_shared_header_t *sh;
+
+ /* publish link-state in shared-memory, to discourage buffer-wasting */
+ sh = intfc->sh;
+ if (intfc->i_am_master)
+ sh->opaque[MASTER_ADMIN_STATE_INDEX] = (void *) is_up;
+ else
+ sh->opaque[SLAVE_ADMIN_STATE_INDEX] = (void *) is_up;
+
+ return 0;
+}
+
+static clib_error_t *
+ssvm_eth_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t *st, int is_add)
+{
+ /* Nothing for now */
+ return 0;
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void
+ssvm_eth_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ ssvm_eth_main_t *em = &ssvm_eth_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ ssvm_private_t *intfc = pool_elt_at_index (em->intfcs, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ intfc->per_interface_next_index = node_index;
+ return;
+ }
+
+ intfc->per_interface_next_index =
+ vlib_node_add_next (em->vlib_main, ssvm_eth_input_node.index, node_index);
+}
+
+static u32
+ssvm_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
+{
+ /* nothing for now */
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (ssvm_eth_device_class) = {
+ .name = "ssvm-eth",
+ .tx_function = ssvm_eth_interface_tx,
+ .tx_function_n_errors = SSVM_ETH_TX_N_ERROR,
+ .tx_function_error_strings = ssvm_eth_tx_func_error_strings,
+ .format_device_name = format_ssvm_eth_device_name,
+ .format_device = format_ssvm_eth_device,
+ .format_tx_trace = format_ssvm_eth_tx_trace,
+ .clear_counters = ssvm_eth_clear_hw_interface_counters,
+ .admin_up_down_function = ssvm_eth_interface_admin_up_down,
+ .subif_add_del_function = ssvm_eth_subif_add_del_function,
+ .rx_redirect_to_node = ssvm_eth_set_interface_next_node,
+};
+
+VLIB_DEVICE_TX_FUNCTION_MULTIARCH (ssvm_eth_device_class,
+ ssvm_eth_interface_tx)
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/ssvm/ssvm_eth.h b/src/vnet/devices/ssvm/ssvm_eth.h
new file mode 100644
index 00000000000..f877df3cd33
--- /dev/null
+++ b/src/vnet/devices/ssvm/ssvm_eth.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ssvm_eth_h__
+#define __included_ssvm_eth_h__
+
+#include <vnet/vnet.h>
+
+#include <vppinfra/elog.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/elog.h>
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+#include <vlibmemory/unix_shared_memory_queue.h>
+
+#include <svm/ssvm.h>
+
+extern vnet_device_class_t ssvm_eth_device_class;
+extern vlib_node_registration_t ssvm_eth_input_node;
+
+#define SSVM_BUFFER_SIZE \
+ (VLIB_BUFFER_DATA_SIZE + VLIB_BUFFER_PRE_DATA_SIZE)
+#define SSVM_PACKET_TYPE 1
+
+typedef struct
+{
+ /* Type of queue element */
+ u8 type;
+ u8 flags;
+#define SSVM_BUFFER_NEXT_PRESENT (1<<0)
+ u8 owner;
+ u8 tag;
+ i16 current_data_hint;
+ u16 length_this_buffer;
+ u16 total_length_not_including_first_buffer;
+ u16 pad;
+ u32 next_index;
+ /* offset 16 */
+ u8 data[SSVM_BUFFER_SIZE];
+ /* pad to an even multiple of 64 octets */
+ u8 pad2[CLIB_CACHE_LINE_BYTES - 16];
+} ssvm_eth_queue_elt_t;
+
+typedef struct
+{
+ /* vector of point-to-point connections */
+ ssvm_private_t *intfcs;
+
+ u32 *buffer_cache;
+ u32 *chunk_cache;
+
+ /* Configurable parameters */
+ /* base address for next placement */
+ u64 next_base_va;
+ u64 segment_size;
+ u64 nbuffers;
+ u64 queue_elts;
+
+ /* Segment names */
+ u8 **names;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+ elog_main_t *elog_main;
+} ssvm_eth_main_t;
+
+ssvm_eth_main_t ssvm_eth_main;
+
+typedef enum
+{
+ CHUNK_POOL_FREELIST_INDEX = 0,
+ CHUNK_POOL_INDEX,
+ CHUNK_POOL_NFREE,
+ TO_MASTER_Q_INDEX,
+ TO_SLAVE_Q_INDEX,
+ MASTER_ADMIN_STATE_INDEX,
+ SLAVE_ADMIN_STATE_INDEX,
+} ssvm_eth_opaque_index_t;
+
+/*
+ * debug scaffolding.
+ */
+static inline void
+ssvm_eth_validate_freelists (int need_lock)
+{
+#if CLIB_DEBUG > 0
+ ssvm_eth_main_t *em = &ssvm_eth_main;
+ ssvm_private_t *intfc;
+ ssvm_shared_header_t *sh;
+ u32 *elt_indices;
+ u32 n_available;
+ int i;
+
+ for (i = 0; i < vec_len (em->intfcs); i++)
+ {
+ intfc = em->intfcs + i;
+ sh = intfc->sh;
+ u32 my_pid = intfc->my_pid;
+
+ if (need_lock)
+ ssvm_lock (sh, my_pid, 15);
+
+ elt_indices = (u32 *) (sh->opaque[CHUNK_POOL_FREELIST_INDEX]);
+ n_available = (u32) (uword) (sh->opaque[CHUNK_POOL_NFREE]);
+
+ for (i = 0; i < n_available; i++)
+ ASSERT (elt_indices[i] < 2048);
+
+ if (need_lock)
+ ssvm_unlock (sh);
+ }
+#endif
+}
+
+#endif /* __included_ssvm_eth_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/virtio/dir.dox b/src/vnet/devices/virtio/dir.dox
new file mode 100644
index 00000000000..50150799e62
--- /dev/null
+++ b/src/vnet/devices/virtio/dir.dox
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Doxygen directory documentation */
+
+/**
+@dir
+@brief vHost User Interface Implementation.
+
+This directory contains the source code for vHost User driver.
+
+*/
+/*? %%clicmd:group_label vHost User %% ?*/
+/*? %%syscfg:group_label vHost User %% ?*/
diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c
new file mode 100644
index 00000000000..bde8106c501
--- /dev/null
+++ b/src/vnet/devices/virtio/vhost-user.c
@@ -0,0 +1,3314 @@
+/*
+ *------------------------------------------------------------------
+ * vhost.c - vhost-user
+ *
+ * Copyright (c) 2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <fcntl.h> /* for open */
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h> /* for iovec */
+#include <netinet/in.h>
+#include <sys/vfs.h>
+
+#include <linux/if_arp.h>
+#include <linux/if_tun.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <vnet/ip/ip.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/devices.h>
+#include <vnet/feature/feature.h>
+
+#include <vnet/devices/virtio/vhost-user.h>
+
+/**
+ * @file
+ * @brief vHost User Device Driver.
+ *
+ * This file contains the source code for vHost User interface.
+ */
+
+
+#define VHOST_USER_DEBUG_SOCKET 0
+#define VHOST_DEBUG_VQ 0
+
+#if VHOST_USER_DEBUG_SOCKET == 1
+#define DBG_SOCK(args...) clib_warning(args);
+#else
+#define DBG_SOCK(args...)
+#endif
+
+#if VHOST_DEBUG_VQ == 1
+#define DBG_VQ(args...) clib_warning(args);
+#else
+#define DBG_VQ(args...)
+#endif
+
+/*
+ * When an RX queue is down but active, received packets
+ * must be discarded. This value controls up to how many
+ * packets will be discarded during each round.
+ */
+#define VHOST_USER_DOWN_DISCARD_COUNT 256
+
+/*
+ * When the number of available buffers gets under this threshold,
+ * RX node will start discarding packets.
+ */
+#define VHOST_USER_RX_BUFFER_STARVATION 32
+
+/*
+ * On the receive side, the host should free descriptors as soon
+ * as possible in order to avoid TX drop in the VM.
+ * This value controls the number of copy operations that are stacked
+ * before copy is done for all and descriptors are given back to
+ * the guest.
+ * The value 64 was obtained by testing (48 and 128 were not as good).
+ */
+#define VHOST_USER_RX_COPY_THRESHOLD 64
+
+#define UNIX_GET_FD(unixfd_idx) \
+ (unixfd_idx != ~0) ? \
+ pool_elt_at_index (unix_main.file_pool, \
+ unixfd_idx)->file_descriptor : -1;
+
+#define foreach_virtio_trace_flags \
+ _ (SIMPLE_CHAINED, 0, "Simple descriptor chaining") \
+ _ (SINGLE_DESC, 1, "Single descriptor packet") \
+ _ (INDIRECT, 2, "Indirect descriptor") \
+ _ (MAP_ERROR, 4, "Memory mapping error")
+
+typedef enum
+{
+#define _(n,i,s) VIRTIO_TRACE_F_##n,
+ foreach_virtio_trace_flags
+#undef _
+} virtio_trace_flag_t;
+
+vlib_node_registration_t vhost_user_input_node;
+
+#define foreach_vhost_user_tx_func_error \
+ _(NONE, "no error") \
+ _(NOT_READY, "vhost vring not ready") \
+ _(DOWN, "vhost interface is down") \
+ _(PKT_DROP_NOBUF, "tx packet drops (no available descriptors)") \
+ _(PKT_DROP_NOMRG, "tx packet drops (cannot merge descriptors)") \
+ _(MMAP_FAIL, "mmap failure") \
+ _(INDIRECT_OVERFLOW, "indirect descriptor table overflow")
+
+typedef enum
+{
+#define _(f,s) VHOST_USER_TX_FUNC_ERROR_##f,
+ foreach_vhost_user_tx_func_error
+#undef _
+ VHOST_USER_TX_FUNC_N_ERROR,
+} vhost_user_tx_func_error_t;
+
+static char *vhost_user_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_vhost_user_tx_func_error
+#undef _
+};
+
+#define foreach_vhost_user_input_func_error \
+ _(NO_ERROR, "no error") \
+ _(NO_BUFFER, "no available buffer") \
+ _(MMAP_FAIL, "mmap failure") \
+ _(INDIRECT_OVERFLOW, "indirect descriptor overflows table") \
+ _(UNDERSIZED_FRAME, "undersized ethernet frame received (< 14 bytes)") \
+ _(FULL_RX_QUEUE, "full rx queue (possible driver tx drop)")
+
+typedef enum
+{
+#define _(f,s) VHOST_USER_INPUT_FUNC_ERROR_##f,
+ foreach_vhost_user_input_func_error
+#undef _
+ VHOST_USER_INPUT_FUNC_N_ERROR,
+} vhost_user_input_func_error_t;
+
+static char *vhost_user_input_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_vhost_user_input_func_error
+#undef _
+};
+
+/* *INDENT-OFF* */
+static vhost_user_main_t vhost_user_main = {
+ .mtu_bytes = 1518,
+};
+
+VNET_HW_INTERFACE_CLASS (vhost_interface_class, static) = {
+ .name = "vhost-user",
+};
+/* *INDENT-ON* */
+
+static u8 *
+format_vhost_user_interface_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ u32 show_dev_instance = ~0;
+ vhost_user_main_t *vum = &vhost_user_main;
+
+ if (i < vec_len (vum->show_dev_instance_by_real_dev_instance))
+ show_dev_instance = vum->show_dev_instance_by_real_dev_instance[i];
+
+ if (show_dev_instance != ~0)
+ i = show_dev_instance;
+
+ s = format (s, "VirtualEthernet0/0/%d", i);
+ return s;
+}
+
+static int
+vhost_user_name_renumber (vnet_hw_interface_t * hi, u32 new_dev_instance)
+{
+ // FIXME: check if the new dev instance is already used
+ vhost_user_main_t *vum = &vhost_user_main;
+ vec_validate_init_empty (vum->show_dev_instance_by_real_dev_instance,
+ hi->dev_instance, ~0);
+
+ vum->show_dev_instance_by_real_dev_instance[hi->dev_instance] =
+ new_dev_instance;
+
+ DBG_SOCK ("renumbered vhost-user interface dev_instance %d to %d",
+ hi->dev_instance, new_dev_instance);
+
+ return 0;
+}
+
+static_always_inline void *
+map_guest_mem (vhost_user_intf_t * vui, uword addr, u32 * hint)
+{
+ int i = *hint;
+ if (PREDICT_TRUE ((vui->regions[i].guest_phys_addr <= addr) &&
+ ((vui->regions[i].guest_phys_addr +
+ vui->regions[i].memory_size) > addr)))
+ {
+ return (void *) (vui->region_mmap_addr[i] + addr -
+ vui->regions[i].guest_phys_addr);
+ }
+#if __SSE4_2__
+ __m128i rl, rh, al, ah, r;
+ al = _mm_set1_epi64x (addr + 1);
+ ah = _mm_set1_epi64x (addr);
+
+ rl = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_lo[0]);
+ rl = _mm_cmpgt_epi64 (al, rl);
+ rh = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_hi[0]);
+ rh = _mm_cmpgt_epi64 (rh, ah);
+ r = _mm_and_si128 (rl, rh);
+
+ rl = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_lo[2]);
+ rl = _mm_cmpgt_epi64 (al, rl);
+ rh = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_hi[2]);
+ rh = _mm_cmpgt_epi64 (rh, ah);
+ r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x22);
+
+ rl = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_lo[4]);
+ rl = _mm_cmpgt_epi64 (al, rl);
+ rh = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_hi[4]);
+ rh = _mm_cmpgt_epi64 (rh, ah);
+ r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x44);
+
+ rl = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_lo[6]);
+ rl = _mm_cmpgt_epi64 (al, rl);
+ rh = _mm_loadu_si128 ((__m128i *) & vui->region_guest_addr_hi[6]);
+ rh = _mm_cmpgt_epi64 (rh, ah);
+ r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x88);
+
+ r = _mm_shuffle_epi8 (r, _mm_set_epi64x (0, 0x0e060c040a020800));
+ i = __builtin_ctzll (_mm_movemask_epi8 (r));
+
+ if (i < vui->nregions)
+ {
+ *hint = i;
+ return (void *) (vui->region_mmap_addr[i] + addr -
+ vui->regions[i].guest_phys_addr);
+ }
+
+#else
+ for (i = 0; i < vui->nregions; i++)
+ {
+ if ((vui->regions[i].guest_phys_addr <= addr) &&
+ ((vui->regions[i].guest_phys_addr + vui->regions[i].memory_size) >
+ addr))
+ {
+ *hint = i;
+ return (void *) (vui->region_mmap_addr[i] + addr -
+ vui->regions[i].guest_phys_addr);
+ }
+ }
+#endif
+ DBG_VQ ("failed to map guest mem addr %llx", addr);
+ *hint = 0;
+ return 0;
+}
+
+static inline void *
+map_user_mem (vhost_user_intf_t * vui, uword addr)
+{
+ int i;
+ for (i = 0; i < vui->nregions; i++)
+ {
+ if ((vui->regions[i].userspace_addr <= addr) &&
+ ((vui->regions[i].userspace_addr + vui->regions[i].memory_size) >
+ addr))
+ {
+ return (void *) (vui->region_mmap_addr[i] + addr -
+ vui->regions[i].userspace_addr);
+ }
+ }
+ return 0;
+}
+
+static long
+get_huge_page_size (int fd)
+{
+ struct statfs s;
+ fstatfs (fd, &s);
+ return s.f_bsize;
+}
+
+static void
+unmap_all_mem_regions (vhost_user_intf_t * vui)
+{
+ int i, r;
+ for (i = 0; i < vui->nregions; i++)
+ {
+ if (vui->region_mmap_addr[i] != (void *) -1)
+ {
+
+ long page_sz = get_huge_page_size (vui->region_mmap_fd[i]);
+
+ ssize_t map_sz = (vui->regions[i].memory_size +
+ vui->regions[i].mmap_offset +
+ page_sz) & ~(page_sz - 1);
+
+ r =
+ munmap (vui->region_mmap_addr[i] - vui->regions[i].mmap_offset,
+ map_sz);
+
+ DBG_SOCK
+ ("unmap memory region %d addr 0x%lx len 0x%lx page_sz 0x%x", i,
+ vui->region_mmap_addr[i], map_sz, page_sz);
+
+ vui->region_mmap_addr[i] = (void *) -1;
+
+ if (r == -1)
+ {
+ clib_warning ("failed to unmap memory region (errno %d)",
+ errno);
+ }
+ close (vui->region_mmap_fd[i]);
+ }
+ }
+ vui->nregions = 0;
+}
+
+static void
+vhost_user_tx_thread_placement (vhost_user_intf_t * vui)
+{
+ //Let's try to assign one queue to each thread
+ u32 qid = 0;
+ u32 cpu_index = 0;
+ vui->use_tx_spinlock = 0;
+ while (1)
+ {
+ for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++)
+ {
+ vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)];
+ if (!rxvq->started || !rxvq->enabled)
+ continue;
+
+ vui->per_cpu_tx_qid[cpu_index] = qid;
+ cpu_index++;
+ if (cpu_index == vlib_get_thread_main ()->n_vlib_mains)
+ return;
+ }
+ //We need to loop, meaning the spinlock has to be used
+ vui->use_tx_spinlock = 1;
+ if (cpu_index == 0)
+ {
+ //Could not find a single valid one
+ for (cpu_index = 0;
+ cpu_index < vlib_get_thread_main ()->n_vlib_mains; cpu_index++)
+ {
+ vui->per_cpu_tx_qid[cpu_index] = 0;
+ }
+ return;
+ }
+ }
+}
+
+static void
+vhost_user_rx_thread_placement ()
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+ vhost_cpu_t *vhc;
+ u32 *workers = 0;
+
+ //Let's list all workers cpu indexes
+ u32 i;
+ for (i = vum->input_cpu_first_index;
+ i < vum->input_cpu_first_index + vum->input_cpu_count; i++)
+ {
+ vlib_node_set_state (vlib_mains ? vlib_mains[i] : &vlib_global_main,
+ vhost_user_input_node.index,
+ VLIB_NODE_STATE_DISABLED);
+ vec_add1 (workers, i);
+ }
+
+ vec_foreach (vhc, vum->cpus)
+ {
+ vec_reset_length (vhc->rx_queues);
+ }
+
+ i = 0;
+ vhost_iface_and_queue_t iaq;
+ /* *INDENT-OFF* */
+ pool_foreach (vui, vum->vhost_user_interfaces, {
+ u32 *vui_workers = vec_len (vui->workers) ? vui->workers : workers;
+ u32 qid;
+ for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++)
+ {
+ vhost_user_vring_t *txvq =
+ &vui->vrings[VHOST_VRING_IDX_TX (qid)];
+ if (!txvq->started)
+ continue;
+
+ i %= vec_len (vui_workers);
+ u32 cpu_index = vui_workers[i];
+ i++;
+ vhc = &vum->cpus[cpu_index];
+
+ iaq.qid = qid;
+ iaq.vhost_iface_index = vui - vum->vhost_user_interfaces;
+ vec_add1 (vhc->rx_queues, iaq);
+ vlib_node_set_state (vlib_mains ? vlib_mains[cpu_index] :
+ &vlib_global_main, vhost_user_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ }
+ });
+ /* *INDENT-ON* */
+}
+
+static int
+vhost_user_thread_placement (u32 sw_if_index, u32 worker_thread_index, u8 del)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+ vnet_hw_interface_t *hw;
+
+ if (worker_thread_index < vum->input_cpu_first_index ||
+ worker_thread_index >=
+ vum->input_cpu_first_index + vum->input_cpu_count)
+ return -1;
+
+ if (!(hw = vnet_get_sup_hw_interface (vnet_get_main (), sw_if_index)))
+ return -2;
+
+ vui = pool_elt_at_index (vum->vhost_user_interfaces, hw->dev_instance);
+ u32 found = ~0, *w;
+ vec_foreach (w, vui->workers)
+ {
+ if (*w == worker_thread_index)
+ {
+ found = w - vui->workers;
+ break;
+ }
+ }
+
+ if (del)
+ {
+ if (found == ~0)
+ return -3;
+ vec_del1 (vui->workers, found);
+ }
+ else if (found == ~0)
+ {
+ vec_add1 (vui->workers, worker_thread_index);
+ }
+
+ vhost_user_rx_thread_placement ();
+ return 0;
+}
+
+/** @brief Returns whether at least one TX and one RX vring are enabled */
+int
+vhost_user_intf_ready (vhost_user_intf_t * vui)
+{
+ int i, found[2] = { }; //RX + TX
+
+ for (i = 0; i < VHOST_VRING_MAX_N; i++)
+ if (vui->vrings[i].started && vui->vrings[i].enabled)
+ found[i & 1] = 1;
+
+ return found[0] && found[1];
+}
+
+static void
+vhost_user_update_iface_state (vhost_user_intf_t * vui)
+{
+ /* if we have pointers to descriptor table, go up */
+ int is_up = vhost_user_intf_ready (vui);
+ if (is_up != vui->is_up)
+ {
+ DBG_SOCK ("interface %d %s", vui->sw_if_index,
+ is_up ? "ready" : "down");
+ vnet_hw_interface_set_flags (vnet_get_main (), vui->hw_if_index,
+ is_up ? VNET_HW_INTERFACE_FLAG_LINK_UP :
+ 0);
+ vui->is_up = is_up;
+ }
+ vhost_user_rx_thread_placement ();
+ vhost_user_tx_thread_placement (vui);
+}
+
+static clib_error_t *
+vhost_user_callfd_read_ready (unix_file_t * uf)
+{
+ __attribute__ ((unused)) int n;
+ u8 buff[8];
+ n = read (uf->file_descriptor, ((char *) &buff), 8);
+ return 0;
+}
+
+static clib_error_t *
+vhost_user_kickfd_read_ready (unix_file_t * uf)
+{
+ __attribute__ ((unused)) int n;
+ u8 buff[8];
+ vhost_user_intf_t *vui =
+ pool_elt_at_index (vhost_user_main.vhost_user_interfaces,
+ uf->private_data >> 8);
+ u32 qid = uf->private_data & 0xff;
+ n = read (uf->file_descriptor, ((char *) &buff), 8);
+ DBG_SOCK ("if %d KICK queue %d", uf->private_data >> 8, qid);
+
+ vlib_worker_thread_barrier_sync (vlib_get_main ());
+ vui->vrings[qid].started = 1;
+ vhost_user_update_iface_state (vui);
+ vlib_worker_thread_barrier_release (vlib_get_main ());
+ return 0;
+}
+
+/**
+ * @brief Try once to lock the vring
+ * @return 0 on success, non-zero on failure.
+ */
+static inline int
+vhost_user_vring_try_lock (vhost_user_intf_t * vui, u32 qid)
+{
+ return __sync_lock_test_and_set (vui->vring_locks[qid], 1);
+}
+
+/**
+ * @brief Spin until the vring is successfully locked
+ */
+static inline void
+vhost_user_vring_lock (vhost_user_intf_t * vui, u32 qid)
+{
+ while (vhost_user_vring_try_lock (vui, qid))
+ ;
+}
+
+/**
+ * @brief Unlock the vring lock
+ */
+static inline void
+vhost_user_vring_unlock (vhost_user_intf_t * vui, u32 qid)
+{
+ *vui->vring_locks[qid] = 0;
+}
+
+static inline void
+vhost_user_vring_init (vhost_user_intf_t * vui, u32 qid)
+{
+ vhost_user_vring_t *vring = &vui->vrings[qid];
+ memset (vring, 0, sizeof (*vring));
+ vring->kickfd_idx = ~0;
+ vring->callfd_idx = ~0;
+ vring->errfd = -1;
+
+ /*
+ * We have a bug with some qemu 2.5, and this may be a fix.
+ * Feel like interpretation holy text, but this is from vhost-user.txt.
+ * "
+ * One queue pair is enabled initially. More queues are enabled
+ * dynamically, by sending message VHOST_USER_SET_VRING_ENABLE.
+ * "
+ * Don't know who's right, but this is what DPDK does.
+ */
+ if (qid == 0 || qid == 1)
+ vring->enabled = 1;
+}
+
+static inline void
+vhost_user_vring_close (vhost_user_intf_t * vui, u32 qid)
+{
+ vhost_user_vring_t *vring = &vui->vrings[qid];
+ if (vring->kickfd_idx != ~0)
+ {
+ unix_file_t *uf = pool_elt_at_index (unix_main.file_pool,
+ vring->kickfd_idx);
+ unix_file_del (&unix_main, uf);
+ vring->kickfd_idx = ~0;
+ }
+ if (vring->callfd_idx != ~0)
+ {
+ unix_file_t *uf = pool_elt_at_index (unix_main.file_pool,
+ vring->callfd_idx);
+ unix_file_del (&unix_main, uf);
+ vring->callfd_idx = ~0;
+ }
+ if (vring->errfd != -1)
+ close (vring->errfd);
+ vhost_user_vring_init (vui, qid);
+}
+
+static inline void
+vhost_user_if_disconnect (vhost_user_intf_t * vui)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ int q;
+
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+
+ if (vui->unix_file_index != ~0)
+ {
+ unix_file_del (&unix_main, unix_main.file_pool + vui->unix_file_index);
+ vui->unix_file_index = ~0;
+ }
+
+ vui->is_up = 0;
+
+ for (q = 0; q < VHOST_VRING_MAX_N; q++)
+ vhost_user_vring_close (vui, q);
+
+ unmap_all_mem_regions (vui);
+ DBG_SOCK ("interface ifindex %d disconnected", vui->sw_if_index);
+}
+
+#define VHOST_LOG_PAGE 0x1000
+static_always_inline void
+vhost_user_log_dirty_pages_2 (vhost_user_intf_t * vui,
+ u64 addr, u64 len, u8 is_host_address)
+{
+ if (PREDICT_TRUE (vui->log_base_addr == 0
+ || !(vui->features & (1 << FEAT_VHOST_F_LOG_ALL))))
+ {
+ return;
+ }
+ if (is_host_address)
+ {
+ addr = (u64) map_user_mem (vui, (uword) addr);
+ }
+ if (PREDICT_FALSE ((addr + len - 1) / VHOST_LOG_PAGE / 8 >= vui->log_size))
+ {
+ DBG_SOCK ("vhost_user_log_dirty_pages(): out of range\n");
+ return;
+ }
+
+ CLIB_MEMORY_BARRIER ();
+ u64 page = addr / VHOST_LOG_PAGE;
+ while (page * VHOST_LOG_PAGE < addr + len)
+ {
+ ((u8 *) vui->log_base_addr)[page / 8] |= 1 << page % 8;
+ page++;
+ }
+}
+
+static_always_inline void
+vhost_user_log_dirty_pages (vhost_user_intf_t * vui, u64 addr, u64 len)
+{
+ vhost_user_log_dirty_pages_2 (vui, addr, len, 0);
+}
+
+#define vhost_user_log_dirty_ring(vui, vq, member) \
+ if (PREDICT_FALSE(vq->log_used)) { \
+ vhost_user_log_dirty_pages(vui, vq->log_guest_addr + STRUCT_OFFSET_OF(vring_used_t, member), \
+ sizeof(vq->used->member)); \
+ }
+
+static clib_error_t *
+vhost_user_socket_read (unix_file_t * uf)
+{
+ int n, i;
+ int fd, number_of_fds = 0;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ vhost_user_msg_t msg;
+ struct msghdr mh;
+ struct iovec iov[1];
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+ struct cmsghdr *cmsg;
+ u8 q;
+ unix_file_t template = { 0 };
+ vnet_main_t *vnm = vnet_get_main ();
+
+ vui = pool_elt_at_index (vum->vhost_user_interfaces, uf->private_data);
+
+ char control[CMSG_SPACE (VHOST_MEMORY_MAX_NREGIONS * sizeof (int))];
+
+ memset (&mh, 0, sizeof (mh));
+ memset (control, 0, sizeof (control));
+
+ for (i = 0; i < VHOST_MEMORY_MAX_NREGIONS; i++)
+ fds[i] = -1;
+
+ /* set the payload */
+ iov[0].iov_base = (void *) &msg;
+ iov[0].iov_len = VHOST_USER_MSG_HDR_SZ;
+
+ mh.msg_iov = iov;
+ mh.msg_iovlen = 1;
+ mh.msg_control = control;
+ mh.msg_controllen = sizeof (control);
+
+ n = recvmsg (uf->file_descriptor, &mh, 0);
+
+ /* Stop workers to avoid end of the world */
+ vlib_worker_thread_barrier_sync (vlib_get_main ());
+
+ if (n != VHOST_USER_MSG_HDR_SZ)
+ {
+ if (n == -1)
+ {
+ DBG_SOCK ("recvmsg returned error %d %s", errno, strerror (errno));
+ }
+ else
+ {
+ DBG_SOCK ("n (%d) != VHOST_USER_MSG_HDR_SZ (%d)",
+ n, VHOST_USER_MSG_HDR_SZ);
+ }
+ goto close_socket;
+ }
+
+ if (mh.msg_flags & MSG_CTRUNC)
+ {
+ DBG_SOCK ("MSG_CTRUNC is set");
+ goto close_socket;
+ }
+
+ cmsg = CMSG_FIRSTHDR (&mh);
+
+ if (cmsg && (cmsg->cmsg_len > 0) && (cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS) &&
+ (cmsg->cmsg_len - CMSG_LEN (0) <=
+ VHOST_MEMORY_MAX_NREGIONS * sizeof (int)))
+ {
+ number_of_fds = (cmsg->cmsg_len - CMSG_LEN (0)) / sizeof (int);
+ clib_memcpy (fds, CMSG_DATA (cmsg), number_of_fds * sizeof (int));
+ }
+
+ /* version 1, no reply bit set */
+ if ((msg.flags & 7) != 1)
+ {
+ DBG_SOCK ("malformed message received. closing socket");
+ goto close_socket;
+ }
+
+ {
+ int rv;
+ rv =
+ read (uf->file_descriptor, ((char *) &msg) + VHOST_USER_MSG_HDR_SZ,
+ msg.size);
+ if (rv < 0)
+ {
+ DBG_SOCK ("read failed %s", strerror (errno));
+ goto close_socket;
+ }
+ else if (rv != msg.size)
+ {
+ DBG_SOCK ("message too short (read %dB should be %dB)", rv, msg.size);
+ goto close_socket;
+ }
+ }
+
+ switch (msg.request)
+ {
+ case VHOST_USER_GET_FEATURES:
+ msg.flags |= 4;
+ msg.u64 = (1ULL << FEAT_VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << FEAT_VIRTIO_NET_F_CTRL_VQ) |
+ (1ULL << FEAT_VIRTIO_F_ANY_LAYOUT) |
+ (1ULL << FEAT_VIRTIO_F_INDIRECT_DESC) |
+ (1ULL << FEAT_VHOST_F_LOG_ALL) |
+ (1ULL << FEAT_VIRTIO_NET_F_GUEST_ANNOUNCE) |
+ (1ULL << FEAT_VIRTIO_NET_F_MQ) |
+ (1ULL << FEAT_VHOST_USER_F_PROTOCOL_FEATURES) |
+ (1ULL << FEAT_VIRTIO_F_VERSION_1);
+ msg.u64 &= vui->feature_mask;
+ msg.size = sizeof (msg.u64);
+ DBG_SOCK ("if %d msg VHOST_USER_GET_FEATURES - reply 0x%016llx",
+ vui->hw_if_index, msg.u64);
+ break;
+
+ case VHOST_USER_SET_FEATURES:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_FEATURES features 0x%016llx",
+ vui->hw_if_index, msg.u64);
+
+ vui->features = msg.u64;
+
+ if (vui->features &
+ ((1 << FEAT_VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << FEAT_VIRTIO_F_VERSION_1)))
+ vui->virtio_net_hdr_sz = 12;
+ else
+ vui->virtio_net_hdr_sz = 10;
+
+ vui->is_any_layout =
+ (vui->features & (1 << FEAT_VIRTIO_F_ANY_LAYOUT)) ? 1 : 0;
+
+ ASSERT (vui->virtio_net_hdr_sz < VLIB_BUFFER_PRE_DATA_SIZE);
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+ vui->is_up = 0;
+
+ /*for (q = 0; q < VHOST_VRING_MAX_N; q++)
+ vhost_user_vring_close(&vui->vrings[q]); */
+
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_MEM_TABLE nregions %d",
+ vui->hw_if_index, msg.memory.nregions);
+
+ if ((msg.memory.nregions < 1) ||
+ (msg.memory.nregions > VHOST_MEMORY_MAX_NREGIONS))
+ {
+
+ DBG_SOCK ("number of mem regions must be between 1 and %i",
+ VHOST_MEMORY_MAX_NREGIONS);
+
+ goto close_socket;
+ }
+
+ if (msg.memory.nregions != number_of_fds)
+ {
+ DBG_SOCK ("each memory region must have FD");
+ goto close_socket;
+ }
+ unmap_all_mem_regions (vui);
+ for (i = 0; i < msg.memory.nregions; i++)
+ {
+ clib_memcpy (&(vui->regions[i]), &msg.memory.regions[i],
+ sizeof (vhost_user_memory_region_t));
+
+ long page_sz = get_huge_page_size (fds[i]);
+
+ /* align size to 2M page */
+ ssize_t map_sz = (vui->regions[i].memory_size +
+ vui->regions[i].mmap_offset +
+ page_sz) & ~(page_sz - 1);
+
+ vui->region_mmap_addr[i] = mmap (0, map_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fds[i], 0);
+ vui->region_guest_addr_lo[i] = vui->regions[i].guest_phys_addr;
+ vui->region_guest_addr_hi[i] = vui->regions[i].guest_phys_addr +
+ vui->regions[i].memory_size;
+
+ DBG_SOCK
+ ("map memory region %d addr 0 len 0x%lx fd %d mapped 0x%lx "
+ "page_sz 0x%x", i, map_sz, fds[i], vui->region_mmap_addr[i],
+ page_sz);
+
+ if (vui->region_mmap_addr[i] == MAP_FAILED)
+ {
+ clib_warning ("failed to map memory. errno is %d", errno);
+ goto close_socket;
+ }
+ vui->region_mmap_addr[i] += vui->regions[i].mmap_offset;
+ vui->region_mmap_fd[i] = fds[i];
+ }
+ vui->nregions = msg.memory.nregions;
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_NUM idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ if ((msg.state.num > 32768) || /* maximum ring size is 32768 */
+ (msg.state.num == 0) || /* it cannot be zero */
+ ((msg.state.num - 1) & msg.state.num)) /* must be power of 2 */
+ goto close_socket;
+ vui->vrings[msg.state.index].qsz = msg.state.num;
+ break;
+
+ case VHOST_USER_SET_VRING_ADDR:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_ADDR idx %d",
+ vui->hw_if_index, msg.state.index);
+
+ if (msg.state.index >= VHOST_VRING_MAX_N)
+ {
+ DBG_SOCK ("invalid vring index VHOST_USER_SET_VRING_ADDR:"
+ " %d >= %d", msg.state.index, VHOST_VRING_MAX_N);
+ goto close_socket;
+ }
+
+ if (msg.size < sizeof (msg.addr))
+ {
+ DBG_SOCK ("vhost message is too short (%d < %d)",
+ msg.size, sizeof (msg.addr));
+ goto close_socket;
+ }
+
+ vui->vrings[msg.state.index].desc = (vring_desc_t *)
+ map_user_mem (vui, msg.addr.desc_user_addr);
+ vui->vrings[msg.state.index].used = (vring_used_t *)
+ map_user_mem (vui, msg.addr.used_user_addr);
+ vui->vrings[msg.state.index].avail = (vring_avail_t *)
+ map_user_mem (vui, msg.addr.avail_user_addr);
+
+ if ((vui->vrings[msg.state.index].desc == NULL) ||
+ (vui->vrings[msg.state.index].used == NULL) ||
+ (vui->vrings[msg.state.index].avail == NULL))
+ {
+ DBG_SOCK ("failed to map user memory for hw_if_index %d",
+ vui->hw_if_index);
+ goto close_socket;
+ }
+
+ vui->vrings[msg.state.index].log_guest_addr = msg.addr.log_guest_addr;
+ vui->vrings[msg.state.index].log_used =
+ (msg.addr.flags & (1 << VHOST_VRING_F_LOG)) ? 1 : 0;
+
+ /* Spec says: If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated,
+ the ring is initialized in an enabled state. */
+ if (!(vui->features & (1 << FEAT_VHOST_USER_F_PROTOCOL_FEATURES)))
+ {
+ vui->vrings[msg.state.index].enabled = 1;
+ }
+
+ vui->vrings[msg.state.index].last_used_idx =
+ vui->vrings[msg.state.index].last_avail_idx =
+ vui->vrings[msg.state.index].used->idx;
+
+ /* tell driver that we don't want interrupts */
+ vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY;
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_OWNER", vui->hw_if_index);
+ break;
+
+ case VHOST_USER_RESET_OWNER:
+ DBG_SOCK ("if %d msg VHOST_USER_RESET_OWNER", vui->hw_if_index);
+ break;
+
+ case VHOST_USER_SET_VRING_CALL:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_CALL u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ /* if there is old fd, delete and close it */
+ if (vui->vrings[q].callfd_idx != ~0)
+ {
+ unix_file_t *uf = pool_elt_at_index (unix_main.file_pool,
+ vui->vrings[q].callfd_idx);
+ unix_file_del (&unix_main, uf);
+ vui->vrings[q].callfd_idx = ~0;
+ }
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ {
+ DBG_SOCK ("More than one fd received !");
+ goto close_socket;
+ }
+
+ template.read_function = vhost_user_callfd_read_ready;
+ template.file_descriptor = fds[0];
+ template.private_data =
+ ((vui - vhost_user_main.vhost_user_interfaces) << 8) + q;
+ vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template);
+ }
+ else
+ vui->vrings[q].callfd_idx = ~0;
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_KICK u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (vui->vrings[q].kickfd_idx != ~0)
+ {
+ unix_file_t *uf = pool_elt_at_index (unix_main.file_pool,
+ vui->vrings[q].kickfd_idx);
+ unix_file_del (&unix_main, uf);
+ vui->vrings[q].kickfd_idx = ~0;
+ }
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ {
+ DBG_SOCK ("More than one fd received !");
+ goto close_socket;
+ }
+
+ template.read_function = vhost_user_kickfd_read_ready;
+ template.file_descriptor = fds[0];
+ template.private_data =
+ (((uword) (vui - vhost_user_main.vhost_user_interfaces)) << 8) +
+ q;
+ vui->vrings[q].kickfd_idx = unix_file_add (&unix_main, &template);
+ }
+ else
+ {
+ //When no kickfd is set, the queue is initialized as started
+ vui->vrings[q].kickfd_idx = ~0;
+ vui->vrings[q].started = 1;
+ }
+
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_ERR u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (vui->vrings[q].errfd != -1)
+ close (vui->vrings[q].errfd);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ vui->vrings[q].errfd = fds[0];
+ }
+ else
+ vui->vrings[q].errfd = -1;
+
+ break;
+
+ case VHOST_USER_SET_VRING_BASE:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ vui->vrings[msg.state.index].last_avail_idx = msg.state.num;
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ DBG_SOCK ("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ if (msg.state.index >= VHOST_VRING_MAX_N)
+ {
+ DBG_SOCK ("invalid vring index VHOST_USER_GET_VRING_BASE:"
+ " %d >= %d", msg.state.index, VHOST_VRING_MAX_N);
+ goto close_socket;
+ }
+
+ /* Spec says: Client must [...] stop ring upon receiving VHOST_USER_GET_VRING_BASE. */
+ vhost_user_vring_close (vui, msg.state.index);
+
+ msg.state.num = vui->vrings[msg.state.index].last_avail_idx;
+ msg.flags |= 4;
+ msg.size = sizeof (msg.state);
+ break;
+
+ case VHOST_USER_NONE:
+ DBG_SOCK ("if %d msg VHOST_USER_NONE", vui->hw_if_index);
+
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ {
+ DBG_SOCK ("if %d msg VHOST_USER_SET_LOG_BASE", vui->hw_if_index);
+
+ if (msg.size != sizeof (msg.log))
+ {
+ DBG_SOCK
+ ("invalid msg size for VHOST_USER_SET_LOG_BASE: %d instead of %d",
+ msg.size, sizeof (msg.log));
+ goto close_socket;
+ }
+
+ if (!
+ (vui->protocol_features & (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD)))
+ {
+ DBG_SOCK
+ ("VHOST_USER_PROTOCOL_F_LOG_SHMFD not set but VHOST_USER_SET_LOG_BASE received");
+ goto close_socket;
+ }
+
+ fd = fds[0];
+ /* align size to 2M page */
+ long page_sz = get_huge_page_size (fd);
+ ssize_t map_sz =
+ (msg.log.size + msg.log.offset + page_sz) & ~(page_sz - 1);
+
+ vui->log_base_addr = mmap (0, map_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+
+ DBG_SOCK
+ ("map log region addr 0 len 0x%lx off 0x%lx fd %d mapped 0x%lx",
+ map_sz, msg.log.offset, fd, vui->log_base_addr);
+
+ if (vui->log_base_addr == MAP_FAILED)
+ {
+ clib_warning ("failed to map memory. errno is %d", errno);
+ goto close_socket;
+ }
+
+ vui->log_base_addr += msg.log.offset;
+ vui->log_size = msg.log.size;
+
+ msg.flags |= 4;
+ msg.size = sizeof (msg.u64);
+
+ break;
+ }
+
+ case VHOST_USER_SET_LOG_FD:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_LOG_FD", vui->hw_if_index);
+
+ break;
+
+ case VHOST_USER_GET_PROTOCOL_FEATURES:
+ DBG_SOCK ("if %d msg VHOST_USER_GET_PROTOCOL_FEATURES",
+ vui->hw_if_index);
+
+ msg.flags |= 4;
+ msg.u64 = (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |
+ (1 << VHOST_USER_PROTOCOL_F_MQ);
+ msg.size = sizeof (msg.u64);
+ break;
+
+ case VHOST_USER_SET_PROTOCOL_FEATURES:
+ DBG_SOCK ("if %d msg VHOST_USER_SET_PROTOCOL_FEATURES features 0x%lx",
+ vui->hw_if_index, msg.u64);
+
+ vui->protocol_features = msg.u64;
+
+ break;
+
+ case VHOST_USER_GET_QUEUE_NUM:
+ DBG_SOCK ("if %d msg VHOST_USER_GET_QUEUE_NUM", vui->hw_if_index);
+ msg.flags |= 4;
+ msg.u64 = VHOST_VRING_MAX_N;
+ msg.size = sizeof (msg.u64);
+ break;
+
+ case VHOST_USER_SET_VRING_ENABLE:
+ DBG_SOCK ("if %d VHOST_USER_SET_VRING_ENABLE: %s queue %d",
+ vui->hw_if_index, msg.state.num ? "enable" : "disable",
+ msg.state.index);
+ if (msg.state.index >= VHOST_VRING_MAX_N)
+ {
+ DBG_SOCK ("invalid vring index VHOST_USER_SET_VRING_ENABLE:"
+ " %d >= %d", msg.state.index, VHOST_VRING_MAX_N);
+ goto close_socket;
+ }
+
+ vui->vrings[msg.state.index].enabled = msg.state.num;
+ break;
+
+ default:
+ DBG_SOCK ("unknown vhost-user message %d received. closing socket",
+ msg.request);
+ goto close_socket;
+ }
+
+ /* if we need to reply */
+ if (msg.flags & 4)
+ {
+ n =
+ send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
+ if (n != (msg.size + VHOST_USER_MSG_HDR_SZ))
+ {
+ DBG_SOCK ("could not send message response");
+ goto close_socket;
+ }
+ }
+
+ vhost_user_update_iface_state (vui);
+ vlib_worker_thread_barrier_release (vlib_get_main ());
+ return 0;
+
+close_socket:
+ vhost_user_if_disconnect (vui);
+ vhost_user_update_iface_state (vui);
+ vlib_worker_thread_barrier_release (vlib_get_main ());
+ return 0;
+}
+
+static clib_error_t *
+vhost_user_socket_error (unix_file_t * uf)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui =
+ pool_elt_at_index (vum->vhost_user_interfaces, uf->private_data);
+
+ DBG_SOCK ("socket error on if %d", vui->sw_if_index);
+ vlib_worker_thread_barrier_sync (vm);
+ vhost_user_if_disconnect (vui);
+ vhost_user_rx_thread_placement ();
+ vlib_worker_thread_barrier_release (vm);
+ return 0;
+}
+
+static clib_error_t *
+vhost_user_socksvr_accept_ready (unix_file_t * uf)
+{
+ int client_fd, client_len;
+ struct sockaddr_un client;
+ unix_file_t template = { 0 };
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+
+ vui = pool_elt_at_index (vum->vhost_user_interfaces, uf->private_data);
+
+ client_len = sizeof (client);
+ client_fd = accept (uf->file_descriptor,
+ (struct sockaddr *) &client,
+ (socklen_t *) & client_len);
+
+ if (client_fd < 0)
+ return clib_error_return_unix (0, "accept");
+
+ DBG_SOCK ("New client socket for vhost interface %d", vui->sw_if_index);
+ template.read_function = vhost_user_socket_read;
+ template.error_function = vhost_user_socket_error;
+ template.file_descriptor = client_fd;
+ template.private_data = vui - vhost_user_main.vhost_user_interfaces;
+ vui->unix_file_index = unix_file_add (&unix_main, &template);
+ return 0;
+}
+
+static clib_error_t *
+vhost_user_init (vlib_main_t * vm)
+{
+ clib_error_t *error;
+ vhost_user_main_t *vum = &vhost_user_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ vlib_thread_registration_t *tr;
+ uword *p;
+
+ error = vlib_call_init_function (vm, ip4_init);
+ if (error)
+ return error;
+
+ vum->coalesce_frames = 32;
+ vum->coalesce_time = 1e-3;
+
+ vec_validate (vum->cpus, tm->n_vlib_mains - 1);
+
+ vhost_cpu_t *cpu;
+ vec_foreach (cpu, vum->cpus)
+ {
+ /* This is actually not necessary as validate already zeroes it
+ * Just keeping the loop here for later because I am lazy. */
+ cpu->rx_buffers_len = 0;
+ }
+
+ /* find out which cpus will be used for input */
+ vum->input_cpu_first_index = 0;
+ vum->input_cpu_count = 1;
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (tr && tr->count > 0)
+ {
+ vum->input_cpu_first_index = tr->first_index;
+ vum->input_cpu_count = tr->count;
+ }
+
+ vum->random = random_default_seed ();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (vhost_user_init);
+
+static clib_error_t *
+vhost_user_exit (vlib_main_t * vm)
+{
+ /* TODO cleanup */
+ return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (vhost_user_exit);
+
+static u8 *
+format_vhost_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main ();
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_trace_t *t = va_arg (*va, vhost_trace_t *);
+ vhost_user_intf_t *vui = pool_elt_at_index (vum->vhost_user_interfaces,
+ t->device_index);
+
+ vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, vui->sw_if_index);
+
+ uword indent = format_get_indent (s);
+
+ s = format (s, "%U %U queue %d\n", format_white_space, indent,
+ format_vnet_sw_interface_name, vnm, sw, t->qid);
+
+ s = format (s, "%U virtio flags:\n", format_white_space, indent);
+#define _(n,i,st) \
+ if (t->virtio_ring_flags & (1 << VIRTIO_TRACE_F_##n)) \
+ s = format (s, "%U %s %s\n", format_white_space, indent, #n, st);
+ foreach_virtio_trace_flags
+#undef _
+ s = format (s, "%U virtio_net_hdr first_desc_len %u\n",
+ format_white_space, indent, t->first_desc_len);
+
+ s = format (s, "%U flags 0x%02x gso_type %u\n",
+ format_white_space, indent,
+ t->hdr.hdr.flags, t->hdr.hdr.gso_type);
+
+ if (vui->virtio_net_hdr_sz == 12)
+ s = format (s, "%U num_buff %u",
+ format_white_space, indent, t->hdr.num_buffers);
+
+ return s;
+}
+
+void
+vhost_user_rx_trace (vhost_trace_t * t,
+ vhost_user_intf_t * vui, u16 qid,
+ vlib_buffer_t * b, vhost_user_vring_t * txvq)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ u32 qsz_mask = txvq->qsz - 1;
+ u32 last_avail_idx = txvq->last_avail_idx;
+ u32 desc_current = txvq->avail->ring[last_avail_idx & qsz_mask];
+ vring_desc_t *hdr_desc = 0;
+ virtio_net_hdr_mrg_rxbuf_t *hdr;
+ u32 hint = 0;
+
+ memset (t, 0, sizeof (*t));
+ t->device_index = vui - vum->vhost_user_interfaces;
+ t->qid = qid;
+
+ hdr_desc = &txvq->desc[desc_current];
+ if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
+ {
+ t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT;
+ /* Header is the first here */
+ hdr_desc = map_guest_mem (vui, txvq->desc[desc_current].addr, &hint);
+ }
+ if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT)
+ {
+ t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED;
+ }
+ if (!(txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT) &&
+ !(txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT))
+ {
+ t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC;
+ }
+
+ t->first_desc_len = hdr_desc ? hdr_desc->len : 0;
+
+ if (!hdr_desc || !(hdr = map_guest_mem (vui, hdr_desc->addr, &hint)))
+ {
+ t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_MAP_ERROR;
+ }
+ else
+ {
+ u32 len = vui->virtio_net_hdr_sz;
+ memcpy (&t->hdr, hdr, len > hdr_desc->len ? hdr_desc->len : len);
+ }
+}
+
+static inline void
+vhost_user_send_call (vlib_main_t * vm, vhost_user_vring_t * vq)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ u64 x = 1;
+ int fd = UNIX_GET_FD (vq->callfd_idx);
+ int rv __attribute__ ((unused));
+ /* TODO: pay attention to rv */
+ rv = write (fd, &x, sizeof (x));
+ vq->n_since_last_int = 0;
+ vq->int_deadline = vlib_time_now (vm) + vum->coalesce_time;
+}
+
+static_always_inline u32
+vhost_user_input_copy (vhost_user_intf_t * vui, vhost_copy_t * cpy,
+ u16 copy_len, u32 * map_hint)
+{
+ void *src0, *src1, *src2, *src3;
+ if (PREDICT_TRUE (copy_len >= 4))
+ {
+ if (PREDICT_FALSE (!(src2 = map_guest_mem (vui, cpy[0].src, map_hint))))
+ return 1;
+ if (PREDICT_FALSE (!(src3 = map_guest_mem (vui, cpy[1].src, map_hint))))
+ return 1;
+
+ while (PREDICT_TRUE (copy_len >= 4))
+ {
+ src0 = src2;
+ src1 = src3;
+
+ if (PREDICT_FALSE
+ (!(src2 = map_guest_mem (vui, cpy[2].src, map_hint))))
+ return 1;
+ if (PREDICT_FALSE
+ (!(src3 = map_guest_mem (vui, cpy[3].src, map_hint))))
+ return 1;
+
+ CLIB_PREFETCH (src2, 64, LOAD);
+ CLIB_PREFETCH (src3, 64, LOAD);
+
+ clib_memcpy ((void *) cpy[0].dst, src0, cpy[0].len);
+ clib_memcpy ((void *) cpy[1].dst, src1, cpy[1].len);
+ copy_len -= 2;
+ cpy += 2;
+ }
+ }
+ while (copy_len)
+ {
+ if (PREDICT_FALSE (!(src0 = map_guest_mem (vui, cpy->src, map_hint))))
+ return 1;
+ clib_memcpy ((void *) cpy->dst, src0, cpy->len);
+ copy_len -= 1;
+ cpy += 1;
+ }
+ return 0;
+}
+
+/**
+ * Try to discard packets from the tx ring (VPP RX path).
+ * Returns the number of discarded packets.
+ */
+u32
+vhost_user_rx_discard_packet (vlib_main_t * vm,
+ vhost_user_intf_t * vui,
+ vhost_user_vring_t * txvq, u32 discard_max)
+{
+ /*
+ * On the RX side, each packet corresponds to one descriptor
+ * (it is the same whether it is a shallow descriptor, chained, or indirect).
+ * Therefore, discarding a packet is like discarding a descriptor.
+ */
+ u32 discarded_packets = 0;
+ u32 avail_idx = txvq->avail->idx;
+ u16 qsz_mask = txvq->qsz - 1;
+ while (discarded_packets != discard_max)
+ {
+ if (avail_idx == txvq->last_avail_idx)
+ goto out;
+
+ u16 desc_chain_head =
+ txvq->avail->ring[txvq->last_avail_idx & qsz_mask];
+ txvq->last_avail_idx++;
+ txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head;
+ txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0;
+ vhost_user_log_dirty_ring (vui, txvq,
+ ring[txvq->last_used_idx & qsz_mask]);
+ txvq->last_used_idx++;
+ discarded_packets++;
+ }
+
+out:
+ CLIB_MEMORY_BARRIER ();
+ txvq->used->idx = txvq->last_used_idx;
+ vhost_user_log_dirty_ring (vui, txvq, idx);
+ return discarded_packets;
+}
+
+/*
+ * In case of overflow, we need to rewind the array of allocated buffers.
+ */
+static void
+vhost_user_input_rewind_buffers (vlib_main_t * vm,
+ vhost_cpu_t * cpu, vlib_buffer_t * b_head)
+{
+ u32 bi_current = cpu->rx_buffers[cpu->rx_buffers_len];
+ vlib_buffer_t *b_current = vlib_get_buffer (vm, bi_current);
+ b_current->current_length = 0;
+ b_current->flags = 0;
+ while (b_current != b_head)
+ {
+ cpu->rx_buffers_len++;
+ bi_current = cpu->rx_buffers[cpu->rx_buffers_len];
+ b_current = vlib_get_buffer (vm, bi_current);
+ b_current->current_length = 0;
+ b_current->flags = 0;
+ }
+}
+
+static u32
+vhost_user_if_input (vlib_main_t * vm,
+ vhost_user_main_t * vum,
+ vhost_user_intf_t * vui,
+ u16 qid, vlib_node_runtime_t * node)
+{
+ vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)];
+ u16 n_rx_packets = 0;
+ u32 n_rx_bytes = 0;
+ u16 n_left;
+ u32 n_left_to_next, *to_next;
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ u32 n_trace = vlib_get_trace_count (vm, node);
+ u16 qsz_mask;
+ u32 map_hint = 0;
+ u16 cpu_index = os_get_cpu_number ();
+ u16 copy_len = 0;
+
+ {
+ /* do we have pending interrupts ? */
+ vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)];
+ f64 now = vlib_time_now (vm);
+
+ if ((txvq->n_since_last_int) && (txvq->int_deadline < now))
+ vhost_user_send_call (vm, txvq);
+
+ if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now))
+ vhost_user_send_call (vm, rxvq);
+ }
+
+ if (PREDICT_FALSE (txvq->avail->flags & 0xFFFE))
+ return 0;
+
+ n_left = (u16) (txvq->avail->idx - txvq->last_avail_idx);
+
+ /* nothing to do */
+ if (PREDICT_FALSE (n_left == 0))
+ return 0;
+
+ if (PREDICT_FALSE (!vui->admin_up || !(txvq->enabled)))
+ {
+ /*
+ * Discard input packet if interface is admin down or vring is not
+ * enabled.
+ * "For example, for a networking device, in the disabled state
+ * client must not supply any new RX packets, but must process
+ * and discard any TX packets."
+ */
+ vhost_user_rx_discard_packet (vm, vui, txvq,
+ VHOST_USER_DOWN_DISCARD_COUNT);
+ return 0;
+ }
+
+ if (PREDICT_FALSE (n_left == txvq->qsz))
+ {
+ /*
+ * Informational error logging when VPP is not
+ * receiving packets fast enough.
+ */
+ vlib_error_count (vm, node->node_index,
+ VHOST_USER_INPUT_FUNC_ERROR_FULL_RX_QUEUE, 1);
+ }
+
+ qsz_mask = txvq->qsz - 1;
+
+ if (n_left > VLIB_FRAME_SIZE)
+ n_left = VLIB_FRAME_SIZE;
+
+ /*
+ * For small packets (<2kB), we will not need more than one vlib buffer
+ * per packet. In case packets are bigger, we will just yeld at some point
+ * in the loop and come back later. This is not an issue as for big packet,
+ * processing cost really comes from the memory copy.
+ */
+ if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len < n_left + 1))
+ {
+ u32 curr_len = vum->cpus[cpu_index].rx_buffers_len;
+ vum->cpus[cpu_index].rx_buffers_len +=
+ vlib_buffer_alloc_from_free_list (vm,
+ vum->cpus[cpu_index].rx_buffers +
+ curr_len,
+ VHOST_USER_RX_BUFFERS_N - curr_len,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ if (PREDICT_FALSE
+ (vum->cpus[cpu_index].rx_buffers_len <
+ VHOST_USER_RX_BUFFER_STARVATION))
+ {
+ /* In case of buffer starvation, discard some packets from the queue
+ * and log the event.
+ * We keep doing best effort for the remaining packets. */
+ u32 flush = (n_left + 1 > vum->cpus[cpu_index].rx_buffers_len) ?
+ n_left + 1 - vum->cpus[cpu_index].rx_buffers_len : 1;
+ flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush);
+
+ n_left -= flush;
+ vlib_increment_simple_counter (vnet_main.
+ interface_main.sw_if_counters +
+ VNET_INTERFACE_COUNTER_DROP,
+ os_get_cpu_number (),
+ vui->sw_if_index, flush);
+
+ vlib_error_count (vm, vhost_user_input_node.index,
+ VHOST_USER_INPUT_FUNC_ERROR_NO_BUFFER, flush);
+ }
+ }
+
+ while (n_left > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t *b_head, *b_current;
+ u32 bi_current;
+ u16 desc_current;
+ u32 desc_data_offset;
+ vring_desc_t *desc_table = txvq->desc;
+
+ if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len <= 1))
+ {
+ /* Not enough rx_buffers
+ * Note: We yeld on 1 so we don't need to do an additional
+ * check for the next buffer prefetch.
+ */
+ n_left = 0;
+ break;
+ }
+
+ desc_current = txvq->avail->ring[txvq->last_avail_idx & qsz_mask];
+ vum->cpus[cpu_index].rx_buffers_len--;
+ bi_current = (vum->cpus[cpu_index].rx_buffers)
+ [vum->cpus[cpu_index].rx_buffers_len];
+ b_head = b_current = vlib_get_buffer (vm, bi_current);
+ to_next[0] = bi_current; //We do that now so we can forget about bi_current
+ to_next++;
+ n_left_to_next--;
+
+ vlib_prefetch_buffer_with_index (vm,
+ (vum->cpus[cpu_index].rx_buffers)
+ [vum->cpus[cpu_index].
+ rx_buffers_len - 1], LOAD);
+
+ /* Just preset the used descriptor id and length for later */
+ txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_current;
+ txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0;
+ vhost_user_log_dirty_ring (vui, txvq,
+ ring[txvq->last_used_idx & qsz_mask]);
+
+ /* The buffer should already be initialized */
+ b_head->total_length_not_including_first_buffer = 0;
+ b_head->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ if (PREDICT_FALSE (n_trace))
+ {
+ //TODO: next_index is not exactly known at that point
+ vlib_trace_buffer (vm, node, next_index, b_head,
+ /* follow_chain */ 0);
+ vhost_trace_t *t0 =
+ vlib_add_trace (vm, node, b_head, sizeof (t0[0]));
+ vhost_user_rx_trace (t0, vui, qid, b_head, txvq);
+ n_trace--;
+ vlib_set_trace_count (vm, node, n_trace);
+ }
+
+ /* This depends on the setup but is very consistent
+ * So I think the CPU branch predictor will make a pretty good job
+ * at optimizing the decision. */
+ if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
+ {
+ desc_table = map_guest_mem (vui, txvq->desc[desc_current].addr,
+ &map_hint);
+ desc_current = 0;
+ if (PREDICT_FALSE (desc_table == 0))
+ {
+ //FIXME: Handle error by shutdown the queue
+ goto out;
+ }
+ }
+
+ if (PREDICT_TRUE (vui->is_any_layout) ||
+ (!(desc_table[desc_current].flags & VIRTQ_DESC_F_NEXT)))
+ {
+ /* ANYLAYOUT or single buffer */
+ desc_data_offset = vui->virtio_net_hdr_sz;
+ }
+ else
+ {
+ /* CSR case without ANYLAYOUT, skip 1st buffer */
+ desc_data_offset = desc_table[desc_current].len;
+ }
+
+ while (1)
+ {
+ /* Get more input if necessary. Or end of packet. */
+ if (desc_data_offset == desc_table[desc_current].len)
+ {
+ if (PREDICT_FALSE (desc_table[desc_current].flags &
+ VIRTQ_DESC_F_NEXT))
+ {
+ desc_current = desc_table[desc_current].next;
+ desc_data_offset = 0;
+ }
+ else
+ {
+ goto out;
+ }
+ }
+
+ /* Get more output if necessary. Or end of packet. */
+ if (PREDICT_FALSE
+ (b_current->current_length == VLIB_BUFFER_DATA_SIZE))
+ {
+ if (PREDICT_FALSE
+ (vum->cpus[cpu_index].rx_buffers_len == 0))
+ {
+ /*
+ * Checking if there are some left buffers.
+ * If not, just rewind the used buffers and stop.
+ * Note: Scheduled copies are not cancelled. This is
+ * not an issue as they would still be valid. Useless,
+ * but valid.
+ */
+ vhost_user_input_rewind_buffers (vm,
+ &vum->cpus[cpu_index],
+ b_head);
+ n_left = 0;
+ goto stop;
+ }
+
+ /* Get next output */
+ vum->cpus[cpu_index].rx_buffers_len--;
+ u32 bi_next =
+ (vum->cpus[cpu_index].rx_buffers)[vum->cpus
+ [cpu_index].rx_buffers_len];
+ b_current->next_buffer = bi_next;
+ b_current->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ bi_current = bi_next;
+ b_current = vlib_get_buffer (vm, bi_current);
+ }
+
+ /* Prepare a copy order executed later for the data */
+ vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
+ copy_len++;
+ u32 desc_data_l =
+ desc_table[desc_current].len - desc_data_offset;
+ cpy->len = VLIB_BUFFER_DATA_SIZE - b_current->current_length;
+ cpy->len = (cpy->len > desc_data_l) ? desc_data_l : cpy->len;
+ cpy->dst = (uword) vlib_buffer_get_current (b_current);
+ cpy->src = desc_table[desc_current].addr + desc_data_offset;
+
+ desc_data_offset += cpy->len;
+
+ b_current->current_length += cpy->len;
+ b_head->total_length_not_including_first_buffer += cpy->len;
+ }
+
+ out:
+ CLIB_PREFETCH (&n_left, sizeof (n_left), LOAD);
+
+ n_rx_bytes += b_head->total_length_not_including_first_buffer;
+ n_rx_packets++;
+
+ b_head->total_length_not_including_first_buffer -=
+ b_head->current_length;
+
+ /* consume the descriptor and return it as used */
+ txvq->last_avail_idx++;
+ txvq->last_used_idx++;
+
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b_head);
+
+ vnet_buffer (b_head)->sw_if_index[VLIB_RX] = vui->sw_if_index;
+ vnet_buffer (b_head)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ b_head->error = 0;
+
+ {
+ u32 next0 = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+
+ /* redirect if feature path enabled */
+ vnet_feature_start_device_input_x1 (vui->sw_if_index, &next0,
+ b_head, 0);
+
+ u32 bi = to_next[-1]; //Cannot use to_next[-1] in the macro
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi, next0);
+ }
+
+ n_left--;
+
+ /*
+ * Although separating memory copies from virtio ring parsing
+ * is beneficial, we can offer to perform the copies from time
+ * to time in order to free some space in the ring.
+ */
+ if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD))
+ {
+ if (PREDICT_FALSE
+ (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy,
+ copy_len, &map_hint)))
+ {
+ clib_warning
+ ("Memory mapping error on interface hw_if_index=%d "
+ "(Shutting down - Switch interface down and up to restart)",
+ vui->hw_if_index);
+ vui->admin_up = 0;
+ copy_len = 0;
+ break;
+ }
+ copy_len = 0;
+
+ /* give buffers back to driver */
+ CLIB_MEMORY_BARRIER ();
+ txvq->used->idx = txvq->last_used_idx;
+ vhost_user_log_dirty_ring (vui, txvq, idx);
+ }
+ }
+ stop:
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ /* Do the memory copies */
+ if (PREDICT_FALSE
+ (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy,
+ copy_len, &map_hint)))
+ {
+ clib_warning ("Memory mapping error on interface hw_if_index=%d "
+ "(Shutting down - Switch interface down and up to restart)",
+ vui->hw_if_index);
+ vui->admin_up = 0;
+ }
+
+ /* give buffers back to driver */
+ CLIB_MEMORY_BARRIER ();
+ txvq->used->idx = txvq->last_used_idx;
+ vhost_user_log_dirty_ring (vui, txvq, idx);
+
+ /* interrupt (call) handling */
+ if ((txvq->callfd_idx != ~0) && !(txvq->avail->flags & 1))
+ {
+ txvq->n_since_last_int += n_rx_packets;
+
+ if (txvq->n_since_last_int > vum->coalesce_frames)
+ vhost_user_send_call (vm, txvq);
+ }
+
+ /* increase rx counters */
+ vlib_increment_combined_counter
+ (vnet_main.interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number (), vui->sw_if_index, n_rx_packets, n_rx_bytes);
+
+ return n_rx_packets;
+}
+
+static uword
+vhost_user_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ uword n_rx_packets = 0;
+ u32 cpu_index = os_get_cpu_number ();
+
+
+ vhost_iface_and_queue_t *vhiq;
+ vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues)
+ {
+ vhost_user_intf_t *vui =
+ &vum->vhost_user_interfaces[vhiq->vhost_iface_index];
+ n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node);
+ }
+
+ return n_rx_packets;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (vhost_user_input_node) = {
+ .function = vhost_user_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "vhost-user-input",
+ .sibling_of = "device-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_vhost_trace,
+
+ .n_errors = VHOST_USER_INPUT_FUNC_N_ERROR,
+ .error_strings = vhost_user_input_func_error_strings,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (vhost_user_input_node, vhost_user_input)
+/* *INDENT-ON* */
+
+
+void
+vhost_user_tx_trace (vhost_trace_t * t,
+ vhost_user_intf_t * vui, u16 qid,
+ vlib_buffer_t * b, vhost_user_vring_t * rxvq)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ u32 qsz_mask = rxvq->qsz - 1;
+ u32 last_avail_idx = rxvq->last_avail_idx;
+ u32 desc_current = rxvq->avail->ring[last_avail_idx & qsz_mask];
+ vring_desc_t *hdr_desc = 0;
+ u32 hint = 0;
+
+ memset (t, 0, sizeof (*t));
+ t->device_index = vui - vum->vhost_user_interfaces;
+ t->qid = qid;
+
+ hdr_desc = &rxvq->desc[desc_current];
+ if (rxvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
+ {
+ t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT;
+ /* Header is the first here */
+ hdr_desc = map_guest_mem (vui, rxvq->desc[desc_current].addr, &hint);
+ }
+ if (rxvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT)
+ {
+ t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED;
+ }
+ if (!(rxvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT) &&
+ !(rxvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT))
+ {
+ t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC;
+ }
+
+ t->first_desc_len = hdr_desc ? hdr_desc->len : 0;
+}
+
+static_always_inline u32
+vhost_user_tx_copy (vhost_user_intf_t * vui, vhost_copy_t * cpy,
+ u16 copy_len, u32 * map_hint)
+{
+ void *dst0, *dst1, *dst2, *dst3;
+ if (PREDICT_TRUE (copy_len >= 4))
+ {
+ if (PREDICT_FALSE (!(dst2 = map_guest_mem (vui, cpy[0].dst, map_hint))))
+ return 1;
+ if (PREDICT_FALSE (!(dst3 = map_guest_mem (vui, cpy[1].dst, map_hint))))
+ return 1;
+ while (PREDICT_TRUE (copy_len >= 4))
+ {
+ dst0 = dst2;
+ dst1 = dst3;
+
+ if (PREDICT_FALSE
+ (!(dst2 = map_guest_mem (vui, cpy[2].dst, map_hint))))
+ return 1;
+ if (PREDICT_FALSE
+ (!(dst3 = map_guest_mem (vui, cpy[3].dst, map_hint))))
+ return 1;
+
+ CLIB_PREFETCH ((void *) cpy[2].src, 64, LOAD);
+ CLIB_PREFETCH ((void *) cpy[3].src, 64, LOAD);
+
+ clib_memcpy (dst0, (void *) cpy[0].src, cpy[0].len);
+ clib_memcpy (dst1, (void *) cpy[1].src, cpy[1].len);
+
+ vhost_user_log_dirty_pages_2 (vui, cpy[0].dst, cpy[0].len, 1);
+ vhost_user_log_dirty_pages_2 (vui, cpy[1].dst, cpy[1].len, 1);
+ copy_len -= 2;
+ cpy += 2;
+ }
+ }
+ while (copy_len)
+ {
+ if (PREDICT_FALSE (!(dst0 = map_guest_mem (vui, cpy->dst, map_hint))))
+ return 1;
+ clib_memcpy (dst0, (void *) cpy->src, cpy->len);
+ vhost_user_log_dirty_pages_2 (vui, cpy->dst, cpy->len, 1);
+ copy_len -= 1;
+ cpy += 1;
+ }
+ return 0;
+}
+
+
+static uword
+vhost_user_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ u32 *buffers = vlib_frame_args (frame);
+ u32 n_left = frame->n_vectors;
+ vhost_user_main_t *vum = &vhost_user_main;
+ vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
+ vhost_user_intf_t *vui =
+ pool_elt_at_index (vum->vhost_user_interfaces, rd->dev_instance);
+ u32 qid = ~0;
+ vhost_user_vring_t *rxvq;
+ u16 qsz_mask;
+ u8 error;
+ u32 cpu_index = os_get_cpu_number ();
+ u32 map_hint = 0;
+ u8 retry = 8;
+ u16 copy_len;
+ u16 tx_headers_len;
+
+ if (PREDICT_FALSE (!vui->admin_up))
+ {
+ error = VHOST_USER_TX_FUNC_ERROR_DOWN;
+ goto done3;
+ }
+
+ if (PREDICT_FALSE (!vui->is_up))
+ {
+ error = VHOST_USER_TX_FUNC_ERROR_NOT_READY;
+ goto done3;
+ }
+
+ qid =
+ VHOST_VRING_IDX_RX (*vec_elt_at_index
+ (vui->per_cpu_tx_qid, os_get_cpu_number ()));
+ rxvq = &vui->vrings[qid];
+ if (PREDICT_FALSE (vui->use_tx_spinlock))
+ vhost_user_vring_lock (vui, qid);
+
+ qsz_mask = rxvq->qsz - 1; /* qsz is always power of 2 */
+
+retry:
+ error = VHOST_USER_TX_FUNC_ERROR_NONE;
+ tx_headers_len = 0;
+ copy_len = 0;
+ while (n_left > 0)
+ {
+ vlib_buffer_t *b0, *current_b0;
+ u16 desc_head, desc_index, desc_len;
+ vring_desc_t *desc_table;
+ uword buffer_map_addr;
+ u32 buffer_len;
+ u16 bytes_left;
+
+ if (PREDICT_TRUE (n_left > 1))
+ vlib_prefetch_buffer_with_index (vm, buffers[1], LOAD);
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vum->cpus[cpu_index].current_trace =
+ vlib_add_trace (vm, node, b0,
+ sizeof (*vum->cpus[cpu_index].current_trace));
+ vhost_user_tx_trace (vum->cpus[cpu_index].current_trace,
+ vui, qid / 2, b0, rxvq);
+ }
+
+ if (PREDICT_FALSE (rxvq->last_avail_idx == rxvq->avail->idx))
+ {
+ error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
+ goto done;
+ }
+
+ desc_table = rxvq->desc;
+ desc_head = desc_index =
+ rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask];
+
+ /* Go deeper in case of indirect descriptor
+ * I don't know of any driver providing indirect for RX. */
+ if (PREDICT_FALSE (rxvq->desc[desc_head].flags & VIRTQ_DESC_F_INDIRECT))
+ {
+ if (PREDICT_FALSE
+ (rxvq->desc[desc_head].len < sizeof (vring_desc_t)))
+ {
+ error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW;
+ goto done;
+ }
+ if (PREDICT_FALSE
+ (!(desc_table =
+ map_guest_mem (vui, rxvq->desc[desc_index].addr,
+ &map_hint))))
+ {
+ error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL;
+ goto done;
+ }
+ desc_index = 0;
+ }
+
+ desc_len = vui->virtio_net_hdr_sz;
+ buffer_map_addr = desc_table[desc_index].addr;
+ buffer_len = desc_table[desc_index].len;
+
+ {
+ // Get a header from the header array
+ virtio_net_hdr_mrg_rxbuf_t *hdr =
+ &vum->cpus[cpu_index].tx_headers[tx_headers_len];
+ tx_headers_len++;
+ hdr->hdr.flags = 0;
+ hdr->hdr.gso_type = 0;
+ hdr->num_buffers = 1; //This is local, no need to check
+
+ // Prepare a copy order executed later for the header
+ vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
+ copy_len++;
+ cpy->len = vui->virtio_net_hdr_sz;
+ cpy->dst = buffer_map_addr;
+ cpy->src = (uword) hdr;
+ }
+
+ buffer_map_addr += vui->virtio_net_hdr_sz;
+ buffer_len -= vui->virtio_net_hdr_sz;
+ bytes_left = b0->current_length;
+ current_b0 = b0;
+ while (1)
+ {
+ if (buffer_len == 0)
+ { //Get new output
+ if (desc_table[desc_index].flags & VIRTQ_DESC_F_NEXT)
+ {
+ //Next one is chained
+ desc_index = desc_table[desc_index].next;
+ buffer_map_addr = desc_table[desc_index].addr;
+ buffer_len = desc_table[desc_index].len;
+ }
+ else if (vui->virtio_net_hdr_sz == 12) //MRG is available
+ {
+ virtio_net_hdr_mrg_rxbuf_t *hdr =
+ &vum->cpus[cpu_index].tx_headers[tx_headers_len - 1];
+
+ //Move from available to used buffer
+ rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id =
+ desc_head;
+ rxvq->used->ring[rxvq->last_used_idx & qsz_mask].len =
+ desc_len;
+ vhost_user_log_dirty_ring (vui, rxvq,
+ ring[rxvq->last_used_idx &
+ qsz_mask]);
+
+ rxvq->last_avail_idx++;
+ rxvq->last_used_idx++;
+ hdr->num_buffers++;
+ desc_len = 0;
+
+ if (PREDICT_FALSE
+ (rxvq->last_avail_idx == rxvq->avail->idx))
+ {
+ //Dequeue queued descriptors for this packet
+ rxvq->last_used_idx -= hdr->num_buffers - 1;
+ rxvq->last_avail_idx -= hdr->num_buffers - 1;
+ error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
+ goto done;
+ }
+
+ desc_table = rxvq->desc;
+ desc_head = desc_index =
+ rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask];
+ if (PREDICT_FALSE
+ (rxvq->desc[desc_head].flags & VIRTQ_DESC_F_INDIRECT))
+ {
+ //It is seriously unlikely that a driver will put indirect descriptor
+ //after non-indirect descriptor.
+ if (PREDICT_FALSE
+ (rxvq->desc[desc_head].len < sizeof (vring_desc_t)))
+ {
+ error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW;
+ goto done;
+ }
+ if (PREDICT_FALSE
+ (!(desc_table =
+ map_guest_mem (vui,
+ rxvq->desc[desc_index].addr,
+ &map_hint))))
+ {
+ error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL;
+ goto done;
+ }
+ desc_index = 0;
+ }
+ buffer_map_addr = desc_table[desc_index].addr;
+ buffer_len = desc_table[desc_index].len;
+ }
+ else
+ {
+ error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOMRG;
+ goto done;
+ }
+ }
+
+ {
+ vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
+ copy_len++;
+ cpy->len = bytes_left;
+ cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len;
+ cpy->dst = buffer_map_addr;
+ cpy->src = (uword) vlib_buffer_get_current (current_b0) +
+ current_b0->current_length - bytes_left;
+
+ bytes_left -= cpy->len;
+ buffer_len -= cpy->len;
+ buffer_map_addr += cpy->len;
+ desc_len += cpy->len;
+
+ CLIB_PREFETCH (&rxvq->desc, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ // Check if vlib buffer has more data. If not, get more or break.
+ if (PREDICT_TRUE (!bytes_left))
+ {
+ if (PREDICT_FALSE
+ (current_b0->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ current_b0 = vlib_get_buffer (vm, current_b0->next_buffer);
+ bytes_left = current_b0->current_length;
+ }
+ else
+ {
+ //End of packet
+ break;
+ }
+ }
+ }
+
+ //Move from available to used ring
+ rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id = desc_head;
+ rxvq->used->ring[rxvq->last_used_idx & qsz_mask].len = desc_len;
+ vhost_user_log_dirty_ring (vui, rxvq,
+ ring[rxvq->last_used_idx & qsz_mask]);
+ rxvq->last_avail_idx++;
+ rxvq->last_used_idx++;
+
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vum->cpus[cpu_index].current_trace->hdr =
+ vum->cpus[cpu_index].tx_headers[tx_headers_len - 1];
+ }
+
+ n_left--; //At the end for error counting when 'goto done' is invoked
+ buffers++;
+ }
+
+done:
+ //Do the memory copies
+ if (PREDICT_FALSE
+ (vhost_user_tx_copy (vui, vum->cpus[cpu_index].copy,
+ copy_len, &map_hint)))
+ {
+ clib_warning ("Memory mapping error on interface hw_if_index=%d "
+ "(Shutting down - Switch interface down and up to restart)",
+ vui->hw_if_index);
+ vui->admin_up = 0;
+ }
+
+ CLIB_MEMORY_BARRIER ();
+ rxvq->used->idx = rxvq->last_used_idx;
+ vhost_user_log_dirty_ring (vui, rxvq, idx);
+
+ /*
+ * When n_left is set, error is always set to something too.
+ * In case error is due to lack of remaining buffers, we go back up and
+ * retry.
+ * The idea is that it is better to waste some time on packets
+ * that have been processed already than dropping them and get
+ * more fresh packets with a good likelyhood that they will be dropped too.
+ * This technique also gives more time to VM driver to pick-up packets.
+ * In case the traffic flows from physical to virtual interfaces, this
+ * technique will end-up leveraging the physical NIC buffer in order to
+ * absorb the VM's CPU jitter.
+ */
+ if (n_left && (error == VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF) && retry)
+ {
+ retry--;
+ goto retry;
+ }
+
+ /* interrupt (call) handling */
+ if ((rxvq->callfd_idx != ~0) && !(rxvq->avail->flags & 1))
+ {
+ rxvq->n_since_last_int += frame->n_vectors - n_left;
+
+ if (rxvq->n_since_last_int > vum->coalesce_frames)
+ vhost_user_send_call (vm, rxvq);
+ }
+
+ vhost_user_vring_unlock (vui, qid);
+
+done3:
+ if (PREDICT_FALSE (n_left && error != VHOST_USER_TX_FUNC_ERROR_NONE))
+ {
+ vlib_error_count (vm, node->node_index, error, n_left);
+ vlib_increment_simple_counter
+ (vnet_main.interface_main.sw_if_counters
+ + VNET_INTERFACE_COUNTER_DROP,
+ os_get_cpu_number (), vui->sw_if_index, n_left);
+ }
+
+ vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+ return frame->n_vectors;
+}
+
+static clib_error_t *
+vhost_user_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index,
+ u32 flags)
+{
+ vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui =
+ pool_elt_at_index (vum->vhost_user_interfaces, hif->dev_instance);
+
+ vui->admin_up = is_up;
+
+ if (is_up)
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+
+ return /* no error */ 0;
+}
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (vhost_user_dev_class,static) = {
+ .name = "vhost-user",
+ .tx_function = vhost_user_tx,
+ .tx_function_n_errors = VHOST_USER_TX_FUNC_N_ERROR,
+ .tx_function_error_strings = vhost_user_tx_func_error_strings,
+ .format_device_name = format_vhost_user_interface_name,
+ .name_renumber = vhost_user_name_renumber,
+ .admin_up_down_function = vhost_user_interface_admin_up_down,
+ .format_tx_trace = format_vhost_trace,
+};
+
+VLIB_DEVICE_TX_FUNCTION_MULTIARCH (vhost_user_dev_class,
+ vhost_user_tx)
+/* *INDENT-ON* */
+
+static uword
+vhost_user_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+ struct sockaddr_un sun;
+ int sockfd;
+ unix_file_t template = { 0 };
+ f64 timeout = 3153600000.0 /* 100 years */ ;
+ uword *event_data = 0;
+
+ sockfd = socket (AF_UNIX, SOCK_STREAM, 0);
+ sun.sun_family = AF_UNIX;
+ template.read_function = vhost_user_socket_read;
+ template.error_function = vhost_user_socket_error;
+
+ if (sockfd < 0)
+ return 0;
+
+ while (1)
+ {
+ vlib_process_wait_for_event_or_clock (vm, timeout);
+ vlib_process_get_events (vm, &event_data);
+ vec_reset_length (event_data);
+
+ timeout = 3.0;
+
+ /* *INDENT-OFF* */
+ pool_foreach (vui, vum->vhost_user_interfaces, {
+
+ if (vui->unix_server_index == ~0) { //Nothing to do for server sockets
+ if (vui->unix_file_index == ~0)
+ {
+ /* try to connect */
+ strncpy (sun.sun_path, (char *) vui->sock_filename,
+ sizeof (sun.sun_path) - 1);
+
+ if (connect (sockfd, (struct sockaddr *) &sun,
+ sizeof (struct sockaddr_un)) == 0)
+ {
+ vui->sock_errno = 0;
+ template.file_descriptor = sockfd;
+ template.private_data =
+ vui - vhost_user_main.vhost_user_interfaces;
+ vui->unix_file_index = unix_file_add (&unix_main, &template);
+
+ //Re-open for next connect
+ if ((sockfd = socket (AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ clib_warning("Critical: Could not open unix socket");
+ return 0;
+ }
+ }
+ else
+ {
+ vui->sock_errno = errno;
+ }
+ }
+ else
+ {
+ /* check if socket is alive */
+ int error = 0;
+ socklen_t len = sizeof (error);
+ int fd = UNIX_GET_FD(vui->unix_file_index);
+ int retval =
+ getsockopt (fd, SOL_SOCKET, SO_ERROR, &error, &len);
+
+ if (retval)
+ {
+ DBG_SOCK ("getsockopt returned %d", retval);
+ vhost_user_if_disconnect (vui);
+ }
+ }
+ }
+ });
+ /* *INDENT-ON* */
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (vhost_user_process_node,static) = {
+ .function = vhost_user_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "vhost-user-process",
+};
+/* *INDENT-ON* */
+
+/**
+ * Disables and reset interface structure.
+ * It can then be either init again, or removed from used interfaces.
+ */
+static void
+vhost_user_term_if (vhost_user_intf_t * vui)
+{
+ // Delete configured thread pinning
+ vec_reset_length (vui->workers);
+ // disconnect interface sockets
+ vhost_user_if_disconnect (vui);
+ vhost_user_update_iface_state (vui);
+
+ if (vui->unix_server_index != ~0)
+ {
+ //Close server socket
+ unix_file_t *uf = pool_elt_at_index (unix_main.file_pool,
+ vui->unix_server_index);
+ unix_file_del (&unix_main, uf);
+ vui->unix_server_index = ~0;
+ }
+}
+
+int
+vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+ int rv = 0;
+ vnet_hw_interface_t *hwif;
+
+ if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) ||
+ hwif->dev_class_index != vhost_user_dev_class.index)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ DBG_SOCK ("Deleting vhost-user interface %s (instance %d)",
+ hwif->name, hwif->dev_instance);
+
+ vui = pool_elt_at_index (vum->vhost_user_interfaces, hwif->dev_instance);
+
+ // Disable and reset interface
+ vhost_user_term_if (vui);
+
+ // Back to pool
+ pool_put (vum->vhost_user_interfaces, vui);
+
+ // Reset renumbered iface
+ if (hwif->dev_instance <
+ vec_len (vum->show_dev_instance_by_real_dev_instance))
+ vum->show_dev_instance_by_real_dev_instance[hwif->dev_instance] = ~0;
+
+ // Delete ethernet interface
+ ethernet_delete_interface (vnm, vui->hw_if_index);
+ return rv;
+}
+
+/**
+ * Open server unix socket on specified sock_filename.
+ */
+static int
+vhost_user_init_server_sock (const char *sock_filename, int *sock_fd)
+{
+ int rv = 0;
+ struct sockaddr_un un = { };
+ int fd;
+ /* create listening socket */
+ if ((fd = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+
+ un.sun_family = AF_UNIX;
+ strncpy ((char *) un.sun_path, (char *) sock_filename,
+ sizeof (un.sun_path) - 1);
+
+ /* remove if exists */
+ unlink ((char *) sock_filename);
+
+ if (bind (fd, (struct sockaddr *) &un, sizeof (un)) == -1)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_2;
+ goto error;
+ }
+
+ if (listen (fd, 1) == -1)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_3;
+ goto error;
+ }
+
+ *sock_fd = fd;
+ return 0;
+
+error:
+ close (fd);
+ return rv;
+}
+
+/**
+ * Create ethernet interface for vhost user interface.
+ */
+static void
+vhost_user_create_ethernet (vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_t * vui, u8 * hwaddress)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ u8 hwaddr[6];
+ clib_error_t *error;
+
+ /* create hw and sw interface */
+ if (hwaddress)
+ {
+ clib_memcpy (hwaddr, hwaddress, 6);
+ }
+ else
+ {
+ random_u32 (&vum->random);
+ clib_memcpy (hwaddr + 2, &vum->random, sizeof (vum->random));
+ hwaddr[0] = 2;
+ hwaddr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface
+ (vnm,
+ vhost_user_dev_class.index,
+ vui - vum->vhost_user_interfaces /* device instance */ ,
+ hwaddr /* ethernet address */ ,
+ &vui->hw_if_index, 0 /* flag change */ );
+
+ if (error)
+ clib_error_report (error);
+
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, vui->hw_if_index);
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = 9000;
+}
+
+/*
+ * Initialize vui with specified attributes
+ */
+static void
+vhost_user_vui_init (vnet_main_t * vnm,
+ vhost_user_intf_t * vui,
+ int server_sock_fd,
+ const char *sock_filename,
+ u64 feature_mask, u32 * sw_if_index)
+{
+ vnet_sw_interface_t *sw;
+ sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index);
+ int q;
+
+ if (server_sock_fd != -1)
+ {
+ unix_file_t template = { 0 };
+ template.read_function = vhost_user_socksvr_accept_ready;
+ template.file_descriptor = server_sock_fd;
+ template.private_data = vui - vhost_user_main.vhost_user_interfaces; //hw index
+ vui->unix_server_index = unix_file_add (&unix_main, &template);
+ }
+ else
+ {
+ vui->unix_server_index = ~0;
+ }
+
+ vui->sw_if_index = sw->sw_if_index;
+ strncpy (vui->sock_filename, sock_filename,
+ ARRAY_LEN (vui->sock_filename) - 1);
+ vui->sock_errno = 0;
+ vui->is_up = 0;
+ vui->feature_mask = feature_mask;
+ vui->unix_file_index = ~0;
+ vui->log_base_addr = 0;
+
+ for (q = 0; q < VHOST_VRING_MAX_N; q++)
+ vhost_user_vring_init (vui, q);
+
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+
+ if (sw_if_index)
+ *sw_if_index = vui->sw_if_index;
+
+ for (q = 0; q < VHOST_VRING_MAX_N; q++)
+ {
+ vui->vring_locks[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) vui->vring_locks[q], 0, CLIB_CACHE_LINE_BYTES);
+ }
+
+ vec_validate (vui->per_cpu_tx_qid,
+ vlib_get_thread_main ()->n_vlib_mains - 1);
+ vhost_user_tx_thread_placement (vui);
+}
+
+int
+vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
+ const char *sock_filename,
+ u8 is_server,
+ u32 * sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance, u8 * hwaddr)
+{
+ vhost_user_intf_t *vui = NULL;
+ u32 sw_if_idx = ~0;
+ int rv = 0;
+ int server_sock_fd = -1;
+
+ if (is_server)
+ {
+ if ((rv =
+ vhost_user_init_server_sock (sock_filename, &server_sock_fd)) != 0)
+ {
+ return rv;
+ }
+ }
+
+ pool_get (vhost_user_main.vhost_user_interfaces, vui);
+
+ vhost_user_create_ethernet (vnm, vm, vui, hwaddr);
+ vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename,
+ feature_mask, &sw_if_idx);
+
+ if (renumber)
+ vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
+
+ if (sw_if_index)
+ *sw_if_index = sw_if_idx;
+
+ // Process node must connect
+ vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0);
+ return rv;
+}
+
+int
+vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
+ const char *sock_filename,
+ u8 is_server,
+ u32 sw_if_index,
+ u64 feature_mask, u8 renumber, u32 custom_dev_instance)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui = NULL;
+ u32 sw_if_idx = ~0;
+ int server_sock_fd = -1;
+ int rv = 0;
+ vnet_hw_interface_t *hwif;
+
+ if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) ||
+ hwif->dev_class_index != vhost_user_dev_class.index)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, hwif->dev_instance);
+
+ // First try to open server socket
+ if (is_server)
+ if ((rv = vhost_user_init_server_sock (sock_filename,
+ &server_sock_fd)) != 0)
+ return rv;
+
+ vhost_user_term_if (vui);
+ vhost_user_vui_init (vnm, vui, server_sock_fd,
+ sock_filename, feature_mask, &sw_if_idx);
+
+ if (renumber)
+ vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
+
+ // Process node must connect
+ vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0);
+ return rv;
+}
+
+clib_error_t *
+vhost_user_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u8 *sock_filename = NULL;
+ u32 sw_if_index;
+ u8 is_server = 0;
+ u64 feature_mask = (u64) ~ (0ULL);
+ u8 renumber = 0;
+ u32 custom_dev_instance = ~0;
+ u8 hwaddr[6];
+ u8 *hw = NULL;
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "socket %s", &sock_filename))
+ ;
+ else if (unformat (line_input, "server"))
+ is_server = 1;
+ else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
+ ;
+ else
+ if (unformat
+ (line_input, "hwaddr %U", unformat_ethernet_address, hwaddr))
+ hw = hwaddr;
+ else if (unformat (line_input, "renumber %d", &custom_dev_instance))
+ {
+ renumber = 1;
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ vnet_main_t *vnm = vnet_get_main ();
+
+ int rv;
+ if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename,
+ is_server, &sw_if_index, feature_mask,
+ renumber, custom_dev_instance, hw)))
+ {
+ vec_free (sock_filename);
+ return clib_error_return (0, "vhost_user_create_if returned %d", rv);
+ }
+
+ vec_free (sock_filename);
+ vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (),
+ sw_if_index);
+ return 0;
+}
+
+clib_error_t *
+vhost_user_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 sw_if_index = ~0;
+ vnet_main_t *vnm = vnet_get_main ();
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+ ;
+ else if (unformat
+ (line_input, "%U", unformat_vnet_sw_interface, vnm,
+ &sw_if_index))
+ {
+ vnet_hw_interface_t *hwif =
+ vnet_get_sup_hw_interface (vnm, sw_if_index);
+ if (hwif == NULL ||
+ vhost_user_dev_class.index != hwif->dev_class_index)
+ return clib_error_return (0, "Not a vhost interface");
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+ vhost_user_delete_if (vnm, vm, sw_if_index);
+ return 0;
+}
+
+int
+vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_details_t ** out_vuids)
+{
+ int rv = 0;
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+ vhost_user_intf_details_t *r_vuids = NULL;
+ vhost_user_intf_details_t *vuid = NULL;
+ u32 *hw_if_indices = 0;
+ vnet_hw_interface_t *hi;
+ u8 *s = NULL;
+ int i;
+
+ if (!out_vuids)
+ return -1;
+
+ pool_foreach (vui, vum->vhost_user_interfaces,
+ vec_add1 (hw_if_indices, vui->hw_if_index);
+ );
+
+ for (i = 0; i < vec_len (hw_if_indices); i++)
+ {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ vui = pool_elt_at_index (vum->vhost_user_interfaces, hi->dev_instance);
+
+ vec_add2 (r_vuids, vuid, 1);
+ vuid->sw_if_index = vui->sw_if_index;
+ vuid->virtio_net_hdr_sz = vui->virtio_net_hdr_sz;
+ vuid->features = vui->features;
+ vuid->num_regions = vui->nregions;
+ vuid->sock_errno = vui->sock_errno;
+ strncpy ((char *) vuid->sock_filename, (char *) vui->sock_filename,
+ ARRAY_LEN (vuid->sock_filename) - 1);
+
+ s = format (s, "%v%c", hi->name, 0);
+
+ strncpy ((char *) vuid->if_name, (char *) s,
+ ARRAY_LEN (vuid->if_name) - 1);
+ _vec_len (s) = 0;
+ }
+
+ vec_free (s);
+ vec_free (hw_if_indices);
+
+ *out_vuids = r_vuids;
+
+ return rv;
+}
+
+clib_error_t *
+show_vhost_user_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t *error = 0;
+ vnet_main_t *vnm = vnet_get_main ();
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+ u32 hw_if_index, *hw_if_indices = 0;
+ vnet_hw_interface_t *hi;
+ vhost_cpu_t *vhc;
+ vhost_iface_and_queue_t *vhiq;
+ u32 ci;
+
+ int i, j, q;
+ int show_descr = 0;
+ struct feat_struct
+ {
+ u8 bit;
+ char *str;
+ };
+ struct feat_struct *feat_entry;
+
+ static struct feat_struct feat_array[] = {
+#define _(s,b) { .str = #s, .bit = b, },
+ foreach_virtio_net_feature
+#undef _
+ {.str = NULL}
+ };
+
+#define foreach_protocol_feature \
+ _(VHOST_USER_PROTOCOL_F_MQ) \
+ _(VHOST_USER_PROTOCOL_F_LOG_SHMFD)
+
+ static struct feat_struct proto_feat_array[] = {
+#define _(s) { .str = #s, .bit = s},
+ foreach_protocol_feature
+#undef _
+ {.str = NULL}
+ };
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index))
+ {
+ vec_add1 (hw_if_indices, hw_if_index);
+ }
+ else if (unformat (input, "descriptors") || unformat (input, "desc"))
+ show_descr = 1;
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+ if (vec_len (hw_if_indices) == 0)
+ {
+ pool_foreach (vui, vum->vhost_user_interfaces,
+ vec_add1 (hw_if_indices, vui->hw_if_index);
+ );
+ }
+ vlib_cli_output (vm, "Virtio vhost-user interfaces");
+ vlib_cli_output (vm, "Global:\n coalesce frames %d time %e",
+ vum->coalesce_frames, vum->coalesce_time);
+
+ for (i = 0; i < vec_len (hw_if_indices); i++)
+ {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ vui = pool_elt_at_index (vum->vhost_user_interfaces, hi->dev_instance);
+ vlib_cli_output (vm, "Interface: %s (ifindex %d)",
+ hi->name, hw_if_indices[i]);
+
+ vlib_cli_output (vm, "virtio_net_hdr_sz %d\n"
+ " features mask (0x%llx): \n"
+ " features (0x%llx): \n",
+ vui->virtio_net_hdr_sz, vui->feature_mask,
+ vui->features);
+
+ feat_entry = (struct feat_struct *) &feat_array;
+ while (feat_entry->str)
+ {
+ if (vui->features & (1ULL << feat_entry->bit))
+ vlib_cli_output (vm, " %s (%d)", feat_entry->str,
+ feat_entry->bit);
+ feat_entry++;
+ }
+
+ vlib_cli_output (vm, " protocol features (0x%llx)",
+ vui->protocol_features);
+ feat_entry = (struct feat_struct *) &proto_feat_array;
+ while (feat_entry->str)
+ {
+ if (vui->protocol_features & (1ULL << feat_entry->bit))
+ vlib_cli_output (vm, " %s (%d)", feat_entry->str,
+ feat_entry->bit);
+ feat_entry++;
+ }
+
+ vlib_cli_output (vm, "\n");
+
+ vlib_cli_output (vm, " socket filename %s type %s errno \"%s\"\n\n",
+ vui->sock_filename,
+ (vui->unix_server_index != ~0) ? "server" : "client",
+ strerror (vui->sock_errno));
+
+ vlib_cli_output (vm, " rx placement: ");
+ vec_foreach (vhc, vum->cpus)
+ {
+ vec_foreach (vhiq, vhc->rx_queues)
+ {
+ if (vhiq->vhost_iface_index == vui - vum->vhost_user_interfaces)
+ vlib_cli_output (vm, " thread %d on vring %d\n",
+ vhc - vum->cpus, VHOST_VRING_IDX_TX (vhiq->qid));
+ }
+ }
+
+ vlib_cli_output (vm, " tx placement: %s\n",
+ vui->use_tx_spinlock ? "spin-lock" : "lock-free");
+
+ vec_foreach_index (ci, vui->per_cpu_tx_qid)
+ {
+ vlib_cli_output (vm, " thread %d on vring %d\n", ci,
+ VHOST_VRING_IDX_RX (vui->per_cpu_tx_qid[ci]));
+ }
+
+ vlib_cli_output (vm, "\n");
+
+ vlib_cli_output (vm, " Memory regions (total %d)\n", vui->nregions);
+
+ if (vui->nregions)
+ {
+ vlib_cli_output (vm,
+ " region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr\n");
+ vlib_cli_output (vm,
+ " ====== ===== ================== ================== ================== ================== ==================\n");
+ }
+ for (j = 0; j < vui->nregions; j++)
+ {
+ vlib_cli_output (vm,
+ " %d %-5d 0x%016lx 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n",
+ j, vui->region_mmap_fd[j],
+ vui->regions[j].guest_phys_addr,
+ vui->regions[j].memory_size,
+ vui->regions[j].userspace_addr,
+ vui->regions[j].mmap_offset,
+ pointer_to_uword (vui->region_mmap_addr[j]));
+ }
+ for (q = 0; q < VHOST_VRING_MAX_N; q++)
+ {
+ if (!vui->vrings[q].started)
+ continue;
+
+ vlib_cli_output (vm, "\n Virtqueue %d (%s%s)\n", q,
+ (q & 1) ? "RX" : "TX",
+ vui->vrings[q].enabled ? "" : " disabled");
+
+ vlib_cli_output (vm,
+ " qsz %d last_avail_idx %d last_used_idx %d\n",
+ vui->vrings[q].qsz, vui->vrings[q].last_avail_idx,
+ vui->vrings[q].last_used_idx);
+
+ if (vui->vrings[q].avail && vui->vrings[q].used)
+ vlib_cli_output (vm,
+ " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
+ vui->vrings[q].avail->flags,
+ vui->vrings[q].avail->idx,
+ vui->vrings[q].used->flags,
+ vui->vrings[q].used->idx);
+
+ int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx);
+ int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx);
+ vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n",
+ kickfd, callfd, vui->vrings[q].errfd);
+
+ if (show_descr)
+ {
+ vlib_cli_output (vm, "\n descriptor table:\n");
+ vlib_cli_output (vm,
+ " id addr len flags next user_addr\n");
+ vlib_cli_output (vm,
+ " ===== ================== ===== ====== ===== ==================\n");
+ for (j = 0; j < vui->vrings[q].qsz; j++)
+ {
+ u32 mem_hint = 0;
+ vlib_cli_output (vm,
+ " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
+ j, vui->vrings[q].desc[j].addr,
+ vui->vrings[q].desc[j].len,
+ vui->vrings[q].desc[j].flags,
+ vui->vrings[q].desc[j].next,
+ pointer_to_uword (map_guest_mem
+ (vui,
+ vui->vrings[q].desc[j].
+ addr, &mem_hint)));
+ }
+ }
+ }
+ vlib_cli_output (vm, "\n");
+ }
+done:
+ vec_free (hw_if_indices);
+ return error;
+}
+
+/*
+ * CLI functions
+ */
+
+/*?
+ * Create a vHost User interface. Once created, a new virtual interface
+ * will exist with the name '<em>VirtualEthernet0/0/x</em>', where '<em>x</em>'
+ * is the next free index.
+ *
+ * There are several parameters associated with a vHost interface:
+ *
+ * - <b>socket <socket-filename></b> - Name of the linux socket used by QEMU/VM and
+ * VPP to manage the vHost interface. If socket does not already exist, VPP will
+ * create the socket.
+ *
+ * - <b>server</b> - Optional flag to indicate that VPP should be the server for the
+ * linux socket. If not provided, VPP will be the client.
+ *
+ * - <b>feature-mask <hex></b> - Optional virtio/vhost feature set negotiated at
+ * startup. By default, all supported features will be advertised. Otherwise,
+ * provide the set of features desired.
+ * - 0x000008000 (15) - VIRTIO_NET_F_MRG_RXBUF
+ * - 0x000020000 (17) - VIRTIO_NET_F_CTRL_VQ
+ * - 0x000200000 (21) - VIRTIO_NET_F_GUEST_ANNOUNCE
+ * - 0x000400000 (22) - VIRTIO_NET_F_MQ
+ * - 0x004000000 (26) - VHOST_F_LOG_ALL
+ * - 0x008000000 (27) - VIRTIO_F_ANY_LAYOUT
+ * - 0x010000000 (28) - VIRTIO_F_INDIRECT_DESC
+ * - 0x040000000 (30) - VHOST_USER_F_PROTOCOL_FEATURES
+ * - 0x100000000 (32) - VIRTIO_F_VERSION_1
+ *
+ * - <b>hwaddr <mac-addr></b> - Optional ethernet address, can be in either
+ * X:X:X:X:X:X unix or X.X.X cisco format.
+ *
+ * - <b>renumber <dev_instance></b> - Optional parameter which allows the instance
+ * in the name to be specified. If instance already exists, name will be used
+ * anyway and multiple instances will have the same name. Use with caution.
+ *
+ * @cliexpar
+ * Example of how to create a vhost interface with VPP as the client and all features enabled:
+ * @cliexstart{create vhost-user socket /tmp/vhost1.sock}
+ * VirtualEthernet0/0/0
+ * @cliexend
+ * Example of how to create a vhost interface with VPP as the server and with just
+ * multiple queues enabled:
+ * @cliexstart{create vhost-user socket /tmp/vhost2.sock server feature-mask 0x40400000}
+ * VirtualEthernet0/0/1
+ * @cliexend
+ * Once the vHost interface is created, enable the interface using:
+ * @cliexcmd{set interface state VirtualEthernet0/0/0 up}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (vhost_user_connect_command, static) = {
+ .path = "create vhost-user",
+ .short_help = "create vhost-user socket <socket-filename> [server] [feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>]",
+ .function = vhost_user_connect_command_fn,
+};
+/* *INDENT-ON* */
+
+/*?
+ * Delete a vHost User interface using the interface name or the
+ * software interface index. Use the '<em>show interfaces</em>'
+ * command to determine the software interface index. On deletion,
+ * the linux socket will not be deleted.
+ *
+ * @cliexpar
+ * Example of how to delete a vhost interface by name:
+ * @cliexcmd{delete vhost-user VirtualEthernet0/0/1}
+ * Example of how to delete a vhost interface by software interface index:
+ * @cliexcmd{delete vhost-user sw_if_index 1}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (vhost_user_delete_command, static) = {
+ .path = "delete vhost-user",
+ .short_help = "delete vhost-user {<interface> | sw_if_index <sw_idx>}",
+ .function = vhost_user_delete_command_fn,
+};
+
+/*?
+ * Display the attributes of a single vHost User interface (provide interface
+ * name), multiple vHost User interfaces (provide a list of interface names seperated
+ * by spaces) or all Vhost User interfaces (omit an interface name to display all
+ * vHost interfaces).
+ *
+ * @cliexpar
+ * @parblock
+ * Example of how to display a vhost interface:
+ * @cliexstart{show vhost-user VirtualEthernet0/0/0}
+ * Virtio vhost-user interfaces
+ * Global:
+ * coalesce frames 32 time 1e-3
+ * Interface: VirtualEthernet0/0/0 (ifindex 1)
+ * virtio_net_hdr_sz 12
+ * features mask (0xffffffffffffffff):
+ * features (0x50408000):
+ * VIRTIO_NET_F_MRG_RXBUF (15)
+ * VIRTIO_NET_F_MQ (22)
+ * VIRTIO_F_INDIRECT_DESC (28)
+ * VHOST_USER_F_PROTOCOL_FEATURES (30)
+ * protocol features (0x3)
+ * VHOST_USER_PROTOCOL_F_MQ (0)
+ * VHOST_USER_PROTOCOL_F_LOG_SHMFD (1)
+ *
+ * socket filename /tmp/vhost1.sock type client errno "Success"
+ *
+ * rx placement:
+ * thread 1 on vring 1
+ * thread 1 on vring 5
+ * thread 2 on vring 3
+ * thread 2 on vring 7
+ * tx placement: spin-lock
+ * thread 0 on vring 0
+ * thread 1 on vring 2
+ * thread 2 on vring 0
+ *
+ * Memory regions (total 2)
+ * region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr
+ * ====== ===== ================== ================== ================== ================== ==================
+ * 0 60 0x0000000000000000 0x00000000000a0000 0x00002aaaaac00000 0x0000000000000000 0x00002aab2b400000
+ * 1 61 0x00000000000c0000 0x000000003ff40000 0x00002aaaaacc0000 0x00000000000c0000 0x00002aababcc0000
+ *
+ * Virtqueue 0 (TX)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 128 used.flags 1 used.idx 0
+ * kickfd 62 callfd 64 errfd -1
+ *
+ * Virtqueue 1 (RX)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
+ * kickfd 65 callfd 66 errfd -1
+ *
+ * Virtqueue 2 (TX)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 128 used.flags 1 used.idx 0
+ * kickfd 63 callfd 70 errfd -1
+ *
+ * Virtqueue 3 (RX)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
+ * kickfd 72 callfd 74 errfd -1
+ *
+ * Virtqueue 4 (TX disabled)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
+ * kickfd 76 callfd 78 errfd -1
+ *
+ * Virtqueue 5 (RX disabled)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
+ * kickfd 80 callfd 82 errfd -1
+ *
+ * Virtqueue 6 (TX disabled)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
+ * kickfd 84 callfd 86 errfd -1
+ *
+ * Virtqueue 7 (RX disabled)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 0 used.flags 1 used.idx 0
+ * kickfd 88 callfd 90 errfd -1
+ *
+ * @cliexend
+ *
+ * The optional '<em>descriptors</em>' parameter will display the same output as
+ * the previous example but will include the descriptor table for each queue.
+ * The output is truncated below:
+ * @cliexstart{show vhost-user VirtualEthernet0/0/0 descriptors}
+ * Virtio vhost-user interfaces
+ * Global:
+ * coalesce frames 32 time 1e-3
+ * Interface: VirtualEthernet0/0/0 (ifindex 1)
+ * virtio_net_hdr_sz 12
+ * features mask (0xffffffffffffffff):
+ * features (0x50408000):
+ * VIRTIO_NET_F_MRG_RXBUF (15)
+ * VIRTIO_NET_F_MQ (22)
+ * :
+ * Virtqueue 0 (TX)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * avail.flags 1 avail.idx 128 used.flags 1 used.idx 0
+ * kickfd 62 callfd 64 errfd -1
+ *
+ * descriptor table:
+ * id addr len flags next user_addr
+ * ===== ================== ===== ====== ===== ==================
+ * 0 0x0000000010b6e974 2060 0x0002 1 0x00002aabbc76e974
+ * 1 0x0000000010b6e034 2060 0x0002 2 0x00002aabbc76e034
+ * 2 0x0000000010b6d6f4 2060 0x0002 3 0x00002aabbc76d6f4
+ * 3 0x0000000010b6cdb4 2060 0x0002 4 0x00002aabbc76cdb4
+ * 4 0x0000000010b6c474 2060 0x0002 5 0x00002aabbc76c474
+ * 5 0x0000000010b6bb34 2060 0x0002 6 0x00002aabbc76bb34
+ * 6 0x0000000010b6b1f4 2060 0x0002 7 0x00002aabbc76b1f4
+ * 7 0x0000000010b6a8b4 2060 0x0002 8 0x00002aabbc76a8b4
+ * 8 0x0000000010b69f74 2060 0x0002 9 0x00002aabbc769f74
+ * 9 0x0000000010b69634 2060 0x0002 10 0x00002aabbc769634
+ * 10 0x0000000010b68cf4 2060 0x0002 11 0x00002aabbc768cf4
+ * :
+ * 249 0x0000000000000000 0 0x0000 250 0x00002aab2b400000
+ * 250 0x0000000000000000 0 0x0000 251 0x00002aab2b400000
+ * 251 0x0000000000000000 0 0x0000 252 0x00002aab2b400000
+ * 252 0x0000000000000000 0 0x0000 253 0x00002aab2b400000
+ * 253 0x0000000000000000 0 0x0000 254 0x00002aab2b400000
+ * 254 0x0000000000000000 0 0x0000 255 0x00002aab2b400000
+ * 255 0x0000000000000000 0 0x0000 32768 0x00002aab2b400000
+ *
+ * Virtqueue 1 (RX)
+ * qsz 256 last_avail_idx 0 last_used_idx 0
+ * :
+ * @cliexend
+ * @endparblock
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_vhost_user_command, static) = {
+ .path = "show vhost-user",
+ .short_help = "show vhost-user [<interface> [<interface> [..]]] [descriptors]",
+ .function = show_vhost_user_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+vhost_user_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "coalesce-frames %d", &vum->coalesce_frames))
+ ;
+ else if (unformat (input, "coalesce-time %f", &vum->coalesce_time))
+ ;
+ else if (unformat (input, "dont-dump-memory"))
+ vum->dont_dump_vhost_user_memory = 1;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ return 0;
+}
+
+/* vhost-user { ... } configuration. */
+VLIB_CONFIG_FUNCTION (vhost_user_config, "vhost-user");
+
+void
+vhost_user_unmap_all (void)
+{
+ vhost_user_main_t *vum = &vhost_user_main;
+ vhost_user_intf_t *vui;
+
+ if (vum->dont_dump_vhost_user_memory)
+ {
+ pool_foreach (vui, vum->vhost_user_interfaces,
+ unmap_all_mem_regions (vui);
+ );
+ }
+}
+
+static clib_error_t *
+vhost_thread_command_fn (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 worker_thread_index;
+ u32 sw_if_index;
+ u8 del = 0;
+ int rv;
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ if (!unformat
+ (line_input, "%U %d", unformat_vnet_sw_interface, vnet_get_main (),
+ &sw_if_index, &worker_thread_index))
+ {
+ unformat_free (line_input);
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (unformat (line_input, "del"))
+ del = 1;
+
+ if ((rv =
+ vhost_user_thread_placement (sw_if_index, worker_thread_index, del)))
+ return clib_error_return (0, "vhost_user_thread_placement returned %d",
+ rv);
+ return 0;
+}
+
+
+/*?
+ * This command is used to move the RX processing for the given
+ * interfaces to the provided thread. If the '<em>del</em>' option is used,
+ * the forced thread assignment is removed and the thread assigment is
+ * reassigned automatically. Use '<em>show vhost-user <interface></em>'
+ * to see the thread assignment.
+ *
+ * @cliexpar
+ * Example of how to move the RX processing for a given interface to a given thread:
+ * @cliexcmd{vhost thread VirtualEthernet0/0/0 1}
+ * Example of how to remove the forced thread assignment for a given interface:
+ * @cliexcmd{vhost thread VirtualEthernet0/0/0 1 del}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (vhost_user_thread_command, static) = {
+ .path = "vhost thread",
+ .short_help = "vhost thread <iface> <worker-index> [del]",
+ .function = vhost_thread_command_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/virtio/vhost-user.h b/src/vnet/devices/virtio/vhost-user.h
new file mode 100644
index 00000000000..3083b614016
--- /dev/null
+++ b/src/vnet/devices/virtio/vhost-user.h
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __VIRTIO_VHOST_USER_H__
+#define __VIRTIO_VHOST_USER_H__
+/* vhost-user data structures */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+#define VHOST_USER_MSG_HDR_SZ 12
+#define VHOST_VRING_MAX_SIZE 32768
+#define VHOST_VRING_MAX_N 16 //8TX + 8RX
+#define VHOST_VRING_IDX_RX(qid) (2*qid)
+#define VHOST_VRING_IDX_TX(qid) (2*qid + 1)
+
+#define VIRTQ_DESC_F_NEXT 1
+#define VIRTQ_DESC_F_INDIRECT 4
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+
+#define VHOST_USER_PROTOCOL_F_MQ 0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
+#define VHOST_VRING_F_LOG 0
+
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD))
+
+/* If multiqueue is provided by host, then we suppport it. */
+#define VIRTIO_NET_CTRL_MQ 4
+#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET 0
+#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1
+#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000
+
+#define VRING_USED_F_NO_NOTIFY 1
+
+#define foreach_virtio_net_feature \
+ _ (VIRTIO_NET_F_MRG_RXBUF, 15) \
+ _ (VIRTIO_NET_F_CTRL_VQ, 17) \
+ _ (VIRTIO_NET_F_GUEST_ANNOUNCE, 21) \
+ _ (VIRTIO_NET_F_MQ, 22) \
+ _ (VHOST_F_LOG_ALL, 26) \
+ _ (VIRTIO_F_ANY_LAYOUT, 27) \
+ _ (VIRTIO_F_INDIRECT_DESC, 28) \
+ _ (VHOST_USER_F_PROTOCOL_FEATURES, 30) \
+ _ (VIRTIO_F_VERSION_1, 32)
+
+
+typedef enum
+{
+#define _(f,n) FEAT_##f = (n),
+ foreach_virtio_net_feature
+#undef _
+} virtio_net_feature_t;
+
+int vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
+ const char *sock_filename, u8 is_server,
+ u32 * sw_if_index, u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance, u8 * hwaddr);
+int vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
+ const char *sock_filename, u8 is_server,
+ u32 sw_if_index, u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm,
+ u32 sw_if_index);
+
+/* *INDENT-OFF* */
+typedef struct vhost_user_memory_region
+{
+ u64 guest_phys_addr;
+ u64 memory_size;
+ u64 userspace_addr;
+ u64 mmap_offset;
+} __attribute ((packed)) vhost_user_memory_region_t;
+
+typedef struct vhost_user_memory
+{
+ u32 nregions;
+ u32 padding;
+ vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute ((packed)) vhost_user_memory_t;
+
+typedef struct
+{
+ u32 index, num;
+} __attribute ((packed)) vhost_vring_state_t;
+
+typedef struct
+{
+ u32 index, flags;
+ u64 desc_user_addr, used_user_addr, avail_user_addr, log_guest_addr;
+} __attribute ((packed)) vhost_vring_addr_t;
+
+typedef struct vhost_user_log
+{
+ u64 size;
+ u64 offset;
+} __attribute ((packed)) vhost_user_log_t;
+
+typedef enum vhost_user_req
+{
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_MAX
+} vhost_user_req_t;
+
+// vring_desc I/O buffer descriptor
+typedef struct
+{
+ uint64_t addr; // packet data buffer address
+ uint32_t len; // packet data buffer size
+ uint16_t flags; // (see below)
+ uint16_t next; // optional index next descriptor in chain
+} __attribute ((packed)) vring_desc_t;
+
+typedef struct
+{
+ uint16_t flags;
+ volatile uint16_t idx;
+ uint16_t ring[VHOST_VRING_MAX_SIZE];
+} __attribute ((packed)) vring_avail_t;
+
+typedef struct
+{
+ uint16_t flags;
+ uint16_t idx;
+ struct /* vring_used_elem */
+ {
+ uint32_t id;
+ uint32_t len;
+ } ring[VHOST_VRING_MAX_SIZE];
+} __attribute ((packed)) vring_used_t;
+
+typedef struct
+{
+ u8 flags;
+ u8 gso_type;
+ u16 hdr_len;
+ u16 gso_size;
+ u16 csum_start;
+ u16 csum_offset;
+} __attribute ((packed)) virtio_net_hdr_t;
+
+typedef struct {
+ virtio_net_hdr_t hdr;
+ u16 num_buffers;
+} __attribute ((packed)) virtio_net_hdr_mrg_rxbuf_t;
+
+typedef struct vhost_user_msg {
+ vhost_user_req_t request;
+ u32 flags;
+ u32 size;
+ union
+ {
+ u64 u64;
+ vhost_vring_state_t state;
+ vhost_vring_addr_t addr;
+ vhost_user_memory_t memory;
+ vhost_user_log_t log;
+ };
+} __attribute ((packed)) vhost_user_msg_t;
+/* *INDENT-ON* */
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u16 qsz;
+ u16 last_avail_idx;
+ u16 last_used_idx;
+ u16 n_since_last_int;
+ vring_desc_t *desc;
+ vring_avail_t *avail;
+ vring_used_t *used;
+ f64 int_deadline;
+ u8 started;
+ u8 enabled;
+ u8 log_used;
+ //Put non-runtime in a different cache line
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
+ int errfd;
+ u32 callfd_idx;
+ u32 kickfd_idx;
+ u64 log_guest_addr;
+} vhost_user_vring_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ u32 is_up;
+ u32 admin_up;
+ u32 unix_server_index;
+ u32 unix_file_index;
+ char sock_filename[256];
+ int sock_errno;
+ u32 hw_if_index, sw_if_index;
+
+ //Feature negotiation
+ u64 features;
+ u64 feature_mask;
+ u64 protocol_features;
+
+ //Memory region information
+ u32 nregions;
+ vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS];
+ void *region_mmap_addr[VHOST_MEMORY_MAX_NREGIONS];
+ u64 region_guest_addr_lo[VHOST_MEMORY_MAX_NREGIONS];
+ u64 region_guest_addr_hi[VHOST_MEMORY_MAX_NREGIONS];
+ u32 region_mmap_fd[VHOST_MEMORY_MAX_NREGIONS];
+
+ //Virtual rings
+ vhost_user_vring_t vrings[VHOST_VRING_MAX_N];
+ volatile u32 *vring_locks[VHOST_VRING_MAX_N];
+
+ int virtio_net_hdr_sz;
+ int is_any_layout;
+
+ void *log_base_addr;
+ u64 log_size;
+
+ /* Whether to use spinlock or per_cpu_tx_qid assignment */
+ u8 use_tx_spinlock;
+ u16 *per_cpu_tx_qid;
+
+ /* Vector of workers for this interface */
+ u32 *workers;
+} vhost_user_intf_t;
+
+typedef struct
+{
+ u16 vhost_iface_index;
+ u16 qid;
+} vhost_iface_and_queue_t;
+
+typedef struct
+{
+ uword dst;
+ uword src;
+ u32 len;
+} vhost_copy_t;
+
+typedef struct
+{
+ u16 qid; /** The interface queue index (Not the virtio vring idx) */
+ u16 device_index; /** The device index */
+ u32 virtio_ring_flags; /** Runtime queue flags **/
+ u16 first_desc_len; /** Length of the first data descriptor **/
+ virtio_net_hdr_mrg_rxbuf_t hdr; /** Virtio header **/
+} vhost_trace_t;
+
+
+#define VHOST_USER_RX_BUFFERS_N (2 * VLIB_FRAME_SIZE + 2)
+#define VHOST_USER_COPY_ARRAY_N (4 * VLIB_FRAME_SIZE)
+
+typedef struct
+{
+ vhost_iface_and_queue_t *rx_queues;
+ u32 rx_buffers_len;
+ u32 rx_buffers[VHOST_USER_RX_BUFFERS_N];
+
+ virtio_net_hdr_mrg_rxbuf_t tx_headers[VLIB_FRAME_SIZE];
+ vhost_copy_t copy[VHOST_USER_COPY_ARRAY_N];
+
+ /* This is here so it doesn't end-up
+ * using stack or registers. */
+ vhost_trace_t *current_trace;
+} vhost_cpu_t;
+
+typedef struct
+{
+ u32 mtu_bytes;
+ vhost_user_intf_t *vhost_user_interfaces;
+ u32 *show_dev_instance_by_real_dev_instance;
+ u32 coalesce_frames;
+ f64 coalesce_time;
+ int dont_dump_vhost_user_memory;
+
+ /** first cpu index */
+ u32 input_cpu_first_index;
+
+ /** total cpu count */
+ u32 input_cpu_count;
+
+ /** Per-CPU data for vhost-user */
+ vhost_cpu_t *cpus;
+
+ /** Pseudo random iterator */
+ u32 random;
+} vhost_user_main_t;
+
+typedef struct
+{
+ u8 if_name[64];
+ u32 sw_if_index;
+ u32 virtio_net_hdr_sz;
+ u64 features;
+ u8 is_server;
+ u8 sock_filename[256];
+ u32 num_regions;
+ int sock_errno;
+} vhost_user_intf_details_t;
+
+int vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_details_t ** out_vuids);
+
+// CLI commands to be used from dpdk
+clib_error_t *vhost_user_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+clib_error_t *vhost_user_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+clib_error_t *show_vhost_user_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+
+#endif
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/virtio/vhost_user.api b/src/vnet/devices/virtio/vhost_user.api
new file mode 100644
index 00000000000..21e42298361
--- /dev/null
+++ b/src/vnet/devices/virtio/vhost_user.api
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2015-2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** \brief vhost-user interface create request
+ @param client_index - opaque cookie to identify the sender
+ @param is_server - our side is socket server
+ @param sock_filename - unix socket filename, used to speak with frontend
+ @param use_custom_mac - enable or disable the use of the provided hardware address
+ @param mac_address - hardware address to use if 'use_custom_mac' is set
+*/
+define create_vhost_user_if
+{
+ u32 client_index;
+ u32 context;
+ u8 is_server;
+ u8 sock_filename[256];
+ u8 renumber;
+ u32 custom_dev_instance;
+ u8 use_custom_mac;
+ u8 mac_address[6];
+ u8 tag[64];
+};
+
+/** \brief vhost-user interface create response
+ @param context - sender context, to match reply w/ request
+ @param retval - return code for the request
+ @param sw_if_index - interface the operation is applied to
+*/
+define create_vhost_user_if_reply
+{
+ u32 context;
+ i32 retval;
+ u32 sw_if_index;
+};
+
+/** \brief vhost-user interface modify request
+ @param client_index - opaque cookie to identify the sender
+ @param is_server - our side is socket server
+ @param sock_filename - unix socket filename, used to speak with frontend
+*/
+define modify_vhost_user_if
+{
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u8 is_server;
+ u8 sock_filename[256];
+ u8 renumber;
+ u32 custom_dev_instance;
+};
+
+/** \brief vhost-user interface modify response
+ @param context - sender context, to match reply w/ request
+ @param retval - return code for the request
+*/
+define modify_vhost_user_if_reply
+{
+ u32 context;
+ i32 retval;
+};
+
+/** \brief vhost-user interface delete request
+ @param client_index - opaque cookie to identify the sender
+*/
+define delete_vhost_user_if
+{
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+};
+
+/** \brief vhost-user interface delete response
+ @param context - sender context, to match reply w/ request
+ @param retval - return code for the request
+*/
+define delete_vhost_user_if_reply
+{
+ u32 context;
+ i32 retval;
+};
+
+/** \brief Vhost-user interface details structure (fix this)
+ @param sw_if_index - index of the interface
+ @param interface_name - name of interface
+ @param virtio_net_hdr_sz - net header size
+ @param features - interface features
+ @param is_server - vhost-user server socket
+ @param sock_filename - socket filename
+ @param num_regions - number of used memory regions
+*/
+define sw_interface_vhost_user_details
+{
+ u32 context;
+ u32 sw_if_index;
+ u8 interface_name[64];
+ u32 virtio_net_hdr_sz;
+ u64 features;
+ u8 is_server;
+ u8 sock_filename[256];
+ u32 num_regions;
+ i32 sock_errno;
+};
+
+define sw_interface_vhost_user_dump
+{
+ u32 client_index;
+ u32 context;
+};
+/*
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/virtio/vhost_user_api.c b/src/vnet/devices/virtio/vhost_user_api.c
new file mode 100644
index 00000000000..dd517c26f55
--- /dev/null
+++ b/src/vnet/devices/virtio/vhost_user_api.c
@@ -0,0 +1,262 @@
+/*
+ *------------------------------------------------------------------
+ * vhost-user_api.c - vhost-user api
+ *
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/vnet.h>
+#include <vlibmemory/api.h>
+
+#include <vnet/interface.h>
+#include <vnet/api_errno.h>
+#include <vnet/devices/virtio/vhost-user.h>
+
+#include <vnet/vnet_msg_enum.h>
+
+#define vl_typedefs /* define message structures */
+#include <vnet/vnet_all_api_h.h>
+#undef vl_typedefs
+
+#define vl_endianfun /* define message structures */
+#include <vnet/vnet_all_api_h.h>
+#undef vl_endianfun
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vnet/vnet_all_api_h.h>
+#undef vl_printfun
+
+#include <vlibapi/api_helper_macros.h>
+
+#define foreach_vpe_api_msg \
+_(CREATE_VHOST_USER_IF, create_vhost_user_if) \
+_(MODIFY_VHOST_USER_IF, modify_vhost_user_if) \
+_(DELETE_VHOST_USER_IF, delete_vhost_user_if) \
+_(SW_INTERFACE_VHOST_USER_DUMP, sw_interface_vhost_user_dump) \
+_(SW_INTERFACE_VHOST_USER_DETAILS, sw_interface_vhost_user_details)
+
+/*
+ * WARNING: replicated pending api refactor completion
+ */
+static void
+send_sw_interface_flags_deleted (vpe_api_main_t * am,
+ unix_shared_memory_queue_t * q,
+ u32 sw_if_index)
+{
+ vl_api_sw_interface_set_flags_t *mp;
+
+ mp = vl_msg_api_alloc (sizeof (*mp));
+ memset (mp, 0, sizeof (*mp));
+ mp->_vl_msg_id = ntohs (VL_API_SW_INTERFACE_SET_FLAGS);
+ mp->sw_if_index = ntohl (sw_if_index);
+
+ mp->admin_up_down = 0;
+ mp->link_up_down = 0;
+ mp->deleted = 1;
+ vl_msg_api_send_shmem (q, (u8 *) & mp);
+}
+
+static void
+vl_api_create_vhost_user_if_t_handler (vl_api_create_vhost_user_if_t * mp)
+{
+ int rv = 0;
+ vl_api_create_vhost_user_if_reply_t *rmp;
+ u32 sw_if_index = (u32) ~ 0;
+ vnet_main_t *vnm = vnet_get_main ();
+ vlib_main_t *vm = vlib_get_main ();
+
+ rv = vhost_user_create_if (vnm, vm, (char *) mp->sock_filename,
+ mp->is_server, &sw_if_index, (u64) ~ 0,
+ mp->renumber, ntohl (mp->custom_dev_instance),
+ (mp->use_custom_mac) ? mp->mac_address : NULL);
+
+ /* Remember an interface tag for the new interface */
+ if (rv == 0)
+ {
+ /* If a tag was supplied... */
+ if (mp->tag[0])
+ {
+ /* Make sure it's a proper C-string */
+ mp->tag[ARRAY_LEN (mp->tag) - 1] = 0;
+ u8 *tag = format (0, "%s%c", mp->tag, 0);
+ vnet_set_sw_interface_tag (vnm, tag, sw_if_index);
+ }
+ }
+
+ /* *INDENT-OFF* */
+ REPLY_MACRO2(VL_API_CREATE_VHOST_USER_IF_REPLY,
+ ({
+ rmp->sw_if_index = ntohl (sw_if_index);
+ }));
+ /* *INDENT-ON* */
+}
+
+static void
+vl_api_modify_vhost_user_if_t_handler (vl_api_modify_vhost_user_if_t * mp)
+{
+ int rv = 0;
+ vl_api_modify_vhost_user_if_reply_t *rmp;
+ u32 sw_if_index = ntohl (mp->sw_if_index);
+
+ vnet_main_t *vnm = vnet_get_main ();
+ vlib_main_t *vm = vlib_get_main ();
+
+ rv = vhost_user_modify_if (vnm, vm, (char *) mp->sock_filename,
+ mp->is_server, sw_if_index, (u64) ~ 0,
+ mp->renumber, ntohl (mp->custom_dev_instance));
+
+ REPLY_MACRO (VL_API_MODIFY_VHOST_USER_IF_REPLY);
+}
+
+static void
+vl_api_delete_vhost_user_if_t_handler (vl_api_delete_vhost_user_if_t * mp)
+{
+ int rv = 0;
+ vl_api_delete_vhost_user_if_reply_t *rmp;
+ vpe_api_main_t *vam = &vpe_api_main;
+ u32 sw_if_index = ntohl (mp->sw_if_index);
+
+ vnet_main_t *vnm = vnet_get_main ();
+ vlib_main_t *vm = vlib_get_main ();
+
+ rv = vhost_user_delete_if (vnm, vm, sw_if_index);
+
+ REPLY_MACRO (VL_API_DELETE_VHOST_USER_IF_REPLY);
+ if (!rv)
+ {
+ unix_shared_memory_queue_t *q =
+ vl_api_client_index_to_input_queue (mp->client_index);
+ if (!q)
+ return;
+
+ vnet_clear_sw_interface_tag (vnm, sw_if_index);
+ send_sw_interface_flags_deleted (vam, q, sw_if_index);
+ }
+}
+
+static void
+ vl_api_sw_interface_vhost_user_details_t_handler
+ (vl_api_sw_interface_vhost_user_details_t * mp)
+{
+ clib_warning ("BUG");
+}
+
+static void
+send_sw_interface_vhost_user_details (vpe_api_main_t * am,
+ unix_shared_memory_queue_t * q,
+ vhost_user_intf_details_t * vui,
+ u32 context)
+{
+ vl_api_sw_interface_vhost_user_details_t *mp;
+
+ mp = vl_msg_api_alloc (sizeof (*mp));
+ memset (mp, 0, sizeof (*mp));
+ mp->_vl_msg_id = ntohs (VL_API_SW_INTERFACE_VHOST_USER_DETAILS);
+ mp->sw_if_index = ntohl (vui->sw_if_index);
+ mp->virtio_net_hdr_sz = ntohl (vui->virtio_net_hdr_sz);
+ mp->features = clib_net_to_host_u64 (vui->features);
+ mp->is_server = vui->is_server;
+ mp->num_regions = ntohl (vui->num_regions);
+ mp->sock_errno = ntohl (vui->sock_errno);
+ mp->context = context;
+
+ strncpy ((char *) mp->sock_filename,
+ (char *) vui->sock_filename, ARRAY_LEN (mp->sock_filename) - 1);
+ strncpy ((char *) mp->interface_name,
+ (char *) vui->if_name, ARRAY_LEN (mp->interface_name) - 1);
+
+ vl_msg_api_send_shmem (q, (u8 *) & mp);
+}
+
+static void
+ vl_api_sw_interface_vhost_user_dump_t_handler
+ (vl_api_sw_interface_vhost_user_dump_t * mp)
+{
+ int rv = 0;
+ vpe_api_main_t *am = &vpe_api_main;
+ vnet_main_t *vnm = vnet_get_main ();
+ vlib_main_t *vm = vlib_get_main ();
+ vhost_user_intf_details_t *ifaces = NULL;
+ vhost_user_intf_details_t *vuid = NULL;
+ unix_shared_memory_queue_t *q;
+
+ q = vl_api_client_index_to_input_queue (mp->client_index);
+ if (q == 0)
+ return;
+
+ rv = vhost_user_dump_ifs (vnm, vm, &ifaces);
+ if (rv)
+ return;
+
+ vec_foreach (vuid, ifaces)
+ {
+ send_sw_interface_vhost_user_details (am, q, vuid, mp->context);
+ }
+ vec_free (ifaces);
+}
+
+/*
+ * vhost-user_api_hookup
+ * Add vpe's API message handlers to the table.
+ * vlib has alread mapped shared memory and
+ * added the client registration handlers.
+ * See .../vlib-api/vlibmemory/memclnt_vlib.c:memclnt_process()
+ */
+#define vl_msg_name_crc_list
+#include <vnet/vnet_all_api_h.h>
+#undef vl_msg_name_crc_list
+
+static void
+setup_message_id_table (api_main_t * am)
+{
+#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id);
+ foreach_vl_msg_name_crc_vhost_user;
+#undef _
+}
+
+static clib_error_t *
+vhost_user_api_hookup (vlib_main_t * vm)
+{
+ api_main_t *am = &api_main;
+
+#define _(N,n) \
+ vl_msg_api_set_handlers(VL_API_##N, #n, \
+ vl_api_##n##_t_handler, \
+ vl_noop_handler, \
+ vl_api_##n##_t_endian, \
+ vl_api_##n##_t_print, \
+ sizeof(vl_api_##n##_t), 1);
+ foreach_vpe_api_msg;
+#undef _
+
+ /*
+ * Set up the (msg_name, crc, message-id) table
+ */
+ setup_message_id_table (am);
+
+ return 0;
+}
+
+VLIB_API_INIT_FUNCTION (vhost_user_api_hookup);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */