summaryrefslogtreecommitdiffstats
path: root/src/vnet/devices/dpdk
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2016-12-19 23:05:39 +0100
committerDamjan Marion <damarion@cisco.com>2016-12-28 12:25:14 +0100
commit7cd468a3d7dee7d6c92f69a0bb7061ae208ec727 (patch)
tree5de62f8dbd3a752f5a676ca600e43d2652d1ff1a /src/vnet/devices/dpdk
parent696f1adec0df3b8f161862566dd9c86174302658 (diff)
Reorganize source tree to use single autotools instance
Change-Id: I7b51f88292e057c6443b12224486f2d0c9f8ae23 Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/vnet/devices/dpdk')
-rw-r--r--src/vnet/devices/dpdk/cli.c1296
-rw-r--r--src/vnet/devices/dpdk/device.c840
-rw-r--r--src/vnet/devices/dpdk/dpdk.h534
-rw-r--r--src/vnet/devices/dpdk/dpdk_priv.h132
-rw-r--r--src/vnet/devices/dpdk/format.c763
-rw-r--r--src/vnet/devices/dpdk/hqos.c775
-rwxr-xr-xsrc/vnet/devices/dpdk/init.c1803
-rw-r--r--src/vnet/devices/dpdk/ipsec/cli.c141
-rw-r--r--src/vnet/devices/dpdk/ipsec/crypto_node.c210
-rw-r--r--src/vnet/devices/dpdk/ipsec/dir.dox18
-rw-r--r--src/vnet/devices/dpdk/ipsec/dpdk_crypto_ipsec_doc.md73
-rw-r--r--src/vnet/devices/dpdk/ipsec/esp.h295
-rw-r--r--src/vnet/devices/dpdk/ipsec/esp_decrypt.c583
-rw-r--r--src/vnet/devices/dpdk/ipsec/esp_encrypt.c598
-rw-r--r--src/vnet/devices/dpdk/ipsec/ipsec.c313
-rw-r--r--src/vnet/devices/dpdk/ipsec/ipsec.h227
-rw-r--r--src/vnet/devices/dpdk/node.c687
-rw-r--r--src/vnet/devices/dpdk/qos_doc.md404
18 files changed, 9692 insertions, 0 deletions
diff --git a/src/vnet/devices/dpdk/cli.c b/src/vnet/devices/dpdk/cli.c
new file mode 100644
index 00000000000..538a00fd975
--- /dev/null
+++ b/src/vnet/devices/dpdk/cli.c
@@ -0,0 +1,1296 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls/packet.h>
+
+#include "dpdk_priv.h"
+
+static clib_error_t *
+pcap_trace_command_fn (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u8 *filename;
+ u32 max;
+ int matched = 0;
+ clib_error_t *error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "on"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ if (dm->pcap_filename == 0)
+ dm->pcap_filename = format (0, "/tmp/vpe.pcap%c", 0);
+
+ memset (&dm->pcap_main, 0, sizeof (dm->pcap_main));
+ dm->pcap_main.file_name = (char *) dm->pcap_filename;
+ dm->pcap_main.n_packets_to_capture = 100;
+ if (dm->pcap_pkts_to_capture)
+ dm->pcap_main.n_packets_to_capture = dm->pcap_pkts_to_capture;
+
+ dm->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet;
+ dm->tx_pcap_enable = 1;
+ matched = 1;
+ vlib_cli_output (vm, "pcap tx capture on...");
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already on...");
+ }
+ matched = 1;
+ }
+ else if (unformat (input, "off"))
+ {
+ if (dm->tx_pcap_enable)
+ {
+ vlib_cli_output (vm, "captured %d pkts...",
+ dm->pcap_main.n_packets_captured + 1);
+ if (dm->pcap_main.n_packets_captured)
+ {
+ dm->pcap_main.n_packets_to_capture =
+ dm->pcap_main.n_packets_captured;
+ error = pcap_write (&dm->pcap_main);
+ if (error)
+ clib_error_report (error);
+ else
+ vlib_cli_output (vm, "saved to %s...", dm->pcap_filename);
+ }
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already off...");
+ }
+
+ dm->tx_pcap_enable = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "max %d", &max))
+ {
+ dm->pcap_pkts_to_capture = max;
+ matched = 1;
+ }
+
+ else if (unformat (input, "intfc %U",
+ unformat_vnet_sw_interface, dm->vnet_main,
+ &dm->pcap_sw_if_index))
+ matched = 1;
+ else if (unformat (input, "intfc any"))
+ {
+ dm->pcap_sw_if_index = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "file %s", &filename))
+ {
+ u8 *chroot_filename;
+ /* Brain-police user path input */
+ if (strstr ((char *) filename, "..")
+ || index ((char *) filename, '/'))
+ {
+ vlib_cli_output (vm, "illegal characters in filename '%s'",
+ filename);
+ continue;
+ }
+
+ chroot_filename = format (0, "/tmp/%s%c", filename, 0);
+ vec_free (filename);
+
+ if (dm->pcap_filename)
+ vec_free (dm->pcap_filename);
+ vec_add1 (filename, 0);
+ dm->pcap_filename = chroot_filename;
+ matched = 1;
+ }
+ else if (unformat (input, "status"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ vlib_cli_output (vm, "pcap tx capture is off...");
+ continue;
+ }
+
+ vlib_cli_output (vm, "pcap tx capture: %d of %d pkts...",
+ dm->pcap_main.n_packets_captured,
+ dm->pcap_main.n_packets_to_capture);
+ matched = 1;
+ }
+
+ else
+ break;
+ }
+
+ if (matched == 0)
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (pcap_trace_command, static) = {
+ .path = "pcap tx trace",
+ .short_help =
+ "pcap tx trace on off max <nn> intfc <intfc> file <name> status",
+ .function = pcap_trace_command_fn,
+};
+/* *INDENT-ON* */
+
+
+static clib_error_t *
+show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ struct rte_mempool *rmp;
+ int i;
+
+ for (i = 0; i < vec_len (vm->buffer_main->pktmbuf_pools); i++)
+ {
+ rmp = vm->buffer_main->pktmbuf_pools[i];
+ if (rmp)
+ {
+ unsigned count = rte_mempool_avail_count (rmp);
+ unsigned free_count = rte_mempool_in_use_count (rmp);
+
+ vlib_cli_output (vm,
+ "name=\"%s\" available = %7d allocated = %7d total = %7d\n",
+ rmp->name, (u32) count, (u32) free_count,
+ (u32) (count + free_count));
+ }
+ else
+ {
+ vlib_cli_output (vm, "rte_mempool is NULL (!)\n");
+ }
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_bufferr,static) = {
+ .path = "show dpdk buffer",
+ .short_help = "show dpdk buffer state",
+ .function = show_dpdk_buffer,
+ .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+test_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ static u32 *allocated_buffers;
+ u32 n_alloc = 0;
+ u32 n_free = 0;
+ u32 first, actual_alloc;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "allocate %d", &n_alloc))
+ ;
+ else if (unformat (input, "free %d", &n_free))
+ ;
+ else
+ break;
+ }
+
+ if (n_free)
+ {
+ if (vec_len (allocated_buffers) < n_free)
+ return clib_error_return (0, "Can't free %d, only %d allocated",
+ n_free, vec_len (allocated_buffers));
+
+ first = vec_len (allocated_buffers) - n_free;
+ vlib_buffer_free (vm, allocated_buffers + first, n_free);
+ _vec_len (allocated_buffers) = first;
+ }
+ if (n_alloc)
+ {
+ first = vec_len (allocated_buffers);
+ vec_validate (allocated_buffers,
+ vec_len (allocated_buffers) + n_alloc - 1);
+
+ actual_alloc = vlib_buffer_alloc (vm, allocated_buffers + first,
+ n_alloc);
+ _vec_len (allocated_buffers) = first + actual_alloc;
+
+ if (actual_alloc < n_alloc)
+ vlib_cli_output (vm, "WARNING: only allocated %d buffers",
+ actual_alloc);
+ }
+
+ vlib_cli_output (vm, "Currently %d buffers allocated",
+ vec_len (allocated_buffers));
+
+ if (allocated_buffers && vec_len (allocated_buffers) == 0)
+ vec_free (allocated_buffers);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_test_dpdk_buffer,static) = {
+ .path = "test dpdk buffer",
+ .short_help = "test dpdk buffer [allocate <nn>][free <nn>]",
+ .function = test_dpdk_buffer,
+ .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 nb_rx_desc = (u32) ~ 0;
+ u32 nb_tx_desc = (u32) ~ 0;
+ clib_error_t *rv;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "tx %d", &nb_tx_desc))
+ ;
+ else if (unformat (line_input, "rx %d", &nb_rx_desc))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ return clib_error_return (0, "number of descriptors can be set only for "
+ "physical devices");
+
+ if ((nb_rx_desc == (u32) ~ 0 || nb_rx_desc == xd->nb_rx_desc) &&
+ (nb_tx_desc == (u32) ~ 0 || nb_tx_desc == xd->nb_tx_desc))
+ return clib_error_return (0, "nothing changed");
+
+ if (nb_rx_desc != (u32) ~ 0)
+ xd->nb_rx_desc = nb_rx_desc;
+
+ if (nb_tx_desc != (u32) ~ 0)
+ xd->nb_tx_desc = nb_tx_desc;
+
+ rv = dpdk_port_setup (dm, xd);
+
+ return rv;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = {
+ .path = "set dpdk interface descriptors",
+ .short_help = "set dpdk interface descriptors <if-name> [rx <n>] [tx <n>]",
+ .function = set_dpdk_if_desc,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ int cpu;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_cli_output (vm, "All interfaces are handled by main thread");
+
+ for (cpu = 0; cpu < vec_len (dm->devices_by_cpu); cpu++)
+ {
+ if (vec_len (dm->devices_by_cpu[cpu]))
+ vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu,
+ vlib_worker_threads[cpu].name,
+ vlib_worker_threads[cpu].lcore_id);
+
+ /* *INDENT-OFF* */
+ vec_foreach(dq, dm->devices_by_cpu[cpu])
+ {
+ u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index;
+ vnet_hw_interface_t * hi = vnet_get_hw_interface(dm->vnet_main, hw_if_index);
+ vlib_cli_output(vm, " %v queue %u", hi->name, dq->queue_id);
+ }
+ /* *INDENT-ON* */
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_placement,static) = {
+ .path = "show dpdk interface placement",
+ .short_help = "show dpdk interface placement",
+ .function = show_dpdk_if_placement,
+};
+/* *INDENT-ON* */
+
+static int
+dpdk_device_queue_sort (void *a1, void *a2)
+{
+ dpdk_device_and_queue_t *dq1 = a1;
+ dpdk_device_and_queue_t *dq2 = a2;
+
+ if (dq1->device > dq2->device)
+ return 1;
+ else if (dq1->device < dq2->device)
+ return -1;
+ else if (dq1->queue_id > dq2->queue_id)
+ return 1;
+ else if (dq1->queue_id < dq2->queue_id)
+ return -1;
+ else
+ return 0;
+}
+
+static clib_error_t *
+set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 queue = (u32) 0;
+ u32 cpu = (u32) ~ 0;
+ int i;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "queue %d", &queue))
+ ;
+ else if (unformat (line_input, "thread %d", &cpu))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ if (cpu < dm->input_cpu_first_index ||
+ cpu >= (dm->input_cpu_first_index + dm->input_cpu_count))
+ return clib_error_return (0, "please specify valid thread id");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ for (i = 0; i < vec_len (dm->devices_by_cpu); i++)
+ {
+ /* *INDENT-OFF* */
+ vec_foreach(dq, dm->devices_by_cpu[i])
+ {
+ if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index &&
+ queue == dq->queue_id)
+ {
+ if (cpu == i) /* nothing to do */
+ return 0;
+
+ vec_del1(dm->devices_by_cpu[i], dq - dm->devices_by_cpu[i]);
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->queue_id = queue;
+ dq->device = xd->device_index;
+ xd->cpu_socket_id_by_queue[queue] =
+ rte_lcore_to_socket_id(vlib_worker_threads[cpu].lcore_id);
+
+ vec_sort_with_function(dm->devices_by_cpu[i],
+ dpdk_device_queue_sort);
+
+ vec_sort_with_function(dm->devices_by_cpu[cpu],
+ dpdk_device_queue_sort);
+
+ if (vec_len(dm->devices_by_cpu[i]) == 0)
+ vlib_node_set_state (vlib_mains[i], dpdk_input_node.index,
+ VLIB_NODE_STATE_DISABLED);
+
+ if (vec_len(dm->devices_by_cpu[cpu]) == 1)
+ vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ return 0;
+ }
+ }
+ /* *INDENT-ON* */
+ }
+
+ return clib_error_return (0, "not found");
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = {
+ .path = "set dpdk interface placement",
+ .short_help = "set dpdk interface placement <if-name> [queue <n>] thread <n>",
+ .function = set_dpdk_if_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ int cpu;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_cli_output (vm, "All interfaces are handled by main thread");
+
+ for (cpu = 0; cpu < vec_len (dm->devices_by_hqos_cpu); cpu++)
+ {
+ if (vec_len (dm->devices_by_hqos_cpu[cpu]))
+ vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu,
+ vlib_worker_threads[cpu].name,
+ vlib_worker_threads[cpu].lcore_id);
+
+ vec_foreach (dq, dm->devices_by_hqos_cpu[cpu])
+ {
+ u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index;
+ vnet_hw_interface_t *hi =
+ vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ vlib_cli_output (vm, " %v queue %u", hi->name, dq->queue_id);
+ }
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos_placement, static) = {
+ .path = "show dpdk interface hqos placement",
+ .short_help = "show dpdk interface hqos placement",
+ .function = show_dpdk_if_hqos_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 cpu = (u32) ~ 0;
+ int i;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "thread %d", &cpu))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ if (cpu < dm->hqos_cpu_first_index ||
+ cpu >= (dm->hqos_cpu_first_index + dm->hqos_cpu_count))
+ return clib_error_return (0, "please specify valid thread id");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ for (i = 0; i < vec_len (dm->devices_by_hqos_cpu); i++)
+ {
+ vec_foreach (dq, dm->devices_by_hqos_cpu[i])
+ {
+ if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index)
+ {
+ if (cpu == i) /* nothing to do */
+ return 0;
+
+ vec_del1 (dm->devices_by_hqos_cpu[i],
+ dq - dm->devices_by_hqos_cpu[i]);
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->queue_id = 0;
+ dq->device = xd->device_index;
+
+ vec_sort_with_function (dm->devices_by_hqos_cpu[i],
+ dpdk_device_queue_sort);
+
+ vec_sort_with_function (dm->devices_by_hqos_cpu[cpu],
+ dpdk_device_queue_sort);
+
+ return 0;
+ }
+ }
+ }
+
+ return clib_error_return (0, "not found");
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_placement, static) = {
+ .path = "set dpdk interface hqos placement",
+ .short_help = "set dpdk interface hqos placement <if-name> thread <n>",
+ .function = set_dpdk_if_hqos_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport_id = (u32) ~ 0;
+ u32 pipe_id = (u32) ~ 0;
+ u32 profile_id = (u32) ~ 0;
+ int rv;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "subport %d", &subport_id))
+ ;
+ else if (unformat (line_input, "pipe %d", &pipe_id))
+ ;
+ else if (unformat (line_input, "profile %d", &profile_id))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rv =
+ rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id,
+ profile_id);
+ if (rv)
+ return clib_error_return (0, "pipe configuration failed");
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pipe, static) =
+{
+ .path = "set dpdk interface hqos pipe",
+ .short_help = "set dpdk interface hqos pipe <if-name> subport <n> pipe <n> "
+ "profile <n>",
+ .function = set_dpdk_if_hqos_pipe,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_subport (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport_id = (u32) ~ 0;
+ struct rte_sched_subport_params p = {
+ .tb_rate = 1250000000, /* 10GbE */
+ .tb_size = 1000000,
+ .tc_rate = {1250000000, 1250000000, 1250000000, 1250000000},
+ .tc_period = 10,
+ };
+ int rv;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "subport %d", &subport_id))
+ ;
+ else if (unformat (line_input, "rate %d", &p.tb_rate))
+ {
+ p.tc_rate[0] = p.tb_rate;
+ p.tc_rate[1] = p.tb_rate;
+ p.tc_rate[2] = p.tb_rate;
+ p.tc_rate[3] = p.tb_rate;
+ }
+ else if (unformat (line_input, "bktsize %d", &p.tb_size))
+ ;
+ else if (unformat (line_input, "tc0 %d", &p.tc_rate[0]))
+ ;
+ else if (unformat (line_input, "tc1 %d", &p.tc_rate[1]))
+ ;
+ else if (unformat (line_input, "tc2 %d", &p.tc_rate[2]))
+ ;
+ else if (unformat (line_input, "tc3 %d", &p.tc_rate[3]))
+ ;
+ else if (unformat (line_input, "period %d", &p.tc_period))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport_id, &p);
+ if (rv)
+ return clib_error_return (0, "subport configuration failed");
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_subport, static) = {
+ .path = "set dpdk interface hqos subport",
+ .short_help = "set dpdk interface hqos subport <if-name> subport <n> "
+ "[rate <n>] [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] "
+ "[period <n>]",
+ .function = set_dpdk_if_hqos_subport,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 tc = (u32) ~ 0;
+ u32 queue = (u32) ~ 0;
+ u32 entry = (u32) ~ 0;
+ u32 val, i;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "entry %d", &entry))
+ ;
+ else if (unformat (line_input, "tc %d", &tc))
+ ;
+ else if (unformat (line_input, "queue %d", &queue))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+ if (entry >= 64)
+ return clib_error_return (0, "invalid entry");
+ if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE)
+ return clib_error_return (0, "invalid traffic class");
+ if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS)
+ return clib_error_return (0, "invalid traffic class");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ /* Detect the set of worker threads */
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ return clib_error_return (0, "no worker registrations?");
+
+ vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0];
+ int worker_thread_first = tr->first_index;
+ int worker_thread_count = tr->count;
+
+ val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue;
+ for (i = 0; i < worker_thread_count; i++)
+ xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val;
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_tctbl, static) = {
+ .path = "set dpdk interface hqos tctbl",
+ .short_help = "set dpdk interface hqos tctbl <if-name> entry <n> tc <n> queue <n>",
+ .function = set_dpdk_if_hqos_tctbl,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+
+ /* Device specific data */
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+
+ /* Detect the set of worker threads */
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ return clib_error_return (0, "no worker registrations?");
+
+ vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0];
+ int worker_thread_first = tr->first_index;
+ int worker_thread_count = tr->count;
+
+ /* Packet field configuration */
+ u64 mask = (u64) ~ 0;
+ u32 id = (u32) ~ 0;
+ u32 offset = (u32) ~ 0;
+
+ /* HQoS params */
+ u32 n_subports_per_port, n_pipes_per_subport, tctbl_size;
+
+ u32 i;
+
+ /* Parse input arguments */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "id %d", &id))
+ ;
+ else if (unformat (line_input, "offset %d", &offset))
+ ;
+ else if (unformat (line_input, "mask %llx", &mask))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ /* Get interface */
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ return 0;
+ }
+
+ n_subports_per_port = devconf->hqos.port.n_subports_per_port;
+ n_pipes_per_subport = devconf->hqos.port.n_pipes_per_subport;
+ tctbl_size = RTE_DIM (devconf->hqos.tc_table);
+
+ /* Validate packet field configuration: id, offset and mask */
+ if (id >= 3)
+ return clib_error_return (0, "invalid packet field id");
+
+ switch (id)
+ {
+ case 0:
+ if (dpdk_hqos_validate_mask (mask, n_subports_per_port) != 0)
+ return clib_error_return (0, "invalid subport ID mask "
+ "(n_subports_per_port = %u)",
+ n_subports_per_port);
+ break;
+ case 1:
+ if (dpdk_hqos_validate_mask (mask, n_pipes_per_subport) != 0)
+ return clib_error_return (0, "invalid pipe ID mask "
+ "(n_pipes_per_subport = %u)",
+ n_pipes_per_subport);
+ break;
+ case 2:
+ default:
+ if (dpdk_hqos_validate_mask (mask, tctbl_size) != 0)
+ return clib_error_return (0, "invalid TC table index mask "
+ "(TC table size = %u)", tctbl_size);
+ }
+
+ /* Propagate packet field configuration to all workers */
+ for (i = 0; i < worker_thread_count; i++)
+ switch (id)
+ {
+ case 0:
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabshr =
+ __builtin_ctzll (mask);
+ break;
+ case 1:
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabshr =
+ __builtin_ctzll (mask);
+ break;
+ case 2:
+ default:
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabshr =
+ __builtin_ctzll (mask);
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pktfield, static) = {
+ .path = "set dpdk interface hqos pktfield",
+ .short_help = "set dpdk interface hqos pktfield <if-name> id <n> offset <n> "
+ "mask <n>",
+ .function = set_dpdk_if_hqos_pktfield,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ dpdk_device_config_hqos_t *cfg;
+ dpdk_device_hqos_per_hqos_thread_t *ht;
+ dpdk_device_hqos_per_worker_thread_t *wk;
+ u32 *tctbl;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 profile_id, i;
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ vlib_thread_registration_t *tr;
+ uword *p = 0;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify interface name!!");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ return 0;
+ }
+
+ /* Detect the set of worker threads */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ return clib_error_return (0, "no worker registrations?");
+
+ tr = (vlib_thread_registration_t *) p[0];
+
+ cfg = &devconf->hqos;
+ ht = xd->hqos_ht;
+ wk = &xd->hqos_wt[tr->first_index];
+ tctbl = wk->hqos_tc_table;
+
+ vlib_cli_output (vm, " Thread:");
+ vlib_cli_output (vm, " Input SWQ size = %u packets", cfg->swq_size);
+ vlib_cli_output (vm, " Enqueue burst size = %u packets",
+ ht->hqos_burst_enq);
+ vlib_cli_output (vm, " Dequeue burst size = %u packets",
+ ht->hqos_burst_deq);
+
+ vlib_cli_output (vm,
+ " Packet field 0: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field0_slabpos, wk->hqos_field0_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 1: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field1_slabpos, wk->hqos_field1_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 2: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field2_slabpos, wk->hqos_field2_slabmask);
+ vlib_cli_output (vm, " Packet field 2 translation table:");
+ vlib_cli_output (vm, " [ 0 .. 15]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[0], tctbl[1], tctbl[2], tctbl[3],
+ tctbl[4], tctbl[5], tctbl[6], tctbl[7],
+ tctbl[8], tctbl[9], tctbl[10], tctbl[11],
+ tctbl[12], tctbl[13], tctbl[14], tctbl[15]);
+ vlib_cli_output (vm, " [16 .. 31]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[16], tctbl[17], tctbl[18], tctbl[19],
+ tctbl[20], tctbl[21], tctbl[22], tctbl[23],
+ tctbl[24], tctbl[25], tctbl[26], tctbl[27],
+ tctbl[28], tctbl[29], tctbl[30], tctbl[31]);
+ vlib_cli_output (vm, " [32 .. 47]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[32], tctbl[33], tctbl[34], tctbl[35],
+ tctbl[36], tctbl[37], tctbl[38], tctbl[39],
+ tctbl[40], tctbl[41], tctbl[42], tctbl[43],
+ tctbl[44], tctbl[45], tctbl[46], tctbl[47]);
+ vlib_cli_output (vm, " [48 .. 63]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[48], tctbl[49], tctbl[50], tctbl[51],
+ tctbl[52], tctbl[53], tctbl[54], tctbl[55],
+ tctbl[56], tctbl[57], tctbl[58], tctbl[59],
+ tctbl[60], tctbl[61], tctbl[62], tctbl[63]);
+
+ vlib_cli_output (vm, " Port:");
+ vlib_cli_output (vm, " Rate = %u bytes/second", cfg->port.rate);
+ vlib_cli_output (vm, " MTU = %u bytes", cfg->port.mtu);
+ vlib_cli_output (vm, " Frame overhead = %u bytes",
+ cfg->port.frame_overhead);
+ vlib_cli_output (vm, " Number of subports = %u",
+ cfg->port.n_subports_per_port);
+ vlib_cli_output (vm, " Number of pipes per subport = %u",
+ cfg->port.n_pipes_per_subport);
+ vlib_cli_output (vm,
+ " Packet queue size: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u packets",
+ cfg->port.qsize[0], cfg->port.qsize[1], cfg->port.qsize[2],
+ cfg->port.qsize[3]);
+ vlib_cli_output (vm, " Number of pipe profiles = %u",
+ cfg->port.n_pipe_profiles);
+
+ for (profile_id = 0; profile_id < vec_len (cfg->pipe); profile_id++)
+ {
+ vlib_cli_output (vm, " Pipe profile %u:", profile_id);
+ vlib_cli_output (vm, " Rate = %u bytes/second",
+ cfg->pipe[profile_id].tb_rate);
+ vlib_cli_output (vm, " Token bucket size = %u bytes",
+ cfg->pipe[profile_id].tb_size);
+ vlib_cli_output (vm,
+ " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second",
+ cfg->pipe[profile_id].tc_rate[0],
+ cfg->pipe[profile_id].tc_rate[1],
+ cfg->pipe[profile_id].tc_rate[2],
+ cfg->pipe[profile_id].tc_rate[3]);
+ vlib_cli_output (vm, " TC period = %u milliseconds",
+ cfg->pipe[profile_id].tc_period);
+#ifdef RTE_SCHED_SUBPORT_TC_OV
+ vlib_cli_output (vm, " TC3 oversubscription_weight = %u",
+ cfg->pipe[profile_id].tc_ov_weight);
+#endif
+
+ for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
+ {
+ vlib_cli_output (vm,
+ " TC%u WRR weights: Q0 = %u, Q1 = %u, Q2 = %u, Q3 = %u",
+ i, cfg->pipe[profile_id].wrr_weights[i * 4],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 1],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 2],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 3]);
+ }
+ }
+
+#ifdef RTE_SCHED_RED
+ vlib_cli_output (vm, " Weighted Random Early Detection (WRED):");
+ for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
+ {
+ vlib_cli_output (vm, " TC%u min: G = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].min_th,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].min_th,
+ cfg->port.red_params[i][e_RTE_METER_RED].min_th);
+
+ vlib_cli_output (vm, " TC%u max: G = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].max_th,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].max_th,
+ cfg->port.red_params[i][e_RTE_METER_RED].max_th);
+
+ vlib_cli_output (vm,
+ " TC%u inverted probability: G = %u, Y = %u, R = %u",
+ i, cfg->port.red_params[i][e_RTE_METER_GREEN].maxp_inv,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].maxp_inv,
+ cfg->port.red_params[i][e_RTE_METER_RED].maxp_inv);
+
+ vlib_cli_output (vm, " TC%u weight: R = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].wq_log2,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].wq_log2,
+ cfg->port.red_params[i][e_RTE_METER_RED].wq_log2);
+ }
+#endif
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos, static) = {
+ .path = "show dpdk interface hqos",
+ .short_help = "show dpdk interface hqos <if-name>",
+ .function = show_dpdk_if_hqos,
+};
+
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport = (u32) ~ 0;
+ u32 pipe = (u32) ~ 0;
+ u32 tc = (u32) ~ 0;
+ u32 tc_q = (u32) ~ 0;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ uword *p = 0;
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ u32 qindex;
+ struct rte_sched_queue_stats stats;
+ u16 qlen;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+
+ else if (unformat (line_input, "subport %d", &subport))
+ ;
+
+ else if (unformat (line_input, "pipe %d", &pipe))
+ ;
+
+ else if (unformat (line_input, "tc %d", &tc))
+ ;
+
+ else if (unformat (line_input, "tc_q %d", &tc_q))
+ ;
+
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify interface name!!");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ return 0;
+ }
+
+ /*
+ * Figure out which queue to query. cf rte_sched_port_qindex. (Not sure why
+ * that method isn't made public by DPDK - how _should_ we get the queue ID?)
+ */
+ qindex = subport * devconf->hqos.port.n_pipes_per_subport + pipe;
+ qindex = qindex * RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE + tc;
+ qindex = qindex * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + tc_q;
+
+ if (rte_sched_queue_read_stats (xd->hqos_ht->hqos, qindex, &stats, &qlen) !=
+ 0)
+ return clib_error_return (0, "failed to read stats");
+
+ vlib_cli_output (vm, "%=24s%=16s", "Stats Parameter", "Value");
+ vlib_cli_output (vm, "%=24s%=16d", "Packets", stats.n_pkts);
+ vlib_cli_output (vm, "%=24s%=16d", "Packets dropped", stats.n_pkts_dropped);
+#ifdef RTE_SCHED_RED
+ vlib_cli_output (vm, "%=24s%=16d", "Packets dropped (RED)",
+ stats.n_pkts_red_dropped);
+#endif
+ vlib_cli_output (vm, "%=24s%=16d", "Bytes", stats.n_bytes);
+ vlib_cli_output (vm, "%=24s%=16d", "Bytes dropped", stats.n_bytes_dropped);
+
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_hqos_queue_stats, static) = {
+ .path = "show dpdk hqos queue",
+ .short_help = "show dpdk hqos queue <if-name> subport <subport> pipe <pipe> tc <tc> tc_q <tc_q>",
+ .function = show_dpdk_hqos_queue_stats,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+dpdk_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (dpdk_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/device.c b/src/vnet/devices/dpdk/device.c
new file mode 100644
index 00000000000..b22fbf2e69e
--- /dev/null
+++ b/src/vnet/devices/dpdk/device.c
@@ -0,0 +1,840 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+#include <assert.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include "dpdk_priv.h"
+#include <vppinfra/error.h>
+
+#define foreach_dpdk_tx_func_error \
+ _(BAD_RETVAL, "DPDK tx function returned an error") \
+ _(RING_FULL, "Tx packet drops (ring full)") \
+ _(PKT_DROP, "Tx packet drops (dpdk tx failure)") \
+ _(REPL_FAIL, "Tx packet drops (replication failure)")
+
+typedef enum
+{
+#define _(f,s) DPDK_TX_FUNC_ERROR_##f,
+ foreach_dpdk_tx_func_error
+#undef _
+ DPDK_TX_FUNC_N_ERROR,
+} dpdk_tx_func_error_t;
+
+static char *dpdk_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_tx_func_error
+#undef _
+};
+
+clib_error_t *
+dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address)
+{
+ int error;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ error = rte_eth_dev_default_mac_addr_set (xd->device_index,
+ (struct ether_addr *) address);
+
+ if (error)
+ {
+ return clib_error_return (0, "mac address set failed: %d", error);
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+clib_error_t *
+dpdk_set_mc_filter (vnet_hw_interface_t * hi,
+ struct ether_addr mc_addr_vec[], int naddr)
+{
+ int error;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ error = rte_eth_dev_set_mc_addr_list (xd->device_index, mc_addr_vec, naddr);
+
+ if (error)
+ {
+ return clib_error_return (0, "mc addr list failed: %d", error);
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+struct rte_mbuf *
+dpdk_replicate_packet_mb (vlib_buffer_t * b)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_buffer_main_t *bm = vm->buffer_main;
+ struct rte_mbuf **mbufs = 0, *s, *d;
+ u8 nb_segs;
+ unsigned socket_id = rte_socket_id ();
+ int i;
+
+ ASSERT (bm->pktmbuf_pools[socket_id]);
+ s = rte_mbuf_from_vlib_buffer (b);
+ nb_segs = s->nb_segs;
+ vec_validate (mbufs, nb_segs - 1);
+
+ if (rte_pktmbuf_alloc_bulk (bm->pktmbuf_pools[socket_id], mbufs, nb_segs))
+ {
+ vec_free (mbufs);
+ return 0;
+ }
+
+ d = mbufs[0];
+ d->nb_segs = s->nb_segs;
+ d->data_len = s->data_len;
+ d->pkt_len = s->pkt_len;
+ d->data_off = s->data_off;
+ clib_memcpy (d->buf_addr, s->buf_addr, RTE_PKTMBUF_HEADROOM + s->data_len);
+
+ for (i = 1; i < nb_segs; i++)
+ {
+ d->next = mbufs[i];
+ d = mbufs[i];
+ s = s->next;
+ d->data_len = s->data_len;
+ clib_memcpy (d->buf_addr, s->buf_addr,
+ RTE_PKTMBUF_HEADROOM + s->data_len);
+ }
+
+ d = mbufs[0];
+ vec_free (mbufs);
+ return d;
+}
+
+static void
+dpdk_tx_trace_buffer (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id, u32 buffer_index, vlib_buffer_t * buffer)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ dpdk_tx_dma_trace_t *t0;
+ struct rte_mbuf *mb;
+
+ mb = rte_mbuf_from_vlib_buffer (buffer);
+
+ t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = buffer_index;
+ clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
+ clib_memcpy (&t0->buffer, buffer,
+ sizeof (buffer[0]) - sizeof (buffer->pre_data));
+ clib_memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data,
+ sizeof (t0->buffer.pre_data));
+}
+
+static_always_inline void
+dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b,
+ int maybe_multiseg)
+{
+ struct rte_mbuf *mb, *first_mb, *last_mb;
+
+ /* buffer is coming from non-dpdk source so we need to init
+ rte_mbuf header */
+ if (PREDICT_FALSE ((b->flags & VNET_BUFFER_RTE_MBUF_VALID) == 0))
+ {
+ vlib_buffer_t *b2 = b;
+ last_mb = mb = rte_mbuf_from_vlib_buffer (b2);
+ rte_pktmbuf_reset (mb);
+ while (maybe_multiseg && (b2->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ b2 = vlib_get_buffer (vm, b2->next_buffer);
+ mb = rte_mbuf_from_vlib_buffer (b2);
+ last_mb->next = mb;
+ last_mb = mb;
+ rte_pktmbuf_reset (mb);
+ }
+ }
+
+ first_mb = mb = rte_mbuf_from_vlib_buffer (b);
+ first_mb->nb_segs = 1;
+ mb->data_len = b->current_length;
+ mb->pkt_len = maybe_multiseg ? vlib_buffer_length_in_chain (vm, b) :
+ b->current_length;
+ mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data;
+
+ while (maybe_multiseg && (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ b = vlib_get_buffer (vm, b->next_buffer);
+ mb = rte_mbuf_from_vlib_buffer (b);
+ mb->data_len = b->current_length;
+ mb->pkt_len = b->current_length;
+ mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data;
+ first_mb->nb_segs++;
+ }
+}
+
+/*
+ * This function calls the dpdk's tx_burst function to transmit the packets
+ * on the tx_vector. It manages a lock per-device if the device does not
+ * support multiple queues. It returns the number of packets untransmitted
+ * on the tx_vector. If all packets are transmitted (the normal case), the
+ * function returns 0.
+ *
+ * The function assumes there is at least one packet on the tx_vector.
+ */
+static_always_inline
+ u32 tx_burst_vector_internal (vlib_main_t * vm,
+ dpdk_device_t * xd,
+ struct rte_mbuf **tx_vector)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 n_packets;
+ u32 tx_head;
+ u32 tx_tail;
+ u32 n_retry;
+ int rv;
+ int queue_id;
+ tx_ring_hdr_t *ring;
+
+ ring = vec_header (tx_vector, sizeof (*ring));
+
+ n_packets = ring->tx_head - ring->tx_tail;
+
+ tx_head = ring->tx_head % xd->nb_tx_desc;
+
+ /*
+ * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to
+ * unpredictable results.
+ */
+ ASSERT (n_packets > 0);
+
+ /*
+ * Check for tx_vector overflow. If this fails it is a system configuration
+ * error. The ring should be sized big enough to handle the largest un-flowed
+ * off burst from a traffic manager. A larger size also helps performance
+ * a bit because it decreases the probability of having to issue two tx_burst
+ * calls due to a ring wrap.
+ */
+ ASSERT (n_packets < xd->nb_tx_desc);
+ ASSERT (ring->tx_tail == 0);
+
+ n_retry = 16;
+ queue_id = vm->cpu_index;
+
+ do
+ {
+ /* start the burst at the tail */
+ tx_tail = ring->tx_tail % xd->nb_tx_desc;
+
+ /*
+ * This device only supports one TX queue,
+ * and we're running multi-threaded...
+ */
+ if (PREDICT_FALSE (xd->lockp != 0))
+ {
+ queue_id = queue_id % xd->tx_q_used;
+ while (__sync_lock_test_and_set (xd->lockp[queue_id], 1))
+ /* zzzz */
+ queue_id = (queue_id + 1) % xd->tx_q_used;
+ }
+
+ if (PREDICT_FALSE (xd->flags & DPDK_DEVICE_FLAG_HQOS)) /* HQoS ON */
+ {
+ /* no wrap, transmit in one burst */
+ dpdk_device_hqos_per_worker_thread_t *hqos =
+ &xd->hqos_wt[vm->cpu_index];
+
+ ASSERT (hqos->swq != NULL);
+
+ dpdk_hqos_metadata_set (hqos,
+ &tx_vector[tx_tail], tx_head - tx_tail);
+ rv = rte_ring_sp_enqueue_burst (hqos->swq,
+ (void **) &tx_vector[tx_tail],
+ (uint16_t) (tx_head - tx_tail));
+ }
+ else if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_eth_tx_burst (xd->device_index,
+ (uint16_t) queue_id,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head - tx_tail));
+ }
+ else
+ {
+ ASSERT (0);
+ rv = 0;
+ }
+
+ if (PREDICT_FALSE (xd->lockp != 0))
+ *xd->lockp[queue_id] = 0;
+
+ if (PREDICT_FALSE (rv < 0))
+ {
+ // emit non-fatal message, bump counter
+ vnet_main_t *vnm = dm->vnet_main;
+ vnet_interface_main_t *im = &vnm->interface_main;
+ u32 node_index;
+
+ node_index = vec_elt_at_index (im->hw_interfaces,
+ xd->vlib_hw_if_index)->tx_node_index;
+
+ vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1);
+ clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index,
+ rv);
+ return n_packets; // untransmitted packets
+ }
+ ring->tx_tail += (u16) rv;
+ n_packets -= (uint16_t) rv;
+ }
+ while (rv && n_packets && (n_retry > 0));
+
+ return n_packets;
+}
+
+static_always_inline void
+dpdk_prefetch_buffer_by_index (vlib_main_t * vm, u32 bi)
+{
+ vlib_buffer_t *b;
+ struct rte_mbuf *mb;
+ b = vlib_get_buffer (vm, bi);
+ mb = rte_mbuf_from_vlib_buffer (b);
+ CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+static_always_inline void
+dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 my_cpu = vm->cpu_index;
+ struct rte_mbuf *mb_new;
+
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0)
+ return;
+
+ mb_new = dpdk_replicate_packet_mb (b);
+ if (PREDICT_FALSE (mb_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ *mbp = mb_new;
+
+ vec_add1 (dm->recycle[my_cpu], bi);
+}
+
+/*
+ * Transmits the packets on the frame to the interface associated with the
+ * node. It first copies packets on the frame to a tx_vector containing the
+ * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal
+ * which calls the dpdk tx_burst function.
+ */
+static uword
+dpdk_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, rd->dev_instance);
+ u32 n_packets = f->n_vectors;
+ u32 n_left;
+ u32 *from;
+ struct rte_mbuf **tx_vector;
+ u16 i;
+ u16 nb_tx_desc = xd->nb_tx_desc;
+ int queue_id;
+ u32 my_cpu;
+ u32 tx_pkts = 0;
+ tx_ring_hdr_t *ring;
+ u32 n_on_ring;
+
+ my_cpu = vm->cpu_index;
+
+ queue_id = my_cpu;
+
+ tx_vector = xd->tx_vectors[queue_id];
+ ring = vec_header (tx_vector, sizeof (*ring));
+
+ n_on_ring = ring->tx_head - ring->tx_tail;
+ from = vlib_frame_vector_args (f);
+
+ ASSERT (n_packets <= VLIB_FRAME_SIZE);
+
+ if (PREDICT_FALSE (n_on_ring + n_packets > nb_tx_desc))
+ {
+ /*
+ * Overflowing the ring should never happen.
+ * If it does then drop the whole frame.
+ */
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL,
+ n_packets);
+
+ while (n_packets--)
+ {
+ u32 bi0 = from[n_packets];
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+ struct rte_mbuf *mb0 = rte_mbuf_from_vlib_buffer (b0);
+ rte_pktmbuf_free (mb0);
+ }
+ return n_on_ring;
+ }
+
+ if (PREDICT_FALSE (dm->tx_pcap_enable))
+ {
+ n_left = n_packets;
+ while (n_left > 0)
+ {
+ u32 bi0 = from[0];
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+ if (dm->pcap_sw_if_index == 0 ||
+ dm->pcap_sw_if_index == vnet_buffer (b0)->sw_if_index[VLIB_TX])
+ pcap_add_buffer (&dm->pcap_main, vm, bi0, 512);
+ from++;
+ n_left--;
+ }
+ }
+
+ from = vlib_frame_vector_args (f);
+ n_left = n_packets;
+ i = ring->tx_head % nb_tx_desc;
+
+ while (n_left >= 8)
+ {
+ u32 bi0, bi1, bi2, bi3;
+ struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+ vlib_buffer_t *b0, *b1, *b2, *b3;
+ u32 or_flags;
+
+ dpdk_prefetch_buffer_by_index (vm, from[4]);
+ dpdk_prefetch_buffer_by_index (vm, from[5]);
+ dpdk_prefetch_buffer_by_index (vm, from[6]);
+ dpdk_prefetch_buffer_by_index (vm, from[7]);
+
+ bi0 = from[0];
+ bi1 = from[1];
+ bi2 = from[2];
+ bi3 = from[3];
+ from += 4;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+ b2 = vlib_get_buffer (vm, bi2);
+ b3 = vlib_get_buffer (vm, bi3);
+
+ or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
+
+ if (or_flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ dpdk_validate_rte_mbuf (vm, b0, 1);
+ dpdk_validate_rte_mbuf (vm, b1, 1);
+ dpdk_validate_rte_mbuf (vm, b2, 1);
+ dpdk_validate_rte_mbuf (vm, b3, 1);
+ }
+ else
+ {
+ dpdk_validate_rte_mbuf (vm, b0, 0);
+ dpdk_validate_rte_mbuf (vm, b1, 0);
+ dpdk_validate_rte_mbuf (vm, b2, 0);
+ dpdk_validate_rte_mbuf (vm, b3, 0);
+ }
+
+ mb0 = rte_mbuf_from_vlib_buffer (b0);
+ mb1 = rte_mbuf_from_vlib_buffer (b1);
+ mb2 = rte_mbuf_from_vlib_buffer (b2);
+ mb3 = rte_mbuf_from_vlib_buffer (b3);
+
+ if (PREDICT_FALSE (or_flags & VLIB_BUFFER_RECYCLE))
+ {
+ dpdk_buffer_recycle (vm, node, b0, bi0, &mb0);
+ dpdk_buffer_recycle (vm, node, b1, bi1, &mb1);
+ dpdk_buffer_recycle (vm, node, b2, bi2, &mb2);
+ dpdk_buffer_recycle (vm, node, b3, bi3, &mb3);
+
+ /* dont enqueue packets if replication failed as they must
+ be sent back to recycle */
+ if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb0;
+ if (PREDICT_TRUE ((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb1;
+ if (PREDICT_TRUE ((b2->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb2;
+ if (PREDICT_TRUE ((b3->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb3;
+ }
+ else
+ {
+ if (PREDICT_FALSE (i + 3 >= nb_tx_desc))
+ {
+ tx_vector[i++ % nb_tx_desc] = mb0;
+ tx_vector[i++ % nb_tx_desc] = mb1;
+ tx_vector[i++ % nb_tx_desc] = mb2;
+ tx_vector[i++ % nb_tx_desc] = mb3;
+ i %= nb_tx_desc;
+ }
+ else
+ {
+ tx_vector[i++] = mb0;
+ tx_vector[i++] = mb1;
+ tx_vector[i++] = mb2;
+ tx_vector[i++] = mb3;
+ }
+ }
+
+
+ if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1);
+ if (b2->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi2, b2);
+ if (b3->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi3, b3);
+ }
+
+ n_left -= 4;
+ }
+ while (n_left > 0)
+ {
+ u32 bi0;
+ struct rte_mbuf *mb0;
+ vlib_buffer_t *b0;
+
+ bi0 = from[0];
+ from++;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ dpdk_validate_rte_mbuf (vm, b0, 1);
+
+ mb0 = rte_mbuf_from_vlib_buffer (b0);
+ dpdk_buffer_recycle (vm, node, b0, bi0, &mb0);
+
+ if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+
+ if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % nb_tx_desc] = mb0;
+ i++;
+ }
+ n_left--;
+ }
+
+ /* account for additional packets in the ring */
+ ring->tx_head += n_packets;
+ n_on_ring = ring->tx_head - ring->tx_tail;
+
+ /* transmit as many packets as possible */
+ n_packets = tx_burst_vector_internal (vm, xd, tx_vector);
+
+ /*
+ * tx_pkts is the number of packets successfully transmitted
+ * This is the number originally on ring minus the number remaining on ring
+ */
+ tx_pkts = n_on_ring - n_packets;
+
+ {
+ /* If there is no callback then drop any non-transmitted packets */
+ if (PREDICT_FALSE (n_packets))
+ {
+ vlib_simple_counter_main_t *cm;
+ vnet_main_t *vnm = vnet_get_main ();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ n_packets);
+
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP,
+ n_packets);
+
+ while (n_packets--)
+ rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]);
+ }
+
+ /* Reset head/tail to avoid unnecessary wrap */
+ ring->tx_head = 0;
+ ring->tx_tail = 0;
+ }
+
+ /* Recycle replicated buffers */
+ if (PREDICT_FALSE (vec_len (dm->recycle[my_cpu])))
+ {
+ vlib_buffer_free (vm, dm->recycle[my_cpu],
+ vec_len (dm->recycle[my_cpu]));
+ _vec_len (dm->recycle[my_cpu]) = 0;
+ }
+
+ ASSERT (ring->tx_head >= ring->tx_tail);
+
+ return tx_pkts;
+}
+
+static void
+dpdk_clear_hw_interface_counters (u32 instance)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, instance);
+
+ /*
+ * Set the "last_cleared_stats" to the current stats, so that
+ * things appear to clear from a display perspective.
+ */
+ dpdk_update_counters (xd, vlib_time_now (dm->vlib_main));
+
+ clib_memcpy (&xd->last_cleared_stats, &xd->stats, sizeof (xd->stats));
+ clib_memcpy (xd->last_cleared_xstats, xd->xstats,
+ vec_len (xd->last_cleared_xstats) *
+ sizeof (xd->last_cleared_xstats[0]));
+
+}
+
+static clib_error_t *
+dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hif->dev_instance);
+ int rv = 0;
+
+ if (is_up)
+ {
+ f64 now = vlib_time_now (dm->vlib_main);
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
+ rv = rte_eth_dev_start (xd->device_index);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
+ rte_eth_promiscuous_enable (xd->device_index);
+ else
+ rte_eth_promiscuous_disable (xd->device_index);
+
+ rte_eth_allmulticast_enable (xd->device_index);
+ xd->flags |= DPDK_DEVICE_FLAG_ADMIN_UP;
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+ }
+ else
+ {
+ xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP;
+
+ rte_eth_allmulticast_disable (xd->device_index);
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+ rte_eth_dev_stop (xd->device_index);
+
+ /* For bonded interface, stop slave links */
+ if (xd->pmd == VNET_DPDK_PMD_BOND)
+ {
+ u8 slink[16];
+ int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16);
+ while (nlink >= 1)
+ {
+ u8 dpdk_port = slink[--nlink];
+ rte_eth_dev_stop (dpdk_port);
+ }
+ }
+ }
+
+ if (rv < 0)
+ clib_warning ("rte_eth_dev_%s error: %d", is_up ? "start" : "stop", rv);
+
+ return /* no error */ 0;
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void
+dpdk_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ dpdk_main_t *xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ xd->per_interface_next_index = node_index;
+ return;
+ }
+
+ xd->per_interface_next_index =
+ vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index);
+}
+
+
+static clib_error_t *
+dpdk_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t *st, int is_add)
+{
+ dpdk_main_t *xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+ vnet_sw_interface_t *t = (vnet_sw_interface_t *) st;
+ int r, vlan_offload;
+ u32 prev_subifs = xd->num_subifs;
+ clib_error_t *err = 0;
+
+ if (is_add)
+ xd->num_subifs++;
+ else if (xd->num_subifs)
+ xd->num_subifs--;
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ goto done;
+
+ /* currently we program VLANS only for IXGBE VF and I40E VF */
+ if ((xd->pmd != VNET_DPDK_PMD_IXGBEVF) && (xd->pmd != VNET_DPDK_PMD_I40EVF))
+ goto done;
+
+ if (t->sub.eth.flags.no_tags == 1)
+ goto done;
+
+ if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "unsupported VLAN setup");
+ goto done;
+ }
+
+ vlan_offload = rte_eth_dev_get_vlan_offload (xd->device_index);
+ vlan_offload |= ETH_VLAN_FILTER_OFFLOAD;
+
+ if ((r = rte_eth_dev_set_vlan_offload (xd->device_index, vlan_offload)))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d",
+ xd->device_index, r);
+ goto done;
+ }
+
+
+ if ((r =
+ rte_eth_dev_vlan_filter (xd->device_index, t->sub.eth.outer_vlan_id,
+ is_add)))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d",
+ xd->device_index, r);
+ goto done;
+ }
+
+done:
+ if (xd->num_subifs)
+ xd->flags |= DPDK_DEVICE_FLAG_HAVE_SUBIF;
+ else
+ xd->flags &= ~DPDK_DEVICE_FLAG_HAVE_SUBIF;
+
+ return err;
+}
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (dpdk_device_class) = {
+ .name = "dpdk",
+ .tx_function = dpdk_interface_tx,
+ .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR,
+ .tx_function_error_strings = dpdk_tx_func_error_strings,
+ .format_device_name = format_dpdk_device_name,
+ .format_device = format_dpdk_device,
+ .format_tx_trace = format_dpdk_tx_dma_trace,
+ .clear_counters = dpdk_clear_hw_interface_counters,
+ .admin_up_down_function = dpdk_interface_admin_up_down,
+ .subif_add_del_function = dpdk_subif_add_del_function,
+ .rx_redirect_to_node = dpdk_set_interface_next_node,
+ .mac_addr_change_function = dpdk_set_mac_address,
+};
+
+VLIB_DEVICE_TX_FUNCTION_MULTIARCH (dpdk_device_class, dpdk_interface_tx)
+/* *INDENT-ON* */
+
+#define UP_DOWN_FLAG_EVENT 1
+
+uword
+admin_up_down_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ clib_error_t *error = 0;
+ uword event_type;
+ uword *event_data = 0;
+ u32 sw_if_index;
+ u32 flags;
+
+ while (1)
+ {
+ vlib_process_wait_for_event (vm);
+
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ dpdk_main.admin_up_down_in_progress = 1;
+
+ switch (event_type)
+ {
+ case UP_DOWN_FLAG_EVENT:
+ {
+ if (vec_len (event_data) == 2)
+ {
+ sw_if_index = event_data[0];
+ flags = event_data[1];
+ error =
+ vnet_sw_interface_set_flags (vnet_get_main (), sw_if_index,
+ flags);
+ clib_error_report (error);
+ }
+ }
+ break;
+ }
+
+ vec_reset_length (event_data);
+
+ dpdk_main.admin_up_down_in_progress = 0;
+
+ }
+ return 0; /* or not */
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (admin_up_down_process_node,static) = {
+ .function = admin_up_down_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "admin-up-down-process",
+ .process_log2_n_stack_bytes = 17, // 256KB
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/dpdk.h b/src/vnet/devices/dpdk/dpdk.h
new file mode 100644
index 00000000000..d8f378d2b54
--- /dev/null
+++ b/src/vnet/devices/dpdk/dpdk.h
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_dpdk_h__
+#define __included_dpdk_h__
+
+/* $$$$ We should rename always_inline -> clib_always_inline */
+#undef always_inline
+
+#include <rte_config.h>
+
+#include <rte_common.h>
+#include <rte_dev.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_prefetch.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_branch_prediction.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_random.h>
+#include <rte_debug.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_virtio_net.h>
+#include <rte_version.h>
+#include <rte_eth_bond.h>
+#include <rte_sched.h>
+
+#include <vnet/unix/pcap.h>
+#include <vnet/devices/devices.h>
+
+#if CLIB_DEBUG > 0
+#define always_inline static inline
+#else
+#define always_inline static inline __attribute__ ((__always_inline__))
+#endif
+
+#include <vlib/pci/pci.h>
+
+#define NB_MBUF (16<<10)
+
+extern vnet_device_class_t dpdk_device_class;
+extern vlib_node_registration_t dpdk_input_node;
+extern vlib_node_registration_t handoff_dispatch_node;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+#define foreach_dpdk_pmd \
+ _ ("net_thunderx", THUNDERX) \
+ _ ("net_e1000_em", E1000EM) \
+ _ ("net_e1000_igb", IGB) \
+ _ ("net_e1000_igb_vf", IGBVF) \
+ _ ("net_ixgbe", IXGBE) \
+ _ ("net_ixgbe_vf", IXGBEVF) \
+ _ ("net_i40e", I40E) \
+ _ ("net_i40e_vf", I40EVF) \
+ _ ("net_virtio", VIRTIO) \
+ _ ("net_enic", ENIC) \
+ _ ("net_vmxnet3", VMXNET3) \
+ _ ("net_af_packet", AF_PACKET) \
+ _ ("rte_bond_pmd", BOND) \
+ _ ("net_fm10k", FM10K) \
+ _ ("net_cxgbe", CXGBE) \
+ _ ("net_mlx5", MLX5) \
+ _ ("net_dpaa2", DPAA2)
+#else
+#define foreach_dpdk_pmd \
+ _ ("rte_nicvf_pmd", THUNDERX) \
+ _ ("rte_em_pmd", E1000EM) \
+ _ ("rte_igb_pmd", IGB) \
+ _ ("rte_igbvf_pmd", IGBVF) \
+ _ ("rte_ixgbe_pmd", IXGBE) \
+ _ ("rte_ixgbevf_pmd", IXGBEVF) \
+ _ ("rte_i40e_pmd", I40E) \
+ _ ("rte_i40evf_pmd", I40EVF) \
+ _ ("rte_virtio_pmd", VIRTIO) \
+ _ ("rte_enic_pmd", ENIC) \
+ _ ("rte_vmxnet3_pmd", VMXNET3) \
+ _ ("AF_PACKET PMD", AF_PACKET) \
+ _ ("rte_bond_pmd", BOND) \
+ _ ("rte_pmd_fm10k", FM10K) \
+ _ ("rte_cxgbe_pmd", CXGBE) \
+ _ ("rte_dpaa2_dpni", DPAA2)
+#endif
+
+typedef enum
+{
+ VNET_DPDK_PMD_NONE,
+#define _(s,f) VNET_DPDK_PMD_##f,
+ foreach_dpdk_pmd
+#undef _
+ VNET_DPDK_PMD_UNKNOWN, /* must be last */
+} dpdk_pmd_t;
+
+typedef enum
+{
+ VNET_DPDK_PORT_TYPE_ETH_1G,
+ VNET_DPDK_PORT_TYPE_ETH_10G,
+ VNET_DPDK_PORT_TYPE_ETH_40G,
+ VNET_DPDK_PORT_TYPE_ETH_100G,
+ VNET_DPDK_PORT_TYPE_ETH_BOND,
+ VNET_DPDK_PORT_TYPE_ETH_SWITCH,
+ VNET_DPDK_PORT_TYPE_AF_PACKET,
+ VNET_DPDK_PORT_TYPE_UNKNOWN,
+} dpdk_port_type_t;
+
+/*
+ * The header for the tx_vector in dpdk_device_t.
+ * Head and tail are indexes into the tx_vector and are of type
+ * u64 so they never overflow.
+ */
+typedef struct
+{
+ u64 tx_head;
+ u64 tx_tail;
+} tx_ring_hdr_t;
+
+typedef struct
+{
+ struct rte_ring *swq;
+
+ u64 hqos_field0_slabmask;
+ u32 hqos_field0_slabpos;
+ u32 hqos_field0_slabshr;
+ u64 hqos_field1_slabmask;
+ u32 hqos_field1_slabpos;
+ u32 hqos_field1_slabshr;
+ u64 hqos_field2_slabmask;
+ u32 hqos_field2_slabpos;
+ u32 hqos_field2_slabshr;
+ u32 hqos_tc_table[64];
+} dpdk_device_hqos_per_worker_thread_t;
+
+typedef struct
+{
+ struct rte_ring **swq;
+ struct rte_mbuf **pkts_enq;
+ struct rte_mbuf **pkts_deq;
+ struct rte_sched_port *hqos;
+ u32 hqos_burst_enq;
+ u32 hqos_burst_deq;
+ u32 pkts_enq_len;
+ u32 swq_pos;
+ u32 flush_count;
+} dpdk_device_hqos_per_hqos_thread_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ volatile u32 **lockp;
+
+ /* Instance ID */
+ u32 device_index;
+
+ u32 vlib_hw_if_index;
+ u32 vlib_sw_if_index;
+
+ /* next node index if we decide to steal the rx graph arc */
+ u32 per_interface_next_index;
+
+ /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */
+ struct rte_mbuf ***tx_vectors; /* one per worker thread */
+ struct rte_mbuf ***rx_vectors;
+
+ /* vector of traced contexts, per device */
+ u32 **d_trace_buffers;
+
+ dpdk_pmd_t pmd:8;
+ i8 cpu_socket;
+
+ u16 flags;
+#define DPDK_DEVICE_FLAG_ADMIN_UP (1 << 0)
+#define DPDK_DEVICE_FLAG_PROMISC (1 << 1)
+#define DPDK_DEVICE_FLAG_PMD (1 << 2)
+#define DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE (1 << 3)
+#define DPDK_DEVICE_FLAG_MAYBE_MULTISEG (1 << 4)
+#define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5)
+#define DPDK_DEVICE_FLAG_HQOS (1 << 6)
+
+ u16 nb_tx_desc;
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
+
+ u8 *interface_name_suffix;
+
+ /* number of sub-interfaces */
+ u16 num_subifs;
+
+ /* PMD related */
+ u16 tx_q_used;
+ u16 rx_q_used;
+ u16 nb_rx_desc;
+ u16 *cpu_socket_id_by_queue;
+ struct rte_eth_conf port_conf;
+ struct rte_eth_txconf tx_conf;
+
+ /* HQoS related */
+ dpdk_device_hqos_per_worker_thread_t *hqos_wt;
+ dpdk_device_hqos_per_hqos_thread_t *hqos_ht;
+
+ /* af_packet */
+ u8 af_packet_port_id;
+
+ struct rte_eth_link link;
+ f64 time_last_link_update;
+
+ struct rte_eth_stats stats;
+ struct rte_eth_stats last_stats;
+ struct rte_eth_stats last_cleared_stats;
+ struct rte_eth_xstat *xstats;
+ struct rte_eth_xstat *last_cleared_xstats;
+ f64 time_last_stats_update;
+ dpdk_port_type_t port_type;
+} dpdk_device_t;
+
+#define DPDK_STATS_POLL_INTERVAL (10.0)
+#define DPDK_MIN_STATS_POLL_INTERVAL (0.001) /* 1msec */
+
+#define DPDK_LINK_POLL_INTERVAL (3.0)
+#define DPDK_MIN_LINK_POLL_INTERVAL (0.001) /* 1msec */
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+
+ /* total input packet counter */
+ u64 aggregate_rx_packets;
+} dpdk_worker_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+
+ /* total input packet counter */
+ u64 aggregate_rx_packets;
+} dpdk_hqos_thread_t;
+
+typedef struct
+{
+ u32 device;
+ u16 queue_id;
+} dpdk_device_and_queue_t;
+
+#ifndef DPDK_HQOS_DBG_BYPASS
+#define DPDK_HQOS_DBG_BYPASS 0
+#endif
+
+#ifndef HQOS_FLUSH_COUNT_THRESHOLD
+#define HQOS_FLUSH_COUNT_THRESHOLD 100000
+#endif
+
+typedef struct dpdk_device_config_hqos_t
+{
+ u32 hqos_thread;
+ u32 hqos_thread_valid;
+
+ u32 swq_size;
+ u32 burst_enq;
+ u32 burst_deq;
+
+ u32 pktfield0_slabpos;
+ u32 pktfield1_slabpos;
+ u32 pktfield2_slabpos;
+ u64 pktfield0_slabmask;
+ u64 pktfield1_slabmask;
+ u64 pktfield2_slabmask;
+ u32 tc_table[64];
+
+ struct rte_sched_port_params port;
+ struct rte_sched_subport_params *subport;
+ struct rte_sched_pipe_params *pipe;
+ uint32_t *pipe_map;
+} dpdk_device_config_hqos_t;
+
+int dpdk_hqos_validate_mask (u64 mask, u32 n);
+void dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t *
+ hqos, u32 pipe_profile_id);
+void dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos);
+clib_error_t *dpdk_port_setup_hqos (dpdk_device_t * xd,
+ dpdk_device_config_hqos_t * hqos);
+void dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos,
+ struct rte_mbuf **pkts, u32 n_pkts);
+
+#define foreach_dpdk_device_config_item \
+ _ (num_rx_queues) \
+ _ (num_tx_queues) \
+ _ (num_rx_desc) \
+ _ (num_tx_desc) \
+ _ (rss_fn)
+
+typedef struct
+{
+ vlib_pci_addr_t pci_addr;
+ u8 is_blacklisted;
+ u8 vlan_strip_offload;
+#define DPDK_DEVICE_VLAN_STRIP_DEFAULT 0
+#define DPDK_DEVICE_VLAN_STRIP_OFF 1
+#define DPDK_DEVICE_VLAN_STRIP_ON 2
+
+#define _(x) uword x;
+ foreach_dpdk_device_config_item
+#undef _
+ clib_bitmap_t * workers;
+ u32 hqos_enabled;
+ dpdk_device_config_hqos_t hqos;
+} dpdk_device_config_t;
+
+typedef struct
+{
+
+ /* Config stuff */
+ u8 **eal_init_args;
+ u8 *eal_init_args_str;
+ u8 *uio_driver_name;
+ u8 no_multi_seg;
+ u8 enable_tcp_udp_checksum;
+
+ /* Required config parameters */
+ u8 coremask_set_manually;
+ u8 nchannels_set_manually;
+ u32 coremask;
+ u32 nchannels;
+ u32 num_mbufs;
+ u8 num_kni; /* while kni_init allows u32, port_id in callback fn is only u8 */
+
+ /*
+ * format interface names ala xxxEthernet%d/%d/%d instead of
+ * xxxEthernet%x/%x/%x.
+ */
+ u8 interface_name_format_decimal;
+
+ /* per-device config */
+ dpdk_device_config_t default_devconf;
+ dpdk_device_config_t *dev_confs;
+ uword *device_config_index_by_pci_addr;
+
+} dpdk_config_main_t;
+
+dpdk_config_main_t dpdk_config_main;
+
+typedef struct
+{
+
+ /* Devices */
+ dpdk_device_t *devices;
+ dpdk_device_and_queue_t **devices_by_cpu;
+ dpdk_device_and_queue_t **devices_by_hqos_cpu;
+
+ /* per-thread recycle lists */
+ u32 **recycle;
+
+ /* buffer flags template, configurable to enable/disable tcp / udp cksum */
+ u32 buffer_flags_template;
+
+ /* vlib buffer free list, must be same size as an rte_mbuf */
+ u32 vlib_buffer_free_list_index;
+
+ /* dpdk worker "threads" */
+ dpdk_worker_t *workers;
+
+ /* dpdk HQoS "threads" */
+ dpdk_hqos_thread_t *hqos_threads;
+
+ /* Ethernet input node index */
+ u32 ethernet_input_node_index;
+
+ /* pcap tracing [only works if (CLIB_DEBUG > 0)] */
+ int tx_pcap_enable;
+ pcap_main_t pcap_main;
+ u8 *pcap_filename;
+ u32 pcap_sw_if_index;
+ u32 pcap_pkts_to_capture;
+
+ /* hashes */
+ uword *dpdk_device_by_kni_port_id;
+ uword *vu_sw_if_index_by_listener_fd;
+ uword *vu_sw_if_index_by_sock_fd;
+ u32 *vu_inactive_interfaces_device_index;
+
+ /*
+ * flag indicating that a posted admin up/down
+ * (via post_sw_interface_set_flags) is in progress
+ */
+ u8 admin_up_down_in_progress;
+
+ u8 use_rss;
+
+ /* which cpus are running dpdk-input */
+ int input_cpu_first_index;
+ int input_cpu_count;
+
+ /* which cpus are running I/O TX */
+ int hqos_cpu_first_index;
+ int hqos_cpu_count;
+
+ /* control interval of dpdk link state and stat polling */
+ f64 link_state_poll_interval;
+ f64 stat_poll_interval;
+
+ /* Sleep for this many MS after each device poll */
+ u32 poll_sleep;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+ dpdk_config_main_t *conf;
+} dpdk_main_t;
+
+dpdk_main_t dpdk_main;
+
+typedef struct
+{
+ u32 buffer_index;
+ u16 device_index;
+ u8 queue_index;
+ struct rte_mbuf mb;
+ /* Copy of VLIB buffer; packet data stored in pre_data. */
+ vlib_buffer_t buffer;
+} dpdk_tx_dma_trace_t;
+
+typedef struct
+{
+ u32 buffer_index;
+ u16 device_index;
+ u16 queue_index;
+ struct rte_mbuf mb;
+ vlib_buffer_t buffer; /* Copy of VLIB buffer; pkt data stored in pre_data. */
+ u8 data[256]; /* First 256 data bytes, used for hexdump */
+} dpdk_rx_dma_trace_t;
+
+void vnet_buffer_needs_dpdk_mb (vlib_buffer_t * b);
+
+clib_error_t *dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address);
+
+clib_error_t *dpdk_set_mc_filter (vnet_hw_interface_t * hi,
+ struct ether_addr mc_addr_vec[], int naddr);
+
+void dpdk_thread_input (dpdk_main_t * dm, dpdk_device_t * xd);
+
+clib_error_t *dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd);
+
+u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance);
+
+struct rte_mbuf *dpdk_replicate_packet_mb (vlib_buffer_t * b);
+struct rte_mbuf *dpdk_zerocopy_replicate_packet_mb (vlib_buffer_t * b);
+
+#define foreach_dpdk_error \
+ _(NONE, "no error") \
+ _(RX_PACKET_ERROR, "Rx packet errors") \
+ _(RX_BAD_FCS, "Rx bad fcs") \
+ _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \
+ _(RX_ALLOC_FAIL, "rx buf alloc from free list failed") \
+ _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem") \
+ _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error")
+
+typedef enum
+{
+#define _(f,s) DPDK_ERROR_##f,
+ foreach_dpdk_error
+#undef _
+ DPDK_N_ERROR,
+} dpdk_error_t;
+
+int dpdk_set_stat_poll_interval (f64 interval);
+int dpdk_set_link_state_poll_interval (f64 interval);
+void dpdk_update_link_state (dpdk_device_t * xd, f64 now);
+void dpdk_device_lock_init (dpdk_device_t * xd);
+void dpdk_device_lock_free (dpdk_device_t * xd);
+
+static inline u64
+vnet_get_aggregate_rx_packets (void)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u64 sum = 0;
+ dpdk_worker_t *dw;
+
+ vec_foreach (dw, dm->workers) sum += dw->aggregate_rx_packets;
+
+ return sum;
+}
+
+void dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id, u32 * buffers, uword n_buffers);
+
+#define EFD_OPERATION_LESS_THAN 0
+#define EFD_OPERATION_GREATER_OR_EQUAL 1
+
+format_function_t format_dpdk_device_name;
+format_function_t format_dpdk_device;
+format_function_t format_dpdk_tx_dma_trace;
+format_function_t format_dpdk_rx_dma_trace;
+format_function_t format_dpdk_rte_mbuf;
+format_function_t format_dpdk_rx_rte_mbuf;
+unformat_function_t unformat_socket_mem;
+clib_error_t *unformat_rss_fn (unformat_input_t * input, uword * rss_fn);
+clib_error_t *unformat_hqos (unformat_input_t * input,
+ dpdk_device_config_hqos_t * hqos);
+
+uword
+admin_up_down_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f);
+
+#endif /* __included_dpdk_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/dpdk_priv.h b/src/vnet/devices/dpdk/dpdk_priv.h
new file mode 100644
index 00000000000..0c81dbc3beb
--- /dev/null
+++ b/src/vnet/devices/dpdk/dpdk_priv.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define DPDK_NB_RX_DESC_DEFAULT 1024
+#define DPDK_NB_TX_DESC_DEFAULT 1024
+#define DPDK_NB_RX_DESC_VIRTIO 256
+#define DPDK_NB_TX_DESC_VIRTIO 256
+
+#define I40E_DEV_ID_SFP_XL710 0x1572
+#define I40E_DEV_ID_QSFP_A 0x1583
+#define I40E_DEV_ID_QSFP_B 0x1584
+#define I40E_DEV_ID_QSFP_C 0x1585
+#define I40E_DEV_ID_10G_BASE_T 0x1586
+#define I40E_DEV_ID_VF 0x154C
+
+/* These args appear by themselves */
+#define foreach_eal_double_hyphen_predicate_arg \
+_(no-shconf) \
+_(no-hpet) \
+_(no-huge) \
+_(vmware-tsc-map)
+
+#define foreach_eal_single_hyphen_mandatory_arg \
+_(coremask, c) \
+_(nchannels, n) \
+
+#define foreach_eal_single_hyphen_arg \
+_(blacklist, b) \
+_(mem-alloc-request, m) \
+_(force-ranks, r)
+
+/* These args are preceeded by "--" and followed by a single string */
+#define foreach_eal_double_hyphen_arg \
+_(huge-dir) \
+_(proc-type) \
+_(file-prefix) \
+_(vdev)
+
+static inline void
+dpdk_get_xstats (dpdk_device_t * xd)
+{
+ int len;
+ if ((len = rte_eth_xstats_get (xd->device_index, NULL, 0)) > 0)
+ {
+ vec_validate (xd->xstats, len - 1);
+ vec_validate (xd->last_cleared_xstats, len - 1);
+
+ len =
+ rte_eth_xstats_get (xd->device_index, xd->xstats,
+ vec_len (xd->xstats));
+
+ ASSERT (vec_len (xd->xstats) == len);
+ ASSERT (vec_len (xd->last_cleared_xstats) == len);
+
+ _vec_len (xd->xstats) = len;
+ _vec_len (xd->last_cleared_xstats) = len;
+
+ }
+}
+
+
+static inline void
+dpdk_update_counters (dpdk_device_t * xd, f64 now)
+{
+ vlib_simple_counter_main_t *cm;
+ vnet_main_t *vnm = vnet_get_main ();
+ u32 my_cpu = os_get_cpu_number ();
+ u64 rxerrors, last_rxerrors;
+
+ /* only update counters for PMD interfaces */
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ return;
+
+ xd->time_last_stats_update = now ? now : xd->time_last_stats_update;
+ clib_memcpy (&xd->last_stats, &xd->stats, sizeof (xd->last_stats));
+ rte_eth_stats_get (xd->device_index, &xd->stats);
+
+ /* maybe bump interface rx no buffer counter */
+ if (PREDICT_FALSE (xd->stats.rx_nombuf != xd->last_stats.rx_nombuf))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_NO_BUF);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ xd->stats.rx_nombuf -
+ xd->last_stats.rx_nombuf);
+ }
+
+ /* missed pkt counter */
+ if (PREDICT_FALSE (xd->stats.imissed != xd->last_stats.imissed))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_MISS);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ xd->stats.imissed -
+ xd->last_stats.imissed);
+ }
+ rxerrors = xd->stats.ierrors;
+ last_rxerrors = xd->last_stats.ierrors;
+
+ if (PREDICT_FALSE (rxerrors != last_rxerrors))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ rxerrors - last_rxerrors);
+ }
+
+ dpdk_get_xstats (xd);
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/format.c b/src/vnet/devices/dpdk/format.c
new file mode 100644
index 00000000000..ff7c7a5a41c
--- /dev/null
+++ b/src/vnet/devices/dpdk/format.c
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+#include <assert.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include "dpdk_priv.h"
+#include <vppinfra/error.h>
+
+#define foreach_dpdk_counter \
+ _ (tx_frames_ok, opackets) \
+ _ (tx_bytes_ok, obytes) \
+ _ (tx_errors, oerrors) \
+ _ (rx_frames_ok, ipackets) \
+ _ (rx_bytes_ok, ibytes) \
+ _ (rx_errors, ierrors) \
+ _ (rx_missed, imissed) \
+ _ (rx_no_bufs, rx_nombuf)
+
+#define foreach_dpdk_q_counter \
+ _ (rx_frames_ok, q_ipackets) \
+ _ (tx_frames_ok, q_opackets) \
+ _ (rx_bytes_ok, q_ibytes) \
+ _ (tx_bytes_ok, q_obytes) \
+ _ (rx_errors, q_errors)
+
+#define foreach_dpdk_rss_hf \
+ _(ETH_RSS_FRAG_IPV4, "ipv4-frag") \
+ _(ETH_RSS_NONFRAG_IPV4_TCP, "ipv4-tcp") \
+ _(ETH_RSS_NONFRAG_IPV4_UDP, "ipv4-udp") \
+ _(ETH_RSS_NONFRAG_IPV4_SCTP, "ipv4-sctp") \
+ _(ETH_RSS_NONFRAG_IPV4_OTHER, "ipv4-other") \
+ _(ETH_RSS_IPV4, "ipv4") \
+ _(ETH_RSS_IPV6_TCP_EX, "ipv6-tcp-ex") \
+ _(ETH_RSS_IPV6_UDP_EX, "ipv6-udp-ex") \
+ _(ETH_RSS_FRAG_IPV6, "ipv6-frag") \
+ _(ETH_RSS_NONFRAG_IPV6_TCP, "ipv6-tcp") \
+ _(ETH_RSS_NONFRAG_IPV6_UDP, "ipv6-udp") \
+ _(ETH_RSS_NONFRAG_IPV6_SCTP, "ipv6-sctp") \
+ _(ETH_RSS_NONFRAG_IPV6_OTHER, "ipv6-other") \
+ _(ETH_RSS_L2_PAYLOAD, "l2-payload") \
+ _(ETH_RSS_IPV6_EX, "ipv6-ex") \
+ _(ETH_RSS_IPV6, "ipv6")
+
+
+#define foreach_dpdk_rx_offload_caps \
+ _(DEV_RX_OFFLOAD_VLAN_STRIP, "vlan-strip") \
+ _(DEV_RX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_RX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_LRO , "rcp-lro") \
+ _(DEV_RX_OFFLOAD_QINQ_STRIP, "qinq-strip")
+
+#define foreach_dpdk_tx_offload_caps \
+ _(DEV_TX_OFFLOAD_VLAN_INSERT, "vlan-insert") \
+ _(DEV_TX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_TX_OFFLOAD_SCTP_CKSUM , "sctp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_TSO , "tcp-tso") \
+ _(DEV_TX_OFFLOAD_UDP_TSO , "udp-tso") \
+ _(DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM, "outer-ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_QINQ_INSERT, "qinq-insert")
+
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+/* New ol_flags bits added in DPDK-16.11 */
+#define PKT_RX_IP_CKSUM_GOOD (1ULL << 7)
+#define PKT_RX_L4_CKSUM_GOOD (1ULL << 8)
+#endif
+
+#define foreach_dpdk_pkt_rx_offload_flag \
+ _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \
+ _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \
+ _ (PKT_RX_FDIR, "RX packet with FDIR infos") \
+ _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \
+ _ (PKT_RX_VLAN_STRIPPED, "RX packet VLAN tag stripped") \
+ _ (PKT_RX_IP_CKSUM_GOOD, "IP cksum of RX pkt. is valid") \
+ _ (PKT_RX_L4_CKSUM_GOOD, "L4 cksum of RX pkt. is valid") \
+ _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \
+ _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet") \
+ _ (PKT_RX_QINQ_STRIPPED, "RX packet QinQ tags stripped")
+
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+/* PTYPE added in DPDK-16.11 */
+#define RTE_PTYPE_L2_ETHER_VLAN 0x00000006
+#define RTE_PTYPE_L2_ETHER_QINQ 0x00000007
+#endif
+
+#define foreach_dpdk_pkt_type \
+ _ (L2, ETHER, "Ethernet packet") \
+ _ (L2, ETHER_TIMESYNC, "Ethernet packet for time sync") \
+ _ (L2, ETHER_ARP, "ARP packet") \
+ _ (L2, ETHER_LLDP, "LLDP (Link Layer Discovery Protocol) packet") \
+ _ (L2, ETHER_NSH, "NSH (Network Service Header) packet") \
+ _ (L2, ETHER_VLAN, "VLAN packet") \
+ _ (L2, ETHER_QINQ, "QinQ packet") \
+ _ (L3, IPV4, "IPv4 packet without extension headers") \
+ _ (L3, IPV4_EXT, "IPv4 packet with extension headers") \
+ _ (L3, IPV4_EXT_UNKNOWN, "IPv4 packet with or without extension headers") \
+ _ (L3, IPV6, "IPv6 packet without extension headers") \
+ _ (L3, IPV6_EXT, "IPv6 packet with extension headers") \
+ _ (L3, IPV6_EXT_UNKNOWN, "IPv6 packet with or without extension headers") \
+ _ (L4, TCP, "TCP packet") \
+ _ (L4, UDP, "UDP packet") \
+ _ (L4, FRAG, "Fragmented IP packet") \
+ _ (L4, SCTP, "SCTP (Stream Control Transmission Protocol) packet") \
+ _ (L4, ICMP, "ICMP packet") \
+ _ (L4, NONFRAG, "Non-fragmented IP packet") \
+ _ (TUNNEL, GRE, "GRE tunneling packet") \
+ _ (TUNNEL, VXLAN, "VXLAN tunneling packet") \
+ _ (TUNNEL, NVGRE, "NVGRE Tunneling packet") \
+ _ (TUNNEL, GENEVE, "GENEVE Tunneling packet") \
+ _ (TUNNEL, GRENAT, "Teredo, VXLAN or GRE Tunneling packet") \
+ _ (INNER_L2, ETHER, "Inner Ethernet packet") \
+ _ (INNER_L2, ETHER_VLAN, "Inner Ethernet packet with VLAN") \
+ _ (INNER_L3, IPV4, "Inner IPv4 packet without extension headers") \
+ _ (INNER_L3, IPV4_EXT, "Inner IPv4 packet with extension headers") \
+ _ (INNER_L3, IPV4_EXT_UNKNOWN, "Inner IPv4 packet with or without extension headers") \
+ _ (INNER_L3, IPV6, "Inner IPv6 packet without extension headers") \
+ _ (INNER_L3, IPV6_EXT, "Inner IPv6 packet with extension headers") \
+ _ (INNER_L3, IPV6_EXT_UNKNOWN, "Inner IPv6 packet with or without extension headers") \
+ _ (INNER_L4, TCP, "Inner TCP packet") \
+ _ (INNER_L4, UDP, "Inner UDP packet") \
+ _ (INNER_L4, FRAG, "Inner fagmented IP packet") \
+ _ (INNER_L4, SCTP, "Inner SCTP (Stream Control Transmission Protocol) packet") \
+ _ (INNER_L4, ICMP, "Inner ICMP packet") \
+ _ (INNER_L4, NONFRAG, "Inner non-fragmented IP packet")
+
+#define foreach_dpdk_pkt_tx_offload_flag \
+ _ (PKT_TX_VLAN_PKT, "TX packet is a 802.1q VLAN packet") \
+ _ (PKT_TX_IP_CKSUM, "IP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_TCP_CKSUM, "TCP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_SCTP_CKSUM, "SCTP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_IEEE1588_TMST, "TX IEEE1588 packet to timestamp")
+
+#define foreach_dpdk_pkt_offload_flag \
+ foreach_dpdk_pkt_rx_offload_flag \
+ foreach_dpdk_pkt_tx_offload_flag
+
+u8 *
+format_dpdk_device_name (u8 * s, va_list * args)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ char *devname_format;
+ char *device_name;
+ u32 i = va_arg (*args, u32);
+ struct rte_eth_dev_info dev_info;
+ u8 *ret;
+
+ if (dm->conf->interface_name_format_decimal)
+ devname_format = "%s%d/%d/%d";
+ else
+ devname_format = "%s%x/%x/%x";
+
+ switch (dm->devices[i].port_type)
+ {
+ case VNET_DPDK_PORT_TYPE_ETH_1G:
+ device_name = "GigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_10G:
+ device_name = "TenGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_40G:
+ device_name = "FortyGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_100G:
+ device_name = "HundredGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_BOND:
+ return format (s, "BondEthernet%d", dm->devices[i].device_index);
+
+ case VNET_DPDK_PORT_TYPE_ETH_SWITCH:
+ device_name = "EthernetSwitch";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_AF_PACKET:
+ rte_eth_dev_info_get (i, &dev_info);
+ return format (s, "af_packet%d", dm->devices[i].af_packet_port_id);
+
+ default:
+ case VNET_DPDK_PORT_TYPE_UNKNOWN:
+ device_name = "UnknownEthernet";
+ break;
+ }
+
+ rte_eth_dev_info_get (i, &dev_info);
+
+ if (dev_info.pci_dev)
+ ret = format (s, devname_format, device_name, dev_info.pci_dev->addr.bus,
+ dev_info.pci_dev->addr.devid,
+ dev_info.pci_dev->addr.function);
+ else
+ ret = format (s, "%s%d", device_name, dm->devices[i].device_index);
+
+ if (dm->devices[i].interface_name_suffix)
+ return format (ret, "/%s", dm->devices[i].interface_name_suffix);
+ return ret;
+}
+
+static u8 *
+format_dpdk_device_type (u8 * s, va_list * args)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ char *dev_type;
+ u32 i = va_arg (*args, u32);
+
+ switch (dm->devices[i].pmd)
+ {
+ case VNET_DPDK_PMD_E1000EM:
+ dev_type = "Intel 82540EM (e1000)";
+ break;
+
+ case VNET_DPDK_PMD_IGB:
+ dev_type = "Intel e1000";
+ break;
+
+ case VNET_DPDK_PMD_I40E:
+ dev_type = "Intel X710/XL710 Family";
+ break;
+
+ case VNET_DPDK_PMD_I40EVF:
+ dev_type = "Intel X710/XL710 Family VF";
+ break;
+
+ case VNET_DPDK_PMD_FM10K:
+ dev_type = "Intel FM10000 Family Ethernet Switch";
+ break;
+
+ case VNET_DPDK_PMD_IGBVF:
+ dev_type = "Intel e1000 VF";
+ break;
+
+ case VNET_DPDK_PMD_VIRTIO:
+ dev_type = "Red Hat Virtio";
+ break;
+
+ case VNET_DPDK_PMD_IXGBEVF:
+ dev_type = "Intel 82599 VF";
+ break;
+
+ case VNET_DPDK_PMD_IXGBE:
+ dev_type = "Intel 82599";
+ break;
+
+ case VNET_DPDK_PMD_ENIC:
+ dev_type = "Cisco VIC";
+ break;
+
+ case VNET_DPDK_PMD_CXGBE:
+ dev_type = "Chelsio T4/T5";
+ break;
+
+ case VNET_DPDK_PMD_MLX5:
+ dev_type = "Mellanox ConnectX-4 Family";
+ break;
+
+ case VNET_DPDK_PMD_VMXNET3:
+ dev_type = "VMware VMXNET3";
+ break;
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ dev_type = "af_packet";
+ break;
+
+ case VNET_DPDK_PMD_BOND:
+ dev_type = "Ethernet Bonding";
+ break;
+
+ case VNET_DPDK_PMD_DPAA2:
+ dev_type = "NXP DPAA2 Mac";
+ break;
+
+ default:
+ case VNET_DPDK_PMD_UNKNOWN:
+ dev_type = "### UNKNOWN ###";
+ break;
+ }
+
+ return format (s, dev_type);
+}
+
+static u8 *
+format_dpdk_link_status (u8 * s, va_list * args)
+{
+ dpdk_device_t *xd = va_arg (*args, dpdk_device_t *);
+ struct rte_eth_link *l = &xd->link;
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index);
+
+ s = format (s, "%s ", l->link_status ? "up" : "down");
+ if (l->link_status)
+ {
+ u32 promisc = rte_eth_promiscuous_get (xd->device_index);
+
+ s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ?
+ "full" : "half");
+ s = format (s, "speed %u mtu %d %s\n", l->link_speed,
+ hi->max_packet_bytes, promisc ? " promisc" : "");
+ }
+ else
+ s = format (s, "\n");
+
+ return s;
+}
+
+#define _line_len 72
+#define _(v, str) \
+if (bitmap & v) { \
+ if (format_get_indent (s) > next_split ) { \
+ next_split += _line_len; \
+ s = format(s,"\n%U", format_white_space, indent); \
+ } \
+ s = format(s, "%s ", str); \
+}
+
+static u8 *
+format_dpdk_rss_hf_name (u8 * s, va_list * args)
+{
+ u64 bitmap = va_arg (*args, u64);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_rss_hf return s;
+}
+
+static u8 *
+format_dpdk_rx_offload_caps (u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_rx_offload_caps return s;
+}
+
+static u8 *
+format_dpdk_tx_offload_caps (u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_tx_offload_caps return s;
+}
+
+#undef _line_len
+#undef _
+
+u8 *
+format_dpdk_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ int verbose = va_arg (*args, int);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dev_instance);
+ uword indent = format_get_indent (s);
+ f64 now = vlib_time_now (dm->vlib_main);
+ struct rte_eth_dev_info di;
+
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+
+ s = format (s, "%U\n%Ucarrier %U",
+ format_dpdk_device_type, xd->device_index,
+ format_white_space, indent + 2, format_dpdk_link_status, xd);
+
+ rte_eth_dev_info_get (xd->device_index, &di);
+
+ if (verbose > 1 && xd->flags & DPDK_DEVICE_FLAG_PMD)
+ {
+ struct rte_pci_device *pci;
+ struct rte_eth_rss_conf rss_conf;
+ int vlan_off;
+ int retval;
+
+ rss_conf.rss_key = 0;
+ retval = rte_eth_dev_rss_hash_conf_get (xd->device_index, &rss_conf);
+ if (retval < 0)
+ clib_warning ("rte_eth_dev_rss_hash_conf_get returned %d", retval);
+ pci = di.pci_dev;
+
+ if (pci)
+ s =
+ format (s,
+ "%Upci id: device %04x:%04x subsystem %04x:%04x\n"
+ "%Upci address: %04x:%02x:%02x.%02x\n",
+ format_white_space, indent + 2, pci->id.vendor_id,
+ pci->id.device_id, pci->id.subsystem_vendor_id,
+ pci->id.subsystem_device_id, format_white_space, indent + 2,
+ pci->addr.domain, pci->addr.bus, pci->addr.devid,
+ pci->addr.function);
+ s =
+ format (s, "%Umax rx packet len: %d\n", format_white_space,
+ indent + 2, di.max_rx_pktlen);
+ s =
+ format (s, "%Umax num of queues: rx %d tx %d\n", format_white_space,
+ indent + 2, di.max_rx_queues, di.max_tx_queues);
+ s =
+ format (s, "%Upromiscuous: unicast %s all-multicast %s\n",
+ format_white_space, indent + 2,
+ rte_eth_promiscuous_get (xd->device_index) ? "on" : "off",
+ rte_eth_promiscuous_get (xd->device_index) ? "on" : "off");
+ vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index);
+ s = format (s, "%Uvlan offload: strip %s filter %s qinq %s\n",
+ format_white_space, indent + 2,
+ vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off");
+ s = format (s, "%Urx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rx_offload_caps, di.rx_offload_capa);
+ s = format (s, "%Utx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_tx_offload_caps, di.tx_offload_capa);
+ s = format (s, "%Urss active: %U\n"
+ "%Urss supported: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, rss_conf.rss_hf,
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, di.flow_type_rss_offloads);
+ }
+
+ s = format (s, "%Urx queues %d, rx desc %d, tx queues %d, tx desc %d\n",
+ format_white_space, indent + 2,
+ xd->rx_q_used, xd->nb_rx_desc, xd->tx_q_used, xd->nb_tx_desc);
+
+ if (xd->cpu_socket > -1)
+ s = format (s, "%Ucpu socket %d\n",
+ format_white_space, indent + 2, xd->cpu_socket);
+
+ /* $$$ MIB counters */
+ {
+#define _(N, V) \
+ if ((xd->stats.V - xd->last_cleared_stats.V) != 0) { \
+ s = format (s, "\n%U%-40U%16Ld", \
+ format_white_space, indent + 2, \
+ format_c_identifier, #N, \
+ xd->stats.V - xd->last_cleared_stats.V); \
+ } \
+
+ foreach_dpdk_counter
+#undef _
+ }
+
+ u8 *xs = 0;
+ u32 i = 0;
+ struct rte_eth_xstat *xstat, *last_xstat;
+ struct rte_eth_xstat_name *xstat_names = 0;
+ int len = rte_eth_xstats_get_names (xd->device_index, NULL, 0);
+ vec_validate (xstat_names, len - 1);
+ rte_eth_xstats_get_names (xd->device_index, xstat_names, len);
+
+ ASSERT (vec_len (xd->xstats) == vec_len (xd->last_cleared_xstats));
+
+ /* *INDENT-OFF* */
+ vec_foreach_index(i, xd->xstats)
+ {
+ u64 delta = 0;
+ xstat = vec_elt_at_index(xd->xstats, i);
+ last_xstat = vec_elt_at_index(xd->last_cleared_xstats, i);
+
+ delta = xstat->value - last_xstat->value;
+ if (verbose == 2 || (verbose && delta))
+ {
+ /* format_c_identifier doesn't like c strings inside vector */
+ u8 * name = format(0,"%s", xstat_names[i].name);
+ xs = format(xs, "\n%U%-38U%16Ld",
+ format_white_space, indent + 4,
+ format_c_identifier, name, delta);
+ vec_free(name);
+ }
+ }
+ /* *INDENT-ON* */
+
+ vec_free (xstat_names);
+
+ if (xs)
+ {
+ s = format (s, "\n%Uextended stats:%v",
+ format_white_space, indent + 2, xs);
+ vec_free (xs);
+ }
+
+ return s;
+}
+
+u8 *
+format_dpdk_tx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main ();
+ dpdk_tx_dma_trace_t *t = va_arg (*va, dpdk_tx_dma_trace_t *);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index);
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U tx queue %d",
+ format_vnet_sw_interface_name, vnm, sw, t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index, format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_ethernet_header_with_length, t->buffer.pre_data,
+ sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+u8 *
+format_dpdk_rx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main ();
+ dpdk_rx_dma_trace_t *t = va_arg (*va, dpdk_rx_dma_trace_t *);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index);
+ format_function_t *f;
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U rx queue %d",
+ format_vnet_sw_interface_name, vnm, sw, t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index, format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_dpdk_rte_mbuf, &t->mb, &t->data);
+
+ if (vm->trace_main.verbose)
+ {
+ s = format (s, "\n%UPacket Dump%s", format_white_space, indent + 2,
+ t->mb.data_len > sizeof (t->data) ? " (truncated)" : "");
+ s = format (s, "\n%U%U", format_white_space, indent + 4,
+ format_hexdump, &t->data,
+ t->mb.data_len >
+ sizeof (t->data) ? sizeof (t->data) : t->mb.data_len);
+ }
+ f = node->format_buffer;
+ if (!f)
+ f = format_hex_bytes;
+ s = format (s, "\n%U%U", format_white_space, indent,
+ f, t->buffer.pre_data, sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+
+static inline u8 *
+format_dpdk_pkt_types (u8 * s, va_list * va)
+{
+ u32 *pkt_types = va_arg (*va, u32 *);
+ uword indent __attribute__ ((unused)) = format_get_indent (s) + 2;
+
+ if (!*pkt_types)
+ return s;
+
+ s = format (s, "Packet Types");
+
+#define _(L, F, S) \
+ if ((*pkt_types & RTE_PTYPE_##L##_MASK) == RTE_PTYPE_##L##_##F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", format_white_space, indent, \
+ "RTE_PTYPE_" #L "_" #F, RTE_PTYPE_##L##_##F, S); \
+ }
+
+ foreach_dpdk_pkt_type
+#undef _
+ return s;
+}
+
+static inline u8 *
+format_dpdk_pkt_offload_flags (u8 * s, va_list * va)
+{
+ u64 *ol_flags = va_arg (*va, u64 *);
+ uword indent = format_get_indent (s) + 2;
+
+ if (!*ol_flags)
+ return s;
+
+ s = format (s, "Packet Offload Flags");
+
+#define _(F, S) \
+ if (*ol_flags & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_offload_flag
+#undef _
+ return s;
+}
+
+u8 *
+format_dpdk_rte_mbuf_vlan (u8 * s, va_list * va)
+{
+ ethernet_vlan_header_tv_t *vlan_hdr =
+ va_arg (*va, ethernet_vlan_header_tv_t *);
+
+ if (clib_net_to_host_u16 (vlan_hdr->type) == ETHERNET_TYPE_DOT1AD)
+ {
+ s = format (s, "%U 802.1q vlan ",
+ format_ethernet_vlan_tci,
+ clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id));
+ vlan_hdr++;
+ }
+
+ s = format (s, "%U",
+ format_ethernet_vlan_tci,
+ clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id));
+
+ return s;
+}
+
+u8 *
+format_dpdk_rte_mbuf (u8 * s, va_list * va)
+{
+ struct rte_mbuf *mb = va_arg (*va, struct rte_mbuf *);
+ ethernet_header_t *eth_hdr = va_arg (*va, ethernet_header_t *);
+ uword indent = format_get_indent (s) + 2;
+
+ s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d"
+ "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x, data_off %d, phys_addr 0x%x"
+ "\n%Upacket_type 0x%x",
+ mb->port, mb->nb_segs, mb->pkt_len,
+ format_white_space, indent,
+ mb->buf_len, mb->data_len, mb->ol_flags, mb->data_off,
+ mb->buf_physaddr, format_white_space, indent, mb->packet_type);
+
+ if (mb->ol_flags)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_offload_flags, &mb->ol_flags);
+
+ if ((mb->ol_flags & PKT_RX_VLAN_PKT) &&
+ ((mb->ol_flags & (PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ_STRIPPED)) == 0))
+ {
+ ethernet_vlan_header_tv_t *vlan_hdr =
+ ((ethernet_vlan_header_tv_t *) & (eth_hdr->type));
+ s = format (s, " %U", format_dpdk_rte_mbuf_vlan, vlan_hdr);
+ }
+
+ if (mb->packet_type)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_types, &mb->packet_type);
+
+ return s;
+}
+
+uword
+unformat_socket_mem (unformat_input_t * input, va_list * va)
+{
+ uword **r = va_arg (*va, uword **);
+ int i = 0;
+ u32 mem;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, ","))
+ hash_set (*r, i, 1024);
+ else if (unformat (input, "%u,", &mem))
+ hash_set (*r, i, mem);
+ else if (unformat (input, "%u", &mem))
+ hash_set (*r, i, mem);
+ else
+ {
+ unformat_put_input (input);
+ goto done;
+ }
+ i++;
+ }
+
+done:
+ return 1;
+}
+
+clib_error_t *
+unformat_rss_fn (unformat_input_t * input, uword * rss_fn)
+{
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (0)
+ ;
+#undef _
+#define _(f, s) \
+ else if (unformat (input, s)) \
+ *rss_fn |= f;
+
+ foreach_dpdk_rss_hf
+#undef _
+ else
+ {
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ }
+ return 0;
+}
+
+clib_error_t *
+unformat_hqos (unformat_input_t * input, dpdk_device_config_hqos_t * hqos)
+{
+ clib_error_t *error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "hqos-thread %u", &hqos->hqos_thread))
+ hqos->hqos_thread_valid = 1;
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ break;
+ }
+ }
+
+ return error;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/hqos.c b/src/vnet/devices/dpdk/hqos.c
new file mode 100644
index 00000000000..d68bc48f80b
--- /dev/null
+++ b/src/vnet/devices/dpdk/hqos.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/bitmap.h>
+
+#include <vnet/vnet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include <vlib/unix/physmem.h>
+#include <vlib/pci/pci.h>
+#include <vlibmemory/api.h>
+#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */
+
+#define vl_typedefs /* define message structures */
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_typedefs
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_printfun
+
+#include "dpdk_priv.h"
+
+dpdk_main_t dpdk_main;
+
+/***
+ *
+ * HQoS default configuration values
+ *
+ ***/
+
+static dpdk_device_config_hqos_t hqos_params_default = {
+ .hqos_thread_valid = 0,
+
+ .swq_size = 4096,
+ .burst_enq = 256,
+ .burst_deq = 220,
+
+ /*
+ * Packet field to identify the subport.
+ *
+ * Default value: Since only one subport is defined by default (see below:
+ * n_subports_per_port = 1), the subport ID is hardcoded to 0.
+ */
+ .pktfield0_slabpos = 0,
+ .pktfield0_slabmask = 0,
+
+ /*
+ * Packet field to identify the pipe.
+ *
+ * Default value: Assuming Ethernet/IPv4/UDP packets, UDP payload bits 12 .. 23
+ */
+ .pktfield1_slabpos = 40,
+ .pktfield1_slabmask = 0x0000000FFF000000LLU,
+
+ /* Packet field used as index into TC translation table to identify the traffic
+ * class and queue.
+ *
+ * Default value: Assuming Ethernet/IPv4 packets, IPv4 DSCP field
+ */
+ .pktfield2_slabpos = 8,
+ .pktfield2_slabmask = 0x00000000000000FCLLU,
+ .tc_table = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ },
+
+ /* port */
+ .port = {
+ .name = NULL, /* Set at init */
+ .socket = 0, /* Set at init */
+ .rate = 1250000000, /* Assuming 10GbE port */
+ .mtu = 14 + 1500, /* Assuming Ethernet/IPv4 pkt (Ethernet FCS not included) */
+ .frame_overhead = RTE_SCHED_FRAME_OVERHEAD_DEFAULT,
+ .n_subports_per_port = 1,
+ .n_pipes_per_subport = 4096,
+ .qsize = {64, 64, 64, 64},
+ .pipe_profiles = NULL, /* Set at config */
+ .n_pipe_profiles = 1,
+
+#ifdef RTE_SCHED_RED
+ .red_params = {
+ /* Traffic Class 0 Colors Green / Yellow / Red */
+ [0][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [0][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [0][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 1 - Colors Green / Yellow / Red */
+ [1][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [1][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [1][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 2 - Colors Green / Yellow / Red */
+ [2][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [2][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [2][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 3 - Colors Green / Yellow / Red */
+ [3][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [3][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [3][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9}
+ },
+#endif /* RTE_SCHED_RED */
+ },
+};
+
+static struct rte_sched_subport_params hqos_subport_params_default = {
+ .tb_rate = 1250000000, /* 10GbE line rate (measured in bytes/second) */
+ .tb_size = 1000000,
+ .tc_rate = {1250000000, 1250000000, 1250000000, 1250000000},
+ .tc_period = 10,
+};
+
+static struct rte_sched_pipe_params hqos_pipe_params_default = {
+ .tb_rate = 305175, /* 10GbE line rate divided by 4K pipes */
+ .tb_size = 1000000,
+ .tc_rate = {305175, 305175, 305175, 305175},
+ .tc_period = 40,
+#ifdef RTE_SCHED_SUBPORT_TC_OV
+ .tc_ov_weight = 1,
+#endif
+ .wrr_weights = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+};
+
+/***
+ *
+ * HQoS configuration
+ *
+ ***/
+
+int
+dpdk_hqos_validate_mask (u64 mask, u32 n)
+{
+ int count = __builtin_popcountll (mask);
+ int pos_lead = sizeof (u64) * 8 - __builtin_clzll (mask);
+ int pos_trail = __builtin_ctzll (mask);
+ int count_expected = __builtin_popcount (n - 1);
+
+ /* Handle the exceptions */
+ if (n == 0)
+ return -1; /* Error */
+
+ if ((mask == 0) && (n == 1))
+ return 0; /* OK */
+
+ if (((mask == 0) && (n != 1)) || ((mask != 0) && (n == 1)))
+ return -2; /* Error */
+
+ /* Check that mask is contiguous */
+ if ((pos_lead - pos_trail) != count)
+ return -3; /* Error */
+
+ /* Check that mask contains the expected number of bits set */
+ if (count != count_expected)
+ return -4; /* Error */
+
+ return 0; /* OK */
+}
+
+void
+dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t *
+ hqos, u32 pipe_profile_id)
+{
+ memcpy (&hqos->pipe[pipe_profile_id], &hqos_pipe_params_default,
+ sizeof (hqos_pipe_params_default));
+}
+
+void
+dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos)
+{
+ struct rte_sched_subport_params *subport_params;
+ struct rte_sched_pipe_params *pipe_params;
+ u32 *pipe_map;
+ u32 i;
+
+ memcpy (hqos, &hqos_params_default, sizeof (hqos_params_default));
+
+ /* pipe */
+ vec_add2 (hqos->pipe, pipe_params, hqos->port.n_pipe_profiles);
+
+ for (i = 0; i < vec_len (hqos->pipe); i++)
+ memcpy (&pipe_params[i],
+ &hqos_pipe_params_default, sizeof (hqos_pipe_params_default));
+
+ hqos->port.pipe_profiles = hqos->pipe;
+
+ /* subport */
+ vec_add2 (hqos->subport, subport_params, hqos->port.n_subports_per_port);
+
+ for (i = 0; i < vec_len (hqos->subport); i++)
+ memcpy (&subport_params[i],
+ &hqos_subport_params_default,
+ sizeof (hqos_subport_params_default));
+
+ /* pipe profile */
+ vec_add2 (hqos->pipe_map,
+ pipe_map,
+ hqos->port.n_subports_per_port * hqos->port.n_pipes_per_subport);
+
+ for (i = 0; i < vec_len (hqos->pipe_map); i++)
+ pipe_map[i] = 0;
+}
+
+/***
+ *
+ * HQoS init
+ *
+ ***/
+
+clib_error_t *
+dpdk_port_setup_hqos (dpdk_device_t * xd, dpdk_device_config_hqos_t * hqos)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ char name[32];
+ u32 subport_id, i;
+ int rv;
+
+ /* Detect the set of worker threads */
+ int worker_thread_first = 0;
+ int worker_thread_count = 0;
+
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ vlib_thread_registration_t *tr =
+ p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (tr && tr->count > 0)
+ {
+ worker_thread_first = tr->first_index;
+ worker_thread_count = tr->count;
+ }
+
+ /* Allocate the per-thread device data array */
+ vec_validate_aligned (xd->hqos_wt, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+ memset (xd->hqos_wt, 0, tm->n_vlib_mains * sizeof (xd->hqos_wt[0]));
+
+ vec_validate_aligned (xd->hqos_ht, 0, CLIB_CACHE_LINE_BYTES);
+ memset (xd->hqos_ht, 0, sizeof (xd->hqos_ht[0]));
+
+ /* Allocate space for one SWQ per worker thread in the I/O TX thread data structure */
+ vec_validate (xd->hqos_ht->swq, worker_thread_count);
+
+ /* SWQ */
+ for (i = 0; i < worker_thread_count + 1; i++)
+ {
+ u32 swq_flags = RING_F_SP_ENQ | RING_F_SC_DEQ;
+
+ snprintf (name, sizeof (name), "SWQ-worker%u-to-device%u", i,
+ xd->device_index);
+ xd->hqos_ht->swq[i] =
+ rte_ring_create (name, hqos->swq_size, xd->cpu_socket, swq_flags);
+ if (xd->hqos_ht->swq[i] == NULL)
+ return clib_error_return (0,
+ "SWQ-worker%u-to-device%u: rte_ring_create err",
+ i, xd->device_index);
+ }
+
+ /*
+ * HQoS
+ */
+
+ /* HQoS port */
+ snprintf (name, sizeof (name), "HQoS%u", xd->device_index);
+ hqos->port.name = strdup (name);
+ if (hqos->port.name == NULL)
+ return clib_error_return (0, "HQoS%u: strdup err", xd->device_index);
+
+ hqos->port.socket = rte_eth_dev_socket_id (xd->device_index);
+ if (hqos->port.socket == SOCKET_ID_ANY)
+ hqos->port.socket = 0;
+
+ xd->hqos_ht->hqos = rte_sched_port_config (&hqos->port);
+ if (xd->hqos_ht->hqos == NULL)
+ return clib_error_return (0, "HQoS%u: rte_sched_port_config err",
+ xd->device_index);
+
+ /* HQoS subport */
+ for (subport_id = 0; subport_id < hqos->port.n_subports_per_port;
+ subport_id++)
+ {
+ u32 pipe_id;
+
+ rv =
+ rte_sched_subport_config (xd->hqos_ht->hqos, subport_id,
+ &hqos->subport[subport_id]);
+ if (rv)
+ return clib_error_return (0,
+ "HQoS%u subport %u: rte_sched_subport_config err (%d)",
+ xd->device_index, subport_id, rv);
+
+ /* HQoS pipe */
+ for (pipe_id = 0; pipe_id < hqos->port.n_pipes_per_subport; pipe_id++)
+ {
+ u32 pos = subport_id * hqos->port.n_pipes_per_subport + pipe_id;
+ u32 profile_id = hqos->pipe_map[pos];
+
+ rv =
+ rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id,
+ profile_id);
+ if (rv)
+ return clib_error_return (0,
+ "HQoS%u subport %u pipe %u: rte_sched_pipe_config err (%d)",
+ xd->device_index, subport_id, pipe_id,
+ rv);
+ }
+ }
+
+ /* Set up per-thread device data for the I/O TX thread */
+ xd->hqos_ht->hqos_burst_enq = hqos->burst_enq;
+ xd->hqos_ht->hqos_burst_deq = hqos->burst_deq;
+ vec_validate (xd->hqos_ht->pkts_enq, 2 * hqos->burst_enq - 1);
+ vec_validate (xd->hqos_ht->pkts_deq, hqos->burst_deq - 1);
+ xd->hqos_ht->pkts_enq_len = 0;
+ xd->hqos_ht->swq_pos = 0;
+ xd->hqos_ht->flush_count = 0;
+
+ /* Set up per-thread device data for each worker thread */
+ for (i = 0; i < worker_thread_count + 1; i++)
+ {
+ u32 tid;
+ if (i)
+ tid = worker_thread_first + (i - 1);
+ else
+ tid = i;
+
+ xd->hqos_wt[tid].swq = xd->hqos_ht->swq[i];
+ xd->hqos_wt[tid].hqos_field0_slabpos = hqos->pktfield0_slabpos;
+ xd->hqos_wt[tid].hqos_field0_slabmask = hqos->pktfield0_slabmask;
+ xd->hqos_wt[tid].hqos_field0_slabshr =
+ __builtin_ctzll (hqos->pktfield0_slabmask);
+ xd->hqos_wt[tid].hqos_field1_slabpos = hqos->pktfield1_slabpos;
+ xd->hqos_wt[tid].hqos_field1_slabmask = hqos->pktfield1_slabmask;
+ xd->hqos_wt[tid].hqos_field1_slabshr =
+ __builtin_ctzll (hqos->pktfield1_slabmask);
+ xd->hqos_wt[tid].hqos_field2_slabpos = hqos->pktfield2_slabpos;
+ xd->hqos_wt[tid].hqos_field2_slabmask = hqos->pktfield2_slabmask;
+ xd->hqos_wt[tid].hqos_field2_slabshr =
+ __builtin_ctzll (hqos->pktfield2_slabmask);
+ memcpy (xd->hqos_wt[tid].hqos_tc_table, hqos->tc_table,
+ sizeof (hqos->tc_table));
+ }
+
+ return 0;
+}
+
+/***
+ *
+ * HQoS run-time
+ *
+ ***/
+/*
+ * dpdk_hqos_thread - Contains the main loop of an HQoS thread.
+ *
+ * w
+ * Information for the current thread
+ */
+static_always_inline void
+dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 cpu_index = vm->cpu_index;
+ u32 dev_pos;
+
+ dev_pos = 0;
+ while (1)
+ {
+ vlib_worker_thread_barrier_check ();
+
+ u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]);
+ if (dev_pos >= n_devs)
+ dev_pos = 0;
+
+ dpdk_device_and_queue_t *dq =
+ vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos);
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
+
+ dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
+ u32 device_index = xd->device_index;
+ u16 queue_id = dq->queue_id;
+
+ struct rte_mbuf **pkts_enq = hqos->pkts_enq;
+ u32 pkts_enq_len = hqos->pkts_enq_len;
+ u32 swq_pos = hqos->swq_pos;
+ u32 n_swq = vec_len (hqos->swq), i;
+ u32 flush_count = hqos->flush_count;
+
+ for (i = 0; i < n_swq; i++)
+ {
+ /* Get current SWQ for this device */
+ struct rte_ring *swq = hqos->swq[swq_pos];
+
+ /* Read SWQ burst to packet buffer of this device */
+ pkts_enq_len += rte_ring_sc_dequeue_burst (swq,
+ (void **)
+ &pkts_enq[pkts_enq_len],
+ hqos->hqos_burst_enq);
+
+ /* Get next SWQ for this device */
+ swq_pos++;
+ if (swq_pos >= n_swq)
+ swq_pos = 0;
+ hqos->swq_pos = swq_pos;
+
+ /* HWQ TX enqueue when burst available */
+ if (pkts_enq_len >= hqos->hqos_burst_enq)
+ {
+ u32 n_pkts = rte_eth_tx_burst (device_index,
+ (uint16_t) queue_id,
+ pkts_enq,
+ (uint16_t) pkts_enq_len);
+
+ for (; n_pkts < pkts_enq_len; n_pkts++)
+ rte_pktmbuf_free (pkts_enq[n_pkts]);
+
+ pkts_enq_len = 0;
+ flush_count = 0;
+ break;
+ }
+ }
+ if (pkts_enq_len)
+ {
+ flush_count++;
+ if (PREDICT_FALSE (flush_count == HQOS_FLUSH_COUNT_THRESHOLD))
+ {
+ rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
+
+ pkts_enq_len = 0;
+ flush_count = 0;
+ }
+ }
+ hqos->pkts_enq_len = pkts_enq_len;
+ hqos->flush_count = flush_count;
+
+ /* Advance to next device */
+ dev_pos++;
+ }
+}
+
+static_always_inline void
+dpdk_hqos_thread_internal (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 cpu_index = vm->cpu_index;
+ u32 dev_pos;
+
+ dev_pos = 0;
+ while (1)
+ {
+ vlib_worker_thread_barrier_check ();
+
+ u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]);
+ if (PREDICT_FALSE (n_devs == 0))
+ {
+ dev_pos = 0;
+ continue;
+ }
+ if (dev_pos >= n_devs)
+ dev_pos = 0;
+
+ dpdk_device_and_queue_t *dq =
+ vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos);
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
+
+ dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
+ u32 device_index = xd->device_index;
+ u16 queue_id = dq->queue_id;
+
+ struct rte_mbuf **pkts_enq = hqos->pkts_enq;
+ struct rte_mbuf **pkts_deq = hqos->pkts_deq;
+ u32 pkts_enq_len = hqos->pkts_enq_len;
+ u32 swq_pos = hqos->swq_pos;
+ u32 n_swq = vec_len (hqos->swq), i;
+ u32 flush_count = hqos->flush_count;
+
+ /*
+ * SWQ dequeue and HQoS enqueue for current device
+ */
+ for (i = 0; i < n_swq; i++)
+ {
+ /* Get current SWQ for this device */
+ struct rte_ring *swq = hqos->swq[swq_pos];
+
+ /* Read SWQ burst to packet buffer of this device */
+ pkts_enq_len += rte_ring_sc_dequeue_burst (swq,
+ (void **)
+ &pkts_enq[pkts_enq_len],
+ hqos->hqos_burst_enq);
+
+ /* Get next SWQ for this device */
+ swq_pos++;
+ if (swq_pos >= n_swq)
+ swq_pos = 0;
+ hqos->swq_pos = swq_pos;
+
+ /* HQoS enqueue when burst available */
+ if (pkts_enq_len >= hqos->hqos_burst_enq)
+ {
+ rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
+
+ pkts_enq_len = 0;
+ flush_count = 0;
+ break;
+ }
+ }
+ if (pkts_enq_len)
+ {
+ flush_count++;
+ if (PREDICT_FALSE (flush_count == HQOS_FLUSH_COUNT_THRESHOLD))
+ {
+ rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
+
+ pkts_enq_len = 0;
+ flush_count = 0;
+ }
+ }
+ hqos->pkts_enq_len = pkts_enq_len;
+ hqos->flush_count = flush_count;
+
+ /*
+ * HQoS dequeue and HWQ TX enqueue for current device
+ */
+ {
+ u32 pkts_deq_len, n_pkts;
+
+ pkts_deq_len = rte_sched_port_dequeue (hqos->hqos,
+ pkts_deq,
+ hqos->hqos_burst_deq);
+
+ for (n_pkts = 0; n_pkts < pkts_deq_len;)
+ n_pkts += rte_eth_tx_burst (device_index,
+ (uint16_t) queue_id,
+ &pkts_deq[n_pkts],
+ (uint16_t) (pkts_deq_len - n_pkts));
+ }
+
+ /* Advance to next device */
+ dev_pos++;
+ }
+}
+
+void
+dpdk_hqos_thread (vlib_worker_thread_t * w)
+{
+ vlib_main_t *vm;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+
+ vm = vlib_get_main ();
+
+ ASSERT (vm->cpu_index == os_get_cpu_number ());
+
+ clib_time_init (&vm->clib_time);
+ clib_mem_set_heap (w->thread_mheap);
+
+ /* Wait until the dpdk init sequence is complete */
+ while (tm->worker_thread_release == 0)
+ vlib_worker_thread_barrier_check ();
+
+ if (vec_len (dm->devices_by_hqos_cpu[vm->cpu_index]) == 0)
+ return
+ clib_error
+ ("current I/O TX thread does not have any devices assigned to it");
+
+ if (DPDK_HQOS_DBG_BYPASS)
+ dpdk_hqos_thread_internal_hqos_dbg_bypass (vm);
+ else
+ dpdk_hqos_thread_internal (vm);
+}
+
+void
+dpdk_hqos_thread_fn (void *arg)
+{
+ vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+ vlib_worker_thread_init (w);
+ dpdk_hqos_thread (w);
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_THREAD (hqos_thread_reg, static) =
+{
+ .name = "hqos-threads",
+ .short_name = "hqos-threads",
+ .function = dpdk_hqos_thread_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * HQoS run-time code to be called by the worker threads
+ */
+#define BITFIELD(byte_array, slab_pos, slab_mask, slab_shr) \
+({ \
+ u64 slab = *((u64 *) &byte_array[slab_pos]); \
+ u64 val = (rte_be_to_cpu_64(slab) & slab_mask) >> slab_shr; \
+ val; \
+})
+
+#define RTE_SCHED_PORT_HIERARCHY(subport, pipe, traffic_class, queue, color) \
+ ((((u64) (queue)) & 0x3) | \
+ ((((u64) (traffic_class)) & 0x3) << 2) | \
+ ((((u64) (color)) & 0x3) << 4) | \
+ ((((u64) (subport)) & 0xFFFF) << 16) | \
+ ((((u64) (pipe)) & 0xFFFFFFFF) << 32))
+
+void
+dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos,
+ struct rte_mbuf **pkts, u32 n_pkts)
+{
+ u32 i;
+
+ for (i = 0; i < (n_pkts & (~0x3)); i += 4)
+ {
+ struct rte_mbuf *pkt0 = pkts[i];
+ struct rte_mbuf *pkt1 = pkts[i + 1];
+ struct rte_mbuf *pkt2 = pkts[i + 2];
+ struct rte_mbuf *pkt3 = pkts[i + 3];
+
+ u8 *pkt0_data = rte_pktmbuf_mtod (pkt0, u8 *);
+ u8 *pkt1_data = rte_pktmbuf_mtod (pkt1, u8 *);
+ u8 *pkt2_data = rte_pktmbuf_mtod (pkt2, u8 *);
+ u8 *pkt3_data = rte_pktmbuf_mtod (pkt3, u8 *);
+
+ u64 pkt0_subport = BITFIELD (pkt0_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt0_pipe = BITFIELD (pkt0_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt0_dscp = BITFIELD (pkt0_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt0_tc = hqos->hqos_tc_table[pkt0_dscp & 0x3F] >> 2;
+ u32 pkt0_tc_q = hqos->hqos_tc_table[pkt0_dscp & 0x3F] & 0x3;
+
+ u64 pkt1_subport = BITFIELD (pkt1_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt1_pipe = BITFIELD (pkt1_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt1_dscp = BITFIELD (pkt1_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt1_tc = hqos->hqos_tc_table[pkt1_dscp & 0x3F] >> 2;
+ u32 pkt1_tc_q = hqos->hqos_tc_table[pkt1_dscp & 0x3F] & 0x3;
+
+ u64 pkt2_subport = BITFIELD (pkt2_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt2_pipe = BITFIELD (pkt2_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt2_dscp = BITFIELD (pkt2_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt2_tc = hqos->hqos_tc_table[pkt2_dscp & 0x3F] >> 2;
+ u32 pkt2_tc_q = hqos->hqos_tc_table[pkt2_dscp & 0x3F] & 0x3;
+
+ u64 pkt3_subport = BITFIELD (pkt3_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt3_pipe = BITFIELD (pkt3_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt3_dscp = BITFIELD (pkt3_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt3_tc = hqos->hqos_tc_table[pkt3_dscp & 0x3F] >> 2;
+ u32 pkt3_tc_q = hqos->hqos_tc_table[pkt3_dscp & 0x3F] & 0x3;
+
+ u64 pkt0_sched = RTE_SCHED_PORT_HIERARCHY (pkt0_subport,
+ pkt0_pipe,
+ pkt0_tc,
+ pkt0_tc_q,
+ 0);
+ u64 pkt1_sched = RTE_SCHED_PORT_HIERARCHY (pkt1_subport,
+ pkt1_pipe,
+ pkt1_tc,
+ pkt1_tc_q,
+ 0);
+ u64 pkt2_sched = RTE_SCHED_PORT_HIERARCHY (pkt2_subport,
+ pkt2_pipe,
+ pkt2_tc,
+ pkt2_tc_q,
+ 0);
+ u64 pkt3_sched = RTE_SCHED_PORT_HIERARCHY (pkt3_subport,
+ pkt3_pipe,
+ pkt3_tc,
+ pkt3_tc_q,
+ 0);
+
+ pkt0->hash.sched.lo = pkt0_sched & 0xFFFFFFFF;
+ pkt0->hash.sched.hi = pkt0_sched >> 32;
+ pkt1->hash.sched.lo = pkt1_sched & 0xFFFFFFFF;
+ pkt1->hash.sched.hi = pkt1_sched >> 32;
+ pkt2->hash.sched.lo = pkt2_sched & 0xFFFFFFFF;
+ pkt2->hash.sched.hi = pkt2_sched >> 32;
+ pkt3->hash.sched.lo = pkt3_sched & 0xFFFFFFFF;
+ pkt3->hash.sched.hi = pkt3_sched >> 32;
+ }
+
+ for (; i < n_pkts; i++)
+ {
+ struct rte_mbuf *pkt = pkts[i];
+
+ u8 *pkt_data = rte_pktmbuf_mtod (pkt, u8 *);
+
+ u64 pkt_subport = BITFIELD (pkt_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt_pipe = BITFIELD (pkt_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt_dscp = BITFIELD (pkt_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt_tc = hqos->hqos_tc_table[pkt_dscp & 0x3F] >> 2;
+ u32 pkt_tc_q = hqos->hqos_tc_table[pkt_dscp & 0x3F] & 0x3;
+
+ u64 pkt_sched = RTE_SCHED_PORT_HIERARCHY (pkt_subport,
+ pkt_pipe,
+ pkt_tc,
+ pkt_tc_q,
+ 0);
+
+ pkt->hash.sched.lo = pkt_sched & 0xFFFFFFFF;
+ pkt->hash.sched.hi = pkt_sched >> 32;
+ }
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/init.c b/src/vnet/devices/dpdk/init.c
new file mode 100755
index 00000000000..693ca985130
--- /dev/null
+++ b/src/vnet/devices/dpdk/init.c
@@ -0,0 +1,1803 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/bitmap.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vlib/unix/physmem.h>
+#include <vlib/pci/pci.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "dpdk_priv.h"
+
+dpdk_main_t dpdk_main;
+
+/* force linker to link functions used by vlib and declared weak */
+void *vlib_weakly_linked_functions[] = {
+ &rte_pktmbuf_init,
+ &rte_pktmbuf_pool_init,
+};
+
+#define LINK_STATE_ELOGS 0
+
+#define DEFAULT_HUGE_DIR "/run/vpp/hugepages"
+#define VPP_RUN_DIR "/run/vpp"
+
+/* Port configuration, mildly modified Intel app values */
+
+static struct rte_eth_conf port_conf_template = {
+ .rxmode = {
+ .split_hdr_size = 0,
+ .header_split = 0, /**< Header Split disabled */
+ .hw_ip_checksum = 0, /**< IP checksum offload disabled */
+ .hw_vlan_filter = 0, /**< VLAN filtering disabled */
+ .hw_strip_crc = 0, /**< CRC stripped by hardware */
+ },
+ .txmode = {
+ .mq_mode = ETH_MQ_TX_NONE,
+ },
+};
+
+clib_error_t *
+dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_buffer_main_t *bm = vm->buffer_main;
+ int rv;
+ int j;
+
+ ASSERT (os_get_cpu_number () == 0);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0);
+ rte_eth_dev_stop (xd->device_index);
+ }
+
+ rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used,
+ xd->tx_q_used, &xd->port_conf);
+
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ /* Set up one TX-queue per worker thread */
+ for (j = 0; j < xd->tx_q_used; j++)
+ {
+ rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc,
+ xd->cpu_socket, &xd->tx_conf);
+
+ /* retry with any other CPU socket */
+ if (rv < 0)
+ rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc,
+ SOCKET_ID_ANY, &xd->tx_conf);
+ if (rv < 0)
+ break;
+ }
+
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_tx_queue_setup[%d]: err %d",
+ xd->device_index, rv);
+
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+
+ rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc,
+ xd->cpu_socket, 0,
+ bm->
+ pktmbuf_pools[xd->cpu_socket_id_by_queue
+ [j]]);
+
+ /* retry with any other CPU socket */
+ if (rv < 0)
+ rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc,
+ SOCKET_ID_ANY, 0,
+ bm->
+ pktmbuf_pools[xd->cpu_socket_id_by_queue
+ [j]]);
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d",
+ xd->device_index, rv);
+ }
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ int rv;
+ rv = rte_eth_dev_start (xd->device_index);
+ if (rv < 0)
+ clib_warning ("rte_eth_dev_start %d returned %d",
+ xd->device_index, rv);
+ }
+ return 0;
+}
+
+static u32
+dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+ u32 old = 0;
+
+ if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC (flags))
+ {
+ old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0;
+
+ if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL)
+ xd->flags |= DPDK_DEVICE_FLAG_PROMISC;
+ else
+ xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
+ rte_eth_promiscuous_enable (xd->device_index);
+ else
+ rte_eth_promiscuous_disable (xd->device_index);
+ }
+ }
+ else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU (flags))
+ {
+ /*
+ * DAW-FIXME: The Cisco VIC firmware does not provide an api for a
+ * driver to dynamically change the mtu. If/when the
+ * VIC firmware gets fixed, then this should be removed.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ struct rte_eth_dev_info dev_info;
+
+ /*
+ * Restore mtu to what has been set by CIMC in the firmware cfg.
+ */
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ hi->max_packet_bytes = dev_info.max_rx_pktlen;
+
+ vlib_cli_output (vlib_get_main (),
+ "Cisco VIC mtu can only be changed "
+ "using CIMC then rebooting the server!");
+ }
+ else
+ {
+ int rv;
+
+ xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ rte_eth_dev_stop (xd->device_index);
+
+ rv = rte_eth_dev_configure
+ (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf);
+
+ if (rv < 0)
+ vlib_cli_output (vlib_get_main (),
+ "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ int rv = rte_eth_dev_start (xd->device_index);
+ if (rv < 0)
+ clib_warning ("rte_eth_dev_start %d returned %d",
+ xd->device_index, rv);
+ }
+ }
+ }
+ return old;
+}
+
+void
+dpdk_device_lock_init (dpdk_device_t * xd)
+{
+ int q;
+ vec_validate (xd->lockp, xd->tx_q_used - 1);
+ for (q = 0; q < xd->tx_q_used; q++)
+ {
+ xd->lockp[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) xd->lockp[q], 0, CLIB_CACHE_LINE_BYTES);
+ }
+}
+
+void
+dpdk_device_lock_free (dpdk_device_t * xd)
+{
+ int q;
+
+ for (q = 0; q < vec_len (xd->lockp); q++)
+ clib_mem_free ((void *) xd->lockp[q]);
+ vec_free (xd->lockp);
+ xd->lockp = 0;
+}
+
+static clib_error_t *
+dpdk_lib_init (dpdk_main_t * dm)
+{
+ u32 nports;
+ u32 nb_desc = 0;
+ int i;
+ clib_error_t *error;
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ vnet_sw_interface_t *sw;
+ vnet_hw_interface_t *hi;
+ dpdk_device_t *xd;
+ vlib_pci_addr_t last_pci_addr;
+ u32 last_pci_addr_port = 0;
+ vlib_thread_registration_t *tr, *tr_hqos;
+ uword *p, *p_hqos;
+
+ u32 next_cpu = 0, next_hqos_cpu = 0;
+ u8 af_packet_port_id = 0;
+ last_pci_addr.as_u32 = ~0;
+
+ dm->input_cpu_first_index = 0;
+ dm->input_cpu_count = 1;
+
+ /* find out which cpus will be used for input */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (tr && tr->count > 0)
+ {
+ dm->input_cpu_first_index = tr->first_index;
+ dm->input_cpu_count = tr->count;
+ }
+
+ vec_validate_aligned (dm->devices_by_cpu, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (dm->workers, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ dm->hqos_cpu_first_index = 0;
+ dm->hqos_cpu_count = 0;
+
+ /* find out which cpus will be used for I/O TX */
+ p_hqos = hash_get_mem (tm->thread_registrations_by_name, "hqos-threads");
+ tr_hqos = p_hqos ? (vlib_thread_registration_t *) p_hqos[0] : 0;
+
+ if (tr_hqos && tr_hqos->count > 0)
+ {
+ dm->hqos_cpu_first_index = tr_hqos->first_index;
+ dm->hqos_cpu_count = tr_hqos->count;
+ }
+
+ vec_validate_aligned (dm->devices_by_hqos_cpu, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (dm->hqos_threads, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ nports = rte_eth_dev_count ();
+ if (nports < 1)
+ {
+ clib_warning ("DPDK drivers found no ports...");
+ }
+
+ if (CLIB_DEBUG > 0)
+ clib_warning ("DPDK drivers found %d ports...", nports);
+
+ /*
+ * All buffers are all allocated from the same rte_mempool.
+ * Thus they all have the same number of data bytes.
+ */
+ dm->vlib_buffer_free_list_index =
+ vlib_buffer_get_or_create_free_list (vm,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
+ "dpdk rx");
+
+ if (dm->conf->enable_tcp_udp_checksum)
+ dm->buffer_flags_template &= ~(IP_BUFFER_L4_CHECKSUM_CORRECT
+ | IP_BUFFER_L4_CHECKSUM_COMPUTED);
+
+ for (i = 0; i < nports; i++)
+ {
+ u8 addr[6];
+ u8 vlan_strip = 0;
+ int j;
+ struct rte_eth_dev_info dev_info;
+ clib_error_t *rv;
+ struct rte_eth_link l;
+ dpdk_device_config_t *devconf = 0;
+ vlib_pci_addr_t pci_addr;
+ uword *p = 0;
+
+ rte_eth_dev_info_get (i, &dev_info);
+ if (dev_info.pci_dev) /* bonded interface has no pci info */
+ {
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr,
+ pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ /* Create vnet interface */
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
+ xd->cpu_socket = (i8) rte_eth_dev_socket_id (i);
+
+ /* Handle interface naming for devices with multiple ports sharing same PCI ID */
+ if (dev_info.pci_dev)
+ {
+ struct rte_eth_dev_info di = { 0 };
+ rte_eth_dev_info_get (i + 1, &di);
+ if (di.pci_dev && pci_addr.as_u32 != last_pci_addr.as_u32 &&
+ memcmp (&dev_info.pci_dev->addr, &di.pci_dev->addr,
+ sizeof (struct rte_pci_addr)) == 0)
+ {
+ xd->interface_name_suffix = format (0, "0");
+ last_pci_addr.as_u32 = pci_addr.as_u32;
+ last_pci_addr_port = i;
+ }
+ else if (pci_addr.as_u32 == last_pci_addr.as_u32)
+ {
+ xd->interface_name_suffix =
+ format (0, "%u", i - last_pci_addr_port);
+ }
+ else
+ {
+ last_pci_addr.as_u32 = ~0;
+ }
+ }
+ else
+ last_pci_addr.as_u32 = ~0;
+
+ clib_memcpy (&xd->tx_conf, &dev_info.default_txconf,
+ sizeof (struct rte_eth_txconf));
+ if (dm->conf->no_multi_seg)
+ {
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 0;
+ }
+ else
+ {
+ xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 1;
+ xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG;
+ }
+
+ clib_memcpy (&xd->port_conf, &port_conf_template,
+ sizeof (struct rte_eth_conf));
+
+ xd->tx_q_used = clib_min (dev_info.max_tx_queues, tm->n_vlib_mains);
+
+ if (devconf->num_tx_queues > 0
+ && devconf->num_tx_queues < xd->tx_q_used)
+ xd->tx_q_used = clib_min (xd->tx_q_used, devconf->num_tx_queues);
+
+ if (devconf->num_rx_queues > 1 && dm->use_rss == 0)
+ {
+ dm->use_rss = 1;
+ }
+
+ if (devconf->num_rx_queues > 1
+ && dev_info.max_rx_queues >= devconf->num_rx_queues)
+ {
+ xd->rx_q_used = devconf->num_rx_queues;
+ xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+ if (devconf->rss_fn == 0)
+ xd->port_conf.rx_adv_conf.rss_conf.rss_hf =
+ ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP;
+ else
+ xd->port_conf.rx_adv_conf.rss_conf.rss_hf = devconf->rss_fn;
+ }
+ else
+ xd->rx_q_used = 1;
+
+ xd->flags |= DPDK_DEVICE_FLAG_PMD;
+
+ /* workaround for drivers not setting driver_name */
+ if ((!dev_info.driver_name) && (dev_info.pci_dev))
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+ dev_info.driver_name = dev_info.pci_dev->driver->name;
+#else
+ dev_info.driver_name = dev_info.pci_dev->driver->driver.name;
+#endif
+ ASSERT (dev_info.driver_name);
+
+ if (!xd->pmd)
+ {
+
+
+#define _(s,f) else if (dev_info.driver_name && \
+ !strcmp(dev_info.driver_name, s)) \
+ xd->pmd = VNET_DPDK_PMD_##f;
+ if (0)
+ ;
+ foreach_dpdk_pmd
+#undef _
+ else
+ xd->pmd = VNET_DPDK_PMD_UNKNOWN;
+
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
+
+ switch (xd->pmd)
+ {
+ /* 1G adapters */
+ case VNET_DPDK_PMD_E1000EM:
+ case VNET_DPDK_PMD_IGB:
+ case VNET_DPDK_PMD_IGBVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ break;
+
+ /* 10G adapters */
+ case VNET_DPDK_PMD_IXGBE:
+ case VNET_DPDK_PMD_IXGBEVF:
+ case VNET_DPDK_PMD_THUNDERX:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ case VNET_DPDK_PMD_DPAA2:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+
+ /* Cisco VIC */
+ case VNET_DPDK_PMD_ENIC:
+ rte_eth_link_get_nowait (i, &l);
+ xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE;
+ if (l.link_speed == 40000)
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ else
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+
+ /* Intel Fortville */
+ case VNET_DPDK_PMD_I40E:
+ case VNET_DPDK_PMD_I40EVF:
+ xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE;
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+
+ switch (dev_info.pci_dev->id.device_id)
+ {
+ case I40E_DEV_ID_10G_BASE_T:
+ case I40E_DEV_ID_SFP_XL710:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ case I40E_DEV_ID_QSFP_A:
+ case I40E_DEV_ID_QSFP_B:
+ case I40E_DEV_ID_QSFP_C:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ case I40E_DEV_ID_VF:
+ rte_eth_link_get_nowait (i, &l);
+ xd->port_type = l.link_speed == 10000 ?
+ VNET_DPDK_PORT_TYPE_ETH_10G : VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+ break;
+
+ case VNET_DPDK_PMD_CXGBE:
+ switch (dev_info.pci_dev->id.device_id)
+ {
+ case 0x540d: /* T580-CR */
+ case 0x5410: /* T580-LP-cr */
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ case 0x5403: /* T540-CR */
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+ break;
+
+ case VNET_DPDK_PMD_MLX5:
+ {
+ char *pn_100g[] = { "MCX415A-CCAT", "MCX416A-CCAT", 0 };
+ char *pn_40g[] = { "MCX413A-BCAT", "MCX414A-BCAT",
+ "MCX415A-BCAT", "MCX416A-BCAT", "MCX4131A-BCAT", 0
+ };
+ char *pn_10g[] = { "MCX4111A-XCAT", "MCX4121A-XCAT", 0 };
+
+ vlib_pci_device_t *pd = vlib_get_pci_device (&pci_addr);
+ u8 *pn = 0;
+ char **c;
+ int found = 0;
+ pn = format (0, "%U%c",
+ format_vlib_pci_vpd, pd->vpd_r, "PN", 0);
+
+ if (!pn)
+ break;
+
+ c = pn_100g;
+ while (!found && c[0])
+ {
+ if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0)
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_100G;
+ break;
+ }
+ c++;
+ }
+
+ c = pn_40g;
+ while (!found && c[0])
+ {
+ if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0)
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ }
+ c++;
+ }
+
+ c = pn_10g;
+ while (!found && c[0])
+ {
+ if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0)
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ }
+ c++;
+ }
+
+ vec_free (pn);
+ }
+
+ break;
+ /* Intel Red Rock Canyon */
+ case VNET_DPDK_PMD_FM10K:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH;
+ break;
+
+ /* virtio */
+ case VNET_DPDK_PMD_VIRTIO:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO;
+ break;
+
+ /* vmxnet3 */
+ case VNET_DPDK_PMD_VMXNET3:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ break;
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET;
+ xd->af_packet_port_id = af_packet_port_id++;
+ break;
+
+ case VNET_DPDK_PMD_BOND:
+ xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE;
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_BOND;
+ break;
+
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+
+ if (devconf->num_rx_desc)
+ xd->nb_rx_desc = devconf->num_rx_desc;
+
+ if (devconf->num_tx_desc)
+ xd->nb_tx_desc = devconf->num_tx_desc;
+ }
+
+ /*
+ * Ensure default mtu is not > the mtu read from the hardware.
+ * Otherwise rte_eth_dev_configure() will fail and the port will
+ * not be available.
+ */
+ if (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen)
+ {
+ /*
+ * This device does not support the platforms's max frame
+ * size. Use it's advertised mru instead.
+ */
+ xd->port_conf.rxmode.max_rx_pkt_len = dev_info.max_rx_pktlen;
+ }
+ else
+ {
+ xd->port_conf.rxmode.max_rx_pkt_len = ETHERNET_MAX_PACKET_BYTES;
+
+ /*
+ * Some platforms do not account for Ethernet FCS (4 bytes) in
+ * MTU calculations. To interop with them increase mru but only
+ * if the device's settings can support it.
+ */
+ if ((dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES + 4)) &&
+ xd->port_conf.rxmode.hw_strip_crc)
+ {
+ /*
+ * Allow additional 4 bytes (for Ethernet FCS). These bytes are
+ * stripped by h/w and so will not consume any buffer memory.
+ */
+ xd->port_conf.rxmode.max_rx_pkt_len += 4;
+ }
+ }
+
+ if (xd->pmd == VNET_DPDK_PMD_AF_PACKET)
+ {
+ f64 now = vlib_time_now (vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+ clib_memcpy (addr + 2, &rnd, sizeof (rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+ else
+ rte_eth_macaddr_get (i, (struct ether_addr *) addr);
+
+ if (xd->tx_q_used < tm->n_vlib_mains)
+ dpdk_device_lock_init (xd);
+
+ xd->device_index = xd - dm->devices;
+ ASSERT (i == xd->device_index);
+ xd->per_interface_next_index = ~0;
+
+ /* assign interface to input thread */
+ dpdk_device_and_queue_t *dq;
+ int q;
+
+ if (devconf->workers)
+ {
+ int i;
+ q = 0;
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (i, devconf->workers, ({
+ int cpu = dm->input_cpu_first_index + i;
+ unsigned lcore = vlib_worker_threads[cpu].lcore_id;
+ vec_validate(xd->cpu_socket_id_by_queue, q);
+ xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore);
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = q++;
+ }));
+ /* *INDENT-ON* */
+ }
+ else
+ for (q = 0; q < xd->rx_q_used; q++)
+ {
+ int cpu = dm->input_cpu_first_index + next_cpu;
+ unsigned lcore = vlib_worker_threads[cpu].lcore_id;
+
+ /*
+ * numa node for worker thread handling this queue
+ * needed for taking buffers from the right mempool
+ */
+ vec_validate (xd->cpu_socket_id_by_queue, q);
+ xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id (lcore);
+
+ /*
+ * construct vector of (device,queue) pairs for each worker thread
+ */
+ vec_add2 (dm->devices_by_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = q;
+
+ next_cpu++;
+ if (next_cpu == dm->input_cpu_count)
+ next_cpu = 0;
+ }
+
+
+ if (devconf->hqos_enabled)
+ {
+ xd->flags |= DPDK_DEVICE_FLAG_HQOS;
+
+ if (devconf->hqos.hqos_thread_valid)
+ {
+ int cpu = dm->hqos_cpu_first_index + devconf->hqos.hqos_thread;
+
+ if (devconf->hqos.hqos_thread >= dm->hqos_cpu_count)
+ return clib_error_return (0, "invalid HQoS thread index");
+
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+ }
+ else
+ {
+ int cpu = dm->hqos_cpu_first_index + next_hqos_cpu;
+
+ if (dm->hqos_cpu_count == 0)
+ return clib_error_return (0, "no HQoS threads available");
+
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+
+ next_hqos_cpu++;
+ if (next_hqos_cpu == dm->hqos_cpu_count)
+ next_hqos_cpu = 0;
+
+ devconf->hqos.hqos_thread_valid = 1;
+ devconf->hqos.hqos_thread = cpu;
+ }
+ }
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], xd->nb_tx_desc,
+ sizeof (tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE - 1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->d_trace_buffers, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ rv = dpdk_port_setup (dm, xd);
+
+ if (rv)
+ return rv;
+
+ if (devconf->hqos_enabled)
+ {
+ rv = dpdk_port_setup_hqos (xd, &devconf->hqos);
+ if (rv)
+ return rv;
+ }
+
+ /* count the number of descriptors used for this device */
+ nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used;
+
+ error = ethernet_register_interface
+ (dm->vnet_main, dpdk_device_class.index, xd->device_index,
+ /* ethernet address */ addr,
+ &xd->vlib_hw_if_index, dpdk_flag_change);
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+ hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+
+ /*
+ * DAW-FIXME: The Cisco VIC firmware does not provide an api for a
+ * driver to dynamically change the mtu. If/when the
+ * VIC firmware gets fixed, then this should be removed.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ /*
+ * Initialize mtu to what has been set by CIMC in the firmware cfg.
+ */
+ hi->max_packet_bytes = dev_info.max_rx_pktlen;
+ if (devconf->vlan_strip_offload != DPDK_DEVICE_VLAN_STRIP_OFF)
+ vlan_strip = 1; /* remove vlan tag from VIC port by default */
+ else
+ clib_warning ("VLAN strip disabled for interface\n");
+ }
+ else if (devconf->vlan_strip_offload == DPDK_DEVICE_VLAN_STRIP_ON)
+ vlan_strip = 1;
+
+ if (vlan_strip)
+ {
+ int vlan_off;
+ vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index);
+ vlan_off |= ETH_VLAN_STRIP_OFFLOAD;
+ xd->port_conf.rxmode.hw_vlan_strip = vlan_off;
+ if (rte_eth_dev_set_vlan_offload (xd->device_index, vlan_off) == 0)
+ clib_warning ("VLAN strip enabled for interface\n");
+ else
+ clib_warning ("VLAN strip cannot be supported by interface\n");
+ }
+
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] =
+ xd->port_conf.rxmode.max_rx_pkt_len - sizeof (ethernet_header_t);
+
+ rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes);
+ }
+
+ if (nb_desc > dm->conf->num_mbufs)
+ clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n",
+ dm->conf->num_mbufs, nb_desc);
+
+ return 0;
+}
+
+static void
+dpdk_bind_devices_to_uio (dpdk_config_main_t * conf)
+{
+ vlib_pci_main_t *pm = &pci_main;
+ clib_error_t *error;
+ vlib_pci_device_t *d;
+ u8 *pci_addr = 0;
+ int num_whitelisted = vec_len (conf->dev_confs);
+
+ /* *INDENT-OFF* */
+ pool_foreach (d, pm->pci_devs, ({
+ dpdk_device_config_t * devconf = 0;
+ vec_reset_length (pci_addr);
+ pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, &d->bus_address, 0);
+
+ if (d->device_class != PCI_CLASS_NETWORK_ETHERNET)
+ continue;
+
+ if (num_whitelisted)
+ {
+ uword * p = hash_get (conf->device_config_index_by_pci_addr, d->bus_address.as_u32);
+
+ if (!p)
+ continue;
+
+ devconf = pool_elt_at_index (conf->dev_confs, p[0]);
+ }
+
+ /* virtio */
+ if (d->vendor_id == 0x1af4 && d->device_id == 0x1000)
+ ;
+ /* vmxnet3 */
+ else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0)
+ ;
+ /* all Intel devices */
+ else if (d->vendor_id == 0x8086)
+ ;
+ /* Cisco VIC */
+ else if (d->vendor_id == 0x1137 && d->device_id == 0x0043)
+ ;
+ /* Chelsio T4/T5 */
+ else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000)
+ ;
+ else
+ {
+ clib_warning ("Unsupported Ethernet PCI device 0x%04x:0x%04x found "
+ "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id,
+ pci_addr);
+ continue;
+ }
+
+ error = vlib_pci_bind_to_uio (d, (char *) conf->uio_driver_name);
+
+ if (error)
+ {
+ if (devconf == 0)
+ {
+ pool_get (conf->dev_confs, devconf);
+ hash_set (conf->device_config_index_by_pci_addr, d->bus_address.as_u32,
+ devconf - conf->dev_confs);
+ devconf->pci_addr.as_u32 = d->bus_address.as_u32;
+ }
+ devconf->is_blacklisted = 1;
+ clib_error_report (error);
+ }
+ }));
+ /* *INDENT-ON* */
+ vec_free (pci_addr);
+}
+
+static clib_error_t *
+dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr,
+ unformat_input_t * input, u8 is_default)
+{
+ clib_error_t *error = 0;
+ uword *p;
+ dpdk_device_config_t *devconf;
+ unformat_input_t sub_input;
+
+ if (is_default)
+ {
+ devconf = &conf->default_devconf;
+ }
+ else
+ {
+ p = hash_get (conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+
+ if (!p)
+ {
+ pool_get (conf->dev_confs, devconf);
+ hash_set (conf->device_config_index_by_pci_addr, pci_addr.as_u32,
+ devconf - conf->dev_confs);
+ }
+ else
+ return clib_error_return (0,
+ "duplicate configuration for PCI address %U",
+ format_vlib_pci_addr, &pci_addr);
+ }
+
+ devconf->pci_addr.as_u32 = pci_addr.as_u32;
+ devconf->hqos_enabled = 0;
+ dpdk_device_config_hqos_default (&devconf->hqos);
+
+ if (!input)
+ return 0;
+
+ unformat_skip_white_space (input);
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "num-rx-queues %u", &devconf->num_rx_queues))
+ ;
+ else if (unformat (input, "num-tx-queues %u", &devconf->num_tx_queues))
+ ;
+ else if (unformat (input, "num-rx-desc %u", &devconf->num_rx_desc))
+ ;
+ else if (unformat (input, "num-tx-desc %u", &devconf->num_tx_desc))
+ ;
+ else if (unformat (input, "workers %U", unformat_bitmap_list,
+ &devconf->workers))
+ ;
+ else
+ if (unformat
+ (input, "rss %U", unformat_vlib_cli_sub_input, &sub_input))
+ {
+ error = unformat_rss_fn (&sub_input, &devconf->rss_fn);
+ if (error)
+ break;
+ }
+ else if (unformat (input, "vlan-strip-offload off"))
+ devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF;
+ else if (unformat (input, "vlan-strip-offload on"))
+ devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON;
+ else
+ if (unformat
+ (input, "hqos %U", unformat_vlib_cli_sub_input, &sub_input))
+ {
+ devconf->hqos_enabled = 1;
+ error = unformat_hqos (&sub_input, &devconf->hqos);
+ if (error)
+ break;
+ }
+ else if (unformat (input, "hqos"))
+ {
+ devconf->hqos_enabled = 1;
+ }
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ break;
+ }
+ }
+
+ if (error)
+ return error;
+
+ if (devconf->workers && devconf->num_rx_queues == 0)
+ devconf->num_rx_queues = clib_bitmap_count_set_bits (devconf->workers);
+ else if (devconf->workers &&
+ clib_bitmap_count_set_bits (devconf->workers) !=
+ devconf->num_rx_queues)
+ error =
+ clib_error_return (0,
+ "%U: number of worker threadds must be "
+ "equal to number of rx queues", format_vlib_pci_addr,
+ &pci_addr);
+
+ return error;
+}
+
+static clib_error_t *
+dpdk_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ clib_error_t *error = 0;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_config_main_t *conf = &dpdk_config_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_device_config_t *devconf;
+ vlib_pci_addr_t pci_addr;
+ unformat_input_t sub_input;
+ u8 *s, *tmp = 0;
+ u8 *rte_cmd = 0, *ethname = 0;
+ u32 log_level;
+ int ret, i;
+ int num_whitelisted = 0;
+ u8 no_pci = 0;
+ u8 no_huge = 0;
+ u8 huge_dir = 0;
+ u8 file_prefix = 0;
+ u8 *socket_mem = 0;
+
+ conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword));
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ /* Prime the pump */
+ if (unformat (input, "no-hugetlb"))
+ {
+ vec_add1 (conf->eal_init_args, (u8 *) "no-huge");
+ no_huge = 1;
+ }
+
+ else if (unformat (input, "enable-tcp-udp-checksum"))
+ conf->enable_tcp_udp_checksum = 1;
+
+ else if (unformat (input, "decimal-interface-names"))
+ conf->interface_name_format_decimal = 1;
+
+ else if (unformat (input, "no-multi-seg"))
+ conf->no_multi_seg = 1;
+
+ else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input,
+ &sub_input))
+ {
+ error =
+ dpdk_device_config (conf, (vlib_pci_addr_t) (u32) ~ 1, &sub_input,
+ 1);
+
+ if (error)
+ return error;
+ }
+ else
+ if (unformat
+ (input, "dev %U %U", unformat_vlib_pci_addr, &pci_addr,
+ unformat_vlib_cli_sub_input, &sub_input))
+ {
+ error = dpdk_device_config (conf, pci_addr, &sub_input, 0);
+
+ if (error)
+ return error;
+
+ num_whitelisted++;
+ }
+ else if (unformat (input, "dev %U", unformat_vlib_pci_addr, &pci_addr))
+ {
+ error = dpdk_device_config (conf, pci_addr, 0, 0);
+
+ if (error)
+ return error;
+
+ num_whitelisted++;
+ }
+ else if (unformat (input, "num-mbufs %d", &conf->num_mbufs))
+ ;
+ else if (unformat (input, "kni %d", &conf->num_kni))
+ ;
+ else if (unformat (input, "uio-driver %s", &conf->uio_driver_name))
+ ;
+ else if (unformat (input, "socket-mem %s", &socket_mem))
+ ;
+ else if (unformat (input, "no-pci"))
+ {
+ no_pci = 1;
+ tmp = format (0, "--no-pci%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ else if (unformat (input, "poll-sleep %d", &dm->poll_sleep))
+ ;
+
+#define _(a) \
+ else if (unformat(input, #a)) \
+ { \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ }
+ foreach_eal_double_hyphen_predicate_arg
+#undef _
+#define _(a) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ if (!strncmp(#a, "huge-dir", 8)) \
+ huge_dir = 1; \
+ else if (!strncmp(#a, "file-prefix", 11)) \
+ file_prefix = 1; \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (conf->eal_init_args, s); \
+ }
+ foreach_eal_double_hyphen_arg
+#undef _
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (conf->eal_init_args, s); \
+ }
+ foreach_eal_single_hyphen_arg
+#undef _
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (conf->eal_init_args, s); \
+ conf->a##_set_manually = 1; \
+ }
+ foreach_eal_single_hyphen_mandatory_arg
+#undef _
+ else if (unformat (input, "default"))
+ ;
+
+ else if (unformat_skip_white_space (input))
+ ;
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+
+ if (!conf->uio_driver_name)
+ conf->uio_driver_name = format (0, "igb_uio%c", 0);
+
+ /*
+ * Use 1G huge pages if available.
+ */
+ if (!no_huge && !huge_dir)
+ {
+ u32 x, *mem_by_socket = 0;
+ uword c = 0;
+ u8 use_1g = 1;
+ u8 use_2m = 1;
+ u8 less_than_1g = 1;
+ int rv;
+
+ umount (DEFAULT_HUGE_DIR);
+
+ /* Process "socket-mem" parameter value */
+ if (vec_len (socket_mem))
+ {
+ unformat_input_t in;
+ unformat_init_vector (&in, socket_mem);
+ while (unformat_check_input (&in) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (&in, "%u,", &x))
+ ;
+ else if (unformat (&in, "%u", &x))
+ ;
+ else if (unformat (&in, ","))
+ x = 0;
+ else
+ break;
+
+ vec_add1 (mem_by_socket, x);
+
+ if (x > 1023)
+ less_than_1g = 0;
+ }
+ /* Note: unformat_free vec_frees(in.buffer), aka socket_mem... */
+ unformat_free (&in);
+ socket_mem = 0;
+ }
+ else
+ {
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
+ {
+ vec_validate(mem_by_socket, c);
+ mem_by_socket[c] = 256; /* default per-socket mem */
+ }
+ ));
+ /* *INDENT-ON* */
+ }
+
+ /* check if available enough 1GB pages for each socket */
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
+ {
+ int pages_avail, page_size, mem;
+
+ vec_validate(mem_by_socket, c);
+ mem = mem_by_socket[c];
+
+ page_size = 1024;
+ pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
+
+ if (pages_avail < 0 || page_size * pages_avail < mem)
+ use_1g = 0;
+
+ page_size = 2;
+ pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024);
+
+ if (pages_avail < 0 || page_size * pages_avail < mem)
+ use_2m = 0;
+ }));
+ /* *INDENT-ON* */
+
+ if (mem_by_socket == 0)
+ {
+ error = clib_error_return (0, "mem_by_socket NULL");
+ goto done;
+ }
+ _vec_len (mem_by_socket) = c + 1;
+
+ /* regenerate socket_mem string */
+ vec_foreach_index (x, mem_by_socket)
+ socket_mem = format (socket_mem, "%s%u",
+ socket_mem ? "," : "", mem_by_socket[x]);
+ socket_mem = format (socket_mem, "%c", 0);
+
+ vec_free (mem_by_socket);
+
+ rv = mkdir (VPP_RUN_DIR, 0755);
+ if (rv && errno != EEXIST)
+ {
+ error = clib_error_return (0, "mkdir '%s' failed errno %d",
+ VPP_RUN_DIR, errno);
+ goto done;
+ }
+
+ rv = mkdir (DEFAULT_HUGE_DIR, 0755);
+ if (rv && errno != EEXIST)
+ {
+ error = clib_error_return (0, "mkdir '%s' failed errno %d",
+ DEFAULT_HUGE_DIR, errno);
+ goto done;
+ }
+
+ if (use_1g && !(less_than_1g && use_2m))
+ {
+ rv =
+ mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G");
+ }
+ else if (use_2m)
+ {
+ rv = mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL);
+ }
+ else
+ {
+ return clib_error_return (0, "not enough free huge pages");
+ }
+
+ if (rv)
+ {
+ error = clib_error_return (0, "mount failed %d", errno);
+ goto done;
+ }
+
+ tmp = format (0, "--huge-dir%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ if (!file_prefix)
+ {
+ tmp = format (0, "--file-prefix%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "vpp%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ }
+
+ vec_free (rte_cmd);
+ vec_free (ethname);
+
+ if (error)
+ return error;
+
+ /* I'll bet that -c and -n must be the first and second args... */
+ if (!conf->coremask_set_manually)
+ {
+ vlib_thread_registration_t *tr;
+ uword *coremask = 0;
+ int i;
+
+ /* main thread core */
+ coremask = clib_bitmap_set (coremask, tm->main_lcore, 1);
+
+ for (i = 0; i < vec_len (tm->registrations); i++)
+ {
+ tr = tm->registrations[i];
+ coremask = clib_bitmap_or (coremask, tr->coremask);
+ }
+
+ vec_insert (conf->eal_init_args, 2, 1);
+ conf->eal_init_args[1] = (u8 *) "-c";
+ tmp = format (0, "%U%c", format_bitmap_hex, coremask, 0);
+ conf->eal_init_args[2] = tmp;
+ clib_bitmap_free (coremask);
+ }
+
+ if (!conf->nchannels_set_manually)
+ {
+ vec_insert (conf->eal_init_args, 2, 3);
+ conf->eal_init_args[3] = (u8 *) "-n";
+ tmp = format (0, "%d", conf->nchannels);
+ conf->eal_init_args[4] = tmp;
+ }
+
+ if (no_pci == 0 && geteuid () == 0)
+ dpdk_bind_devices_to_uio (conf);
+
+#define _(x) \
+ if (devconf->x == 0 && conf->default_devconf.x > 0) \
+ devconf->x = conf->default_devconf.x ;
+
+ /* *INDENT-OFF* */
+ pool_foreach (devconf, conf->dev_confs, ({
+
+ /* default per-device config items */
+ foreach_dpdk_device_config_item
+
+ /* add DPDK EAL whitelist/blacklist entry */
+ if (num_whitelisted > 0 && devconf->is_blacklisted == 0)
+ {
+ tmp = format (0, "-w%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ else if (num_whitelisted == 0 && devconf->is_blacklisted != 0)
+ {
+ tmp = format (0, "-b%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ }));
+ /* *INDENT-ON* */
+
+#undef _
+
+ /* set master-lcore */
+ tmp = format (0, "--master-lcore%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%u%c", tm->main_lcore, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+
+ /* set socket-mem */
+ tmp = format (0, "--socket-mem%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%s%c", socket_mem, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+
+ /* NULL terminate the "argv" vector, in case of stupidity */
+ vec_add1 (conf->eal_init_args, 0);
+ _vec_len (conf->eal_init_args) -= 1;
+
+ /* Set up DPDK eal and packet mbuf pool early. */
+
+ log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE;
+
+ rte_set_log_level (log_level);
+
+ vm = vlib_get_main ();
+
+ /* make copy of args as rte_eal_init tends to mess up with arg array */
+ for (i = 1; i < vec_len (conf->eal_init_args); i++)
+ conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ",
+ conf->eal_init_args[i]);
+
+ ret =
+ rte_eal_init (vec_len (conf->eal_init_args),
+ (char **) conf->eal_init_args);
+
+ /* lazy umount hugepages */
+ umount2 (DEFAULT_HUGE_DIR, MNT_DETACH);
+
+ if (ret < 0)
+ return clib_error_return (0, "rte_eal_init returned %d", ret);
+
+ /* Dump the physical memory layout prior to creating the mbuf_pool */
+ fprintf (stdout, "DPDK physical memory layout:\n");
+ rte_dump_physmem_layout (stdout);
+
+ /* main thread 1st */
+ error = vlib_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ());
+ if (error)
+ return error;
+
+ for (i = 0; i < RTE_MAX_LCORE; i++)
+ {
+ error = vlib_buffer_pool_create (vm, conf->num_mbufs,
+ rte_lcore_to_socket_id (i));
+ if (error)
+ return error;
+ }
+
+done:
+ return error;
+}
+
+VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk");
+
+void
+dpdk_update_link_state (dpdk_device_t * xd, f64 now)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ struct rte_eth_link prev_link = xd->link;
+ u32 hw_flags = 0;
+ u8 hw_flags_chg = 0;
+
+ /* only update link state for PMD interfaces */
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ return;
+
+ xd->time_last_link_update = now ? now : xd->time_last_link_update;
+ memset (&xd->link, 0, sizeof (xd->link));
+ rte_eth_link_get_nowait (xd->device_index, &xd->link);
+
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t *vm = vlib_get_main ();
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format =
+ "update-link-state: sw_if_index %d, admin_up %d,"
+ "old link_state %d new link_state %d",.format_args = "i4i1i1i1",};
+
+ struct
+ {
+ u32 sw_if_index;
+ u8 admin_up;
+ u8 old_link_state;
+ u8 new_link_state;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0;
+ ed->old_link_state = (u8)
+ vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index);
+ ed->new_link_state = (u8) xd->link.link_status;
+ }
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) &&
+ ((xd->link.link_status != 0) ^
+ vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index)))
+ {
+ hw_flags_chg = 1;
+ hw_flags |= (xd->link.link_status ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0);
+ }
+
+ if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_duplex)
+ {
+ case ETH_LINK_HALF_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX;
+ break;
+ case ETH_LINK_FULL_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX;
+ break;
+ default:
+ break;
+ }
+ }
+ if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_speed)
+ {
+ case ETH_SPEED_NUM_10M:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M;
+ break;
+ case ETH_SPEED_NUM_100M:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M;
+ break;
+ case ETH_SPEED_NUM_1G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G;
+ break;
+ case ETH_SPEED_NUM_10G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G;
+ break;
+ case ETH_SPEED_NUM_40G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G;
+ break;
+ case 0:
+ break;
+ default:
+ clib_warning ("unknown link speed %d", xd->link.link_speed);
+ break;
+ }
+ }
+ if (hw_flags_chg)
+ {
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t *vm = vlib_get_main ();
+
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format =
+ "update-link-state: sw_if_index %d, new flags %d",.format_args
+ = "i4i4",};
+
+ struct
+ {
+ u32 sw_if_index;
+ u32 flags;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->flags = hw_flags;
+ }
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, hw_flags);
+ }
+}
+
+static uword
+dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ clib_error_t *error;
+ vnet_main_t *vnm = vnet_get_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ ethernet_main_t *em = &ethernet_main;
+ dpdk_device_t *xd;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ int i;
+
+ error = dpdk_lib_init (dm);
+
+ /*
+ * Turn on the input node if we found some devices to drive
+ * and we're not running worker threads or i/o threads
+ */
+
+ if (error == 0 && vec_len (dm->devices) > 0)
+ {
+ if (tm->n_vlib_mains == 1)
+ vlib_node_set_state (vm, dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ if (vec_len (dm->devices_by_cpu[i]) > 0)
+ vlib_node_set_state (vlib_mains[i], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ }
+
+ if (error)
+ clib_error_report (error);
+
+ tm->worker_thread_release = 1;
+
+ f64 now = vlib_time_now (vm);
+ vec_foreach (xd, dm->devices)
+ {
+ dpdk_update_link_state (xd, now);
+ }
+
+ {
+ /*
+ * Extra set up for bond interfaces:
+ * 1. Setup MACs for bond interfaces and their slave links which was set
+ * in dpdk_port_setup() but needs to be done again here to take effect.
+ * 2. Set up info for bond interface related CLI support.
+ */
+ int nports = rte_eth_dev_count ();
+ if (nports > 0)
+ {
+ for (i = 0; i < nports; i++)
+ {
+ struct rte_eth_dev_info dev_info;
+ rte_eth_dev_info_get (i, &dev_info);
+ if (!dev_info.driver_name)
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+ dev_info.driver_name = dev_info.pci_dev->driver->name;
+#else
+ dev_info.driver_name = dev_info.pci_dev->driver->driver.name;
+#endif
+ ASSERT (dev_info.driver_name);
+ if (strncmp (dev_info.driver_name, "rte_bond_pmd", 12) == 0)
+ {
+ u8 addr[6];
+ u8 slink[16];
+ int nlink = rte_eth_bond_slaves_get (i, slink, 16);
+ if (nlink > 0)
+ {
+ vnet_hw_interface_t *bhi;
+ ethernet_interface_t *bei;
+ int rv;
+
+ /* Get MAC of 1st slave link */
+ rte_eth_macaddr_get (slink[0],
+ (struct ether_addr *) addr);
+ /* Set MAC of bounded interface to that of 1st slave link */
+ rv =
+ rte_eth_bond_mac_address_set (i,
+ (struct ether_addr *)
+ addr);
+ if (rv < 0)
+ clib_warning ("Failed to set MAC address");
+
+ /* Populate MAC of bonded interface in VPP hw tables */
+ bhi =
+ vnet_get_hw_interface (vnm,
+ dm->devices[i].vlib_hw_if_index);
+ bei =
+ pool_elt_at_index (em->interfaces, bhi->hw_instance);
+ clib_memcpy (bhi->hw_address, addr, 6);
+ clib_memcpy (bei->address, addr, 6);
+ /* Init l3 packet size allowed on bonded interface */
+ bhi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES;
+ bhi->max_l3_packet_bytes[VLIB_RX] =
+ bhi->max_l3_packet_bytes[VLIB_TX] =
+ ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t);
+ while (nlink >= 1)
+ { /* for all slave links */
+ int slave = slink[--nlink];
+ dpdk_device_t *sdev = &dm->devices[slave];
+ vnet_hw_interface_t *shi;
+ vnet_sw_interface_t *ssi;
+ /* Add MAC to all slave links except the first one */
+ if (nlink)
+ rte_eth_dev_mac_addr_add (slave,
+ (struct ether_addr *)
+ addr, 0);
+ /* Set slaves bitmap for bonded interface */
+ bhi->bond_info =
+ clib_bitmap_set (bhi->bond_info,
+ sdev->vlib_hw_if_index, 1);
+ /* Set slave link flags on slave interface */
+ shi =
+ vnet_get_hw_interface (vnm, sdev->vlib_hw_if_index);
+ ssi =
+ vnet_get_sw_interface (vnm, sdev->vlib_sw_if_index);
+ shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE;
+ ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE;
+
+ /* Set l3 packet size allowed as the lowest of slave */
+ if (bhi->max_l3_packet_bytes[VLIB_RX] >
+ shi->max_l3_packet_bytes[VLIB_RX])
+ bhi->max_l3_packet_bytes[VLIB_RX] =
+ bhi->max_l3_packet_bytes[VLIB_TX] =
+ shi->max_l3_packet_bytes[VLIB_RX];
+
+ /* Set max packet size allowed as the lowest of slave */
+ if (bhi->max_packet_bytes > shi->max_packet_bytes)
+ bhi->max_packet_bytes = shi->max_packet_bytes;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ while (1)
+ {
+ /*
+ * check each time through the loop in case intervals are changed
+ */
+ f64 min_wait = dm->link_state_poll_interval < dm->stat_poll_interval ?
+ dm->link_state_poll_interval : dm->stat_poll_interval;
+
+ vlib_process_wait_for_event_or_clock (vm, min_wait);
+
+ if (dm->admin_up_down_in_progress)
+ /* skip the poll if an admin up down is in progress (on any interface) */
+ continue;
+
+ vec_foreach (xd, dm->devices)
+ {
+ f64 now = vlib_time_now (vm);
+ if ((now - xd->time_last_stats_update) >= dm->stat_poll_interval)
+ dpdk_update_counters (xd, now);
+ if ((now - xd->time_last_link_update) >= dm->link_state_poll_interval)
+ dpdk_update_link_state (xd, now);
+
+ }
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (dpdk_process_node,static) = {
+ .function = dpdk_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "dpdk-process",
+ .process_log2_n_stack_bytes = 17,
+};
+/* *INDENT-ON* */
+
+int
+dpdk_set_stat_poll_interval (f64 interval)
+{
+ if (interval < DPDK_MIN_STATS_POLL_INTERVAL)
+ return (VNET_API_ERROR_INVALID_VALUE);
+
+ dpdk_main.stat_poll_interval = interval;
+
+ return 0;
+}
+
+int
+dpdk_set_link_state_poll_interval (f64 interval)
+{
+ if (interval < DPDK_MIN_LINK_POLL_INTERVAL)
+ return (VNET_API_ERROR_INVALID_VALUE);
+
+ dpdk_main.link_state_poll_interval = interval;
+
+ return 0;
+}
+
+clib_error_t *
+dpdk_init (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ vlib_node_t *ei;
+ clib_error_t *error = 0;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+ /* verify that structs are cacheline aligned */
+ STATIC_ASSERT (offsetof (dpdk_device_t, cacheline0) == 0,
+ "Cache line marker must be 1st element in dpdk_device_t");
+ STATIC_ASSERT (offsetof (dpdk_device_t, cacheline1) ==
+ CLIB_CACHE_LINE_BYTES,
+ "Data in cache line 0 is bigger than cache line size");
+ STATIC_ASSERT (offsetof (dpdk_worker_t, cacheline0) == 0,
+ "Cache line marker must be 1st element in dpdk_worker_t");
+ STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0,
+ "Cache line marker must be 1st element in frame_queue_trace_t");
+
+ dm->vlib_main = vm;
+ dm->vnet_main = vnet_get_main ();
+ dm->conf = &dpdk_config_main;
+
+ ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input");
+ if (ei == 0)
+ return clib_error_return (0, "ethernet-input node AWOL");
+
+ dm->ethernet_input_node_index = ei->index;
+
+ dm->conf->nchannels = 4;
+ dm->conf->num_mbufs = dm->conf->num_mbufs ? dm->conf->num_mbufs : NB_MBUF;
+ vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet");
+
+ dm->dpdk_device_by_kni_port_id = hash_create (0, sizeof (uword));
+ dm->vu_sw_if_index_by_listener_fd = hash_create (0, sizeof (uword));
+ dm->vu_sw_if_index_by_sock_fd = hash_create (0, sizeof (uword));
+
+ /* $$$ use n_thread_stacks since it's known-good at this point */
+ vec_validate (dm->recycle, tm->n_thread_stacks - 1);
+
+ /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */
+ dm->buffer_flags_template =
+ (VLIB_BUFFER_TOTAL_LENGTH_VALID | VNET_BUFFER_RTE_MBUF_VALID
+ | IP_BUFFER_L4_CHECKSUM_COMPUTED | IP_BUFFER_L4_CHECKSUM_CORRECT);
+
+ dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL;
+ dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL;
+
+ /* init CLI */
+ if ((error = vlib_call_init_function (vm, dpdk_cli_init)))
+ return error;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (dpdk_init);
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/cli.c b/src/vnet/devices/dpdk/ipsec/cli.c
new file mode 100644
index 00000000000..3b634e036da
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/cli.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+
+static void
+dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display)
+{
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ u32 i, skip_master;
+
+ if (detail_display)
+ vlib_cli_output (vm, "worker\t%10s\t%15s\tdir\tdev\tqp\n",
+ "cipher", "auth");
+ else
+ vlib_cli_output (vm, "worker\tcrypto device id(type)\n");
+
+ skip_master = vlib_num_workers () > 0;
+
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ {
+ uword key, data;
+ u32 cpu_index = vlib_mains[i]->cpu_index;
+ crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+ u8 *s = 0;
+
+ if (skip_master)
+ {
+ skip_master = 0;
+ continue;
+ }
+
+ if (!detail_display)
+ {
+ i32 last_cdev = -1;
+ crypto_qp_data_t *qpd;
+
+ s = format (s, "%u\t", cpu_index);
+
+ /* *INDENT-OFF* */
+ vec_foreach (qpd, cwm->qp_data)
+ {
+ u32 dev_id = qpd->dev_id;
+
+ if ((u16) last_cdev != dev_id)
+ {
+ struct rte_cryptodev_info cdev_info;
+
+ rte_cryptodev_info_get (dev_id, &cdev_info);
+
+ s = format(s, "%u(%s)\t", dev_id, cdev_info.feature_flags &
+ RTE_CRYPTODEV_FF_HW_ACCELERATED ? "HW" : "SW");
+ }
+ last_cdev = dev_id;
+ }
+ /* *INDENT-ON* */
+ vlib_cli_output (vm, "%s", s);
+ }
+ else
+ {
+ char cipher_str[15], auth_str[15];
+ struct rte_cryptodev_capabilities cap;
+ crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *) & key;
+ /* *INDENT-OFF* */
+ hash_foreach (key, data, cwm->algo_qp_map,
+ ({
+ cap.op = RTE_CRYPTO_OP_TYPE_SYMMETRIC;
+ cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+ cap.sym.cipher.algo = p_key->cipher_algo;
+ check_algo_is_supported (&cap, cipher_str);
+ cap.op = RTE_CRYPTO_OP_TYPE_SYMMETRIC;
+ cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_AUTH;
+ cap.sym.auth.algo = p_key->auth_algo;
+ check_algo_is_supported (&cap, auth_str);
+ vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n",
+ vlib_mains[i]->cpu_index, cipher_str, auth_str,
+ p_key->is_outbound ? "out" : "in",
+ cwm->qp_data[data].dev_id,
+ cwm->qp_data[data].qp_id);
+ }));
+ /* *INDENT-ON* */
+ }
+ }
+}
+
+static clib_error_t *
+lcore_cryptodev_map_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u16 detail = 0;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "verbose"))
+ detail = 1;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ dpdk_ipsec_show_mapping (vm, detail);
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (lcore_cryptodev_map, static) = {
+ .path = "show crypto device mapping",
+ .short_help =
+ "show cryptodev device mapping <verbose>",
+ .function = lcore_cryptodev_map_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/crypto_node.c b/src/vnet/devices/dpdk/ipsec/crypto_node.c
new file mode 100644
index 00000000000..7b32704ec05
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/crypto_node.c
@@ -0,0 +1,210 @@
+/*
+ *------------------------------------------------------------------
+ * crypto_node.c - DPDK Cryptodev input node
+ *
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ipsec/ipsec.h>
+
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+
+#define foreach_dpdk_crypto_input_next \
+ _(DROP, "error-drop") \
+ _(ENCRYPT_POST, "dpdk-esp-encrypt-post") \
+ _(DECRYPT_POST, "dpdk-esp-decrypt-post")
+
+typedef enum
+{
+#define _(f,s) DPDK_CRYPTO_INPUT_NEXT_##f,
+ foreach_dpdk_crypto_input_next
+#undef _
+ DPDK_CRYPTO_INPUT_N_NEXT,
+} dpdk_crypto_input_next_t;
+
+#define foreach_dpdk_crypto_input_error \
+ _(DQ_COPS, "Crypto ops dequeued") \
+ _(COP_FAILED, "Crypto op failed")
+
+typedef enum
+{
+#define _(f,s) DPDK_CRYPTO_INPUT_ERROR_##f,
+ foreach_dpdk_crypto_input_error
+#undef _
+ DPDK_CRYPTO_INPUT_N_ERROR,
+} dpdk_crypto_input_error_t;
+
+static char *dpdk_crypto_input_error_strings[] = {
+#define _(n, s) s,
+ foreach_dpdk_crypto_input_error
+#undef _
+};
+
+vlib_node_registration_t dpdk_crypto_input_node;
+
+typedef struct
+{
+ u32 cdev;
+ u32 qp;
+ u32 status;
+ u32 sa_idx;
+ u32 next_index;
+} dpdk_crypto_input_trace_t;
+
+static u8 *
+format_dpdk_crypto_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ dpdk_crypto_input_trace_t *t = va_arg (*args, dpdk_crypto_input_trace_t *);
+
+ s = format (s, "dpdk_crypto: cryptodev-id %u queue-pair %u next-index %d",
+ t->cdev, t->qp, t->next_index);
+
+ s = format (s, " status %u sa-idx %u\n", t->status, t->sa_idx);
+
+ return s;
+}
+
+static_always_inline u32
+dpdk_crypto_dequeue (vlib_main_t * vm, vlib_node_runtime_t * node,
+ crypto_qp_data_t * qpd)
+{
+ u32 n_deq, *to_next = 0, next_index, n_cops, def_next_index;
+ struct rte_crypto_op **cops = qpd->cops;
+
+ if (qpd->inflights == 0)
+ return 0;
+
+ if (qpd->is_outbound)
+ def_next_index = DPDK_CRYPTO_INPUT_NEXT_ENCRYPT_POST;
+ else
+ def_next_index = DPDK_CRYPTO_INPUT_NEXT_DECRYPT_POST;
+
+ n_cops = rte_cryptodev_dequeue_burst (qpd->dev_id, qpd->qp_id,
+ cops, VLIB_FRAME_SIZE);
+ n_deq = n_cops;
+ next_index = def_next_index;
+
+ qpd->inflights -= n_cops;
+ ASSERT (qpd->inflights >= 0);
+
+ while (n_cops > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_cops > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t *b0 = 0;
+ struct rte_crypto_op *cop;
+ struct rte_crypto_sym_op *sym_cop;
+
+ cop = cops[0];
+ cops += 1;
+ n_cops -= 1;
+ n_left_to_next -= 1;
+
+ next0 = def_next_index;
+
+ if (PREDICT_FALSE (cop->status != RTE_CRYPTO_OP_STATUS_SUCCESS))
+ {
+ next0 = DPDK_CRYPTO_INPUT_NEXT_DROP;
+ vlib_node_increment_counter (vm, dpdk_crypto_input_node.index,
+ DPDK_CRYPTO_INPUT_ERROR_COP_FAILED,
+ 1);
+ }
+ cop->status = RTE_CRYPTO_OP_STATUS_NOT_PROCESSED;
+
+ sym_cop = (struct rte_crypto_sym_op *) (cop + 1);
+ b0 = vlib_buffer_from_rte_mbuf (sym_cop->m_src);
+ bi0 = vlib_get_buffer_index (vm, b0);
+
+ to_next[0] = bi0;
+ to_next += 1;
+
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vlib_trace_next_frame (vm, node, next0);
+ dpdk_crypto_input_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->cdev = qpd->dev_id;
+ tr->qp = qpd->qp_id;
+ tr->status = cop->status;
+ tr->next_index = next0;
+ tr->sa_idx = vnet_buffer (b0)->ipsec.sad_index;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ crypto_free_cop (qpd, qpd->cops, n_deq);
+
+ vlib_node_increment_counter (vm, dpdk_crypto_input_node.index,
+ DPDK_CRYPTO_INPUT_ERROR_DQ_COPS, n_deq);
+ return n_deq;
+}
+
+static uword
+dpdk_crypto_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 cpu_index = os_get_cpu_number ();
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+ crypto_qp_data_t *qpd;
+ u32 n_deq = 0;
+
+ /* *INDENT-OFF* */
+ vec_foreach (qpd, cwm->qp_data)
+ n_deq += dpdk_crypto_dequeue(vm, node, qpd);
+ /* *INDENT-ON* */
+
+ return n_deq;
+}
+
+VLIB_REGISTER_NODE (dpdk_crypto_input_node) =
+{
+ .function = dpdk_crypto_input_fn,.name = "dpdk-crypto-input",.format_trace =
+ format_dpdk_crypto_input_trace,.type = VLIB_NODE_TYPE_INPUT,.state =
+ VLIB_NODE_STATE_DISABLED,.n_errors =
+ DPDK_CRYPTO_INPUT_N_ERROR,.error_strings =
+ dpdk_crypto_input_error_strings,.n_next_nodes =
+ DPDK_CRYPTO_INPUT_N_NEXT,.next_nodes =
+ {
+#define _(s,n) [DPDK_CRYPTO_INPUT_NEXT_##s] = n,
+ foreach_dpdk_crypto_input_next
+#undef _
+ }
+,};
+
+#if DPDK_CRYPTO==1
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_crypto_input_node, dpdk_crypto_input_fn)
+#endif
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/dir.dox b/src/vnet/devices/dpdk/ipsec/dir.dox
new file mode 100644
index 00000000000..ffebfc4d62e
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/dir.dox
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ @dir vnet/vnet/devices/dpdk/ipsec
+ @brief IPSec ESP encrypt/decrypt using DPDK Cryptodev API
+*/
diff --git a/src/vnet/devices/dpdk/ipsec/dpdk_crypto_ipsec_doc.md b/src/vnet/devices/dpdk/ipsec/dpdk_crypto_ipsec_doc.md
new file mode 100644
index 00000000000..8089696f4a0
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/dpdk_crypto_ipsec_doc.md
@@ -0,0 +1,73 @@
+# VPP IPSec implementation using DPDK Cryptodev API {#dpdk_crypto_ipsec_doc}
+
+This document is meant to contain all related information about implementation and usability.
+
+
+## VPP IPsec with DPDK Cryptodev
+
+DPDK Cryptodev is an asynchronous crypto API that supports both Hardware and Software implementations (for more details refer to [DPDK Cryptography Device Library documentation](http://dpdk.org/doc/guides/prog_guide/cryptodev_lib.html)).
+
+When DPDK Cryptodev support is enabled, the node graph is modified by adding and replacing some of the nodes.
+
+The following nodes are replaced:
+* esp-encrypt -> dpdk-esp-encrypt
+* esp-decrypt -> dpdk-esp-decrypt
+
+The following nodes are added:
+* dpdk-crypto-input : polling input node, basically dequeuing from crypto devices.
+* dpdk-esp-encrypt-post : internal node.
+* dpdk-esp-decrypt-post : internal node.
+
+
+### How to enable VPP IPSec with DPDK Cryptodev support
+
+To enable DPDK Cryptodev support (disabled by default), we need the following env option:
+
+ vpp_uses_dpdk_cryptodev=yes
+
+A couple of ways to achive this:
+* uncomment/add it in the platforms config (ie. build-data/platforms/vpp.mk)
+* set the option when building vpp (ie. make vpp_uses_dpdk_cryptodev=yes build-release)
+
+
+### Crypto Resources allocation
+
+VPP allocates crypto resources based on a best effort approach:
+* first allocate Hardware crypto resources, then Software.
+* if there are not enough crypto resources for all workers, all packets will be dropped if they reach ESP encrypt/decrypt nodes, displaying the warning:
+
+ 0: dpdk_ipsec_init: not enough cryptodevs for ipsec
+
+
+### Configuration example
+
+No especial IPsec configuration is required.
+
+Once DPDK Cryptodev is enabled, the user just needs to provide cryptodevs in the startup.conf.
+
+Example startup.conf:
+
+```
+dpdk {
+ socket-mem 1024,1024
+ num-mbufs 131072
+ dev 0000:81:00.0
+ dev 0000:81:00.1
+ dev 0000:85:01.0
+ dev 0000:85:01.1
+ vdev cryptodev_aesni_mb_pmd,socket_id=1
+ vdev cryptodev_aesni_mb_pmd,socket_id=1
+}
+```
+
+In the above configuration:
+* 0000:85:01.0 and 0000:85:01.1 are crypto BDFs and they require the same driver binding as DPDK Ethernet devices but they do not support any extra configuration options.
+* Two AESNI-MB Software Cryptodev PMDs are created in NUMA node 1.
+
+For further details refer to [DPDK Crypto Device Driver documentation](http://dpdk.org/doc/guides/cryptodevs/index.html)
+
+### Operational data
+
+The following CLI command displays the Cryptodev/Worker mapping:
+
+ show crypto device mapping [verbose]
diff --git a/src/vnet/devices/dpdk/ipsec/esp.h b/src/vnet/devices/dpdk/ipsec/esp.h
new file mode 100644
index 00000000000..7ef90c49816
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/esp.h
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DPDK_ESP_H__
+#define __DPDK_ESP_H__
+
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/esp.h>
+
+typedef struct
+{
+ enum rte_crypto_cipher_algorithm algo;
+ u8 key_len;
+ u8 iv_len;
+} dpdk_esp_crypto_alg_t;
+
+typedef struct
+{
+ enum rte_crypto_auth_algorithm algo;
+ u8 trunc_size;
+} dpdk_esp_integ_alg_t;
+
+typedef struct
+{
+ dpdk_esp_crypto_alg_t *esp_crypto_algs;
+ dpdk_esp_integ_alg_t *esp_integ_algs;
+} dpdk_esp_main_t;
+
+dpdk_esp_main_t dpdk_esp_main;
+
+static_always_inline void
+dpdk_esp_init ()
+{
+ dpdk_esp_main_t *em = &dpdk_esp_main;
+ dpdk_esp_integ_alg_t *i;
+ dpdk_esp_crypto_alg_t *c;
+
+ vec_validate (em->esp_crypto_algs, IPSEC_CRYPTO_N_ALG - 1);
+
+ c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_128];
+ c->algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ c->key_len = 16;
+ c->iv_len = 16;
+
+ c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_192];
+ c->algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ c->key_len = 24;
+ c->iv_len = 16;
+
+ c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_256];
+ c->algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ c->key_len = 32;
+ c->iv_len = 16;
+
+ c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_GCM_128];
+ c->algo = RTE_CRYPTO_CIPHER_AES_GCM;
+ c->key_len = 16;
+ c->iv_len = 8;
+
+ vec_validate (em->esp_integ_algs, IPSEC_INTEG_N_ALG - 1);
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA1_96];
+ i->algo = RTE_CRYPTO_AUTH_SHA1_HMAC;
+ i->trunc_size = 12;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_256_96];
+ i->algo = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ i->trunc_size = 12;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_256_128];
+ i->algo = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ i->trunc_size = 16;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_384_192];
+ i->algo = RTE_CRYPTO_AUTH_SHA384_HMAC;
+ i->trunc_size = 24;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_512_256];
+ i->algo = RTE_CRYPTO_AUTH_SHA512_HMAC;
+ i->trunc_size = 32;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_AES_GCM_128];
+ i->algo = RTE_CRYPTO_AUTH_AES_GCM;
+ i->trunc_size = 16;
+}
+
+static_always_inline int
+add_del_sa_sess (u32 sa_index, u8 is_add)
+{
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ crypto_worker_main_t *cwm;
+ u8 skip_master = vlib_num_workers () > 0;
+
+ /* *INDENT-OFF* */
+ vec_foreach (cwm, dcm->workers_main)
+ {
+ crypto_sa_session_t *sa_sess;
+ u8 is_outbound;
+
+ if (skip_master)
+ {
+ skip_master = 0;
+ continue;
+ }
+
+ for (is_outbound = 0; is_outbound < 2; is_outbound++)
+ {
+ if (is_add)
+ {
+ pool_get (cwm->sa_sess_d[is_outbound], sa_sess);
+ }
+ else
+ {
+ u8 dev_id;
+
+ sa_sess = pool_elt_at_index (cwm->sa_sess_d[is_outbound], sa_index);
+ dev_id = cwm->qp_data[sa_sess->qp_index].dev_id;
+
+ if (!sa_sess->sess)
+ continue;
+
+ if (rte_cryptodev_sym_session_free(dev_id, sa_sess->sess))
+ {
+ clib_warning("failed to free session");
+ return -1;
+ }
+ memset(sa_sess, 0, sizeof(sa_sess[0]));
+ }
+ }
+ }
+ /* *INDENT-OFF* */
+
+ return 0;
+}
+
+static_always_inline int
+translate_crypto_algo(ipsec_crypto_alg_t crypto_algo,
+ struct rte_crypto_sym_xform *cipher_xform)
+{
+ switch (crypto_algo)
+ {
+ case IPSEC_CRYPTO_ALG_NONE:
+ cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_NULL;
+ break;
+ case IPSEC_CRYPTO_ALG_AES_CBC_128:
+ case IPSEC_CRYPTO_ALG_AES_CBC_192:
+ case IPSEC_CRYPTO_ALG_AES_CBC_256:
+ cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ break;
+ case IPSEC_CRYPTO_ALG_AES_GCM_128:
+ cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_GCM;
+ break;
+ default:
+ return -1;
+ }
+
+ cipher_xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+
+ return 0;
+}
+
+static_always_inline int
+translate_integ_algo(ipsec_integ_alg_t integ_alg,
+ struct rte_crypto_sym_xform *auth_xform, int use_esn)
+{
+ switch (integ_alg) {
+ case IPSEC_INTEG_ALG_NONE:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_NULL;
+ auth_xform->auth.digest_length = 0;
+ break;
+ case IPSEC_INTEG_ALG_SHA1_96:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA1_HMAC;
+ auth_xform->auth.digest_length = 12;
+ break;
+ case IPSEC_INTEG_ALG_SHA_256_96:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ auth_xform->auth.digest_length = 12;
+ break;
+ case IPSEC_INTEG_ALG_SHA_256_128:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA256_HMAC;
+ auth_xform->auth.digest_length = 16;
+ break;
+ case IPSEC_INTEG_ALG_SHA_384_192:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA384_HMAC;
+ auth_xform->auth.digest_length = 24;
+ break;
+ case IPSEC_INTEG_ALG_SHA_512_256:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA512_HMAC;
+ auth_xform->auth.digest_length = 32;
+ break;
+ case IPSEC_INTEG_ALG_AES_GCM_128:
+ auth_xform->auth.algo = RTE_CRYPTO_AUTH_AES_GCM;
+ auth_xform->auth.digest_length = 16;
+ auth_xform->auth.add_auth_data_length = use_esn? 12 : 8;
+ break;
+ default:
+ return -1;
+ }
+
+ auth_xform->type = RTE_CRYPTO_SYM_XFORM_AUTH;
+
+ return 0;
+}
+
+static_always_inline int
+create_sym_sess(ipsec_sa_t *sa, crypto_sa_session_t *sa_sess, u8 is_outbound)
+{
+ u32 cpu_index = os_get_cpu_number();
+ dpdk_crypto_main_t * dcm = &dpdk_crypto_main;
+ crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+ struct rte_crypto_sym_xform cipher_xform = {0};
+ struct rte_crypto_sym_xform auth_xform = {0};
+ struct rte_crypto_sym_xform *xfs;
+ uword key = 0, *data;
+ crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *)&key;
+
+ if (sa->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128)
+ {
+ sa->crypto_key_len -= 4;
+ clib_memcpy(&sa->salt, &sa->crypto_key[sa->crypto_key_len], 4);
+ }
+ else
+ {
+ sa->salt = (u32) rand();
+ }
+
+ cipher_xform.type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+ cipher_xform.cipher.key.data = sa->crypto_key;
+ cipher_xform.cipher.key.length = sa->crypto_key_len;
+
+ auth_xform.type = RTE_CRYPTO_SYM_XFORM_AUTH;
+ auth_xform.auth.key.data = sa->integ_key;
+ auth_xform.auth.key.length = sa->integ_key_len;
+
+ if (translate_crypto_algo(sa->crypto_alg, &cipher_xform) < 0)
+ return -1;
+ p_key->cipher_algo = cipher_xform.cipher.algo;
+
+ if (translate_integ_algo(sa->integ_alg, &auth_xform, sa->use_esn) < 0)
+ return -1;
+ p_key->auth_algo = auth_xform.auth.algo;
+
+ if (is_outbound)
+ {
+ cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT;
+ auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_GENERATE;
+ cipher_xform.next = &auth_xform;
+ xfs = &cipher_xform;
+ }
+ else
+ {
+ cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT;
+ auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_VERIFY;
+ auth_xform.next = &cipher_xform;
+ xfs = &auth_xform;
+ }
+
+ p_key->is_outbound = is_outbound;
+
+ data = hash_get(cwm->algo_qp_map, key);
+ if (!data)
+ return -1;
+
+ sa_sess->sess =
+ rte_cryptodev_sym_session_create(cwm->qp_data[*data].dev_id, xfs);
+
+ if (!sa_sess->sess)
+ return -1;
+
+ sa_sess->qp_index = (u8)*data;
+
+ return 0;
+}
+
+#endif /* __DPDK_ESP_H__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/esp_decrypt.c b/src/vnet/devices/dpdk/ipsec/esp_decrypt.c
new file mode 100644
index 00000000000..89ab9f9bc43
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/esp_decrypt.c
@@ -0,0 +1,583 @@
+/*
+ * esp_decrypt.c : IPSec ESP Decrypt node using DPDK Cryptodev
+ *
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/esp.h>
+
+#define foreach_esp_decrypt_next \
+_(DROP, "error-drop") \
+_(IP4_INPUT, "ip4-input") \
+_(IP6_INPUT, "ip6-input")
+
+#define _(v, s) ESP_DECRYPT_NEXT_##v,
+typedef enum {
+ foreach_esp_decrypt_next
+#undef _
+ ESP_DECRYPT_N_NEXT,
+} esp_decrypt_next_t;
+
+#define foreach_esp_decrypt_error \
+ _(RX_PKTS, "ESP pkts received") \
+ _(DECRYPTION_FAILED, "ESP decryption failed") \
+ _(REPLAY, "SA replayed packet") \
+ _(NOT_IP, "Not IP packet (dropped)") \
+ _(ENQ_FAIL, "Enqueue failed (buffer full)") \
+ _(NO_CRYPTODEV, "Cryptodev not configured") \
+ _(BAD_LEN, "Invalid ciphertext length") \
+ _(UNSUPPORTED, "Cipher/Auth not supported")
+
+
+typedef enum {
+#define _(sym,str) ESP_DECRYPT_ERROR_##sym,
+ foreach_esp_decrypt_error
+#undef _
+ ESP_DECRYPT_N_ERROR,
+} esp_decrypt_error_t;
+
+static char * esp_decrypt_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_decrypt_error
+#undef _
+};
+
+vlib_node_registration_t dpdk_esp_decrypt_node;
+
+typedef struct {
+ ipsec_crypto_alg_t crypto_alg;
+ ipsec_integ_alg_t integ_alg;
+} esp_decrypt_trace_t;
+
+/* packet trace format function */
+static u8 * format_esp_decrypt_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ esp_decrypt_trace_t * t = va_arg (*args, esp_decrypt_trace_t *);
+
+ s = format (s, "esp: crypto %U integrity %U",
+ format_ipsec_crypto_alg, t->crypto_alg,
+ format_ipsec_integ_alg, t->integ_alg);
+ return s;
+}
+
+static uword
+dpdk_esp_decrypt_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, *to_next, next_index;
+ ipsec_main_t *im = &ipsec_main;
+ u32 cpu_index = os_get_cpu_number();
+ dpdk_crypto_main_t * dcm = &dpdk_crypto_main;
+ dpdk_esp_main_t * em = &dpdk_esp_main;
+ u32 i;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (PREDICT_FALSE(!dcm->workers_main))
+ {
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_NO_CRYPTODEV, n_left_from);
+ vlib_buffer_free(vm, from, n_left_from);
+ return n_left_from;
+ }
+
+ crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, cpu_index);
+ u32 n_qps = vec_len(cwm->qp_data);
+ struct rte_crypto_op ** cops_to_enq[n_qps];
+ u32 n_cop_qp[n_qps], * bi_to_enq[n_qps];
+
+ for (i = 0; i < n_qps; i++)
+ {
+ bi_to_enq[i] = cwm->qp_data[i].bi;
+ cops_to_enq[i] = cwm->qp_data[i].cops;
+ }
+
+ memset(n_cop_qp, 0, n_qps * sizeof(u32));
+
+ crypto_alloc_cops();
+
+ next_index = ESP_DECRYPT_NEXT_DROP;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, sa_index0 = ~0, seq, icv_size, iv_size;
+ vlib_buffer_t * b0;
+ esp_header_t * esp0;
+ ipsec_sa_t * sa0;
+ struct rte_mbuf * mb0 = 0;
+ const int BLOCK_SIZE = 16;
+ crypto_sa_session_t * sa_sess;
+ void * sess;
+ u16 qp_index;
+ struct rte_crypto_op * cop = 0;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ esp0 = vlib_buffer_get_current (b0);
+
+ sa_index0 = vnet_buffer(b0)->ipsec.sad_index;
+ sa0 = pool_elt_at_index (im->sad, sa_index0);
+
+ seq = clib_host_to_net_u32(esp0->seq);
+
+ /* anti-replay check */
+ if (sa0->use_anti_replay)
+ {
+ int rv = 0;
+
+ if (PREDICT_TRUE(sa0->use_esn))
+ rv = esp_replay_check_esn(sa0, seq);
+ else
+ rv = esp_replay_check(sa0, seq);
+
+ if (PREDICT_FALSE(rv))
+ {
+ clib_warning ("anti-replay SPI %u seq %u", sa0->spi, seq);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_REPLAY, 1);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ goto trace;
+ }
+ }
+
+ if (PREDICT_FALSE(sa0->integ_alg == IPSEC_INTEG_ALG_NONE) ||
+ PREDICT_FALSE(sa0->crypto_alg == IPSEC_CRYPTO_ALG_NONE))
+ {
+ clib_warning ("SPI %u : only cipher + auth supported", sa0->spi);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_UNSUPPORTED, 1);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ goto trace;
+ }
+
+ sa_sess = pool_elt_at_index(cwm->sa_sess_d[0], sa_index0);
+
+ if (PREDICT_FALSE(!sa_sess->sess))
+ {
+ int ret = create_sym_sess(sa0, sa_sess, 0);
+ ASSERT(ret == 0);
+ }
+
+ sess = sa_sess->sess;
+ qp_index = sa_sess->qp_index;
+
+ ASSERT (vec_len (vec_elt (cwm->qp_data, qp_index).free_cops) > 0);
+ cop = vec_pop (vec_elt (cwm->qp_data, qp_index).free_cops);
+ ASSERT (cop->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED);
+
+ cops_to_enq[qp_index][0] = cop;
+ cops_to_enq[qp_index] += 1;
+ n_cop_qp[qp_index] += 1;
+ bi_to_enq[qp_index][0] = bi0;
+ bi_to_enq[qp_index] += 1;
+
+ rte_crypto_op_attach_sym_session(cop, sess);
+
+ icv_size = em->esp_integ_algs[sa0->integ_alg].trunc_size;
+ iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len;
+
+ /* Convert vlib buffer to mbuf */
+ mb0 = rte_mbuf_from_vlib_buffer(b0);
+ mb0->data_len = b0->current_length;
+ mb0->pkt_len = b0->current_length;
+ mb0->data_off = RTE_PKTMBUF_HEADROOM + b0->current_data;
+
+ /* Outer IP header has already been stripped */
+ u16 payload_len = rte_pktmbuf_pkt_len(mb0) - sizeof (esp_header_t) -
+ iv_size - icv_size;
+
+ if ((payload_len & (BLOCK_SIZE - 1)) || (payload_len <= 0))
+ {
+ clib_warning ("payload %u not multiple of %d\n",
+ payload_len, BLOCK_SIZE);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_BAD_LEN, 1);
+ vec_add (vec_elt (cwm->qp_data, qp_index).free_cops, &cop, 1);
+ bi_to_enq[qp_index] -= 1;
+ cops_to_enq[qp_index] -= 1;
+ n_cop_qp[qp_index] -= 1;
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ goto trace;
+ }
+
+ struct rte_crypto_sym_op *sym_cop = (struct rte_crypto_sym_op *)(cop + 1);
+
+ sym_cop->m_src = mb0;
+ sym_cop->cipher.data.offset = sizeof (esp_header_t) + iv_size;
+ sym_cop->cipher.data.length = payload_len;
+
+ u8 *iv = rte_pktmbuf_mtod_offset(mb0, void*, sizeof (esp_header_t));
+ dpdk_cop_priv_t * priv = (dpdk_cop_priv_t *)(sym_cop + 1);
+
+ if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128)
+ {
+ dpdk_gcm_cnt_blk *icb = &priv->cb;
+ icb->salt = sa0->salt;
+ clib_memcpy(icb->iv, iv, 8);
+ icb->cnt = clib_host_to_net_u32(1);
+ sym_cop->cipher.iv.data = (u8 *)icb;
+ sym_cop->cipher.iv.phys_addr = cop->phys_addr +
+ (uintptr_t)icb - (uintptr_t)cop;
+ sym_cop->cipher.iv.length = 16;
+
+ u8 *aad = priv->aad;
+ clib_memcpy(aad, iv - sizeof(esp_header_t), 8);
+ sym_cop->auth.aad.data = aad;
+ sym_cop->auth.aad.phys_addr = cop->phys_addr +
+ (uintptr_t)aad - (uintptr_t)cop;
+ if (sa0->use_esn)
+ {
+ *((u32*)&aad[8]) = sa0->seq_hi;
+ sym_cop->auth.aad.length = 12;
+ }
+ else
+ {
+ sym_cop->auth.aad.length = 8;
+ }
+
+ sym_cop->auth.digest.data = rte_pktmbuf_mtod_offset(mb0, void*,
+ rte_pktmbuf_pkt_len(mb0) - icv_size);
+ sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset(mb0,
+ rte_pktmbuf_pkt_len(mb0) - icv_size);
+ sym_cop->auth.digest.length = icv_size;
+
+ }
+ else
+ {
+ sym_cop->cipher.iv.data = rte_pktmbuf_mtod_offset(mb0, void*,
+ sizeof (esp_header_t));
+ sym_cop->cipher.iv.phys_addr = rte_pktmbuf_mtophys_offset(mb0,
+ sizeof (esp_header_t));
+ sym_cop->cipher.iv.length = iv_size;
+
+ if (sa0->use_esn)
+ {
+ dpdk_cop_priv_t* priv = (dpdk_cop_priv_t*) (sym_cop + 1);
+ u8* payload_end = rte_pktmbuf_mtod_offset(
+ mb0, u8*, sizeof(esp_header_t) + iv_size + payload_len);
+
+ clib_memcpy (priv->icv, payload_end, icv_size);
+ *((u32*) payload_end) = sa0->seq_hi;
+ sym_cop->auth.data.offset = 0;
+ sym_cop->auth.data.length = sizeof(esp_header_t) + iv_size
+ + payload_len + sizeof(sa0->seq_hi);
+ sym_cop->auth.digest.data = priv->icv;
+ sym_cop->auth.digest.phys_addr = cop->phys_addr
+ + (uintptr_t) priv->icv - (uintptr_t) cop;
+ sym_cop->auth.digest.length = icv_size;
+ }
+ else
+ {
+ sym_cop->auth.data.offset = 0;
+ sym_cop->auth.data.length = sizeof(esp_header_t) +
+ iv_size + payload_len;
+
+ sym_cop->auth.digest.data = rte_pktmbuf_mtod_offset(mb0, void*,
+ rte_pktmbuf_pkt_len(mb0) - icv_size);
+ sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset(mb0,
+ rte_pktmbuf_pkt_len(mb0) - icv_size);
+ sym_cop->auth.digest.length = icv_size;
+ }
+ }
+
+trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->crypto_alg = sa0->crypto_alg;
+ tr->integ_alg = sa0->integ_alg;
+ }
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_RX_PKTS,
+ from_frame->n_vectors);
+ crypto_qp_data_t *qpd;
+ /* *INDENT-OFF* */
+ vec_foreach_index (i, cwm->qp_data)
+ {
+ u32 enq;
+
+ qpd = vec_elt_at_index(cwm->qp_data, i);
+ enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id,
+ qpd->cops, n_cop_qp[i]);
+ qpd->inflights += enq;
+
+ if (PREDICT_FALSE(enq < n_cop_qp[i]))
+ {
+ crypto_free_cop (qpd, &qpd->cops[enq], n_cop_qp[i] - enq);
+ vlib_buffer_free (vm, &qpd->bi[enq], n_cop_qp[i] - enq);
+
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_ENQ_FAIL,
+ n_cop_qp[i] - enq);
+ }
+ }
+ /* *INDENT-ON* */
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dpdk_esp_decrypt_node) = {
+ .function = dpdk_esp_decrypt_node_fn,
+ .name = "dpdk-esp-decrypt",
+ .vector_size = sizeof (u32),
+ .format_trace = format_esp_decrypt_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(esp_decrypt_error_strings),
+ .error_strings = esp_decrypt_error_strings,
+
+ .n_next_nodes = ESP_DECRYPT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ESP_DECRYPT_NEXT_##s] = n,
+ foreach_esp_decrypt_next
+#undef _
+ },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_decrypt_node, dpdk_esp_decrypt_node_fn)
+
+/*
+ * Decrypt Post Node
+ */
+
+#define foreach_esp_decrypt_post_error \
+ _(PKTS, "ESP post pkts")
+
+typedef enum {
+#define _(sym,str) ESP_DECRYPT_POST_ERROR_##sym,
+ foreach_esp_decrypt_post_error
+#undef _
+ ESP_DECRYPT_POST_N_ERROR,
+} esp_decrypt_post_error_t;
+
+static char * esp_decrypt_post_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_decrypt_post_error
+#undef _
+};
+
+vlib_node_registration_t dpdk_esp_decrypt_post_node;
+
+static u8 * format_esp_decrypt_post_trace (u8 * s, va_list * args)
+{
+ return s;
+}
+
+static uword
+dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, *to_next = 0, next_index;
+ ipsec_sa_t * sa0;
+ u32 sa_index0 = ~0;
+ ipsec_main_t *im = &ipsec_main;
+ dpdk_esp_main_t *em = &dpdk_esp_main;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ esp_footer_t * f0;
+ u32 bi0, next0, icv_size, iv_size;
+ vlib_buffer_t * b0 = 0;
+ ip4_header_t *ih4 = 0, *oh4 = 0;
+ ip6_header_t *ih6 = 0, *oh6 = 0;
+ u8 tunnel_mode = 1;
+ u8 transport_ip6 = 0;
+
+ next0 = ESP_DECRYPT_NEXT_DROP;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sa_index0 = vnet_buffer(b0)->ipsec.sad_index;
+ sa0 = pool_elt_at_index (im->sad, sa_index0);
+
+ to_next[0] = bi0;
+ to_next += 1;
+
+ icv_size = em->esp_integ_algs[sa0->integ_alg].trunc_size;
+ iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len;
+
+ if (sa0->use_anti_replay)
+ {
+ esp_header_t * esp0 = vlib_buffer_get_current (b0);
+ u32 seq;
+ seq = clib_host_to_net_u32(esp0->seq);
+ if (PREDICT_TRUE(sa0->use_esn))
+ esp_replay_advance_esn(sa0, seq);
+ else
+ esp_replay_advance(sa0, seq);
+ }
+
+ ih4 = (ip4_header_t *) (b0->data + sizeof(ethernet_header_t));
+ vlib_buffer_advance (b0, sizeof (esp_header_t) + iv_size);
+
+ b0->current_length -= (icv_size + 2);
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ f0 = (esp_footer_t *) ((u8 *) vlib_buffer_get_current (b0) +
+ b0->current_length);
+ b0->current_length -= f0->pad_length;
+
+ /* transport mode */
+ if (PREDICT_FALSE(!sa0->is_tunnel && !sa0->is_tunnel_ip6))
+ {
+ tunnel_mode = 0;
+
+ if (PREDICT_TRUE((ih4->ip_version_and_header_length & 0xF0) != 0x40))
+ {
+ if (PREDICT_TRUE((ih4->ip_version_and_header_length & 0xF0) == 0x60))
+ transport_ip6 = 1;
+ else
+ {
+ clib_warning("next header: 0x%x", f0->next_header);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_NOT_IP, 1);
+ goto trace;
+ }
+ }
+ }
+
+ if (PREDICT_TRUE (tunnel_mode))
+ {
+ if (PREDICT_TRUE(f0->next_header == IP_PROTOCOL_IP_IN_IP))
+ next0 = ESP_DECRYPT_NEXT_IP4_INPUT;
+ else if (f0->next_header == IP_PROTOCOL_IPV6)
+ next0 = ESP_DECRYPT_NEXT_IP6_INPUT;
+ else
+ {
+ clib_warning("next header: 0x%x", f0->next_header);
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_DECRYPTION_FAILED,
+ 1);
+ goto trace;
+ }
+ }
+ /* transport mode */
+ else
+ {
+ if (PREDICT_FALSE(transport_ip6))
+ {
+ ih6 = (ip6_header_t *) (b0->data + sizeof(ethernet_header_t));
+ vlib_buffer_advance (b0, -sizeof(ip6_header_t));
+ oh6 = vlib_buffer_get_current (b0);
+ memmove(oh6, ih6, sizeof(ip6_header_t));
+
+ next0 = ESP_DECRYPT_NEXT_IP6_INPUT;
+ oh6->protocol = f0->next_header;
+ oh6->payload_length =
+ clib_host_to_net_u16 (
+ vlib_buffer_length_in_chain(vm, b0) -
+ sizeof (ip6_header_t));
+ }
+ else
+ {
+ vlib_buffer_advance (b0, -sizeof(ip4_header_t));
+ oh4 = vlib_buffer_get_current (b0);
+ memmove(oh4, ih4, sizeof(ip4_header_t));
+
+ next0 = ESP_DECRYPT_NEXT_IP4_INPUT;
+ oh4->ip_version_and_header_length = 0x45;
+ oh4->fragment_id = 0;
+ oh4->flags_and_fragment_offset = 0;
+ oh4->protocol = f0->next_header;
+ oh4->length = clib_host_to_net_u16 (
+ vlib_buffer_length_in_chain (vm, b0));
+ oh4->checksum = ip4_header_checksum (oh4);
+ }
+ }
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32)~0;
+
+trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->crypto_alg = sa0->crypto_alg;
+ tr->integ_alg = sa0->integ_alg;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, dpdk_esp_decrypt_post_node.index,
+ ESP_DECRYPT_POST_ERROR_PKTS,
+ from_frame->n_vectors);
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dpdk_esp_decrypt_post_node) = {
+ .function = dpdk_esp_decrypt_post_node_fn,
+ .name = "dpdk-esp-decrypt-post",
+ .vector_size = sizeof (u32),
+ .format_trace = format_esp_decrypt_post_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(esp_decrypt_post_error_strings),
+ .error_strings = esp_decrypt_post_error_strings,
+
+ .n_next_nodes = ESP_DECRYPT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ESP_DECRYPT_NEXT_##s] = n,
+ foreach_esp_decrypt_next
+#undef _
+ },
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_decrypt_post_node, dpdk_esp_decrypt_post_node_fn)
diff --git a/src/vnet/devices/dpdk/ipsec/esp_encrypt.c b/src/vnet/devices/dpdk/ipsec/esp_encrypt.c
new file mode 100644
index 00000000000..10bb4616eef
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/esp_encrypt.c
@@ -0,0 +1,598 @@
+/*
+ * esp_encrypt.c : IPSec ESP encrypt node using DPDK Cryptodev
+ *
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/esp.h>
+
+#define foreach_esp_encrypt_next \
+_(DROP, "error-drop") \
+_(IP4_LOOKUP, "ip4-lookup") \
+_(IP6_LOOKUP, "ip6-lookup") \
+_(INTERFACE_OUTPUT, "interface-output")
+
+#define _(v, s) ESP_ENCRYPT_NEXT_##v,
+typedef enum
+{
+ foreach_esp_encrypt_next
+#undef _
+ ESP_ENCRYPT_N_NEXT,
+} esp_encrypt_next_t;
+
+#define foreach_esp_encrypt_error \
+ _(RX_PKTS, "ESP pkts received") \
+ _(SEQ_CYCLED, "sequence number cycled") \
+ _(ENQ_FAIL, "Enqueue failed (buffer full)") \
+ _(NO_CRYPTODEV, "Cryptodev not configured") \
+ _(UNSUPPORTED, "Cipher/Auth not supported")
+
+
+typedef enum
+{
+#define _(sym,str) ESP_ENCRYPT_ERROR_##sym,
+ foreach_esp_encrypt_error
+#undef _
+ ESP_ENCRYPT_N_ERROR,
+} esp_encrypt_error_t;
+
+static char *esp_encrypt_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_encrypt_error
+#undef _
+};
+
+vlib_node_registration_t dpdk_esp_encrypt_node;
+
+typedef struct
+{
+ u32 spi;
+ u32 seq;
+ ipsec_crypto_alg_t crypto_alg;
+ ipsec_integ_alg_t integ_alg;
+} esp_encrypt_trace_t;
+
+/* packet trace format function */
+static u8 *
+format_esp_encrypt_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ esp_encrypt_trace_t *t = va_arg (*args, esp_encrypt_trace_t *);
+
+ s = format (s, "esp: spi %u seq %u crypto %U integrity %U",
+ t->spi, t->seq,
+ format_ipsec_crypto_alg, t->crypto_alg,
+ format_ipsec_integ_alg, t->integ_alg);
+ return s;
+}
+
+static uword
+dpdk_esp_encrypt_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, *to_next, next_index;
+ ipsec_main_t *im = &ipsec_main;
+ u32 cpu_index = os_get_cpu_number ();
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ dpdk_esp_main_t *em = &dpdk_esp_main;
+ u32 i;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (PREDICT_FALSE (!dcm->workers_main))
+ {
+ /* Likely there are not enough cryptodevs, so drop frame */
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_NO_CRYPTODEV,
+ n_left_from);
+ vlib_buffer_free (vm, from, n_left_from);
+ return n_left_from;
+ }
+
+ crypto_worker_main_t *cwm = vec_elt_at_index (dcm->workers_main, cpu_index);
+ u32 n_qps = vec_len (cwm->qp_data);
+ struct rte_crypto_op **cops_to_enq[n_qps];
+ u32 n_cop_qp[n_qps], *bi_to_enq[n_qps];
+
+ for (i = 0; i < n_qps; i++)
+ {
+ bi_to_enq[i] = cwm->qp_data[i].bi;
+ cops_to_enq[i] = cwm->qp_data[i].cops;
+ }
+
+ memset (n_cop_qp, 0, n_qps * sizeof (u32));
+
+ crypto_alloc_cops ();
+
+ next_index = ESP_ENCRYPT_NEXT_DROP;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t *b0 = 0;
+ u32 sa_index0;
+ ipsec_sa_t *sa0;
+ ip4_and_esp_header_t *ih0, *oh0 = 0;
+ ip6_and_esp_header_t *ih6_0, *oh6_0 = 0;
+ struct rte_mbuf *mb0 = 0;
+ esp_footer_t *f0;
+ u8 is_ipv6;
+ u8 ip_hdr_size;
+ u8 next_hdr_type;
+ u8 transport_mode = 0;
+ const int BLOCK_SIZE = 16;
+ u32 iv_size;
+ u16 orig_sz;
+ crypto_sa_session_t *sa_sess;
+ void *sess;
+ struct rte_crypto_op *cop = 0;
+ u16 qp_index;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ sa_index0 = vnet_buffer (b0)->ipsec.sad_index;
+ sa0 = pool_elt_at_index (im->sad, sa_index0);
+
+ if (PREDICT_FALSE (esp_seq_advance (sa0)))
+ {
+ clib_warning ("sequence number counter has cycled SPI %u",
+ sa0->spi);
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_SEQ_CYCLED, 1);
+ //TODO: rekey SA
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ goto trace;
+ }
+
+ sa_sess = pool_elt_at_index (cwm->sa_sess_d[1], sa_index0);
+ if (PREDICT_FALSE (!sa_sess->sess))
+ {
+ int ret = create_sym_sess (sa0, sa_sess, 1);
+ ASSERT (ret == 0);
+ }
+
+ qp_index = sa_sess->qp_index;
+ sess = sa_sess->sess;
+
+ ASSERT (vec_len (vec_elt (cwm->qp_data, qp_index).free_cops) > 0);
+ cop = vec_pop (vec_elt (cwm->qp_data, qp_index).free_cops);
+ ASSERT (cop->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED);
+
+ cops_to_enq[qp_index][0] = cop;
+ cops_to_enq[qp_index] += 1;
+ n_cop_qp[qp_index] += 1;
+ bi_to_enq[qp_index][0] = bi0;
+ bi_to_enq[qp_index] += 1;
+
+ ssize_t adv;
+ iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len;
+ ih0 = vlib_buffer_get_current (b0);
+ orig_sz = b0->current_length;
+ is_ipv6 = (ih0->ip4.ip_version_and_header_length & 0xF0) == 0x60;
+ /* is ipv6 */
+ if (PREDICT_TRUE (sa0->is_tunnel))
+ {
+ if (PREDICT_TRUE (!is_ipv6))
+ adv = -sizeof (ip4_and_esp_header_t);
+ else
+ adv = -sizeof (ip6_and_esp_header_t);
+ }
+ else
+ {
+ adv = -sizeof (esp_header_t);
+ if (PREDICT_TRUE (!is_ipv6))
+ orig_sz -= sizeof (ip4_header_t);
+ else
+ orig_sz -= sizeof (ip6_header_t);
+ }
+
+ /*transport mode save the eth header before it is overwritten */
+ if (PREDICT_FALSE (!sa0->is_tunnel))
+ {
+ ethernet_header_t *ieh0 = (ethernet_header_t *)
+ ((u8 *) vlib_buffer_get_current (b0) -
+ sizeof (ethernet_header_t));
+ ethernet_header_t *oeh0 =
+ (ethernet_header_t *) ((u8 *) ieh0 + (adv - iv_size));
+ clib_memcpy (oeh0, ieh0, sizeof (ethernet_header_t));
+ }
+
+ vlib_buffer_advance (b0, adv - iv_size);
+
+ /* XXX IP6/ip4 and IP4/IP6 not supported, only IP4/IP4 and IP6/IP6 */
+
+ /* is ipv6 */
+ if (PREDICT_FALSE (is_ipv6))
+ {
+ ih6_0 = (ip6_and_esp_header_t *) ih0;
+ ip_hdr_size = sizeof (ip6_header_t);
+ oh6_0 = vlib_buffer_get_current (b0);
+
+ if (PREDICT_TRUE (sa0->is_tunnel))
+ {
+ next_hdr_type = IP_PROTOCOL_IPV6;
+ oh6_0->ip6.ip_version_traffic_class_and_flow_label =
+ ih6_0->ip6.ip_version_traffic_class_and_flow_label;
+ }
+ else
+ {
+ next_hdr_type = ih6_0->ip6.protocol;
+ memmove (oh6_0, ih6_0, sizeof (ip6_header_t));
+ }
+
+ oh6_0->ip6.protocol = IP_PROTOCOL_IPSEC_ESP;
+ oh6_0->ip6.hop_limit = 254;
+ oh6_0->esp.spi = clib_net_to_host_u32 (sa0->spi);
+ oh6_0->esp.seq = clib_net_to_host_u32 (sa0->seq);
+ }
+ else
+ {
+ ip_hdr_size = sizeof (ip4_header_t);
+ oh0 = vlib_buffer_get_current (b0);
+
+ if (PREDICT_TRUE (sa0->is_tunnel))
+ {
+ next_hdr_type = IP_PROTOCOL_IP_IN_IP;
+ oh0->ip4.tos = ih0->ip4.tos;
+ }
+ else
+ {
+ next_hdr_type = ih0->ip4.protocol;
+ memmove (oh0, ih0, sizeof (ip4_header_t));
+ }
+
+ oh0->ip4.ip_version_and_header_length = 0x45;
+ oh0->ip4.fragment_id = 0;
+ oh0->ip4.flags_and_fragment_offset = 0;
+ oh0->ip4.ttl = 254;
+ oh0->ip4.protocol = IP_PROTOCOL_IPSEC_ESP;
+ oh0->esp.spi = clib_net_to_host_u32 (sa0->spi);
+ oh0->esp.seq = clib_net_to_host_u32 (sa0->seq);
+ }
+
+ if (PREDICT_TRUE (sa0->is_tunnel && !sa0->is_tunnel_ip6))
+ {
+ oh0->ip4.src_address.as_u32 = sa0->tunnel_src_addr.ip4.as_u32;
+ oh0->ip4.dst_address.as_u32 = sa0->tunnel_dst_addr.ip4.as_u32;
+
+ /* in tunnel mode send it back to FIB */
+ next0 = ESP_ENCRYPT_NEXT_IP4_LOOKUP;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ }
+ else if (sa0->is_tunnel && sa0->is_tunnel_ip6)
+ {
+ oh6_0->ip6.src_address.as_u64[0] =
+ sa0->tunnel_src_addr.ip6.as_u64[0];
+ oh6_0->ip6.src_address.as_u64[1] =
+ sa0->tunnel_src_addr.ip6.as_u64[1];
+ oh6_0->ip6.dst_address.as_u64[0] =
+ sa0->tunnel_dst_addr.ip6.as_u64[0];
+ oh6_0->ip6.dst_address.as_u64[1] =
+ sa0->tunnel_dst_addr.ip6.as_u64[1];
+
+ /* in tunnel mode send it back to FIB */
+ next0 = ESP_ENCRYPT_NEXT_IP6_LOOKUP;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ }
+ else
+ {
+ next0 = ESP_ENCRYPT_NEXT_INTERFACE_OUTPUT;
+ transport_mode = 1;
+ }
+
+ ASSERT (sa0->crypto_alg < IPSEC_CRYPTO_N_ALG);
+ ASSERT (sa0->crypto_alg != IPSEC_CRYPTO_ALG_NONE);
+
+ int blocks = 1 + (orig_sz + 1) / BLOCK_SIZE;
+
+ /* pad packet in input buffer */
+ u8 pad_bytes = BLOCK_SIZE * blocks - 2 - orig_sz;
+ u8 i;
+ u8 *padding = vlib_buffer_get_current (b0) + b0->current_length;
+
+ for (i = 0; i < pad_bytes; ++i)
+ padding[i] = i + 1;
+
+ f0 = vlib_buffer_get_current (b0) + b0->current_length + pad_bytes;
+ f0->pad_length = pad_bytes;
+ f0->next_header = next_hdr_type;
+ b0->current_length += pad_bytes + 2 +
+ em->esp_integ_algs[sa0->integ_alg].trunc_size;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] =
+ vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ struct rte_crypto_sym_op *sym_cop;
+ sym_cop = (struct rte_crypto_sym_op *) (cop + 1);
+
+ dpdk_cop_priv_t *priv = (dpdk_cop_priv_t *) (sym_cop + 1);
+
+ vnet_buffer (b0)->unused[0] = next0;
+
+ mb0 = rte_mbuf_from_vlib_buffer (b0);
+ mb0->data_len = b0->current_length;
+ mb0->pkt_len = b0->current_length;
+ mb0->data_off = RTE_PKTMBUF_HEADROOM + b0->current_data;
+
+ rte_crypto_op_attach_sym_session (cop, sess);
+
+ sym_cop->m_src = mb0;
+
+ dpdk_gcm_cnt_blk *icb = &priv->cb;
+ icb->salt = sa0->salt;
+ icb->iv[0] = sa0->seq;
+ icb->iv[1] = sa0->seq_hi;
+
+ if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128)
+ {
+ icb->cnt = clib_host_to_net_u32 (1);
+ clib_memcpy (vlib_buffer_get_current (b0) + ip_hdr_size +
+ sizeof (esp_header_t), icb->iv, 8);
+ sym_cop->cipher.data.offset =
+ ip_hdr_size + sizeof (esp_header_t) + iv_size;
+ sym_cop->cipher.data.length = BLOCK_SIZE * blocks;
+ sym_cop->cipher.iv.length = 16;
+ }
+ else
+ {
+ sym_cop->cipher.data.offset =
+ ip_hdr_size + sizeof (esp_header_t);
+ sym_cop->cipher.data.length = BLOCK_SIZE * blocks + iv_size;
+ sym_cop->cipher.iv.length = iv_size;
+ }
+
+ sym_cop->cipher.iv.data = (u8 *) icb;
+ sym_cop->cipher.iv.phys_addr = cop->phys_addr + (uintptr_t) icb
+ - (uintptr_t) cop;
+
+
+ ASSERT (sa0->integ_alg < IPSEC_INTEG_N_ALG);
+ ASSERT (sa0->integ_alg != IPSEC_INTEG_ALG_NONE);
+
+ if (PREDICT_FALSE (sa0->integ_alg == IPSEC_INTEG_ALG_AES_GCM_128))
+ {
+ u8 *aad = priv->aad;
+ clib_memcpy (aad, vlib_buffer_get_current (b0) + ip_hdr_size,
+ 8);
+ sym_cop->auth.aad.data = aad;
+ sym_cop->auth.aad.phys_addr = cop->phys_addr +
+ (uintptr_t) aad - (uintptr_t) cop;
+
+ if (PREDICT_FALSE (sa0->use_esn))
+ {
+ *((u32 *) & aad[8]) = sa0->seq_hi;
+ sym_cop->auth.aad.length = 12;
+ }
+ else
+ {
+ sym_cop->auth.aad.length = 8;
+ }
+ }
+ else
+ {
+ sym_cop->auth.data.offset = ip_hdr_size;
+ sym_cop->auth.data.length = b0->current_length - ip_hdr_size
+ - em->esp_integ_algs[sa0->integ_alg].trunc_size;
+
+ if (PREDICT_FALSE (sa0->use_esn))
+ {
+ u8 *payload_end =
+ vlib_buffer_get_current (b0) + b0->current_length;
+ *((u32 *) payload_end) = sa0->seq_hi;
+ sym_cop->auth.data.length += sizeof (sa0->seq_hi);
+ }
+ }
+ sym_cop->auth.digest.data = vlib_buffer_get_current (b0) +
+ b0->current_length -
+ em->esp_integ_algs[sa0->integ_alg].trunc_size;
+ sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset (mb0,
+ b0->current_length
+ -
+ em->esp_integ_algs
+ [sa0->integ_alg].trunc_size);
+ sym_cop->auth.digest.length =
+ em->esp_integ_algs[sa0->integ_alg].trunc_size;
+
+
+ if (PREDICT_FALSE (is_ipv6))
+ {
+ oh6_0->ip6.payload_length =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) -
+ sizeof (ip6_header_t));
+ }
+ else
+ {
+ oh0->ip4.length =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+ oh0->ip4.checksum = ip4_header_checksum (&oh0->ip4);
+ }
+
+ if (transport_mode)
+ vlib_buffer_advance (b0, -sizeof (ethernet_header_t));
+
+ trace:
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ esp_encrypt_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->spi = sa0->spi;
+ tr->seq = sa0->seq - 1;
+ tr->crypto_alg = sa0->crypto_alg;
+ tr->integ_alg = sa0->integ_alg;
+ }
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_RX_PKTS,
+ from_frame->n_vectors);
+ crypto_qp_data_t *qpd;
+ /* *INDENT-OFF* */
+ vec_foreach_index (i, cwm->qp_data)
+ {
+ u32 enq;
+
+ qpd = vec_elt_at_index(cwm->qp_data, i);
+ enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id,
+ qpd->cops, n_cop_qp[i]);
+ qpd->inflights += enq;
+
+ if (PREDICT_FALSE(enq < n_cop_qp[i]))
+ {
+ crypto_free_cop (qpd, &qpd->cops[enq], n_cop_qp[i] - enq);
+ vlib_buffer_free (vm, &qpd->bi[enq], n_cop_qp[i] - enq);
+
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_ENQ_FAIL,
+ n_cop_qp[i] - enq);
+ }
+ }
+ /* *INDENT-ON* */
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dpdk_esp_encrypt_node) =
+{
+ .function = dpdk_esp_encrypt_node_fn,.name = "dpdk-esp-encrypt",.flags =
+ VLIB_NODE_FLAG_IS_OUTPUT,.vector_size = sizeof (u32),.format_trace =
+ format_esp_encrypt_trace,.n_errors =
+ ARRAY_LEN (esp_encrypt_error_strings),.error_strings =
+ esp_encrypt_error_strings,.n_next_nodes = 1,.next_nodes =
+ {
+ [ESP_ENCRYPT_NEXT_DROP] = "error-drop",}
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_encrypt_node, dpdk_esp_encrypt_node_fn)
+/*
+ * ESP Encrypt Post Node
+ */
+#define foreach_esp_encrypt_post_error \
+ _(PKTS, "ESP post pkts")
+ typedef enum
+ {
+#define _(sym,str) ESP_ENCRYPT_POST_ERROR_##sym,
+ foreach_esp_encrypt_post_error
+#undef _
+ ESP_ENCRYPT_POST_N_ERROR,
+ } esp_encrypt_post_error_t;
+
+ static char *esp_encrypt_post_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_encrypt_post_error
+#undef _
+ };
+
+vlib_node_registration_t dpdk_esp_encrypt_post_node;
+
+static u8 *
+format_esp_encrypt_post_trace (u8 * s, va_list * args)
+{
+ return s;
+}
+
+static uword
+dpdk_esp_encrypt_post_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, *to_next = 0, next_index;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t *b0 = 0;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ to_next[0] = bi0;
+ to_next += 1;
+
+ next0 = vnet_buffer (b0)->unused[0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, bi0,
+ next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, dpdk_esp_encrypt_post_node.index,
+ ESP_ENCRYPT_POST_ERROR_PKTS,
+ from_frame->n_vectors);
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dpdk_esp_encrypt_post_node) =
+{
+ .function = dpdk_esp_encrypt_post_node_fn,.name =
+ "dpdk-esp-encrypt-post",.vector_size = sizeof (u32),.format_trace =
+ format_esp_encrypt_post_trace,.type = VLIB_NODE_TYPE_INTERNAL,.n_errors =
+ ARRAY_LEN (esp_encrypt_post_error_strings),.error_strings =
+ esp_encrypt_post_error_strings,.n_next_nodes =
+ ESP_ENCRYPT_N_NEXT,.next_nodes =
+ {
+#define _(s,n) [ESP_ENCRYPT_NEXT_##s] = n,
+ foreach_esp_encrypt_next
+#undef _
+ }
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_encrypt_post_node,
+ dpdk_esp_encrypt_post_node_fn)
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/ipsec.c b/src/vnet/devices/dpdk/ipsec/ipsec.c
new file mode 100644
index 00000000000..de253f02636
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/ipsec.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/api_errno.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/devices/dpdk/ipsec/ipsec.h>
+#include <vnet/devices/dpdk/ipsec/esp.h>
+#include <vnet/ipsec/ipsec.h>
+
+#define DPDK_CRYPTO_NB_OBJS 2048
+#define DPDK_CRYPTO_CACHE_SIZE 512
+#define DPDK_CRYPTO_PRIV_SIZE 128
+#define DPDK_CRYPTO_N_QUEUE_DESC 512
+#define DPDK_CRYPTO_NB_COPS (1024 * 4)
+
+/*
+ * return:
+ * -1: update failed
+ * 0: already exist
+ * 1: mapped
+ */
+static int
+update_qp_data (crypto_worker_main_t * cwm,
+ u8 cdev_id, u16 qp_id, u8 is_outbound, u16 * idx)
+{
+ crypto_qp_data_t *qpd;
+
+ /* *INDENT-OFF* */
+ vec_foreach_index (*idx, cwm->qp_data)
+ {
+ qpd = vec_elt_at_index(cwm->qp_data, *idx);
+
+ if (qpd->dev_id == cdev_id && qpd->qp_id == qp_id &&
+ qpd->is_outbound == is_outbound)
+ return 0;
+ }
+ /* *INDENT-ON* */
+
+ vec_add2 (cwm->qp_data, qpd, 1);
+
+ qpd->dev_id = cdev_id;
+ qpd->qp_id = qp_id;
+ qpd->is_outbound = is_outbound;
+
+ return 1;
+}
+
+/*
+ * return:
+ * -1: error
+ * 0: already exist
+ * 1: mapped
+ */
+static int
+add_mapping (crypto_worker_main_t * cwm,
+ u8 cdev_id, u16 qp, u8 is_outbound,
+ const struct rte_cryptodev_capabilities *cipher_cap,
+ const struct rte_cryptodev_capabilities *auth_cap)
+{
+ int mapped;
+ u16 qp_index;
+ uword key = 0, data, *ret;
+ crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *) & key;
+
+ p_key->cipher_algo = (u8) cipher_cap->sym.cipher.algo;
+ p_key->auth_algo = (u8) auth_cap->sym.auth.algo;
+ p_key->is_outbound = is_outbound;
+
+ ret = hash_get (cwm->algo_qp_map, key);
+ if (ret)
+ return 0;
+
+ mapped = update_qp_data (cwm, cdev_id, qp, is_outbound, &qp_index);
+ if (mapped < 0)
+ return -1;
+
+ data = (uword) qp_index;
+
+ ret = hash_set (cwm->algo_qp_map, key, data);
+ if (!ret)
+ rte_panic ("Failed to insert hash table\n");
+
+ return mapped;
+}
+
+/*
+ * return:
+ * 0: already exist
+ * 1: mapped
+ */
+static int
+add_cdev_mapping (crypto_worker_main_t * cwm,
+ struct rte_cryptodev_info *dev_info, u8 cdev_id,
+ u16 qp, u8 is_outbound)
+{
+ const struct rte_cryptodev_capabilities *i, *j;
+ u32 mapped = 0;
+
+ for (i = dev_info->capabilities; i->op != RTE_CRYPTO_OP_TYPE_UNDEFINED; i++)
+ {
+ if (i->sym.xform_type != RTE_CRYPTO_SYM_XFORM_CIPHER)
+ continue;
+
+ if (check_algo_is_supported (i, NULL) != 0)
+ continue;
+
+ for (j = dev_info->capabilities; j->op != RTE_CRYPTO_OP_TYPE_UNDEFINED;
+ j++)
+ {
+ int status = 0;
+
+ if (j->sym.xform_type != RTE_CRYPTO_SYM_XFORM_AUTH)
+ continue;
+
+ if (check_algo_is_supported (j, NULL) != 0)
+ continue;
+
+ status = add_mapping (cwm, cdev_id, qp, is_outbound, i, j);
+ if (status == 1)
+ mapped += 1;
+ if (status < 0)
+ return status;
+ }
+ }
+
+ return mapped;
+}
+
+static int
+check_cryptodev_queues ()
+{
+ u32 n_qs = 0;
+ u8 cdev_id;
+ u32 n_req_qs = 2;
+
+ if (vlib_num_workers () > 0)
+ n_req_qs = vlib_num_workers () * 2;
+
+ for (cdev_id = 0; cdev_id < rte_cryptodev_count (); cdev_id++)
+ {
+ struct rte_cryptodev_info cdev_info;
+
+ rte_cryptodev_info_get (cdev_id, &cdev_info);
+
+ if (!
+ (cdev_info.feature_flags & RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING))
+ continue;
+
+ n_qs += cdev_info.max_nb_queue_pairs;
+ }
+
+ if (n_qs >= n_req_qs)
+ return 0;
+ else
+ return -1;
+}
+
+static clib_error_t *
+dpdk_ipsec_init (vlib_main_t * vm)
+{
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ struct rte_cryptodev_config dev_conf;
+ struct rte_cryptodev_qp_conf qp_conf;
+ struct rte_cryptodev_info cdev_info;
+ struct rte_mempool *rmp;
+ i32 dev_id, ret;
+ u32 i, skip_master;
+
+ if (check_cryptodev_queues () < 0)
+ return clib_error_return (0, "not enough cryptodevs for ipsec");
+
+ vec_alloc (dcm->workers_main, tm->n_vlib_mains);
+ _vec_len (dcm->workers_main) = tm->n_vlib_mains;
+
+ fprintf (stdout, "DPDK Cryptodevs info:\n");
+ fprintf (stdout, "dev_id\tn_qp\tnb_obj\tcache_size\n");
+ /* HW cryptodevs have higher dev_id, use HW first */
+ for (dev_id = rte_cryptodev_count () - 1; dev_id >= 0; dev_id--)
+ {
+ u16 max_nb_qp, qp = 0;
+ skip_master = vlib_num_workers () > 0;
+
+ rte_cryptodev_info_get (dev_id, &cdev_info);
+
+ if (!
+ (cdev_info.feature_flags & RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING))
+ continue;
+
+ max_nb_qp = cdev_info.max_nb_queue_pairs;
+
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ {
+ u8 is_outbound;
+ crypto_worker_main_t *cwm;
+ uword *map;
+
+ if (skip_master)
+ {
+ skip_master = 0;
+ continue;
+ }
+
+ cwm = vec_elt_at_index (dcm->workers_main, i);
+ map = cwm->algo_qp_map;
+
+ if (!map)
+ {
+ map = hash_create (0, sizeof (crypto_worker_qp_key_t));
+ if (!map)
+ return clib_error_return (0, "unable to create hash table "
+ "for worker %u",
+ vlib_mains[i]->cpu_index);
+ cwm->algo_qp_map = map;
+ }
+
+ for (is_outbound = 0; is_outbound < 2 && qp < max_nb_qp;
+ is_outbound++)
+ {
+ int mapped = add_cdev_mapping (cwm, &cdev_info,
+ dev_id, qp, is_outbound);
+ if (mapped > 0)
+ qp++;
+
+ if (mapped < 0)
+ return clib_error_return (0,
+ "too many queues for one worker");
+ }
+ }
+
+ if (qp == 0)
+ continue;
+
+ dev_conf.socket_id = rte_cryptodev_socket_id (dev_id);
+ dev_conf.nb_queue_pairs = cdev_info.max_nb_queue_pairs;
+ dev_conf.session_mp.nb_objs = DPDK_CRYPTO_NB_OBJS;
+ dev_conf.session_mp.cache_size = DPDK_CRYPTO_CACHE_SIZE;
+
+ ret = rte_cryptodev_configure (dev_id, &dev_conf);
+ if (ret < 0)
+ return clib_error_return (0, "cryptodev %u config error", dev_id);
+
+ qp_conf.nb_descriptors = DPDK_CRYPTO_N_QUEUE_DESC;
+ for (qp = 0; qp < dev_conf.nb_queue_pairs; qp++)
+ {
+ ret = rte_cryptodev_queue_pair_setup (dev_id, qp, &qp_conf,
+ dev_conf.socket_id);
+ if (ret < 0)
+ return clib_error_return (0, "cryptodev %u qp %u setup error",
+ dev_id, qp);
+ }
+ fprintf (stdout, "%u\t%u\t%u\t%u\n", dev_id, dev_conf.nb_queue_pairs,
+ DPDK_CRYPTO_NB_OBJS, DPDK_CRYPTO_CACHE_SIZE);
+ }
+
+ u32 socket_id = rte_socket_id ();
+
+ vec_validate_aligned (dcm->cop_pools, socket_id, CLIB_CACHE_LINE_BYTES);
+
+ /* pool already exists, nothing to do */
+ if (dcm->cop_pools[socket_id])
+ return 0;
+
+ u8 *pool_name = format (0, "crypto_op_pool_socket%u%c", socket_id, 0);
+
+ rmp = rte_crypto_op_pool_create ((char *) pool_name,
+ RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ DPDK_CRYPTO_NB_COPS *
+ (1 + vlib_num_workers ()),
+ DPDK_CRYPTO_CACHE_SIZE,
+ DPDK_CRYPTO_PRIV_SIZE, socket_id);
+ vec_free (pool_name);
+
+ if (!rmp)
+ return clib_error_return (0, "failed to allocate mempool on socket %u",
+ socket_id);
+ dcm->cop_pools[socket_id] = rmp;
+
+ dpdk_esp_init ();
+
+ if (vec_len (vlib_mains) == 0)
+ vlib_node_set_state (&vlib_global_main, dpdk_crypto_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else
+ for (i = 1; i < tm->n_vlib_mains; i++)
+ vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ return 0;
+}
+
+VLIB_MAIN_LOOP_ENTER_FUNCTION (dpdk_ipsec_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/ipsec/ipsec.h b/src/vnet/devices/dpdk/ipsec/ipsec.h
new file mode 100644
index 00000000000..e6c7498c0d3
--- /dev/null
+++ b/src/vnet/devices/dpdk/ipsec/ipsec.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2016 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DPDK_IPSEC_H__
+#define __DPDK_IPSEC_H__
+
+#include <vnet/vnet.h>
+
+#undef always_inline
+#include <rte_crypto.h>
+#include <rte_cryptodev.h>
+
+#if CLIB_DEBUG > 0
+#define always_inline static inline
+#else
+#define always_inline static inline __attribute__ ((__always_inline__))
+#endif
+
+
+#define MAX_QP_PER_LCORE 16
+
+typedef struct
+{
+ u32 salt;
+ u32 iv[2];
+ u32 cnt;
+} dpdk_gcm_cnt_blk;
+
+typedef struct
+{
+ dpdk_gcm_cnt_blk cb;
+ union
+ {
+ u8 aad[12];
+ u8 icv[64];
+ };
+} dpdk_cop_priv_t;
+
+typedef struct
+{
+ u8 cipher_algo;
+ u8 auth_algo;
+ u8 is_outbound;
+} crypto_worker_qp_key_t;
+
+typedef struct
+{
+ u16 dev_id;
+ u16 qp_id;
+ u16 is_outbound;
+ i16 inflights;
+ u32 bi[VLIB_FRAME_SIZE];
+ struct rte_crypto_op *cops[VLIB_FRAME_SIZE];
+ struct rte_crypto_op **free_cops;
+} crypto_qp_data_t;
+
+typedef struct
+{
+ u8 qp_index;
+ void *sess;
+} crypto_sa_session_t;
+
+typedef struct
+{
+ crypto_sa_session_t *sa_sess_d[2];
+ crypto_qp_data_t *qp_data;
+ uword *algo_qp_map;
+} crypto_worker_main_t;
+
+typedef struct
+{
+ struct rte_mempool **cop_pools;
+ crypto_worker_main_t *workers_main;
+} dpdk_crypto_main_t;
+
+dpdk_crypto_main_t dpdk_crypto_main;
+
+extern vlib_node_registration_t dpdk_crypto_input_node;
+
+#define CRYPTO_N_FREE_COPS (VLIB_FRAME_SIZE * 3)
+
+static_always_inline void
+crypto_alloc_cops ()
+{
+ dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+ u32 cpu_index = os_get_cpu_number ();
+ crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+ unsigned socket_id = rte_socket_id ();
+ crypto_qp_data_t *qpd;
+
+ /* *INDENT-OFF* */
+ vec_foreach (qpd, cwm->qp_data)
+ {
+ u32 l = vec_len (qpd->free_cops);
+
+ if (PREDICT_FALSE (l < VLIB_FRAME_SIZE))
+ {
+ u32 n_alloc;
+
+ if (PREDICT_FALSE (!qpd->free_cops))
+ vec_alloc (qpd->free_cops, CRYPTO_N_FREE_COPS);
+
+ n_alloc = rte_crypto_op_bulk_alloc (dcm->cop_pools[socket_id],
+ RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ &qpd->free_cops[l],
+ CRYPTO_N_FREE_COPS - l - 1);
+
+ _vec_len (qpd->free_cops) = l + n_alloc;
+ }
+ }
+ /* *INDENT-ON* */
+}
+
+static_always_inline void
+crypto_free_cop (crypto_qp_data_t * qpd, struct rte_crypto_op **cops, u32 n)
+{
+ u32 l = vec_len (qpd->free_cops);
+
+ if (l + n >= CRYPTO_N_FREE_COPS)
+ {
+ l -= VLIB_FRAME_SIZE;
+ rte_mempool_put_bulk (cops[0]->mempool,
+ (void **) &qpd->free_cops[l], VLIB_FRAME_SIZE);
+ }
+ clib_memcpy (&qpd->free_cops[l], cops, sizeof (*cops) * n);
+
+ _vec_len (qpd->free_cops) = l + n;
+}
+
+static_always_inline int
+check_algo_is_supported (const struct rte_cryptodev_capabilities *cap,
+ char *name)
+{
+ struct
+ {
+ uint8_t cipher_algo;
+ enum rte_crypto_sym_xform_type type;
+ union
+ {
+ enum rte_crypto_auth_algorithm auth;
+ enum rte_crypto_cipher_algorithm cipher;
+ };
+ char *name;
+ } supported_algo[] =
+ {
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher =
+ RTE_CRYPTO_CIPHER_NULL,.name = "NULL"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher =
+ RTE_CRYPTO_CIPHER_AES_CBC,.name = "AES_CBC"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher =
+ RTE_CRYPTO_CIPHER_AES_CTR,.name = "AES_CTR"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher =
+ RTE_CRYPTO_CIPHER_3DES_CBC,.name = "3DES-CBC"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.auth =
+ RTE_CRYPTO_CIPHER_AES_GCM,.name = "AES-GCM"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_SHA1_HMAC,.name = "HMAC-SHA1"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_SHA256_HMAC,.name = "HMAC-SHA256"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_SHA384_HMAC,.name = "HMAC-SHA384"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_SHA512_HMAC,.name = "HMAC-SHA512"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_AES_XCBC_MAC,.name = "AES-XCBC-MAC"},
+ {
+ .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth =
+ RTE_CRYPTO_AUTH_AES_GCM,.name = "AES-GCM"},
+ {
+ /* tail */
+ .type = RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED},};
+ uint32_t i = 0;
+
+ if (cap->op != RTE_CRYPTO_OP_TYPE_SYMMETRIC)
+ return -1;
+
+ while (supported_algo[i].type != RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED)
+ {
+ if (cap->sym.xform_type == supported_algo[i].type)
+ {
+ if ((cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_CIPHER &&
+ cap->sym.cipher.algo == supported_algo[i].cipher) ||
+ (cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AUTH &&
+ cap->sym.auth.algo == supported_algo[i].auth))
+ {
+ if (name)
+ strcpy (name, supported_algo[i].name);
+ return 0;
+ }
+ }
+
+ i++;
+ }
+
+ return -1;
+}
+
+#endif /* __DPDK_IPSEC_H__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/node.c b/src/vnet/devices/dpdk/node.c
new file mode 100644
index 00000000000..e541cdbcbd2
--- /dev/null
+++ b/src/vnet/devices/dpdk/node.c
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls/packet.h>
+#include <vnet/handoff.h>
+#include <vnet/devices/devices.h>
+#include <vnet/feature/feature.h>
+
+#include "dpdk_priv.h"
+
+static char *dpdk_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_error
+#undef _
+};
+
+always_inline int
+vlib_buffer_is_ip4 (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) b->data;
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP4));
+}
+
+always_inline int
+vlib_buffer_is_ip6 (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) b->data;
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6));
+}
+
+always_inline int
+vlib_buffer_is_mpls (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) b->data;
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST));
+}
+
+#if RTE_VERSION < RTE_VERSION_NUM(16, 11, 0, 0)
+/* New ol_flags bits added in DPDK-16.11 */
+#define PKT_RX_IP_CKSUM_GOOD (1ULL << 7)
+#endif
+
+always_inline u32
+dpdk_rx_next_from_etype (struct rte_mbuf * mb, vlib_buffer_t * b0)
+{
+ if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0)))
+ if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0))
+ return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT;
+ else
+ return VNET_DEVICE_INPUT_NEXT_IP4_INPUT;
+ else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0)))
+ return VNET_DEVICE_INPUT_NEXT_IP6_INPUT;
+ else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0)))
+ return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT;
+ else
+ return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+}
+
+always_inline int
+dpdk_mbuf_is_vlan (struct rte_mbuf *mb)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0)
+ return (mb->packet_type & RTE_PTYPE_L2_ETHER_VLAN) ==
+ RTE_PTYPE_L2_ETHER_VLAN;
+#else
+ return
+ (mb->ol_flags &
+ (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ_STRIPPED)) ==
+ PKT_RX_VLAN_PKT;
+#endif
+}
+
+always_inline int
+dpdk_mbuf_is_ip4 (struct rte_mbuf *mb)
+{
+ return RTE_ETH_IS_IPV4_HDR (mb->packet_type) != 0;
+}
+
+always_inline int
+dpdk_mbuf_is_ip6 (struct rte_mbuf *mb)
+{
+ return RTE_ETH_IS_IPV6_HDR (mb->packet_type) != 0;
+}
+
+always_inline u32
+dpdk_rx_next_from_mb (struct rte_mbuf * mb, vlib_buffer_t * b0)
+{
+ if (PREDICT_FALSE (dpdk_mbuf_is_vlan (mb)))
+ return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ else if (PREDICT_TRUE (dpdk_mbuf_is_ip4 (mb)))
+ return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT;
+ else if (PREDICT_TRUE (dpdk_mbuf_is_ip6 (mb)))
+ return VNET_DEVICE_INPUT_NEXT_IP6_INPUT;
+ else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0)))
+ return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT;
+ else
+ return dpdk_rx_next_from_etype (mb, b0);
+}
+
+always_inline void
+dpdk_rx_error_from_mb (struct rte_mbuf *mb, u32 * next, u8 * error)
+{
+ if (mb->ol_flags & PKT_RX_IP_CKSUM_BAD)
+ {
+ *error = DPDK_ERROR_IP_CHECKSUM_ERROR;
+ *next = VNET_DEVICE_INPUT_NEXT_DROP;
+ }
+ else
+ *error = DPDK_ERROR_NONE;
+}
+
+void
+dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id, u32 * buffers, uword n_buffers)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ u32 *b, n_left;
+ u32 next0;
+
+ n_left = n_buffers;
+ b = buffers;
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ dpdk_rx_dma_trace_t *t0;
+ struct rte_mbuf *mb;
+ u8 error0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ mb = rte_mbuf_from_vlib_buffer (b0);
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ next0 = xd->per_interface_next_index;
+ else if (PREDICT_TRUE
+ ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0))
+ next0 = dpdk_rx_next_from_mb (mb, b0);
+ else
+ next0 = dpdk_rx_next_from_etype (mb, b0);
+
+ dpdk_rx_error_from_mb (mb, &next0, &error0);
+
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = bi0;
+
+ clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
+ clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ clib_memcpy (t0->buffer.pre_data, b0->data,
+ sizeof (t0->buffer.pre_data));
+ clib_memcpy (&t0->data, mb->buf_addr + mb->data_off, sizeof (t0->data));
+
+ b += 1;
+ }
+}
+
+static inline u32
+dpdk_rx_burst (dpdk_main_t * dm, dpdk_device_t * xd, u16 queue_id)
+{
+ u32 n_buffers;
+ u32 n_left;
+ u32 n_this_chunk;
+
+ n_left = VLIB_FRAME_SIZE;
+ n_buffers = 0;
+
+ if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD))
+ {
+ while (n_left)
+ {
+ n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id,
+ xd->rx_vectors[queue_id] +
+ n_buffers, n_left);
+ n_buffers += n_this_chunk;
+ n_left -= n_this_chunk;
+
+ /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */
+ if (n_this_chunk < 32)
+ break;
+ }
+ }
+ else
+ {
+ ASSERT (0);
+ }
+
+ return n_buffers;
+}
+
+
+static_always_inline void
+dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b,
+ struct rte_mbuf *mb, vlib_buffer_free_list_t * fl)
+{
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb_seg = 0;
+ vlib_buffer_t *b_seg, *b_chain = 0;
+ mb_seg = mb->next;
+ b_chain = b;
+
+ while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
+ {
+ ASSERT (mb_seg != 0);
+
+ b_seg = vlib_buffer_from_rte_mbuf (mb_seg);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+
+ ASSERT ((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT (b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data =
+ (mb_seg->buf_addr + mb_seg->data_off) - (void *) b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b->total_length_not_including_first_buffer += mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+}
+
+static_always_inline void
+dpdk_prefetch_buffer (struct rte_mbuf *mb)
+{
+ vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb);
+ CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, STORE);
+}
+
+/*
+ * This function is used when there are no worker threads.
+ * The main thread performs IO and forwards the packets.
+ */
+static_always_inline u32
+dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
+ vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id)
+{
+ u32 n_buffers;
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ u32 n_left_to_next, *to_next;
+ u32 mb_index;
+ vlib_main_t *vm = vlib_get_main ();
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__ ((unused));
+ vlib_buffer_free_list_t *fl;
+ u32 buffer_flags_template;
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
+ return 0;
+
+ n_buffers = dpdk_rx_burst (dm, xd, queue_id);
+
+ if (n_buffers == 0)
+ {
+ return 0;
+ }
+
+ buffer_flags_template = dm->buffer_flags_template;
+
+ vec_reset_length (xd->d_trace_buffers[cpu_index]);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node);
+
+ if (n_trace > 0)
+ {
+ u32 n = clib_min (n_trace, n_buffers);
+ mb_index = 0;
+
+ while (n--)
+ {
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++];
+ vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb);
+ vec_add1 (xd->d_trace_buffers[cpu_index],
+ vlib_get_buffer_index (vm, b));
+ }
+ }
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t *b0, *b1, *b2, *b3;
+ u32 bi0, next0, l3_offset0;
+ u32 bi1, next1, l3_offset1;
+ u32 bi2, next2, l3_offset2;
+ u32 bi3, next3, l3_offset3;
+ u8 error0, error1, error2, error3;
+ u64 or_ol_flags;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_buffers > 8 && n_left_to_next > 4)
+ {
+ struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index];
+ struct rte_mbuf *mb1 = xd->rx_vectors[queue_id][mb_index + 1];
+ struct rte_mbuf *mb2 = xd->rx_vectors[queue_id][mb_index + 2];
+ struct rte_mbuf *mb3 = xd->rx_vectors[queue_id][mb_index + 3];
+
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 4]);
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 5]);
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 6]);
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 7]);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG)
+ {
+ if (PREDICT_FALSE (mb0->nb_segs > 1))
+ dpdk_prefetch_buffer (mb0->next);
+ if (PREDICT_FALSE (mb1->nb_segs > 1))
+ dpdk_prefetch_buffer (mb1->next);
+ if (PREDICT_FALSE (mb2->nb_segs > 1))
+ dpdk_prefetch_buffer (mb2->next);
+ if (PREDICT_FALSE (mb3->nb_segs > 1))
+ dpdk_prefetch_buffer (mb3->next);
+ }
+
+ ASSERT (mb0);
+ ASSERT (mb1);
+ ASSERT (mb2);
+ ASSERT (mb3);
+
+ or_ol_flags = (mb0->ol_flags | mb1->ol_flags |
+ mb2->ol_flags | mb3->ol_flags);
+ b0 = vlib_buffer_from_rte_mbuf (mb0);
+ b1 = vlib_buffer_from_rte_mbuf (mb1);
+ b2 = vlib_buffer_from_rte_mbuf (mb2);
+ b3 = vlib_buffer_from_rte_mbuf (mb3);
+
+ vlib_buffer_init_for_free_list (b0, fl);
+ vlib_buffer_init_for_free_list (b1, fl);
+ vlib_buffer_init_for_free_list (b2, fl);
+ vlib_buffer_init_for_free_list (b3, fl);
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+ bi1 = vlib_get_buffer_index (vm, b1);
+ bi2 = vlib_get_buffer_index (vm, b2);
+ bi3 = vlib_get_buffer_index (vm, b3);
+
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ to_next[2] = bi2;
+ to_next[3] = bi3;
+ to_next += 4;
+ n_left_to_next -= 4;
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ {
+ next0 = next1 = next2 = next3 = xd->per_interface_next_index;
+ }
+ else if (PREDICT_TRUE
+ ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0))
+ {
+ next0 = dpdk_rx_next_from_mb (mb0, b0);
+ next1 = dpdk_rx_next_from_mb (mb1, b1);
+ next2 = dpdk_rx_next_from_mb (mb2, b2);
+ next3 = dpdk_rx_next_from_mb (mb3, b3);
+ }
+ else
+ {
+ next0 = dpdk_rx_next_from_etype (mb0, b0);
+ next1 = dpdk_rx_next_from_etype (mb1, b1);
+ next2 = dpdk_rx_next_from_etype (mb2, b2);
+ next3 = dpdk_rx_next_from_etype (mb3, b3);
+ }
+
+ if (PREDICT_FALSE (or_ol_flags & PKT_RX_IP_CKSUM_BAD))
+ {
+ dpdk_rx_error_from_mb (mb0, &next0, &error0);
+ dpdk_rx_error_from_mb (mb1, &next1, &error1);
+ dpdk_rx_error_from_mb (mb2, &next2, &error2);
+ dpdk_rx_error_from_mb (mb3, &next3, &error3);
+ b0->error = node->errors[error0];
+ b1->error = node->errors[error1];
+ b2->error = node->errors[error2];
+ b3->error = node->errors[error3];
+ }
+ else
+ {
+ b0->error = b1->error = node->errors[DPDK_ERROR_NONE];
+ b2->error = b3->error = node->errors[DPDK_ERROR_NONE];
+ }
+
+ l3_offset0 = device_input_next_node_advance[next0];
+ l3_offset1 = device_input_next_node_advance[next1];
+ l3_offset2 = device_input_next_node_advance[next2];
+ l3_offset3 = device_input_next_node_advance[next3];
+
+ b0->current_data = l3_offset0 + mb0->data_off;
+ b1->current_data = l3_offset1 + mb1->data_off;
+ b2->current_data = l3_offset2 + mb2->data_off;
+ b3->current_data = l3_offset3 + mb3->data_off;
+
+ b0->current_data -= RTE_PKTMBUF_HEADROOM;
+ b1->current_data -= RTE_PKTMBUF_HEADROOM;
+ b2->current_data -= RTE_PKTMBUF_HEADROOM;
+ b3->current_data -= RTE_PKTMBUF_HEADROOM;
+
+ b0->current_length = mb0->data_len - l3_offset0;
+ b1->current_length = mb1->data_len - l3_offset1;
+ b2->current_length = mb2->data_len - l3_offset2;
+ b3->current_length = mb3->data_len - l3_offset3;
+
+ b0->flags = buffer_flags_template;
+ b1->flags = buffer_flags_template;
+ b2->flags = buffer_flags_template;
+ b3->flags = buffer_flags_template;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b2)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b3)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ vnet_buffer (b2)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ vnet_buffer (b3)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+
+ n_rx_bytes += mb0->pkt_len;
+ n_rx_bytes += mb1->pkt_len;
+ n_rx_bytes += mb2->pkt_len;
+ n_rx_bytes += mb3->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG)
+ {
+ dpdk_process_subseq_segs (vm, b0, mb0, fl);
+ dpdk_process_subseq_segs (vm, b1, mb1, fl);
+ dpdk_process_subseq_segs (vm, b2, mb2, fl);
+ dpdk_process_subseq_segs (vm, b3, mb3, fl);
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3);
+
+ /* Do we have any driver RX features configured on the interface? */
+ vnet_feature_start_device_input_x4 (xd->vlib_sw_if_index,
+ &next0, &next1, &next2, &next3,
+ b0, b1, b2, b3,
+ l3_offset0, l3_offset1,
+ l3_offset2, l3_offset3);
+
+ vlib_validate_buffer_enqueue_x4 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, bi2, bi3,
+ next0, next1, next2, next3);
+ n_buffers -= 4;
+ mb_index += 4;
+ }
+ while (n_buffers > 0 && n_left_to_next > 0)
+ {
+ struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index];
+
+ ASSERT (mb0);
+
+ b0 = vlib_buffer_from_rte_mbuf (mb0);
+
+ /* Prefetch one next segment if it exists. */
+ if (PREDICT_FALSE (mb0->nb_segs > 1))
+ dpdk_prefetch_buffer (mb0->next);
+
+ vlib_buffer_init_for_free_list (b0, fl);
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+
+ to_next[0] = bi0;
+ to_next++;
+ n_left_to_next--;
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ next0 = xd->per_interface_next_index;
+ else if (PREDICT_TRUE
+ ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0))
+ next0 = dpdk_rx_next_from_mb (mb0, b0);
+ else
+ next0 = dpdk_rx_next_from_etype (mb0, b0);
+
+ dpdk_rx_error_from_mb (mb0, &next0, &error0);
+ b0->error = node->errors[error0];
+
+ l3_offset0 = device_input_next_node_advance[next0];
+
+ b0->current_data = l3_offset0;
+ b0->current_data += mb0->data_off - RTE_PKTMBUF_HEADROOM;
+ b0->current_length = mb0->data_len - l3_offset0;
+
+ b0->flags = buffer_flags_template;
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ n_rx_bytes += mb0->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ dpdk_process_subseq_segs (vm, b0, mb0, fl);
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+
+ /* Do we have any driver RX features configured on the interface? */
+ vnet_feature_start_device_input_x1 (xd->vlib_sw_if_index, &next0,
+ b0, l3_offset0);
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ n_buffers--;
+ mb_index++;
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[cpu_index]) > 0))
+ {
+ dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers[cpu_index],
+ vec_len (xd->d_trace_buffers[cpu_index]));
+ vlib_set_trace_count (vm, node, n_trace -
+ vec_len (xd->d_trace_buffers[cpu_index]));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main ()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes);
+
+ dpdk_worker_t *dw = vec_elt_at_index (dm->workers, cpu_index);
+ dw->aggregate_rx_packets += mb_index;
+
+ return mb_index;
+}
+
+static inline void
+poll_rate_limit (dpdk_main_t * dm)
+{
+ /* Limit the poll rate by sleeping for N msec between polls */
+ if (PREDICT_FALSE (dm->poll_sleep != 0))
+ {
+ struct timespec ts, tsrem;
+
+ ts.tv_sec = 0;
+ ts.tv_nsec = 1000 * 1000 * dm->poll_sleep; /* 1ms */
+
+ while (nanosleep (&ts, &tsrem) < 0)
+ {
+ ts = tsrem;
+ }
+ }
+}
+
+/** \brief Main DPDK input node
+ @node dpdk-input
+
+ This is the main DPDK input node: across each assigned interface,
+ call rte_eth_rx_burst(...) or similar to obtain a vector of
+ packets to process. Handle early packet discard. Derive @c
+ vlib_buffer_t metadata from <code>struct rte_mbuf</code> metadata,
+ Depending on the resulting metadata: adjust <code>b->current_data,
+ b->current_length </code> and dispatch directly to
+ ip4-input-no-checksum, or ip6-input. Trace the packet if required.
+
+ @param vm vlib_main_t corresponding to the current thread
+ @param node vlib_node_runtime_t
+ @param f vlib_frame_t input-node, not used.
+
+ @par Graph mechanics: buffer metadata, next index usage
+
+ @em Uses:
+ - <code>struct rte_mbuf mb->ol_flags</code>
+ - PKT_RX_IP_CKSUM_BAD
+ - <code> RTE_ETH_IS_xxx_HDR(mb->packet_type) </code>
+ - packet classification result
+
+ @em Sets:
+ - <code>b->error</code> if the packet is to be dropped immediately
+ - <code>b->current_data, b->current_length</code>
+ - adjusted as needed to skip the L2 header in direct-dispatch cases
+ - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
+ - rx interface sw_if_index
+ - <code>vnet_buffer(b)->sw_if_index[VLIB_TX] = ~0</code>
+ - required by ipX-lookup
+ - <code>b->flags</code>
+ - to indicate multi-segment pkts (VLIB_BUFFER_NEXT_PRESENT), etc.
+
+ <em>Next Nodes:</em>
+ - Static arcs to: error-drop, ethernet-input,
+ ip4-input-no-checksum, ip6-input, mpls-input
+ - per-interface redirection, controlled by
+ <code>xd->per_interface_next_index</code>
+*/
+
+static uword
+dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd;
+ uword n_rx_packets = 0;
+ dpdk_device_and_queue_t *dq;
+ u32 cpu_index = os_get_cpu_number ();
+
+ /*
+ * Poll all devices on this cpu for input/interrupts.
+ */
+ /* *INDENT-OFF* */
+ vec_foreach (dq, dm->devices_by_cpu[cpu_index])
+ {
+ xd = vec_elt_at_index(dm->devices, dq->device);
+ n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id);
+ }
+ /* *INDENT-ON* */
+
+ poll_rate_limit (dm);
+
+ return n_rx_packets;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (dpdk_input_node) = {
+ .function = dpdk_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "dpdk-input",
+ .sibling_of = "device-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_dpdk_rx_dma_trace,
+
+ .n_errors = DPDK_N_ERROR,
+ .error_strings = dpdk_error_strings,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_input_node, dpdk_input);
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/qos_doc.md b/src/vnet/devices/dpdk/qos_doc.md
new file mode 100644
index 00000000000..9bd0659d616
--- /dev/null
+++ b/src/vnet/devices/dpdk/qos_doc.md
@@ -0,0 +1,404 @@
+# QoS Hierarchical Scheduler {#qos_doc}
+
+The Quality-of-Service (QoS) scheduler performs egress-traffic management by
+prioritizing the transmission of the packets of different type services and
+subcribers based on the Service Level Agreements (SLAs). The QoS scheduler can
+be enabled on one or more NIC output interfaces depending upon the
+requirement.
+
+
+## Overview
+
+The QoS schdeuler supports a number of scheduling and shaping levels which
+construct hierarchical-tree. The first level in the hierarchy is port (i.e.
+the physical interface) that constitutes the root node of the tree. The
+subsequent level is subport which represents the group of the
+users/subscribers. The individual user/subscriber is represented by the pipe
+at the next level. Each user can have different traffic type based on the
+criteria of specific loss rate, jitter, and latency. These traffic types are
+represented at the traffic-class level in the form of different traffic-
+classes. The last level contains number of queues which are grouped together
+to host the packets of the specific class type traffic.
+
+The QoS scheduler implementation requires flow classification, enqueue and
+dequeue operations. The flow classification is mandatory stage for HQoS where
+incoming packets are classified by mapping the packet fields information to
+5-tuple (HQoS subport, pipe, traffic class, queue within traffic class, and
+color) and storing that information in mbuf sched field. The enqueue operation
+uses this information to determine the queue for storing the packet, and at
+this stage, if the specific queue is full, QoS drops the packet. The dequeue
+operation consists of scheduling the packet based on its length and available
+credits, and handing over the scheduled packet to the output interface.
+
+For more information on QoS Scheduler, please refer DPDK Programmer's Guide-
+http://dpdk.org/doc/guides/prog_guide/qos_framework.html
+
+
+### QoS Schdeuler Parameters
+
+Following illustrates the default HQoS configuration for each 10GbE output
+port:
+
+Single subport (subport 0):
+ - Subport rate set to 100% of port rate
+ - Each of the 4 traffic classes has rate set to 100% of port rate
+
+4K pipes per subport 0 (pipes 0 .. 4095) with identical configuration:
+ - Pipe rate set to 1/4K of port rate
+ - Each of the 4 traffic classes has rate set to 100% of pipe rate
+ - Within each traffic class, the byte-level WRR weights for the 4 queues are set to 1:1:1:1
+
+
+#### Port configuration
+
+```
+port {
+ rate 1250000000 /* Assuming 10GbE port */
+ frame_overhead 24 /* Overhead fields per Ethernet frame:
+ * 7B (Preamble) +
+ * 1B (Start of Frame Delimiter (SFD)) +
+ * 4B (Frame Check Sequence (FCS)) +
+ * 12B (Inter Frame Gap (IFG))
+ */
+ mtu 1522 /* Assuming Ethernet/IPv4 pkt (FCS not included) */
+ n_subports_per_port 1 /* Number of subports per output interface */
+ n_pipes_per_subport 4096 /* Number of pipes (users/subscribers) */
+ queue_sizes 64 64 64 64 /* Packet queue size for each traffic class.
+ * All queues within the same pipe traffic class
+ * have the same size. Queues from different
+ * pipes serving the same traffic class have
+ * the same size. */
+}
+```
+
+
+#### Subport configuration
+
+```
+subport 0 {
+ tb_rate 1250000000 /* Subport level token bucket rate (bytes per second) */
+ tb_size 1000000 /* Subport level token bucket size (bytes) */
+ tc0_rate 1250000000 /* Subport level token bucket rate for traffic class 0 (bytes per second) */
+ tc1_rate 1250000000 /* Subport level token bucket rate for traffic class 1 (bytes per second) */
+ tc2_rate 1250000000 /* Subport level token bucket rate for traffic class 2 (bytes per second) */
+ tc3_rate 1250000000 /* Subport level token bucket rate for traffic class 3 (bytes per second) */
+ tc_period 10 /* Time interval for refilling the token bucket associated with traffic class (Milliseconds) */
+ pipe 0 4095 profile 0 /* pipes (users/subscribers) configured with pipe profile 0 */
+}
+```
+
+
+#### Pipe configuration
+
+```
+pipe_profile 0 {
+ tb_rate 305175 /* Pipe level token bucket rate (bytes per second) */
+ tb_size 1000000 /* Pipe level token bucket size (bytes) */
+ tc0_rate 305175 /* Pipe level token bucket rate for traffic class 0 (bytes per second) */
+ tc1_rate 305175 /* Pipe level token bucket rate for traffic class 1 (bytes per second) */
+ tc2_rate 305175 /* Pipe level token bucket rate for traffic class 2 (bytes per second) */
+ tc3_rate 305175 /* Pipe level token bucket rate for traffic class 3 (bytes per second) */
+ tc_period 40 /* Time interval for refilling the token bucket associated with traffic class at pipe level (Milliseconds) */
+ tc3_oversubscription_weight 1 /* Weight traffic class 3 oversubscription */
+ tc0_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 0 */
+ tc1_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 1 */
+ tc2_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 2 */
+ tc3_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 3 */
+}
+```
+
+
+#### Random Early Detection (RED) parameters per traffic class and color (Green / Yellow / Red)
+
+```
+red {
+ tc0_wred_min 48 40 32 /* Minimum threshold for traffic class 0 queue (min_th) in number of packets */
+ tc0_wred_max 64 64 64 /* Maximum threshold for traffic class 0 queue (max_th) in number of packets */
+ tc0_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 0 queue (maxp = 1 / maxp_inv) */
+ tc0_wred_weight 9 9 9 /* Traffic Class 0 queue weight */
+ tc1_wred_min 48 40 32 /* Minimum threshold for traffic class 1 queue (min_th) in number of packets */
+ tc1_wred_max 64 64 64 /* Maximum threshold for traffic class 1 queue (max_th) in number of packets */
+ tc1_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 1 queue (maxp = 1 / maxp_inv) */
+ tc1_wred_weight 9 9 9 /* Traffic Class 1 queue weight */
+ tc2_wred_min 48 40 32 /* Minimum threshold for traffic class 2 queue (min_th) in number of packets */
+ tc2_wred_max 64 64 64 /* Maximum threshold for traffic class 2 queue (max_th) in number of packets */
+ tc2_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 2 queue (maxp = 1 / maxp_inv) */
+ tc2_wred_weight 9 9 9 /* Traffic Class 2 queue weight */
+ tc3_wred_min 48 40 32 /* Minimum threshold for traffic class 3 queue (min_th) in number of packets */
+ tc3_wred_max 64 64 64 /* Maximum threshold for traffic class 3 queue (max_th) in number of packets */
+ tc3_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 3 queue (maxp = 1 / maxp_inv) */
+ tc3_wred_weight 9 9 9 /* Traffic Class 3 queue weight */
+}
+```
+
+
+### DPDK QoS Scheduler Integration in VPP
+
+The Hierarchical Quaity-of-Service (HQoS) scheduler object could be seen as
+part of the logical NIC output interface. To enable HQoS on specific output
+interface, vpp startup.conf file has to be configured accordingly. The output
+interface that requires HQoS, should have "hqos" parameter specified in dpdk
+section. Another optional parameter "hqos-thread" has been defined which can
+be used to associate the output interface with specific hqos thread. In cpu
+section of the config file, "corelist-hqos-threads" is introduced to assign
+logical cpu cores to run the HQoS threads. A HQoS thread can run multiple HQoS
+objects each associated with different output interfaces. All worker threads
+instead of writing packets to NIC TX queue directly, write the packets to a
+software queues. The hqos_threads read the software queues, and enqueue the
+packets to HQoS objects, as well as dequeue packets from HQOS objects and
+write them to NIC output interfaces. The worker threads need to be able to
+send the packets to any output interface, therefore, each HQoS object
+associated with NIC output interface should have software queues equal to
+worker threads count.
+
+Following illustrates the sample startup configuration file with 4x worker
+threads feeding 2x hqos threads that handle each QoS scheduler for 1x output
+interface.
+
+```
+dpdk {
+ socket-mem 16384,16384
+
+ dev 0000:02:00.0 {
+ num-rx-queues 2
+ hqos
+ }
+ dev 0000:06:00.0 {
+ num-rx-queues 2
+ hqos
+ }
+
+ num-mbufs 1000000
+}
+
+cpu {
+ main-core 0
+ corelist-workers 1, 2, 3, 4
+ corelist-hqos-threads 5, 6
+}
+```
+
+
+### QoS scheduler CLI Commands
+
+Each QoS scheduler instance is initialised with default parameters required to
+configure hqos port, subport, pipe and queues. Some of the parameters can be
+re-configured in run-time through CLI commands.
+
+
+#### Configuration
+
+Following commands can be used to configure QoS scheduler parameters.
+
+The command below can be used to set the subport level parameters such as
+token bucket rate (bytes per seconds), token bucket size (bytes), traffic
+class rates (bytes per seconds) and token update period (Milliseconds).
+
+```
+set dpdk interface hqos subport <if-name> subport <n> [rate <n>]
+ [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]
+```
+
+For setting the pipe profile, following command can be used.
+
+```
+set dpdk interface hqos pipe <if-name> subport <n> pipe <n> profile <n>
+```
+
+To assign QoS scheduler instance to the specific thread, following command can
+be used.
+
+```
+set dpdk interface hqos placement <if-name> thread <n>
+```
+
+The command below is used to set the packet fields required for classifiying
+the incoming packet. As a result of classification process, packet field
+information will be mapped to 5 tuples (subport, pipe, traffic class, pipe,
+color) and stored in packet mbuf.
+
+```
+set dpdk interface hqos pktfield <if-name> id <n> offset <n> mask <n>
+```
+
+The DSCP table entries used for idenfiying the traffic class and queue can be set using the command below;
+
+```
+set dpdk interface hqos tctbl <if-name> entry <n> tc <n> queue <n>
+```
+
+
+#### Show Command
+
+The QoS Scheduler configuration can displayed using the command below.
+
+```
+ vpp# show dpdk interface hqos TenGigabitEthernet2/0/0
+ Thread:
+ Input SWQ size = 4096 packets
+ Enqueue burst size = 256 packets
+ Dequeue burst size = 220 packets
+ Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000
+ Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000
+ Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc
+ Packet field 2 translation table:
+ [ 0 .. 15]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [16 .. 31]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [32 .. 47]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [48 .. 63]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ Port:
+ Rate = 1250000000 bytes/second
+ MTU = 1514 bytes
+ Frame overhead = 24 bytes
+ Number of subports = 1
+ Number of pipes per subport = 4096
+ Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets
+ Number of pipe profiles = 1
+ Pipe profile 0:
+ Rate = 305175 bytes/second
+ Token bucket size = 1000000 bytes
+ Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second
+ TC period = 40 milliseconds
+ TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+```
+
+The QoS Scheduler placement over the logical cpu cores can be displayed using
+below command.
+
+```
+ vpp# show dpdk interface hqos placement
+ Thread 5 (vpp_hqos-threads_0 at lcore 5):
+ TenGigabitEthernet2/0/0 queue 0
+ Thread 6 (vpp_hqos-threads_1 at lcore 6):
+ TenGigabitEthernet4/0/1 queue 0
+```
+
+
+### QoS Scheduler Binary APIs
+
+This section explans the available binary APIs for configuring QoS scheduler
+parameters in run-time.
+
+The following API can be used to set the pipe profile of a pipe that belongs
+to a given subport:
+
+```
+sw_interface_set_dpdk_hqos_pipe rx <intfc> | sw_if_index <id>
+ subport <subport-id> pipe <pipe-id> profile <profile-id>
+```
+
+The data structures used for set the pipe profile parameter are as follows;
+
+```
+ /** \\brief DPDK interface HQoS pipe profile set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param subport - subport ID
+ @param pipe - pipe ID within its subport
+ @param profile - pipe profile ID
+ */
+ define sw_interface_set_dpdk_hqos_pipe {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 subport;
+ u32 pipe;
+ u32 profile;
+ };
+
+ /** \\brief DPDK interface HQoS pipe profile set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_pipe_reply {
+ u32 context;
+ i32 retval;
+ };
+```
+
+The following API can be used to set the subport level parameters, for
+example- token bucket rate (bytes per seconds), token bucket size (bytes),
+traffic class rate (bytes per seconds) and tokens update period.
+
+```
+sw_interface_set_dpdk_hqos_subport rx <intfc> | sw_if_index <id>
+ subport <subport-id> [rate <n>] [bktsize <n>]
+ [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]
+```
+
+The data structures used for set the subport level parameter are as follows;
+
+```
+ /** \\brief DPDK interface HQoS subport parameters set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param subport - subport ID
+ @param tb_rate - subport token bucket rate (measured in bytes/second)
+ @param tb_size - subport token bucket size (measured in credits)
+ @param tc_rate - subport traffic class 0 .. 3 rates (measured in bytes/second)
+ @param tc_period - enforcement period for rates (measured in milliseconds)
+ */
+ define sw_interface_set_dpdk_hqos_subport {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 subport;
+ u32 tb_rate;
+ u32 tb_size;
+ u32 tc_rate[4];
+ u32 tc_period;
+ };
+
+ /** \\brief DPDK interface HQoS subport parameters set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_subport_reply {
+ u32 context;
+ i32 retval;
+ };
+```
+
+The following API can be used set the DSCP table entry. The DSCP table have
+64 entries to map the packet DSCP field onto traffic class and hqos input
+queue.
+
+```
+sw_interface_set_dpdk_hqos_tctbl rx <intfc> | sw_if_index <id>
+ entry <n> tc <n> queue <n>
+```
+
+The data structures used for setting DSCP table entries are given below.
+
+```
+ /** \\brief DPDK interface HQoS tctbl entry set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param entry - entry index ID
+ @param tc - traffic class (0 .. 3)
+ @param queue - traffic class queue (0 .. 3)
+ */
+ define sw_interface_set_dpdk_hqos_tctbl {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 entry;
+ u32 tc;
+ u32 queue;
+ };
+
+ /** \\brief DPDK interface HQoS tctbl entry set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_tctbl_reply {
+ u32 context;
+ i32 retval;
+ };
+```