aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/dpdk/device
diff options
context:
space:
mode:
authorSachin Saxena <sachin.saxena@freescale.com>2018-02-28 20:28:52 +0530
committerSachin Saxena <sachin.saxena@nxp.com>2018-02-28 20:34:56 +0530
commit0689fce93ba269c48f83a2f70f971b3976d04c90 (patch)
tree4cc2908df3598507cc1828ac19d8c43b22450ffa /src/plugins/dpdk/device
parent746b57564deede624261ab8a96c94f562f24d22c (diff)
parentd594711a5d79859a7d0bde83a516f7ab52051d9b (diff)
Merge branch 'stable/1710' of https://gerrit.fd.io/r/vpp into 17101710
Diffstat (limited to 'src/plugins/dpdk/device')
-rw-r--r--src/plugins/dpdk/device/cli.c1955
-rw-r--r--src/plugins/dpdk/device/common.c315
-rw-r--r--src/plugins/dpdk/device/device.c856
-rw-r--r--src/plugins/dpdk/device/dir.dox27
-rw-r--r--src/plugins/dpdk/device/dpdk.h483
-rw-r--r--src/plugins/dpdk/device/dpdk_priv.h135
-rw-r--r--src/plugins/dpdk/device/format.c804
-rwxr-xr-xsrc/plugins/dpdk/device/init.c1589
-rw-r--r--src/plugins/dpdk/device/node.c704
9 files changed, 6868 insertions, 0 deletions
diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c
new file mode 100644
index 00000000..c9fcea5c
--- /dev/null
+++ b/src/plugins/dpdk/device/cli.c
@@ -0,0 +1,1955 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <dpdk/device/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls/packet.h>
+
+#include <dpdk/device/dpdk_priv.h>
+
+/**
+ * @file
+ * @brief CLI for DPDK Abstraction Layer and pcap Tx Trace.
+ *
+ * This file contains the source code for CLI for DPDK
+ * Abstraction Layer and pcap Tx Trace.
+ */
+
+
+static clib_error_t *
+get_hqos (u32 hw_if_index, u32 subport_id, dpdk_device_t ** xd,
+ dpdk_device_config_t ** devconf)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ struct rte_eth_dev_info dev_info;
+ uword *p = 0;
+ clib_error_t *error = NULL;
+
+
+ if (hw_if_index == (u32) ~ 0)
+ {
+ error = clib_error_return (0, "please specify valid interface name");
+ goto done;
+ }
+
+ if (subport_id != 0)
+ {
+ error = clib_error_return (0, "Invalid subport");
+ goto done;
+ }
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ *xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get ((*xd)->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ (*devconf) = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ (*devconf) = &dm->conf->default_devconf;
+
+done:
+ return error;
+}
+
+static clib_error_t *
+pcap_trace_command_fn (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+#define PCAP_DEF_PKT_TO_CAPTURE (100)
+
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ u8 *filename;
+ u8 *chroot_filename = 0;
+ u32 max = 0;
+ int enabled = 0;
+ int errorFlag = 0;
+ clib_error_t *error = 0;
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "on"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ enabled = 1;
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already on...");
+ errorFlag = 1;
+ break;
+ }
+ }
+ else if (unformat (line_input, "off"))
+ {
+ if (dm->tx_pcap_enable)
+ {
+ vlib_cli_output (vm, "captured %d pkts...",
+ dm->pcap_main.n_packets_captured + 1);
+ if (dm->pcap_main.n_packets_captured)
+ {
+ dm->pcap_main.n_packets_to_capture =
+ dm->pcap_main.n_packets_captured;
+ error = pcap_write (&dm->pcap_main);
+ if (error)
+ clib_error_report (error);
+ else
+ vlib_cli_output (vm, "saved to %s...", dm->pcap_filename);
+ }
+
+ dm->tx_pcap_enable = 0;
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already off...");
+ errorFlag = 1;
+ break;
+ }
+ }
+ else if (unformat (line_input, "max %d", &max))
+ {
+ if (dm->tx_pcap_enable)
+ {
+ vlib_cli_output (vm,
+ "can't change max value while pcap tx capture active...");
+ errorFlag = 1;
+ break;
+ }
+ }
+ else if (unformat (line_input, "intfc %U",
+ unformat_vnet_sw_interface, dm->vnet_main,
+ &dm->pcap_sw_if_index))
+ ;
+
+ else if (unformat (line_input, "intfc any"))
+ {
+ dm->pcap_sw_if_index = 0;
+ }
+ else if (unformat (line_input, "file %s", &filename))
+ {
+ if (dm->tx_pcap_enable)
+ {
+ vlib_cli_output (vm,
+ "can't change file while pcap tx capture active...");
+ errorFlag = 1;
+ break;
+ }
+
+ /* Brain-police user path input */
+ if (strstr ((char *) filename, "..")
+ || index ((char *) filename, '/'))
+ {
+ vlib_cli_output (vm, "illegal characters in filename '%s'",
+ filename);
+ vlib_cli_output (vm,
+ "Hint: Only filename, do not enter directory structure.");
+ vec_free (filename);
+ errorFlag = 1;
+ break;
+ }
+
+ chroot_filename = format (0, "/tmp/%s%c", filename, 0);
+ vec_free (filename);
+ }
+ else if (unformat (line_input, "status"))
+ {
+ if (dm->pcap_sw_if_index == 0)
+ {
+ vlib_cli_output (vm, "max is %d for any interface to file %s",
+ dm->
+ pcap_pkts_to_capture ? dm->pcap_pkts_to_capture
+ : PCAP_DEF_PKT_TO_CAPTURE,
+ dm->
+ pcap_filename ? dm->pcap_filename : (u8 *)
+ "/tmp/vpe.pcap");
+ }
+ else
+ {
+ vlib_cli_output (vm, "max is %d for interface %U to file %s",
+ dm->
+ pcap_pkts_to_capture ? dm->pcap_pkts_to_capture
+ : PCAP_DEF_PKT_TO_CAPTURE,
+ format_vnet_sw_if_index_name, dm->vnet_main,
+ dm->pcap_sw_if_index,
+ dm->
+ pcap_filename ? dm->pcap_filename : (u8 *)
+ "/tmp/vpe.pcap");
+ }
+
+ if (dm->tx_pcap_enable == 0)
+ {
+ vlib_cli_output (vm, "pcap tx capture is off...");
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture is on: %d of %d pkts...",
+ dm->pcap_main.n_packets_captured,
+ dm->pcap_main.n_packets_to_capture);
+ }
+ break;
+ }
+
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, line_input);
+ errorFlag = 1;
+ break;
+ }
+ }
+ unformat_free (line_input);
+
+
+ if (errorFlag == 0)
+ {
+ /* Since no error, save configured values. */
+ if (chroot_filename)
+ {
+ if (dm->pcap_filename)
+ vec_free (dm->pcap_filename);
+ vec_add1 (chroot_filename, 0);
+ dm->pcap_filename = chroot_filename;
+ }
+
+ if (max)
+ dm->pcap_pkts_to_capture = max;
+
+
+ if (enabled)
+ {
+ if (dm->pcap_filename == 0)
+ dm->pcap_filename = format (0, "/tmp/vpe.pcap%c", 0);
+
+ memset (&dm->pcap_main, 0, sizeof (dm->pcap_main));
+ dm->pcap_main.file_name = (char *) dm->pcap_filename;
+ dm->pcap_main.n_packets_to_capture = PCAP_DEF_PKT_TO_CAPTURE;
+ if (dm->pcap_pkts_to_capture)
+ dm->pcap_main.n_packets_to_capture = dm->pcap_pkts_to_capture;
+
+ dm->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet;
+ dm->tx_pcap_enable = 1;
+ vlib_cli_output (vm, "pcap tx capture on...");
+ }
+ }
+ else if (chroot_filename)
+ vec_free (chroot_filename);
+
+
+ return error;
+}
+
+/*?
+ * This command is used to start or stop a packet capture, or show
+ * the status of packet capture.
+ *
+ * This command has the following optional parameters:
+ *
+ * - <b>on|off</b> - Used to start or stop a packet capture.
+ *
+ * - <b>max <nn></b> - Depth of local buffer. Once '<em>nn</em>' number
+ * of packets have been received, buffer is flushed to file. Once another
+ * '<em>nn</em>' number of packets have been received, buffer is flushed
+ * to file, overwriting previous write. If not entered, value defaults
+ * to 100. Can only be updated if packet capture is off.
+ *
+ * - <b>intfc <interface>|any</b> - Used to specify a given interface,
+ * or use '<em>any</em>' to run packet capture on all interfaces.
+ * '<em>any</em>' is the default if not provided. Settings from a previous
+ * packet capture are preserved, so '<em>any</em>' can be used to reset
+ * the interface setting.
+ *
+ * - <b>file <name></b> - Used to specify the output filename. The file will
+ * be placed in the '<em>/tmp</em>' directory, so only the filename is
+ * supported. Directory should not be entered. If file already exists, file
+ * will be overwritten. If no filename is provided, '<em>/tmp/vpe.pcap</em>'
+ * will be used. Can only be updated if packet capture is off.
+ *
+ * - <b>status</b> - Displays the current status and configured attributes
+ * associated with a packet capture. If packet capture is in progress,
+ * '<em>status</em>' also will return the number of packets currently in
+ * the local buffer. All additional attributes entered on command line
+ * with '<em>status</em>' will be ingnored and not applied.
+ *
+ * @cliexpar
+ * Example of how to display the status of a tx packet capture when off:
+ * @cliexstart{pcap tx trace status}
+ * max is 100, for any interface to file /tmp/vpe.pcap
+ * pcap tx capture is off...
+ * @cliexend
+ * Example of how to start a tx packet capture:
+ * @cliexstart{pcap tx trace on max 35 intfc GigabitEthernet0/8/0 file vppTest.pcap}
+ * pcap tx capture on...
+ * @cliexend
+ * Example of how to display the status of a tx packet capture in progress:
+ * @cliexstart{pcap tx trace status}
+ * max is 35, for interface GigabitEthernet0/8/0 to file /tmp/vppTest.pcap
+ * pcap tx capture is on: 20 of 35 pkts...
+ * @cliexend
+ * Example of how to stop a tx packet capture:
+ * @cliexstart{vppctl pcap tx trace off}
+ * captured 21 pkts...
+ * saved to /tmp/vppTest.pcap...
+ * @cliexend
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (pcap_trace_command, static) = {
+ .path = "pcap tx trace",
+ .short_help =
+ "pcap tx trace [on|off] [max <nn>] [intfc <interface>|any] [file <name>] [status]",
+ .function = pcap_trace_command_fn,
+};
+/* *INDENT-ON* */
+
+
+static clib_error_t *
+show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ struct rte_mempool *rmp;
+ int i;
+
+ for (i = 0; i < vec_len (dpdk_main.pktmbuf_pools); i++)
+ {
+ rmp = dpdk_main.pktmbuf_pools[i];
+ if (rmp)
+ {
+ unsigned count = rte_mempool_avail_count (rmp);
+ unsigned free_count = rte_mempool_in_use_count (rmp);
+
+ vlib_cli_output (vm,
+ "name=\"%s\" available = %7d allocated = %7d total = %7d\n",
+ rmp->name, (u32) count, (u32) free_count,
+ (u32) (count + free_count));
+ }
+ else
+ {
+ vlib_cli_output (vm, "rte_mempool is NULL (!)\n");
+ }
+ }
+ return 0;
+}
+
+/*?
+ * This command displays statistics of each DPDK mempool.
+ *
+ * @cliexpar
+ * Example of how to display DPDK buffer data:
+ * @cliexstart{show dpdk buffer}
+ * name="mbuf_pool_socket0" available = 15104 allocated = 1280 total = 16384
+ * @cliexend
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_bufferr,static) = {
+ .path = "show dpdk buffer",
+ .short_help = "show dpdk buffer",
+ .function = show_dpdk_buffer,
+ .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+test_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ static u32 *allocated_buffers;
+ u32 n_alloc = 0;
+ u32 n_free = 0;
+ u32 first, actual_alloc;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "allocate %d", &n_alloc))
+ ;
+ else if (unformat (input, "free %d", &n_free))
+ ;
+ else
+ break;
+ }
+
+ if (n_free)
+ {
+ if (vec_len (allocated_buffers) < n_free)
+ return clib_error_return (0, "Can't free %d, only %d allocated",
+ n_free, vec_len (allocated_buffers));
+
+ first = vec_len (allocated_buffers) - n_free;
+ vlib_buffer_free (vm, allocated_buffers + first, n_free);
+ _vec_len (allocated_buffers) = first;
+ }
+ if (n_alloc)
+ {
+ first = vec_len (allocated_buffers);
+ vec_validate (allocated_buffers,
+ vec_len (allocated_buffers) + n_alloc - 1);
+
+ actual_alloc = vlib_buffer_alloc (vm, allocated_buffers + first,
+ n_alloc);
+ _vec_len (allocated_buffers) = first + actual_alloc;
+
+ if (actual_alloc < n_alloc)
+ vlib_cli_output (vm, "WARNING: only allocated %d buffers",
+ actual_alloc);
+ }
+
+ vlib_cli_output (vm, "Currently %d buffers allocated",
+ vec_len (allocated_buffers));
+
+ if (allocated_buffers && vec_len (allocated_buffers) == 0)
+ vec_free (allocated_buffers);
+
+ return 0;
+}
+
+/*?
+ * This command tests the allocation and freeing of DPDK buffers.
+ * If both '<em>allocate</em>' and '<em>free</em>' are entered on the
+ * same command, the '<em>free</em>' is executed first. If no
+ * parameters are provided, this command display how many DPDK buffers
+ * the test command has allocated.
+ *
+ * @cliexpar
+ * @parblock
+ *
+ * Example of how to display how many DPDK buffer test command has allcoated:
+ * @cliexstart{test dpdk buffer}
+ * Currently 0 buffers allocated
+ * @cliexend
+ *
+ * Example of how to allocate DPDK buffers using the test command:
+ * @cliexstart{test dpdk buffer allocate 10}
+ * Currently 10 buffers allocated
+ * @cliexend
+ *
+ * Example of how to free DPDK buffers allocated by the test command:
+ * @cliexstart{test dpdk buffer free 10}
+ * Currently 0 buffers allocated
+ * @cliexend
+ * @endparblock
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_test_dpdk_buffer,static) = {
+ .path = "test dpdk buffer",
+ .short_help = "test dpdk buffer [allocate <nn>] [free <nn>]",
+ .function = test_dpdk_buffer,
+ .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 nb_rx_desc = (u32) ~ 0;
+ u32 nb_tx_desc = (u32) ~ 0;
+ clib_error_t *error = NULL;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "tx %d", &nb_tx_desc))
+ ;
+ else if (unformat (line_input, "rx %d", &nb_rx_desc))
+ ;
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ if (hw_if_index == (u32) ~ 0)
+ {
+ error = clib_error_return (0, "please specify valid interface name");
+ goto done;
+ }
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ {
+ error =
+ clib_error_return (0,
+ "number of descriptors can be set only for "
+ "physical devices");
+ goto done;
+ }
+
+ if ((nb_rx_desc == (u32) ~ 0 || nb_rx_desc == xd->nb_rx_desc) &&
+ (nb_tx_desc == (u32) ~ 0 || nb_tx_desc == xd->nb_tx_desc))
+ {
+ error = clib_error_return (0, "nothing changed");
+ goto done;
+ }
+
+ if (nb_rx_desc != (u32) ~ 0)
+ xd->nb_rx_desc = nb_rx_desc;
+
+ if (nb_tx_desc != (u32) ~ 0)
+ xd->nb_tx_desc = nb_tx_desc;
+
+ dpdk_device_setup (xd);
+
+ if (vec_len (xd->errors))
+ return clib_error_return (0, "%U", format_dpdk_device_errors, xd);
+
+done:
+ unformat_free (line_input);
+
+ return error;
+}
+
+/*?
+ * This command sets the number of DPDK '<em>rx</em>' and
+ * '<em>tx</em>' descriptors for the given physical interface. Use
+ * the command '<em>show hardware-interface</em>' to display the
+ * current descriptor allocation.
+ *
+ * @cliexpar
+ * Example of how to set the DPDK interface descriptors:
+ * @cliexcmd{set dpdk interface descriptors GigabitEthernet0/8/0 rx 512 tx 512}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = {
+ .path = "set dpdk interface descriptors",
+ .short_help = "set dpdk interface descriptors <interface> [rx <nn>] [tx <nn>]",
+ .function = set_dpdk_if_desc,
+};
+/* *INDENT-ON* */
+
+static int
+dpdk_device_queue_sort (void *a1, void *a2)
+{
+ dpdk_device_and_queue_t *dq1 = a1;
+ dpdk_device_and_queue_t *dq2 = a2;
+
+ if (dq1->device > dq2->device)
+ return 1;
+ else if (dq1->device < dq2->device)
+ return -1;
+ else if (dq1->queue_id > dq2->queue_id)
+ return 1;
+ else if (dq1->queue_id < dq2->queue_id)
+ return -1;
+ else
+ return 0;
+}
+
+
+static clib_error_t *
+show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ int cpu;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_cli_output (vm, "All interfaces are handled by main thread");
+
+ for (cpu = 0; cpu < vec_len (dm->devices_by_hqos_cpu); cpu++)
+ {
+ if (cpu >= dm->hqos_cpu_first_index &&
+ cpu < (dm->hqos_cpu_first_index + dm->hqos_cpu_count))
+ vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu,
+ vlib_worker_threads[cpu].name,
+ vlib_worker_threads[cpu].lcore_id);
+
+ vec_foreach (dq, dm->devices_by_hqos_cpu[cpu])
+ {
+ u32 hw_if_index = dm->devices[dq->device].hw_if_index;
+ vnet_hw_interface_t *hi =
+ vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ vlib_cli_output (vm, " %v queue %u", hi->name, dq->queue_id);
+ }
+ }
+ return 0;
+}
+
+/*?
+ * This command is used to display the thread and core each
+ * DPDK output interface and HQoS queue is assigned too.
+ *
+ * @cliexpar
+ * Example of how to display the DPDK output interface and HQoS queue placement:
+ * @cliexstart{show dpdk interface hqos placement}
+ * Thread 1 (vpp_hqos-threads_0 at lcore 3):
+ * GigabitEthernet0/8/0 queue 0
+ * Thread 2 (vpp_hqos-threads_1 at lcore 4):
+ * GigabitEthernet0/9/0 queue 0
+ * @cliexend
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos_placement, static) = {
+ .path = "show dpdk interface hqos placement",
+ .short_help = "show dpdk interface hqos placement",
+ .function = show_dpdk_if_hqos_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 cpu = (u32) ~ 0;
+ int i;
+ clib_error_t *error = NULL;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "thread %d", &cpu))
+ ;
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ if (cpu < dm->hqos_cpu_first_index ||
+ cpu >= (dm->hqos_cpu_first_index + dm->hqos_cpu_count))
+ {
+ error = clib_error_return (0, "please specify valid thread id");
+ goto done;
+ }
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ for (i = 0; i < vec_len (dm->devices_by_hqos_cpu); i++)
+ {
+ vec_foreach (dq, dm->devices_by_hqos_cpu[i])
+ {
+ if (hw_if_index == dm->devices[dq->device].hw_if_index)
+ {
+ if (cpu == i) /* nothing to do */
+ goto done;
+
+ vec_del1 (dm->devices_by_hqos_cpu[i],
+ dq - dm->devices_by_hqos_cpu[i]);
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->queue_id = 0;
+ dq->device = xd->device_index;
+
+ vec_sort_with_function (dm->devices_by_hqos_cpu[i],
+ dpdk_device_queue_sort);
+
+ vec_sort_with_function (dm->devices_by_hqos_cpu[cpu],
+ dpdk_device_queue_sort);
+
+ goto done;
+ }
+ }
+ }
+
+ error = clib_error_return (0, "not found");
+
+done:
+ unformat_free (line_input);
+
+ return error;
+}
+
+/*?
+ * This command is used to assign a given DPDK output interface and
+ * HQoS queue to a different thread. This will not create a thread,
+ * so the thread must already exist. Use '<em>/etc/vpp/startup.conf</em>'
+ * for the initial thread creation. See @ref qos_doc for more details.
+ *
+ * @cliexpar
+ * Example of how to display the DPDK output interface and HQoS queue placement:
+ * @cliexstart{show dpdk interface hqos placement}
+ * Thread 1 (vpp_hqos-threads_0 at lcore 3):
+ * GigabitEthernet0/8/0 queue 0
+ * Thread 2 (vpp_hqos-threads_1 at lcore 4):
+ * GigabitEthernet0/9/0 queue 0
+ * @cliexend
+ * Example of how to assign a DPDK output interface and HQoS queue to a thread:
+ * @cliexcmd{set dpdk interface hqos placement GigabitEthernet0/8/0 thread 2}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_placement, static) = {
+ .path = "set dpdk interface hqos placement",
+ .short_help = "set dpdk interface hqos placement <interface> thread <n>",
+ .function = set_dpdk_if_hqos_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport_id = (u32) ~ 0;
+ u32 pipe_id = (u32) ~ 0;
+ u32 profile_id = (u32) ~ 0;
+ int rv;
+ clib_error_t *error = NULL;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "subport %d", &subport_id))
+ ;
+ else if (unformat (line_input, "pipe %d", &pipe_id))
+ ;
+ else if (unformat (line_input, "profile %d", &profile_id))
+ ;
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ if (hw_if_index == (u32) ~ 0)
+ {
+ error = clib_error_return (0, "please specify valid interface name");
+ goto done;
+ }
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rv =
+ rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id,
+ profile_id);
+ if (rv)
+ {
+ error = clib_error_return (0, "pipe configuration failed");
+ goto done;
+ }
+
+done:
+ unformat_free (line_input);
+
+ return error;
+}
+
+/*?
+ * This command is used to change the profile associate with a HQoS pipe. The
+ * '<em><profile_id></em>' is zero based. Use the command
+ * '<em>show dpdk interface hqos</em>' to display the content of each profile.
+ * See @ref qos_doc for more details.
+ *
+ * @note
+ * Currently there is not an API to create a new HQoS pipe profile. One is
+ * created by default in the code (search for '<em>hqos_pipe_params_default</em>'').
+ * Additional profiles can be created in code and code recompiled. Then use this
+ * command to assign it.
+ *
+ * @cliexpar
+ * Example of how to assign a new profile to a HQoS pipe:
+ * @cliexcmd{set dpdk interface hqos pipe GigabitEthernet0/8/0 subport 0 pipe 2 profile 1}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pipe, static) =
+{
+ .path = "set dpdk interface hqos pipe",
+ .short_help = "set dpdk interface hqos pipe <interface> subport <subport_id> pipe <pipe_id> "
+ "profile <profile_id>",
+ .function = set_dpdk_if_hqos_pipe,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_subport (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = NULL;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport_id = (u32) ~ 0;
+ struct rte_sched_subport_params p;
+ int rv;
+ clib_error_t *error = NULL;
+ u32 tb_rate = (u32) ~ 0;
+ u32 tb_size = (u32) ~ 0;
+ u32 tc_rate[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE] =
+ { (u32) ~ 0, (u32) ~ 0, (u32) ~ 0, (u32) ~ 0 };
+ u32 tc_period = (u32) ~ 0;
+ dpdk_device_config_t *devconf = NULL;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "subport %d", &subport_id))
+ ;
+ else if (unformat (line_input, "rate %d", &tb_rate))
+ ;
+ else if (unformat (line_input, "bktsize %d", &tb_size))
+ ;
+ else if (unformat (line_input, "tc0 %d", &tc_rate[0]))
+ ;
+ else if (unformat (line_input, "tc1 %d", &tc_rate[1]))
+ ;
+ else if (unformat (line_input, "tc2 %d", &tc_rate[2]))
+ ;
+ else if (unformat (line_input, "tc3 %d", &tc_rate[3]))
+ ;
+ else if (unformat (line_input, "period %d", &tc_period))
+ ;
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ error = get_hqos (hw_if_index, subport_id, &xd, &devconf);
+
+ if (error == NULL)
+ {
+ /* Copy the current values over to local structure. */
+ memcpy (&p, &devconf->hqos.subport[subport_id], sizeof (p));
+
+ /* Update local structure with input values. */
+ if (tb_rate != (u32) ~ 0)
+ {
+ p.tb_rate = tb_rate;
+ p.tc_rate[0] = tb_rate;
+ p.tc_rate[1] = tb_rate;
+ p.tc_rate[2] = tb_rate;
+ p.tc_rate[3] = tb_rate;
+ }
+ if (tb_size != (u32) ~ 0)
+ {
+ p.tb_size = tb_size;
+ }
+ if (tc_rate[0] != (u32) ~ 0)
+ {
+ p.tc_rate[0] = tc_rate[0];
+ }
+ if (tc_rate[1] != (u32) ~ 0)
+ {
+ p.tc_rate[1] = tc_rate[1];
+ }
+ if (tc_rate[2] != (u32) ~ 0)
+ {
+ p.tc_rate[2] = tc_rate[2];
+ }
+ if (tc_rate[3] != (u32) ~ 0)
+ {
+ p.tc_rate[3] = tc_rate[3];
+ }
+ if (tc_period != (u32) ~ 0)
+ {
+ p.tc_period = tc_period;
+ }
+
+ /* Apply changes. */
+ rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport_id, &p);
+ if (rv)
+ {
+ error = clib_error_return (0, "subport configuration failed");
+ goto done;
+ }
+ else
+ {
+ /* Successfully applied, so save of the input values. */
+ memcpy (&devconf->hqos.subport[subport_id], &p, sizeof (p));
+ }
+ }
+
+done:
+ unformat_free (line_input);
+
+ return error;
+}
+
+/*?
+ * This command is used to set the subport level parameters such as token
+ * bucket rate (bytes per seconds), token bucket size (bytes), traffic class
+ * rates (bytes per seconds) and token update period (Milliseconds).
+ *
+ * By default, the '<em>rate</em>' is set to 1250000000 bytes/second (10GbE
+ * rate) and each of the four traffic classes is set to 100% of the port rate.
+ * If the '<em>rate</em>' is updated by this command, all four traffic classes
+ * are assigned the same value. Each of the four traffic classes can be updated
+ * individually.
+ *
+ * @cliexpar
+ * Example of how modify the subport attributes for a 1GbE link:
+ * @cliexcmd{set dpdk interface hqos subport GigabitEthernet0/8/0 subport 0 rate 125000000}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_subport, static) = {
+ .path = "set dpdk interface hqos subport",
+ .short_help = "set dpdk interface hqos subport <interface> subport <subport_id> "
+ "[rate <n>] [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] "
+ "[period <n>]",
+ .function = set_dpdk_if_hqos_subport,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 tc = (u32) ~ 0;
+ u32 queue = (u32) ~ 0;
+ u32 entry = (u32) ~ 0;
+ u32 val, i;
+ clib_error_t *error = NULL;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "entry %d", &entry))
+ ;
+ else if (unformat (line_input, "tc %d", &tc))
+ ;
+ else if (unformat (line_input, "queue %d", &queue))
+ ;
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ if (hw_if_index == (u32) ~ 0)
+ {
+ error = clib_error_return (0, "please specify valid interface name");
+ goto done;
+ }
+ if (entry >= 64)
+ {
+ error = clib_error_return (0, "invalid entry");
+ goto done;
+ }
+ if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE)
+ {
+ error = clib_error_return (0, "invalid traffic class");
+ goto done;
+ }
+ if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS)
+ {
+ error = clib_error_return (0, "invalid traffic class queue");
+ goto done;
+ }
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ /* Detect the set of worker threads */
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ {
+ error = clib_error_return (0, "no worker registrations?");
+ goto done;
+ }
+
+ vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0];
+ int worker_thread_first = tr->first_index;
+ int worker_thread_count = tr->count;
+
+ val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue;
+ for (i = 0; i < worker_thread_count; i++)
+ xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val;
+
+done:
+ unformat_free (line_input);
+
+ return error;
+}
+
+/*?
+ * This command is used to set the traffic class translation table. The
+ * traffic class translation table is used to map 64 values (0-63) to one of
+ * four traffic class and one of four HQoS input queue. Use the '<em>show
+ * dpdk interface hqos</em>' command to display the traffic class translation
+ * table. See @ref qos_doc for more details.
+ *
+ * This command has the following parameters:
+ *
+ * - <b><interface></b> - Used to specify the output interface.
+ *
+ * - <b>entry <map_val></b> - Mapped value (0-63) to assign traffic class and queue to.
+ *
+ * - <b>tc <tc_id></b> - Traffic class (0-3) to be used by the provided mapped value.
+ *
+ * - <b>queue <queue_id></b> - HQoS input queue (0-3) to be used by the provided mapped value.
+ *
+ * @cliexpar
+ * Example of how modify the traffic class translation table:
+ * @cliexcmd{set dpdk interface hqos tctbl GigabitEthernet0/8/0 entry 16 tc 2 queue 2}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_tctbl, static) = {
+ .path = "set dpdk interface hqos tctbl",
+ .short_help = "set dpdk interface hqos tctbl <interface> entry <map_val> tc <tc_id> queue <queue_id>",
+ .function = set_dpdk_if_hqos_tctbl,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ clib_error_t *error = NULL;
+
+ /* Device specific data */
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+
+ /* Detect the set of worker threads */
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ return clib_error_return (0, "no worker registrations?");
+
+ vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0];
+ int worker_thread_first = tr->first_index;
+ int worker_thread_count = tr->count;
+
+ /* Packet field configuration */
+ u64 mask = (u64) ~ 0;
+ u32 id = (u32) ~ 0;
+ u32 offset = (u32) ~ 0;
+
+ /* HQoS params */
+ u32 n_subports_per_port, n_pipes_per_subport, tctbl_size;
+
+ u32 i;
+
+ /* Parse input arguments */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "id subport"))
+ id = 0;
+ else if (unformat (line_input, "id pipe"))
+ id = 1;
+ else if (unformat (line_input, "id tc"))
+ id = 2;
+ else if (unformat (line_input, "id %d", &id))
+ ;
+ else if (unformat (line_input, "offset %d", &offset))
+ ;
+ else if (unformat (line_input, "mask %llx", &mask))
+ ;
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ /* Get interface */
+ if (hw_if_index == (u32) ~ 0)
+ {
+ error = clib_error_return (0, "please specify valid interface name");
+ goto done;
+ }
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ goto done;
+ }
+
+ n_subports_per_port = devconf->hqos.port.n_subports_per_port;
+ n_pipes_per_subport = devconf->hqos.port.n_pipes_per_subport;
+ tctbl_size = RTE_DIM (devconf->hqos.tc_table);
+
+ /* Validate packet field configuration: id, offset and mask */
+ if (id >= 3)
+ {
+ error = clib_error_return (0, "invalid packet field id");
+ goto done;
+ }
+
+ switch (id)
+ {
+ case 0:
+ if (dpdk_hqos_validate_mask (mask, n_subports_per_port) != 0)
+ {
+ error = clib_error_return (0, "invalid subport ID mask "
+ "(n_subports_per_port = %u)",
+ n_subports_per_port);
+ goto done;
+ }
+ break;
+ case 1:
+ if (dpdk_hqos_validate_mask (mask, n_pipes_per_subport) != 0)
+ {
+ error = clib_error_return (0, "invalid pipe ID mask "
+ "(n_pipes_per_subport = %u)",
+ n_pipes_per_subport);
+ goto done;
+ }
+ break;
+ case 2:
+ default:
+ if (dpdk_hqos_validate_mask (mask, tctbl_size) != 0)
+ {
+ error = clib_error_return (0, "invalid TC table index mask "
+ "(TC table size = %u)", tctbl_size);
+ goto done;
+ }
+ }
+
+ /* Propagate packet field configuration to all workers */
+ for (i = 0; i < worker_thread_count; i++)
+ switch (id)
+ {
+ case 0:
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabshr =
+ __builtin_ctzll (mask);
+ break;
+ case 1:
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabshr =
+ __builtin_ctzll (mask);
+ break;
+ case 2:
+ default:
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabmask = mask;
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabshr =
+ __builtin_ctzll (mask);
+ }
+
+done:
+ unformat_free (line_input);
+
+ return error;
+}
+
+/*?
+ * This command is used to set the packet fields required for classifiying the
+ * incoming packet. As a result of classification process, packet field
+ * information will be mapped to 5 tuples (subport, pipe, traffic class, pipe,
+ * color) and stored in packet mbuf.
+ *
+ * This command has the following parameters:
+ *
+ * - <b><interface></b> - Used to specify the output interface.
+ *
+ * - <b>id subport|pipe|tc</b> - Classification occurs across three fields.
+ * This parameter indicates which of the three masks are being configured. Legacy
+ * code used 0-2 to represent these three fields, so 0-2 is still accepted.
+ * - <b>subport|0</b> - Currently only one subport is supported, so only
+ * an empty mask is supported for the subport classification.
+ * - <b>pipe|1</b> - Currently, 4096 pipes per subport are supported, so a
+ * 12-bit mask should be configure to map to the 0-4095 pipes.
+ * - <b>tc|2</b> - The translation table (see '<em>set dpdk interface hqos
+ * tctbl</em>' command) maps each value (0-63) into one of the 4 traffic classes
+ * per pipe. A 6-bit mask should be configure to map this field to a traffic class.
+ *
+ * - <b>offset <n></b> - Offset in the packet to apply the 64-bit mask for classification.
+ * The offset should be on an 8-byte boundary (0,8,16,24..).
+ *
+ * - <b>mask <hex-mask></b> - 64-bit mask to apply to packet at the given '<em>offset</em>'.
+ * Bits must be contiguous and should not include '<em>0x</em>'.
+ *
+ * The default values for the '<em>pktfield</em>' assumes Ethernet/IPv4/UDP packets with
+ * no VLAN. Adjust based on expected packet format and desired classification field.
+ * - '<em>subport</em>' is always empty (offset 0 mask 0000000000000000)
+ * - By default, '<em>pipe</em>' maps to the UDP payload bits 12 .. 23 (offset 40
+ * mask 0000000fff000000)
+ * - By default, '<em>tc</em>' maps to the DSCP field in IP header (offset 48 mask
+ * 00000000000000fc)
+ *
+ * @cliexpar
+ * Example of how modify the '<em>pipe</em>' classification filter to match VLAN:
+ * @cliexcmd{set dpdk interface hqos pktfield GigabitEthernet0/8/0 id pipe offset 8 mask 0000000000000FFF}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pktfield, static) = {
+ .path = "set dpdk interface hqos pktfield",
+ .short_help = "set dpdk interface hqos pktfield <interface> id subport|pipe|tc offset <n> "
+ "mask <hex-mask>",
+ .function = set_dpdk_if_hqos_pktfield,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ dpdk_device_config_hqos_t *cfg;
+ dpdk_device_hqos_per_hqos_thread_t *ht;
+ dpdk_device_hqos_per_worker_thread_t *wk;
+ u32 *tctbl;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 profile_id, subport_id, i;
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ vlib_thread_registration_t *tr;
+ uword *p = 0;
+ clib_error_t *error = NULL;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ if (hw_if_index == (u32) ~ 0)
+ {
+ error = clib_error_return (0, "please specify interface name!!");
+ goto done;
+ }
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ goto done;
+ }
+
+ /* Detect the set of worker threads */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+
+ /* Should never happen, shut up Coverity warning */
+ if (p == 0)
+ {
+ error = clib_error_return (0, "no worker registrations?");
+ goto done;
+ }
+
+ tr = (vlib_thread_registration_t *) p[0];
+
+ cfg = &devconf->hqos;
+ ht = xd->hqos_ht;
+ wk = &xd->hqos_wt[tr->first_index];
+ tctbl = wk->hqos_tc_table;
+
+ vlib_cli_output (vm, " Thread:");
+ vlib_cli_output (vm, " Input SWQ size = %u packets", cfg->swq_size);
+ vlib_cli_output (vm, " Enqueue burst size = %u packets",
+ ht->hqos_burst_enq);
+ vlib_cli_output (vm, " Dequeue burst size = %u packets",
+ ht->hqos_burst_deq);
+
+ vlib_cli_output (vm,
+ " Packet field 0: slab position = %4u, slab bitmask = 0x%016llx (subport)",
+ wk->hqos_field0_slabpos, wk->hqos_field0_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 1: slab position = %4u, slab bitmask = 0x%016llx (pipe)",
+ wk->hqos_field1_slabpos, wk->hqos_field1_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 2: slab position = %4u, slab bitmask = 0x%016llx (tc)",
+ wk->hqos_field2_slabpos, wk->hqos_field2_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 2 tc translation table: ([Mapped Value Range]: tc/queue tc/queue ...)");
+ vlib_cli_output (vm,
+ " [ 0 .. 15]: "
+ "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u",
+ tctbl[0] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[0] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[1] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[1] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[2] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[2] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[3] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[3] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[4] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[4] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[5] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[5] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[6] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[6] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[7] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[7] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[8] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[8] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[9] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[9] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[10] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[10] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[11] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[11] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[12] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[12] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[13] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[13] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[14] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[14] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[15] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[15] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS);
+ vlib_cli_output (vm,
+ " [16 .. 31]: "
+ "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u",
+ tctbl[16] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[16] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[17] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[17] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[18] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[18] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[19] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[19] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[20] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[20] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[21] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[21] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[22] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[22] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[23] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[23] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[24] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[24] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[25] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[25] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[26] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[26] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[27] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[27] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[28] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[28] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[29] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[29] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[30] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[30] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[31] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[31] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS);
+ vlib_cli_output (vm,
+ " [32 .. 47]: "
+ "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u",
+ tctbl[32] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[32] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[33] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[33] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[34] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[34] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[35] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[35] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[36] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[36] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[37] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[37] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[38] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[38] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[39] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[39] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[40] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[40] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[41] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[41] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[42] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[42] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[43] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[43] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[44] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[44] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[45] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[45] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[46] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[46] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[47] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[47] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS);
+ vlib_cli_output (vm,
+ " [48 .. 63]: "
+ "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u",
+ tctbl[48] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[48] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[49] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[49] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[50] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[50] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[51] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[51] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[52] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[52] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[53] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[53] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[54] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[54] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[55] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[55] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[56] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[56] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[57] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[57] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[58] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[58] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[59] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[59] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[60] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[60] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[61] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[61] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[62] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[62] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[63] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS,
+ tctbl[63] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS);
+ vlib_cli_output (vm, " Port:");
+ vlib_cli_output (vm, " Rate = %u bytes/second", cfg->port.rate);
+ vlib_cli_output (vm, " MTU = %u bytes", cfg->port.mtu);
+ vlib_cli_output (vm, " Frame overhead = %u bytes",
+ cfg->port.frame_overhead);
+ vlib_cli_output (vm, " Number of subports = %u",
+ cfg->port.n_subports_per_port);
+ vlib_cli_output (vm, " Number of pipes per subport = %u",
+ cfg->port.n_pipes_per_subport);
+ vlib_cli_output (vm,
+ " Packet queue size: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u packets",
+ cfg->port.qsize[0], cfg->port.qsize[1], cfg->port.qsize[2],
+ cfg->port.qsize[3]);
+ vlib_cli_output (vm, " Number of pipe profiles = %u",
+ cfg->port.n_pipe_profiles);
+
+ for (subport_id = 0; subport_id < vec_len (cfg->subport); subport_id++)
+ {
+ vlib_cli_output (vm, " Subport %u:", subport_id);
+ vlib_cli_output (vm, " Rate = %u bytes/second",
+ cfg->subport[subport_id].tb_rate);
+ vlib_cli_output (vm, " Token bucket size = %u bytes",
+ cfg->subport[subport_id].tb_size);
+ vlib_cli_output (vm,
+ " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second",
+ cfg->subport[subport_id].tc_rate[0],
+ cfg->subport[subport_id].tc_rate[1],
+ cfg->subport[subport_id].tc_rate[2],
+ cfg->subport[subport_id].tc_rate[3]);
+ vlib_cli_output (vm, " TC period = %u milliseconds",
+ cfg->subport[subport_id].tc_period);
+ }
+
+ for (profile_id = 0; profile_id < vec_len (cfg->pipe); profile_id++)
+ {
+ vlib_cli_output (vm, " Pipe profile %u:", profile_id);
+ vlib_cli_output (vm, " Rate = %u bytes/second",
+ cfg->pipe[profile_id].tb_rate);
+ vlib_cli_output (vm, " Token bucket size = %u bytes",
+ cfg->pipe[profile_id].tb_size);
+ vlib_cli_output (vm,
+ " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second",
+ cfg->pipe[profile_id].tc_rate[0],
+ cfg->pipe[profile_id].tc_rate[1],
+ cfg->pipe[profile_id].tc_rate[2],
+ cfg->pipe[profile_id].tc_rate[3]);
+ vlib_cli_output (vm, " TC period = %u milliseconds",
+ cfg->pipe[profile_id].tc_period);
+#ifdef RTE_SCHED_SUBPORT_TC_OV
+ vlib_cli_output (vm, " TC3 oversubscription_weight = %u",
+ cfg->pipe[profile_id].tc_ov_weight);
+#endif
+
+ for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
+ {
+ vlib_cli_output (vm,
+ " TC%u WRR weights: Q0 = %u, Q1 = %u, Q2 = %u, Q3 = %u",
+ i, cfg->pipe[profile_id].wrr_weights[i * 4],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 1],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 2],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 3]);
+ }
+ }
+
+#ifdef RTE_SCHED_RED
+ vlib_cli_output (vm, " Weighted Random Early Detection (WRED):");
+ for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
+ {
+ vlib_cli_output (vm, " TC%u min: G = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].min_th,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].min_th,
+ cfg->port.red_params[i][e_RTE_METER_RED].min_th);
+
+ vlib_cli_output (vm, " TC%u max: G = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].max_th,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].max_th,
+ cfg->port.red_params[i][e_RTE_METER_RED].max_th);
+
+ vlib_cli_output (vm,
+ " TC%u inverted probability: G = %u, Y = %u, R = %u",
+ i, cfg->port.red_params[i][e_RTE_METER_GREEN].maxp_inv,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].maxp_inv,
+ cfg->port.red_params[i][e_RTE_METER_RED].maxp_inv);
+
+ vlib_cli_output (vm, " TC%u weight: R = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].wq_log2,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].wq_log2,
+ cfg->port.red_params[i][e_RTE_METER_RED].wq_log2);
+ }
+#endif
+
+done:
+ unformat_free (line_input);
+
+ return error;
+}
+
+/*?
+ * This command is used to display details of an output interface's HQoS
+ * settings.
+ *
+ * @cliexpar
+ * Example of how to display HQoS settings for an interfaces:
+ * @cliexstart{show dpdk interface hqos GigabitEthernet0/8/0}
+ * Thread:
+ * Input SWQ size = 4096 packets
+ * Enqueue burst size = 256 packets
+ * Dequeue burst size = 220 packets
+ * Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000 (subport)
+ * Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000 (pipe)
+ * Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc (tc)
+ * Packet field 2 tc translation table: ([Mapped Value Range]: tc/queue tc/queue ...)
+ * [ 0 .. 15]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3
+ * [16 .. 31]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3
+ * [32 .. 47]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3
+ * [48 .. 63]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3
+ * Port:
+ * Rate = 1250000000 bytes/second
+ * MTU = 1514 bytes
+ * Frame overhead = 24 bytes
+ * Number of subports = 1
+ * Number of pipes per subport = 4096
+ * Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets
+ * Number of pipe profiles = 2
+ * Subport 0:
+ * Rate = 1250000000 bytes/second
+ * Token bucket size = 1000000 bytes
+ * Traffic class rate: TC0 = 1250000000, TC1 = 1250000000, TC2 = 1250000000, TC3 = 1250000000 bytes/second
+ * TC period = 10 milliseconds
+ * Pipe profile 0:
+ * Rate = 305175 bytes/second
+ * Token bucket size = 1000000 bytes
+ * Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second
+ * TC period = 40 milliseconds
+ * TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ * TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ * TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ * TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ * @cliexend
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos, static) = {
+ .path = "show dpdk interface hqos",
+ .short_help = "show dpdk interface hqos <interface>",
+ .function = show_dpdk_if_hqos,
+};
+
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ clib_error_t *error = NULL;
+#ifdef RTE_SCHED_COLLECT_STATS
+ dpdk_main_t *dm = &dpdk_main;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport = (u32) ~ 0;
+ u32 pipe = (u32) ~ 0;
+ u32 tc = (u32) ~ 0;
+ u32 tc_q = (u32) ~ 0;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ uword *p = 0;
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ u32 qindex;
+ struct rte_sched_queue_stats stats;
+ u16 qlen;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+
+ else if (unformat (line_input, "subport %d", &subport))
+ ;
+
+ else if (unformat (line_input, "pipe %d", &pipe))
+ ;
+
+ else if (unformat (line_input, "tc %d", &tc))
+ ;
+
+ else if (unformat (line_input, "tc_q %d", &tc_q))
+ ;
+
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ if (hw_if_index == (u32) ~ 0)
+ {
+ error = clib_error_return (0, "please specify interface name!!");
+ goto done;
+ }
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ goto done;
+ }
+
+ /*
+ * Figure out which queue to query. cf rte_sched_port_qindex. (Not sure why
+ * that method isn't made public by DPDK - how _should_ we get the queue ID?)
+ */
+ qindex = subport * devconf->hqos.port.n_pipes_per_subport + pipe;
+ qindex = qindex * RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE + tc;
+ qindex = qindex * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + tc_q;
+
+ if (rte_sched_queue_read_stats (xd->hqos_ht->hqos, qindex, &stats, &qlen) !=
+ 0)
+ {
+ error = clib_error_return (0, "failed to read stats");
+ goto done;
+ }
+
+ vlib_cli_output (vm, "%=24s%=16s", "Stats Parameter", "Value");
+ vlib_cli_output (vm, "%=24s%=16d", "Packets", stats.n_pkts);
+ vlib_cli_output (vm, "%=24s%=16d", "Packets dropped", stats.n_pkts_dropped);
+#ifdef RTE_SCHED_RED
+ vlib_cli_output (vm, "%=24s%=16d", "Packets dropped (RED)",
+ stats.n_pkts_red_dropped);
+#endif
+ vlib_cli_output (vm, "%=24s%=16d", "Bytes", stats.n_bytes);
+ vlib_cli_output (vm, "%=24s%=16d", "Bytes dropped", stats.n_bytes_dropped);
+
+#else
+
+ /* Get a line of input */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ vlib_cli_output (vm, "RTE_SCHED_COLLECT_STATS disabled in DPDK");
+ goto done;
+
+#endif
+
+done:
+ unformat_free (line_input);
+
+ return error;
+}
+
+/*?
+ * This command is used to display statistics associated with a HQoS traffic class
+ * queue.
+ *
+ * @note
+ * Statistic collection by the scheduler is disabled by default in DPDK. In order to
+ * turn it on, add the following line to '<em>../vpp/dpdk/Makefile</em>':
+ * - <b>$(call set,RTE_SCHED_COLLECT_STATS,y)</b>
+ *
+ * @cliexpar
+ * Example of how to display statistics of HQoS a HQoS traffic class queue:
+ * @cliexstart{show dpdk hqos queue GigabitEthernet0/9/0 subport 0 pipe 3181 tc 0 tc_q 0}
+ * Stats Parameter Value
+ * Packets 140
+ * Packets dropped 0
+ * Bytes 8400
+ * Bytes dropped 0
+ * @cliexend
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_hqos_queue_stats, static) = {
+ .path = "show dpdk hqos queue",
+ .short_help = "show dpdk hqos queue <interface> subport <subport_id> pipe <pipe_id> tc <tc_id> tc_q <queue_id>",
+ .function = show_dpdk_hqos_queue_stats,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_version_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+#define _(a,b,c) vlib_cli_output (vm, "%-25s " b, a ":", c);
+ _("DPDK Version", "%s", rte_version ());
+ _("DPDK EAL init args", "%s", dpdk_config_main.eal_init_args_str);
+#undef _
+ return 0;
+}
+
+/*?
+ * This command is used to display the current DPDK version and
+ * the list of arguments passed to DPDK when started.
+ *
+ * @cliexpar
+ * Example of how to display how many DPDK buffer test command has allcoated:
+ * @cliexstart{show dpdk version}
+ * DPDK Version: DPDK 16.11.0
+ * DPDK EAL init args: -c 1 -n 4 --huge-dir /run/vpp/hugepages --file-prefix vpp -w 0000:00:08.0 -w 0000:00:09.0 --master-lcore 0 --socket-mem 256
+ * @cliexend
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_vpe_version_command, static) = {
+ .path = "show dpdk version",
+ .short_help = "show dpdk version",
+ .function = show_dpdk_version_command_fn,
+};
+/* *INDENT-ON* */
+
+#if CLI_DEBUG
+
+static clib_error_t *
+dpdk_validate_buffers_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd_arg)
+{
+ u32 n_invalid_bufs = 0, uninitialized = 0;
+ u32 is_poison = 0, is_test = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "poison"))
+ is_poison = 1;
+ else if (unformat (input, "trajectory"))
+ is_test = 1;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (VLIB_BUFFER_TRACE_TRAJECTORY == 0)
+ {
+ vlib_cli_output (vm, "Trajectory not enabled. Recompile with "
+ "VLIB_BUFFER_TRACE_TRAJECTORY 1");
+ return 0;
+ }
+ if (is_poison)
+ {
+ dpdk_buffer_poison_trajectory_all ();
+ }
+ if (is_test)
+ {
+ n_invalid_bufs = dpdk_buffer_validate_trajectory_all (&uninitialized);
+ if (!n_invalid_bufs)
+ vlib_cli_output (vm, "All buffers are valid %d uninitialized",
+ uninitialized);
+ else
+ vlib_cli_output (vm, "Found %d invalid buffers and %d uninitialized",
+ n_invalid_bufs, uninitialized);
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (test_dpdk_buffers_command, static) =
+{
+ .path = "test dpdk buffers",
+ .short_help = "test dpdk buffers [poison] [trajectory]",
+ .function = dpdk_validate_buffers_fn,
+};
+/* *INDENT-ON* */
+
+#endif
+
+clib_error_t *
+dpdk_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (dpdk_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/dpdk/device/common.c b/src/plugins/dpdk/device/common.c
new file mode 100644
index 00000000..aedc3f52
--- /dev/null
+++ b/src/plugins/dpdk/device/common.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+#include <assert.h>
+
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/arp_packet.h>
+#include <dpdk/device/dpdk.h>
+
+#include <dpdk/device/dpdk_priv.h>
+#include <vppinfra/error.h>
+
+void
+dpdk_device_error (dpdk_device_t * xd, char *str, int rv)
+{
+ xd->errors = clib_error_return (xd->errors, "%s[port:%d, errno:%d]: %s",
+ str, xd->device_index, rv,
+ rte_strerror (rv));
+}
+
+void
+dpdk_device_setup (dpdk_device_t * xd)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+ int rv;
+ int j;
+
+ ASSERT (vlib_get_thread_index () == 0);
+
+ clib_error_free (xd->errors);
+ sw->flags &= ~VNET_SW_INTERFACE_FLAG_ERROR;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ vnet_hw_interface_set_flags (dm->vnet_main, xd->hw_if_index, 0);
+ dpdk_device_stop (xd);
+ }
+
+ rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used,
+ xd->tx_q_used, &xd->port_conf);
+
+ if (rv < 0)
+ {
+ dpdk_device_error (xd, "rte_eth_dev_configure", rv);
+ goto error;
+ }
+
+ /* Set up one TX-queue per worker thread */
+ for (j = 0; j < xd->tx_q_used; j++)
+ {
+ rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc,
+ xd->cpu_socket, &xd->tx_conf);
+
+ /* retry with any other CPU socket */
+ if (rv < 0)
+ rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc,
+ SOCKET_ID_ANY, &xd->tx_conf);
+ if (rv < 0)
+ dpdk_device_error (xd, "rte_eth_tx_queue_setup", rv);
+ }
+
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+ uword tidx = vnet_get_device_input_thread_index (dm->vnet_main,
+ xd->hw_if_index, j);
+ unsigned lcore = vlib_worker_threads[tidx].lcore_id;
+ u16 socket_id = rte_lcore_to_socket_id (lcore);
+
+ rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc,
+ xd->cpu_socket, 0,
+ dm->pktmbuf_pools[socket_id]);
+
+ /* retry with any other CPU socket */
+ if (rv < 0)
+ rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc,
+ SOCKET_ID_ANY, 0,
+ dm->pktmbuf_pools[socket_id]);
+
+ if (rv < 0)
+ dpdk_device_error (xd, "rte_eth_rx_queue_setup", rv);
+ }
+
+ if (vec_len (xd->errors))
+ goto error;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ dpdk_device_start (xd);
+
+ if (vec_len (xd->errors))
+ goto error;
+
+ return;
+
+error:
+ xd->flags |= DPDK_DEVICE_FLAG_PMD_INIT_FAIL;
+ sw->flags |= VNET_SW_INTERFACE_FLAG_ERROR;
+}
+
+void
+dpdk_device_start (dpdk_device_t * xd)
+{
+ int rv;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL)
+ return;
+
+ rv = rte_eth_dev_start (xd->device_index);
+
+ if (rv)
+ {
+ dpdk_device_error (xd, "rte_eth_dev_start", rv);
+ return;
+ }
+
+ if (xd->default_mac_address)
+ rv =
+ rte_eth_dev_default_mac_addr_set (xd->device_index,
+ (struct ether_addr *)
+ xd->default_mac_address);
+
+ if (rv)
+ dpdk_device_error (xd, "rte_eth_dev_default_mac_addr_set", rv);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
+ rte_eth_promiscuous_enable (xd->device_index);
+ else
+ rte_eth_promiscuous_disable (xd->device_index);
+
+ rte_eth_allmulticast_enable (xd->device_index);
+
+ if (xd->pmd == VNET_DPDK_PMD_BOND)
+ {
+ u8 slink[16];
+ int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16);
+ while (nlink >= 1)
+ {
+ u8 dpdk_port = slink[--nlink];
+ rte_eth_allmulticast_enable (dpdk_port);
+ }
+ }
+}
+
+void
+dpdk_device_stop (dpdk_device_t * xd)
+{
+ if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL)
+ return;
+
+ rte_eth_allmulticast_disable (xd->device_index);
+ rte_eth_dev_stop (xd->device_index);
+
+ /* For bonded interface, stop slave links */
+ if (xd->pmd == VNET_DPDK_PMD_BOND)
+ {
+ u8 slink[16];
+ int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16);
+ while (nlink >= 1)
+ {
+ u8 dpdk_port = slink[--nlink];
+ rte_eth_dev_stop (dpdk_port);
+ }
+ }
+}
+
+/* Even type for send_garp_na_process */
+enum
+{
+ SEND_GARP_NA = 1,
+} dpdk_send_garp_na_process_event_t;
+
+static vlib_node_registration_t send_garp_na_proc_node;
+
+static uword
+send_garp_na_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ uword event_type, *event_data = 0;
+
+ while (1)
+ {
+ u32 i;
+ uword dpdk_port;
+ vlib_process_wait_for_event (vm);
+ event_type = vlib_process_get_events (vm, &event_data);
+ ASSERT (event_type == SEND_GARP_NA);
+ for (i = 0; i < vec_len (event_data); i++)
+ {
+ dpdk_port = event_data[i];
+ if (i < 5) /* wait 0.2 sec for link to settle, max total 1 sec */
+ vlib_process_suspend (vm, 0.2);
+ dpdk_device_t *xd = &dpdk_main.devices[dpdk_port];
+ u32 hw_if_index = xd->hw_if_index;
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_update_link_state (xd, vlib_time_now (vm));
+ send_ip4_garp (vm, hi);
+ send_ip6_na (vm, hi);
+ }
+ vec_reset_length (event_data);
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (send_garp_na_proc_node, static) = {
+ .function = send_garp_na_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "send-garp-na-process",
+};
+/* *INDENT-ON* */
+
+void vl_api_force_rpc_call_main_thread (void *fp, u8 * data, u32 data_length);
+
+static void
+garp_na_proc_callback (uword * dpdk_port)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ ASSERT (vlib_get_thread_index () == 0);
+ vlib_process_signal_event
+ (vm, send_garp_na_proc_node.index, SEND_GARP_NA, *dpdk_port);
+}
+
+always_inline int
+dpdk_port_state_callback_inline (uint8_t port_id,
+ enum rte_eth_event_type type, void *param)
+{
+ struct rte_eth_link link;
+ dpdk_device_t *xd = &dpdk_main.devices[port_id];
+
+ RTE_SET_USED (param);
+ if (type != RTE_ETH_EVENT_INTR_LSC)
+ {
+ clib_warning ("Unknown event %d received for port %d", type, port_id);
+ return -1;
+ }
+
+ rte_eth_link_get_nowait (port_id, &link);
+ u8 link_up = link.link_status;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE)
+ {
+ uword bd_port = xd->bond_port;
+ int bd_mode = rte_eth_bond_mode_get (bd_port);
+#if 0
+ clib_warning ("Port %d state to %s, "
+ "slave of port %d BondEthernet%d in mode %d",
+ port_id, (link_up) ? "UP" : "DOWN",
+ bd_port, xd->port_id, bd_mode);
+#endif
+ if (bd_mode == BONDING_MODE_ACTIVE_BACKUP)
+ {
+ vl_api_force_rpc_call_main_thread
+ (garp_na_proc_callback, (u8 *) & bd_port, sizeof (uword));
+ }
+ xd->flags |= link_up ?
+ DPDK_DEVICE_FLAG_BOND_SLAVE_UP : ~DPDK_DEVICE_FLAG_BOND_SLAVE_UP;
+ }
+ else /* Should not happen as callback not setup for "normal" links */
+ {
+ if (link_up)
+ clib_warning ("Port %d Link Up - speed %u Mbps - %s",
+ port_id, (unsigned) link.link_speed,
+ (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
+ "full-duplex" : "half-duplex");
+ else
+ clib_warning ("Port %d Link Down\n\n", port_id);
+ }
+
+ return 0;
+}
+
+#if DPDK_VOID_CALLBACK
+void
+dpdk_port_state_callback (uint8_t port_id,
+ enum rte_eth_event_type type, void *param)
+{
+ dpdk_port_state_callback_inline (port_id, type, param);
+}
+
+#else
+int
+dpdk_port_state_callback (uint8_t port_id,
+ enum rte_eth_event_type type,
+ void *param,
+ void *ret_param __attribute__ ((unused)))
+{
+ return dpdk_port_state_callback_inline (port_id, type, param);
+}
+#endif
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c
new file mode 100644
index 00000000..aa134327
--- /dev/null
+++ b/src/plugins/dpdk/device/device.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+#include <assert.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <dpdk/device/dpdk.h>
+
+#include <dpdk/device/dpdk_priv.h>
+#include <vppinfra/error.h>
+
+#define foreach_dpdk_tx_func_error \
+ _(BAD_RETVAL, "DPDK tx function returned an error") \
+ _(RING_FULL, "Tx packet drops (ring full)") \
+ _(PKT_DROP, "Tx packet drops (dpdk tx failure)") \
+ _(REPL_FAIL, "Tx packet drops (replication failure)")
+
+typedef enum
+{
+#define _(f,s) DPDK_TX_FUNC_ERROR_##f,
+ foreach_dpdk_tx_func_error
+#undef _
+ DPDK_TX_FUNC_N_ERROR,
+} dpdk_tx_func_error_t;
+
+static char *dpdk_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_tx_func_error
+#undef _
+};
+
+static clib_error_t *
+dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address)
+{
+ int error;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ error = rte_eth_dev_default_mac_addr_set (xd->device_index,
+ (struct ether_addr *) address);
+
+ if (error)
+ {
+ return clib_error_return (0, "mac address set failed: %d", error);
+ }
+ else
+ {
+ vec_reset_length (xd->default_mac_address);
+ vec_add (xd->default_mac_address, address, sizeof (address));
+ return NULL;
+ }
+}
+
+struct rte_mbuf *
+dpdk_replicate_packet_mb (vlib_buffer_t * b)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ struct rte_mbuf **mbufs = 0, *s, *d;
+ u8 nb_segs;
+ unsigned socket_id = rte_socket_id ();
+ int i;
+
+ ASSERT (dm->pktmbuf_pools[socket_id]);
+ s = rte_mbuf_from_vlib_buffer (b);
+ nb_segs = s->nb_segs;
+ vec_validate (mbufs, nb_segs - 1);
+
+ if (rte_pktmbuf_alloc_bulk (dm->pktmbuf_pools[socket_id], mbufs, nb_segs))
+ {
+ vec_free (mbufs);
+ return 0;
+ }
+
+ d = mbufs[0];
+ d->nb_segs = s->nb_segs;
+ d->data_len = s->data_len;
+ d->pkt_len = s->pkt_len;
+ d->data_off = s->data_off;
+ clib_memcpy (d->buf_addr, s->buf_addr, RTE_PKTMBUF_HEADROOM + s->data_len);
+
+ for (i = 1; i < nb_segs; i++)
+ {
+ d->next = mbufs[i];
+ d = mbufs[i];
+ s = s->next;
+ d->data_len = s->data_len;
+ clib_memcpy (d->buf_addr, s->buf_addr,
+ RTE_PKTMBUF_HEADROOM + s->data_len);
+ }
+
+ d = mbufs[0];
+ vec_free (mbufs);
+ return d;
+}
+
+static void
+dpdk_tx_trace_buffer (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id, u32 buffer_index, vlib_buffer_t * buffer)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ dpdk_tx_dma_trace_t *t0;
+ struct rte_mbuf *mb;
+
+ mb = rte_mbuf_from_vlib_buffer (buffer);
+
+ t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = buffer_index;
+ clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
+ clib_memcpy (&t0->buffer, buffer,
+ sizeof (buffer[0]) - sizeof (buffer->pre_data));
+ clib_memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data,
+ sizeof (t0->buffer.pre_data));
+}
+
+static_always_inline void
+dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b,
+ int maybe_multiseg)
+{
+ struct rte_mbuf *mb, *first_mb, *last_mb;
+
+ /* buffer is coming from non-dpdk source so we need to init
+ rte_mbuf header */
+ if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_EXT_HDR_VALID) == 0))
+ {
+ vlib_buffer_t *b2 = b;
+ last_mb = mb = rte_mbuf_from_vlib_buffer (b2);
+ rte_pktmbuf_reset (mb);
+ while (maybe_multiseg && (b2->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ b2 = vlib_get_buffer (vm, b2->next_buffer);
+ mb = rte_mbuf_from_vlib_buffer (b2);
+ rte_pktmbuf_reset (mb);
+ }
+ }
+
+ last_mb = first_mb = mb = rte_mbuf_from_vlib_buffer (b);
+ first_mb->nb_segs = 1;
+ mb->data_len = b->current_length;
+ mb->pkt_len = maybe_multiseg ? vlib_buffer_length_in_chain (vm, b) :
+ b->current_length;
+ mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data;
+
+ while (maybe_multiseg && (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ b = vlib_get_buffer (vm, b->next_buffer);
+ mb = rte_mbuf_from_vlib_buffer (b);
+ last_mb->next = mb;
+ last_mb = mb;
+ mb->data_len = b->current_length;
+ mb->pkt_len = b->current_length;
+ mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data;
+ first_mb->nb_segs++;
+ if (PREDICT_FALSE (b->n_add_refs))
+ {
+ rte_mbuf_refcnt_update (mb, b->n_add_refs);
+ b->n_add_refs = 0;
+ }
+ }
+}
+
+/*
+ * This function calls the dpdk's tx_burst function to transmit the packets
+ * on the tx_vector. It manages a lock per-device if the device does not
+ * support multiple queues. It returns the number of packets untransmitted
+ * on the tx_vector. If all packets are transmitted (the normal case), the
+ * function returns 0.
+ *
+ * The function assumes there is at least one packet on the tx_vector.
+ */
+static_always_inline
+ u32 tx_burst_vector_internal (vlib_main_t * vm,
+ dpdk_device_t * xd,
+ struct rte_mbuf **tx_vector)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 n_packets;
+ u32 tx_head;
+ u32 tx_tail;
+ u32 n_retry;
+ int rv;
+ int queue_id;
+ tx_ring_hdr_t *ring;
+
+ ring = vec_header (tx_vector, sizeof (*ring));
+
+ n_packets = ring->tx_head - ring->tx_tail;
+
+ tx_head = ring->tx_head % xd->nb_tx_desc;
+
+ /*
+ * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to
+ * unpredictable results.
+ */
+ ASSERT (n_packets > 0);
+
+ /*
+ * Check for tx_vector overflow. If this fails it is a system configuration
+ * error. The ring should be sized big enough to handle the largest un-flowed
+ * off burst from a traffic manager. A larger size also helps performance
+ * a bit because it decreases the probability of having to issue two tx_burst
+ * calls due to a ring wrap.
+ */
+ ASSERT (n_packets < xd->nb_tx_desc);
+ ASSERT (ring->tx_tail == 0);
+
+ n_retry = 16;
+ queue_id = vm->thread_index;
+
+ do
+ {
+ /* start the burst at the tail */
+ tx_tail = ring->tx_tail % xd->nb_tx_desc;
+
+ /*
+ * This device only supports one TX queue,
+ * and we're running multi-threaded...
+ */
+ if (PREDICT_FALSE (xd->lockp != 0))
+ {
+ queue_id = queue_id % xd->tx_q_used;
+ while (__sync_lock_test_and_set (xd->lockp[queue_id], 1))
+ /* zzzz */
+ queue_id = (queue_id + 1) % xd->tx_q_used;
+ }
+
+ if (PREDICT_FALSE (xd->flags & DPDK_DEVICE_FLAG_HQOS)) /* HQoS ON */
+ {
+ /* no wrap, transmit in one burst */
+ dpdk_device_hqos_per_worker_thread_t *hqos =
+ &xd->hqos_wt[vm->thread_index];
+
+ ASSERT (hqos->swq != NULL);
+
+ dpdk_hqos_metadata_set (hqos,
+ &tx_vector[tx_tail], tx_head - tx_tail);
+ rv = rte_ring_sp_enqueue_burst (hqos->swq,
+ (void **) &tx_vector[tx_tail],
+ (uint16_t) (tx_head - tx_tail), 0);
+ }
+ else if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_eth_tx_burst (xd->device_index,
+ (uint16_t) queue_id,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head - tx_tail));
+ }
+ else
+ {
+ ASSERT (0);
+ rv = 0;
+ }
+
+ if (PREDICT_FALSE (xd->lockp != 0))
+ *xd->lockp[queue_id] = 0;
+
+ if (PREDICT_FALSE (rv < 0))
+ {
+ // emit non-fatal message, bump counter
+ vnet_main_t *vnm = dm->vnet_main;
+ vnet_interface_main_t *im = &vnm->interface_main;
+ u32 node_index;
+
+ node_index = vec_elt_at_index (im->hw_interfaces,
+ xd->hw_if_index)->tx_node_index;
+
+ vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1);
+ clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index,
+ rv);
+ return n_packets; // untransmitted packets
+ }
+ ring->tx_tail += (u16) rv;
+ n_packets -= (uint16_t) rv;
+ }
+ while (rv && n_packets && (n_retry > 0));
+
+ return n_packets;
+}
+
+static_always_inline void
+dpdk_prefetch_buffer_by_index (vlib_main_t * vm, u32 bi)
+{
+ vlib_buffer_t *b;
+ struct rte_mbuf *mb;
+ b = vlib_get_buffer (vm, bi);
+ mb = rte_mbuf_from_vlib_buffer (b);
+ CLIB_PREFETCH (mb, 2 * CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+static_always_inline void
+dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 my_cpu = vm->thread_index;
+ struct rte_mbuf *mb_new;
+
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0)
+ return;
+
+ mb_new = dpdk_replicate_packet_mb (b);
+ if (PREDICT_FALSE (mb_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ *mbp = mb_new;
+
+ vec_add1 (dm->recycle[my_cpu], bi);
+}
+
+static_always_inline void
+dpdk_buffer_tx_offload (dpdk_device_t * xd, vlib_buffer_t * b,
+ struct rte_mbuf *mb)
+{
+ u32 ip_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
+ u32 tcp_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+ u32 udp_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
+ int is_ip4 = b->flags & VNET_BUFFER_F_IS_IP4;
+ u64 ol_flags;
+
+ /* Is there any work for us? */
+ if (PREDICT_TRUE ((ip_cksum | tcp_cksum | udp_cksum) == 0))
+ return;
+
+ mb->l2_len = vnet_buffer (b)->l3_hdr_offset - b->current_data;
+ mb->l3_len = vnet_buffer (b)->l4_hdr_offset -
+ vnet_buffer (b)->l3_hdr_offset;
+ mb->outer_l3_len = 0;
+ mb->outer_l2_len = 0;
+ ol_flags = is_ip4 ? PKT_TX_IPV4 : PKT_TX_IPV6;
+ ol_flags |= ip_cksum ? PKT_TX_IP_CKSUM : 0;
+ ol_flags |= tcp_cksum ? PKT_TX_TCP_CKSUM : 0;
+ ol_flags |= udp_cksum ? PKT_TX_UDP_CKSUM : 0;
+ mb->ol_flags |= ol_flags;
+
+ /* we are trying to help compiler here by using local ol_flags with known
+ state of all flags */
+ if (xd->flags & DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM)
+ rte_net_intel_cksum_flags_prepare (mb, ol_flags);
+}
+
+/*
+ * Transmits the packets on the frame to the interface associated with the
+ * node. It first copies packets on the frame to a tx_vector containing the
+ * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal
+ * which calls the dpdk tx_burst function.
+ */
+static uword
+dpdk_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, rd->dev_instance);
+ u32 n_packets = f->n_vectors;
+ u32 n_left;
+ u32 *from;
+ struct rte_mbuf **tx_vector;
+ u16 i;
+ u16 nb_tx_desc = xd->nb_tx_desc;
+ int queue_id;
+ u32 my_cpu;
+ u32 tx_pkts = 0;
+ tx_ring_hdr_t *ring;
+ u32 n_on_ring;
+
+ my_cpu = vm->thread_index;
+
+ queue_id = my_cpu;
+
+ tx_vector = xd->tx_vectors[queue_id];
+ ring = vec_header (tx_vector, sizeof (*ring));
+
+ n_on_ring = ring->tx_head - ring->tx_tail;
+ from = vlib_frame_vector_args (f);
+
+ ASSERT (n_packets <= VLIB_FRAME_SIZE);
+
+ if (PREDICT_FALSE (n_on_ring + n_packets > nb_tx_desc))
+ {
+ /*
+ * Overflowing the ring should never happen.
+ * If it does then drop the whole frame.
+ */
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL,
+ n_packets);
+
+ while (n_packets--)
+ {
+ u32 bi0 = from[n_packets];
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+ struct rte_mbuf *mb0 = rte_mbuf_from_vlib_buffer (b0);
+ rte_pktmbuf_free (mb0);
+ }
+ return n_on_ring;
+ }
+
+ if (PREDICT_FALSE (dm->tx_pcap_enable))
+ {
+ n_left = n_packets;
+ while (n_left > 0)
+ {
+ u32 bi0 = from[0];
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+ if (dm->pcap_sw_if_index == 0 ||
+ dm->pcap_sw_if_index == vnet_buffer (b0)->sw_if_index[VLIB_TX])
+ pcap_add_buffer (&dm->pcap_main, vm, bi0, 512);
+ from++;
+ n_left--;
+ }
+ }
+
+ from = vlib_frame_vector_args (f);
+ n_left = n_packets;
+ i = ring->tx_head % nb_tx_desc;
+
+ while (n_left >= 8)
+ {
+ u32 bi0, bi1, bi2, bi3;
+ struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+ vlib_buffer_t *b0, *b1, *b2, *b3;
+ u32 or_flags;
+
+ dpdk_prefetch_buffer_by_index (vm, from[4]);
+ dpdk_prefetch_buffer_by_index (vm, from[5]);
+ dpdk_prefetch_buffer_by_index (vm, from[6]);
+ dpdk_prefetch_buffer_by_index (vm, from[7]);
+
+ bi0 = from[0];
+ bi1 = from[1];
+ bi2 = from[2];
+ bi3 = from[3];
+ from += 4;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+ b2 = vlib_get_buffer (vm, bi2);
+ b3 = vlib_get_buffer (vm, bi3);
+
+ or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
+
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3);
+
+ if (or_flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ dpdk_validate_rte_mbuf (vm, b0, 1);
+ dpdk_validate_rte_mbuf (vm, b1, 1);
+ dpdk_validate_rte_mbuf (vm, b2, 1);
+ dpdk_validate_rte_mbuf (vm, b3, 1);
+ }
+ else
+ {
+ dpdk_validate_rte_mbuf (vm, b0, 0);
+ dpdk_validate_rte_mbuf (vm, b1, 0);
+ dpdk_validate_rte_mbuf (vm, b2, 0);
+ dpdk_validate_rte_mbuf (vm, b3, 0);
+ }
+
+ mb0 = rte_mbuf_from_vlib_buffer (b0);
+ mb1 = rte_mbuf_from_vlib_buffer (b1);
+ mb2 = rte_mbuf_from_vlib_buffer (b2);
+ mb3 = rte_mbuf_from_vlib_buffer (b3);
+
+ if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) &&
+ (or_flags &
+ (VNET_BUFFER_F_OFFLOAD_TCP_CKSUM
+ | VNET_BUFFER_F_OFFLOAD_IP_CKSUM
+ | VNET_BUFFER_F_OFFLOAD_UDP_CKSUM))))
+ {
+ dpdk_buffer_tx_offload (xd, b0, mb0);
+ dpdk_buffer_tx_offload (xd, b1, mb1);
+ dpdk_buffer_tx_offload (xd, b2, mb2);
+ dpdk_buffer_tx_offload (xd, b3, mb3);
+ }
+
+ if (PREDICT_FALSE (or_flags & VLIB_BUFFER_RECYCLE))
+ {
+ dpdk_buffer_recycle (vm, node, b0, bi0, &mb0);
+ dpdk_buffer_recycle (vm, node, b1, bi1, &mb1);
+ dpdk_buffer_recycle (vm, node, b2, bi2, &mb2);
+ dpdk_buffer_recycle (vm, node, b3, bi3, &mb3);
+
+ /* dont enqueue packets if replication failed as they must
+ be sent back to recycle */
+ if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb0;
+ if (PREDICT_TRUE ((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb1;
+ if (PREDICT_TRUE ((b2->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb2;
+ if (PREDICT_TRUE ((b3->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ tx_vector[i++ % nb_tx_desc] = mb3;
+ }
+ else
+ {
+ if (PREDICT_FALSE (i + 3 >= nb_tx_desc))
+ {
+ tx_vector[i++ % nb_tx_desc] = mb0;
+ tx_vector[i++ % nb_tx_desc] = mb1;
+ tx_vector[i++ % nb_tx_desc] = mb2;
+ tx_vector[i++ % nb_tx_desc] = mb3;
+ i %= nb_tx_desc;
+ }
+ else
+ {
+ tx_vector[i++] = mb0;
+ tx_vector[i++] = mb1;
+ tx_vector[i++] = mb2;
+ tx_vector[i++] = mb3;
+ }
+ }
+
+
+ if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1);
+ if (b2->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi2, b2);
+ if (b3->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi3, b3);
+ }
+
+ n_left -= 4;
+ }
+ while (n_left > 0)
+ {
+ u32 bi0;
+ struct rte_mbuf *mb0;
+ vlib_buffer_t *b0;
+
+ bi0 = from[0];
+ from++;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+
+ dpdk_validate_rte_mbuf (vm, b0, 1);
+
+ mb0 = rte_mbuf_from_vlib_buffer (b0);
+ dpdk_buffer_tx_offload (xd, b0, mb0);
+ dpdk_buffer_recycle (vm, node, b0, bi0, &mb0);
+
+ if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+
+ if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % nb_tx_desc] = mb0;
+ i++;
+ }
+ n_left--;
+ }
+
+ /* account for additional packets in the ring */
+ ring->tx_head += n_packets;
+ n_on_ring = ring->tx_head - ring->tx_tail;
+
+ /* transmit as many packets as possible */
+ n_packets = tx_burst_vector_internal (vm, xd, tx_vector);
+
+ /*
+ * tx_pkts is the number of packets successfully transmitted
+ * This is the number originally on ring minus the number remaining on ring
+ */
+ tx_pkts = n_on_ring - n_packets;
+
+ {
+ /* If there is no callback then drop any non-transmitted packets */
+ if (PREDICT_FALSE (n_packets))
+ {
+ vlib_simple_counter_main_t *cm;
+ vnet_main_t *vnm = vnet_get_main ();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ n_packets);
+
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP,
+ n_packets);
+
+ while (n_packets--)
+ rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]);
+ }
+
+ /* Reset head/tail to avoid unnecessary wrap */
+ ring->tx_head = 0;
+ ring->tx_tail = 0;
+ }
+
+ /* Recycle replicated buffers */
+ if (PREDICT_FALSE (vec_len (dm->recycle[my_cpu])))
+ {
+ vlib_buffer_free (vm, dm->recycle[my_cpu],
+ vec_len (dm->recycle[my_cpu]));
+ _vec_len (dm->recycle[my_cpu]) = 0;
+ }
+
+ ASSERT (ring->tx_head >= ring->tx_tail);
+
+ return tx_pkts;
+}
+
+static void
+dpdk_clear_hw_interface_counters (u32 instance)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, instance);
+
+ /*
+ * Set the "last_cleared_stats" to the current stats, so that
+ * things appear to clear from a display perspective.
+ */
+ dpdk_update_counters (xd, vlib_time_now (dm->vlib_main));
+
+ clib_memcpy (&xd->last_cleared_stats, &xd->stats, sizeof (xd->stats));
+ clib_memcpy (xd->last_cleared_xstats, xd->xstats,
+ vec_len (xd->last_cleared_xstats) *
+ sizeof (xd->last_cleared_xstats[0]));
+
+}
+
+static clib_error_t *
+dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hif->dev_instance);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL)
+ return clib_error_return (0, "Interface not initialized");
+
+ if (is_up)
+ {
+ vnet_hw_interface_set_flags (vnm, xd->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
+ dpdk_device_start (xd);
+ xd->flags |= DPDK_DEVICE_FLAG_ADMIN_UP;
+ f64 now = vlib_time_now (dm->vlib_main);
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+ }
+ else
+ {
+ vnet_hw_interface_set_flags (vnm, xd->hw_if_index, 0);
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0)
+ dpdk_device_stop (xd);
+ xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP;
+ }
+
+ return /* no error */ 0;
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void
+dpdk_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ dpdk_main_t *xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ xd->per_interface_next_index = node_index;
+ return;
+ }
+
+ xd->per_interface_next_index =
+ vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index);
+}
+
+
+static clib_error_t *
+dpdk_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t *st, int is_add)
+{
+ dpdk_main_t *xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+ vnet_sw_interface_t *t = (vnet_sw_interface_t *) st;
+ int r, vlan_offload;
+ u32 prev_subifs = xd->num_subifs;
+ clib_error_t *err = 0;
+
+ if (is_add)
+ xd->num_subifs++;
+ else if (xd->num_subifs)
+ xd->num_subifs--;
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ goto done;
+
+ /* currently we program VLANS only for IXGBE VF and I40E VF */
+ if ((xd->pmd != VNET_DPDK_PMD_IXGBEVF) && (xd->pmd != VNET_DPDK_PMD_I40EVF))
+ goto done;
+
+ if (t->sub.eth.flags.no_tags == 1)
+ goto done;
+
+ if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "unsupported VLAN setup");
+ goto done;
+ }
+
+ vlan_offload = rte_eth_dev_get_vlan_offload (xd->device_index);
+ vlan_offload |= ETH_VLAN_FILTER_OFFLOAD;
+
+ if ((r = rte_eth_dev_set_vlan_offload (xd->device_index, vlan_offload)))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d",
+ xd->device_index, r);
+ goto done;
+ }
+
+
+ if ((r =
+ rte_eth_dev_vlan_filter (xd->device_index, t->sub.eth.outer_vlan_id,
+ is_add)))
+ {
+ xd->num_subifs = prev_subifs;
+ err = clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d",
+ xd->device_index, r);
+ goto done;
+ }
+
+done:
+ if (xd->num_subifs)
+ xd->flags |= DPDK_DEVICE_FLAG_HAVE_SUBIF;
+ else
+ xd->flags &= ~DPDK_DEVICE_FLAG_HAVE_SUBIF;
+
+ return err;
+}
+
+/* *INDENT-OFF* */
+VNET_DEVICE_CLASS (dpdk_device_class) = {
+ .name = "dpdk",
+ .tx_function = dpdk_interface_tx,
+ .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR,
+ .tx_function_error_strings = dpdk_tx_func_error_strings,
+ .format_device_name = format_dpdk_device_name,
+ .format_device = format_dpdk_device,
+ .format_tx_trace = format_dpdk_tx_dma_trace,
+ .clear_counters = dpdk_clear_hw_interface_counters,
+ .admin_up_down_function = dpdk_interface_admin_up_down,
+ .subif_add_del_function = dpdk_subif_add_del_function,
+ .rx_redirect_to_node = dpdk_set_interface_next_node,
+ .mac_addr_change_function = dpdk_set_mac_address,
+};
+
+VLIB_DEVICE_TX_FUNCTION_MULTIARCH (dpdk_device_class, dpdk_interface_tx)
+/* *INDENT-ON* */
+
+#define UP_DOWN_FLAG_EVENT 1
+
+uword
+admin_up_down_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ clib_error_t *error = 0;
+ uword event_type;
+ uword *event_data = 0;
+ u32 sw_if_index;
+ u32 flags;
+
+ while (1)
+ {
+ vlib_process_wait_for_event (vm);
+
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ dpdk_main.admin_up_down_in_progress = 1;
+
+ switch (event_type)
+ {
+ case UP_DOWN_FLAG_EVENT:
+ {
+ if (vec_len (event_data) == 2)
+ {
+ sw_if_index = event_data[0];
+ flags = event_data[1];
+ error =
+ vnet_sw_interface_set_flags (vnet_get_main (), sw_if_index,
+ flags);
+ clib_error_report (error);
+ }
+ }
+ break;
+ }
+
+ vec_reset_length (event_data);
+
+ dpdk_main.admin_up_down_in_progress = 0;
+
+ }
+ return 0; /* or not */
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (admin_up_down_process_node,static) = {
+ .function = admin_up_down_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "admin-up-down-process",
+ .process_log2_n_stack_bytes = 17, // 256KB
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/dpdk/device/dir.dox b/src/plugins/dpdk/device/dir.dox
new file mode 100644
index 00000000..43e36753
--- /dev/null
+++ b/src/plugins/dpdk/device/dir.dox
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Doxygen directory documentation */
+
+/**
+@dir
+@brief DPDK Abstraction Layer.
+
+This directory contains the source code for the DPDK abstraction layer.
+
+*/
+/*? %%clicmd:group_label DPDK and pcap tx %% ?*/
+/*? %%syscfg:group_label DPDK and pcap tx %% ?*/
diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h
new file mode 100644
index 00000000..9762c713
--- /dev/null
+++ b/src/plugins/dpdk/device/dpdk.h
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_dpdk_h__
+#define __included_dpdk_h__
+
+/* $$$$ We should rename always_inline -> clib_always_inline */
+#undef always_inline
+
+#include <rte_config.h>
+
+#include <rte_common.h>
+#include <rte_dev.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_cycles.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_version.h>
+#include <rte_eth_bond.h>
+#include <rte_sched.h>
+#include <rte_net.h>
+
+#include <vnet/unix/pcap.h>
+#include <vnet/devices/devices.h>
+
+#if CLIB_DEBUG > 0
+#define always_inline static inline
+#else
+#define always_inline static inline __attribute__ ((__always_inline__))
+#endif
+
+#include <vlib/pci/pci.h>
+
+#define NB_MBUF (16<<10)
+
+extern vnet_device_class_t dpdk_device_class;
+extern vlib_node_registration_t dpdk_input_node;
+
+#define foreach_dpdk_pmd \
+ _ ("net_thunderx", THUNDERX) \
+ _ ("net_e1000_em", E1000EM) \
+ _ ("net_e1000_igb", IGB) \
+ _ ("net_e1000_igb_vf", IGBVF) \
+ _ ("net_ixgbe", IXGBE) \
+ _ ("net_ixgbe_vf", IXGBEVF) \
+ _ ("net_i40e", I40E) \
+ _ ("net_i40e_vf", I40EVF) \
+ _ ("net_virtio", VIRTIO) \
+ _ ("net_enic", ENIC) \
+ _ ("net_vmxnet3", VMXNET3) \
+ _ ("AF_PACKET PMD", AF_PACKET) \
+ _ ("net_bonding", BOND) \
+ _ ("net_fm10k", FM10K) \
+ _ ("net_cxgbe", CXGBE) \
+ _ ("net_mlx4", MLX4) \
+ _ ("net_mlx5", MLX5) \
+ _ ("net_dpaa2", DPAA2) \
+ _ ("net_virtio_user", VIRTIO_USER) \
+ _ ("net_vhost", VHOST_ETHER)
+
+typedef enum
+{
+ VNET_DPDK_PMD_NONE,
+#define _(s,f) VNET_DPDK_PMD_##f,
+ foreach_dpdk_pmd
+#undef _
+ VNET_DPDK_PMD_UNKNOWN, /* must be last */
+} dpdk_pmd_t;
+
+typedef enum
+{
+ VNET_DPDK_PORT_TYPE_ETH_1G,
+ VNET_DPDK_PORT_TYPE_ETH_10G,
+ VNET_DPDK_PORT_TYPE_ETH_25G,
+ VNET_DPDK_PORT_TYPE_ETH_40G,
+ VNET_DPDK_PORT_TYPE_ETH_50G,
+ VNET_DPDK_PORT_TYPE_ETH_100G,
+ VNET_DPDK_PORT_TYPE_ETH_BOND,
+ VNET_DPDK_PORT_TYPE_ETH_SWITCH,
+ VNET_DPDK_PORT_TYPE_AF_PACKET,
+ VNET_DPDK_PORT_TYPE_ETH_VF,
+ VNET_DPDK_PORT_TYPE_VIRTIO_USER,
+ VNET_DPDK_PORT_TYPE_VHOST_ETHER,
+ VNET_DPDK_PORT_TYPE_UNKNOWN,
+} dpdk_port_type_t;
+
+/*
+ * The header for the tx_vector in dpdk_device_t.
+ * Head and tail are indexes into the tx_vector and are of type
+ * u64 so they never overflow.
+ */
+typedef struct
+{
+ u64 tx_head;
+ u64 tx_tail;
+} tx_ring_hdr_t;
+
+typedef struct
+{
+ struct rte_ring *swq;
+
+ u64 hqos_field0_slabmask;
+ u32 hqos_field0_slabpos;
+ u32 hqos_field0_slabshr;
+ u64 hqos_field1_slabmask;
+ u32 hqos_field1_slabpos;
+ u32 hqos_field1_slabshr;
+ u64 hqos_field2_slabmask;
+ u32 hqos_field2_slabpos;
+ u32 hqos_field2_slabshr;
+ u32 hqos_tc_table[64];
+} dpdk_device_hqos_per_worker_thread_t;
+
+typedef struct
+{
+ struct rte_ring **swq;
+ struct rte_mbuf **pkts_enq;
+ struct rte_mbuf **pkts_deq;
+ struct rte_sched_port *hqos;
+ u32 hqos_burst_enq;
+ u32 hqos_burst_deq;
+ u32 pkts_enq_len;
+ u32 swq_pos;
+ u32 flush_count;
+} dpdk_device_hqos_per_hqos_thread_t;
+
+typedef struct
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ volatile u32 **lockp;
+
+ /* Instance ID */
+ u32 device_index;
+
+ u32 hw_if_index;
+ u32 vlib_sw_if_index;
+
+ /* next node index if we decide to steal the rx graph arc */
+ u32 per_interface_next_index;
+
+ /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */
+ struct rte_mbuf ***tx_vectors; /* one per worker thread */
+ struct rte_mbuf ***rx_vectors;
+
+ /* vector of traced contexts, per device */
+ u32 **d_trace_buffers;
+
+ dpdk_pmd_t pmd:8;
+ i8 cpu_socket;
+
+ u16 flags;
+#define DPDK_DEVICE_FLAG_ADMIN_UP (1 << 0)
+#define DPDK_DEVICE_FLAG_PROMISC (1 << 1)
+#define DPDK_DEVICE_FLAG_PMD (1 << 2)
+#define DPDK_DEVICE_FLAG_PMD_INIT_FAIL (1 << 3)
+#define DPDK_DEVICE_FLAG_MAYBE_MULTISEG (1 << 4)
+#define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5)
+#define DPDK_DEVICE_FLAG_HQOS (1 << 6)
+#define DPDK_DEVICE_FLAG_BOND_SLAVE (1 << 7)
+#define DPDK_DEVICE_FLAG_BOND_SLAVE_UP (1 << 8)
+#define DPDK_DEVICE_FLAG_TX_OFFLOAD (1 << 9)
+#define DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM (1 << 10)
+
+ u16 nb_tx_desc;
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
+
+ u8 *interface_name_suffix;
+
+ /* number of sub-interfaces */
+ u16 num_subifs;
+
+ /* PMD related */
+ u16 tx_q_used;
+ u16 rx_q_used;
+ u16 nb_rx_desc;
+ u16 *cpu_socket_id_by_queue;
+ struct rte_eth_conf port_conf;
+ struct rte_eth_txconf tx_conf;
+
+ /* HQoS related */
+ dpdk_device_hqos_per_worker_thread_t *hqos_wt;
+ dpdk_device_hqos_per_hqos_thread_t *hqos_ht;
+
+ /* af_packet or BondEthernet instance number */
+ u8 port_id;
+
+ /* Bonded interface port# of a slave -
+ only valid if DPDK_DEVICE_FLAG_BOND_SLAVE bit is set */
+ u8 bond_port;
+
+ struct rte_eth_link link;
+ f64 time_last_link_update;
+
+ struct rte_eth_stats stats;
+ struct rte_eth_stats last_stats;
+ struct rte_eth_stats last_cleared_stats;
+ struct rte_eth_xstat *xstats;
+ struct rte_eth_xstat *last_cleared_xstats;
+ f64 time_last_stats_update;
+ dpdk_port_type_t port_type;
+
+ /* mac address */
+ u8 *default_mac_address;
+
+ /* error string */
+ clib_error_t *errors;
+} dpdk_device_t;
+
+#define DPDK_STATS_POLL_INTERVAL (10.0)
+#define DPDK_MIN_STATS_POLL_INTERVAL (0.001) /* 1msec */
+
+#define DPDK_LINK_POLL_INTERVAL (3.0)
+#define DPDK_MIN_LINK_POLL_INTERVAL (0.001) /* 1msec */
+
+typedef struct
+{
+ u32 device;
+ u16 queue_id;
+} dpdk_device_and_queue_t;
+
+#ifndef DPDK_HQOS_DBG_BYPASS
+#define DPDK_HQOS_DBG_BYPASS 0
+#endif
+
+#ifndef HQOS_FLUSH_COUNT_THRESHOLD
+#define HQOS_FLUSH_COUNT_THRESHOLD 100000
+#endif
+
+typedef struct dpdk_device_config_hqos_t
+{
+ u32 hqos_thread;
+ u32 hqos_thread_valid;
+
+ u32 swq_size;
+ u32 burst_enq;
+ u32 burst_deq;
+
+ u32 pktfield0_slabpos;
+ u32 pktfield1_slabpos;
+ u32 pktfield2_slabpos;
+ u64 pktfield0_slabmask;
+ u64 pktfield1_slabmask;
+ u64 pktfield2_slabmask;
+ u32 tc_table[64];
+
+ struct rte_sched_port_params port;
+ struct rte_sched_subport_params *subport;
+ struct rte_sched_pipe_params *pipe;
+ uint32_t *pipe_map;
+} dpdk_device_config_hqos_t;
+
+int dpdk_hqos_validate_mask (u64 mask, u32 n);
+void dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t *
+ hqos, u32 pipe_profile_id);
+void dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos);
+clib_error_t *dpdk_port_setup_hqos (dpdk_device_t * xd,
+ dpdk_device_config_hqos_t * hqos);
+void dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos,
+ struct rte_mbuf **pkts, u32 n_pkts);
+
+#define foreach_dpdk_device_config_item \
+ _ (num_rx_queues) \
+ _ (num_tx_queues) \
+ _ (num_rx_desc) \
+ _ (num_tx_desc) \
+ _ (rss_fn)
+
+typedef struct
+{
+ vlib_pci_addr_t pci_addr;
+ u8 is_blacklisted;
+ u8 vlan_strip_offload;
+#define DPDK_DEVICE_VLAN_STRIP_DEFAULT 0
+#define DPDK_DEVICE_VLAN_STRIP_OFF 1
+#define DPDK_DEVICE_VLAN_STRIP_ON 2
+
+#define _(x) uword x;
+ foreach_dpdk_device_config_item
+#undef _
+ clib_bitmap_t * workers;
+ u32 hqos_enabled;
+ dpdk_device_config_hqos_t hqos;
+} dpdk_device_config_t;
+
+typedef struct
+{
+
+ /* Config stuff */
+ u8 **eal_init_args;
+ u8 *eal_init_args_str;
+ u8 *uio_driver_name;
+ u8 no_multi_seg;
+ u8 enable_tcp_udp_checksum;
+
+ /* Required config parameters */
+ u8 coremask_set_manually;
+ u8 nchannels_set_manually;
+ u32 coremask;
+ u32 nchannels;
+ u32 num_mbufs;
+
+ /*
+ * format interface names ala xxxEthernet%d/%d/%d instead of
+ * xxxEthernet%x/%x/%x.
+ */
+ u8 interface_name_format_decimal;
+
+ /* per-device config */
+ dpdk_device_config_t default_devconf;
+ dpdk_device_config_t *dev_confs;
+ uword *device_config_index_by_pci_addr;
+
+} dpdk_config_main_t;
+
+dpdk_config_main_t dpdk_config_main;
+
+typedef struct
+{
+
+ /* Devices */
+ dpdk_device_t *devices;
+ dpdk_device_and_queue_t **devices_by_hqos_cpu;
+
+ /* per-thread recycle lists */
+ u32 **recycle;
+
+ /* per-thread buffer templates */
+ vlib_buffer_t *buffer_templates;
+
+ /* buffer flags template, configurable to enable/disable tcp / udp cksum */
+ u32 buffer_flags_template;
+
+ /* vlib buffer free list, must be same size as an rte_mbuf */
+ u32 vlib_buffer_free_list_index;
+
+ /* Ethernet input node index */
+ u32 ethernet_input_node_index;
+
+ /* pcap tracing [only works if (CLIB_DEBUG > 0)] */
+ int tx_pcap_enable;
+ pcap_main_t pcap_main;
+ u8 *pcap_filename;
+ u32 pcap_sw_if_index;
+ u32 pcap_pkts_to_capture;
+
+ /*
+ * flag indicating that a posted admin up/down
+ * (via post_sw_interface_set_flags) is in progress
+ */
+ u8 admin_up_down_in_progress;
+
+ u8 use_rss;
+
+ /* which cpus are running I/O TX */
+ int hqos_cpu_first_index;
+ int hqos_cpu_count;
+
+ /* control interval of dpdk link state and stat polling */
+ f64 link_state_poll_interval;
+ f64 stat_poll_interval;
+
+ /* Sleep for this many usec after each device poll */
+ u32 poll_sleep_usec;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+ dpdk_config_main_t *conf;
+
+ /* mempool */
+ struct rte_mempool **pktmbuf_pools;
+
+ /* API message ID base */
+ u16 msg_id_base;
+} dpdk_main_t;
+
+extern dpdk_main_t dpdk_main;
+
+typedef struct
+{
+ u32 buffer_index;
+ u16 device_index;
+ u8 queue_index;
+ struct rte_mbuf mb;
+ /* Copy of VLIB buffer; packet data stored in pre_data. */
+ vlib_buffer_t buffer;
+} dpdk_tx_dma_trace_t;
+
+typedef struct
+{
+ u32 buffer_index;
+ u16 device_index;
+ u16 queue_index;
+ struct rte_mbuf mb;
+ vlib_buffer_t buffer; /* Copy of VLIB buffer; pkt data stored in pre_data. */
+ u8 data[256]; /* First 256 data bytes, used for hexdump */
+} dpdk_rx_dma_trace_t;
+
+void dpdk_device_setup (dpdk_device_t * xd);
+void dpdk_device_start (dpdk_device_t * xd);
+void dpdk_device_stop (dpdk_device_t * xd);
+
+#if DPDK_VOID_CALLBACK
+void dpdk_port_state_callback (uint8_t port_id,
+ enum rte_eth_event_type type, void *param);
+#else
+int dpdk_port_state_callback (uint8_t port_id,
+ enum rte_eth_event_type type,
+ void *param, void *ret_param);
+#endif
+
+#define foreach_dpdk_error \
+ _(NONE, "no error") \
+ _(RX_PACKET_ERROR, "Rx packet errors") \
+ _(RX_BAD_FCS, "Rx bad fcs") \
+ _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \
+ _(RX_ALLOC_FAIL, "rx buf alloc from free list failed") \
+ _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem") \
+ _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error")
+
+typedef enum
+{
+#define _(f,s) DPDK_ERROR_##f,
+ foreach_dpdk_error
+#undef _
+ DPDK_N_ERROR,
+} dpdk_error_t;
+
+void dpdk_update_link_state (dpdk_device_t * xd, f64 now);
+
+format_function_t format_dpdk_device_name;
+format_function_t format_dpdk_device;
+format_function_t format_dpdk_device_errors;
+format_function_t format_dpdk_tx_dma_trace;
+format_function_t format_dpdk_rx_dma_trace;
+format_function_t format_dpdk_rte_mbuf;
+format_function_t format_dpdk_rx_rte_mbuf;
+unformat_function_t unformat_dpdk_log_level;
+clib_error_t *unformat_rss_fn (unformat_input_t * input, uword * rss_fn);
+clib_error_t *unformat_hqos (unformat_input_t * input,
+ dpdk_device_config_hqos_t * hqos);
+
+uword
+admin_up_down_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f);
+
+clib_error_t *dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs,
+ unsigned socket_id);
+
+#if CLI_DEBUG
+int dpdk_buffer_validate_trajectory_all (u32 * uninitialized);
+void dpdk_buffer_poison_trajectory_all (void);
+#endif
+
+#endif /* __included_dpdk_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/dpdk/device/dpdk_priv.h b/src/plugins/dpdk/device/dpdk_priv.h
new file mode 100644
index 00000000..52b4ca4b
--- /dev/null
+++ b/src/plugins/dpdk/device/dpdk_priv.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1)
+#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1))
+
+#define DPDK_NB_RX_DESC_DEFAULT 1024
+#define DPDK_NB_TX_DESC_DEFAULT 1024
+#define DPDK_NB_RX_DESC_VIRTIO 256
+#define DPDK_NB_TX_DESC_VIRTIO 256
+
+#define I40E_DEV_ID_SFP_XL710 0x1572
+#define I40E_DEV_ID_QSFP_A 0x1583
+#define I40E_DEV_ID_QSFP_B 0x1584
+#define I40E_DEV_ID_QSFP_C 0x1585
+#define I40E_DEV_ID_10G_BASE_T 0x1586
+#define I40E_DEV_ID_VF 0x154C
+
+/* These args appear by themselves */
+#define foreach_eal_double_hyphen_predicate_arg \
+_(no-shconf) \
+_(no-hpet) \
+_(no-huge) \
+_(vmware-tsc-map)
+
+#define foreach_eal_single_hyphen_mandatory_arg \
+_(coremask, c) \
+_(nchannels, n) \
+
+#define foreach_eal_single_hyphen_arg \
+_(blacklist, b) \
+_(mem-alloc-request, m) \
+_(force-ranks, r)
+
+/* These args are preceeded by "--" and followed by a single string */
+#define foreach_eal_double_hyphen_arg \
+_(huge-dir) \
+_(proc-type) \
+_(file-prefix) \
+_(vdev)
+
+static inline void
+dpdk_get_xstats (dpdk_device_t * xd)
+{
+ int len;
+ if ((len = rte_eth_xstats_get (xd->device_index, NULL, 0)) > 0)
+ {
+ vec_validate (xd->xstats, len - 1);
+ vec_validate (xd->last_cleared_xstats, len - 1);
+
+ len =
+ rte_eth_xstats_get (xd->device_index, xd->xstats,
+ vec_len (xd->xstats));
+
+ ASSERT (vec_len (xd->xstats) == len);
+ ASSERT (vec_len (xd->last_cleared_xstats) == len);
+
+ _vec_len (xd->xstats) = len;
+ _vec_len (xd->last_cleared_xstats) = len;
+
+ }
+}
+
+
+static inline void
+dpdk_update_counters (dpdk_device_t * xd, f64 now)
+{
+ vlib_simple_counter_main_t *cm;
+ vnet_main_t *vnm = vnet_get_main ();
+ u32 thread_index = vlib_get_thread_index ();
+ u64 rxerrors, last_rxerrors;
+
+ /* only update counters for PMD interfaces */
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ return;
+
+ xd->time_last_stats_update = now ? now : xd->time_last_stats_update;
+ clib_memcpy (&xd->last_stats, &xd->stats, sizeof (xd->last_stats));
+ rte_eth_stats_get (xd->device_index, &xd->stats);
+
+ /* maybe bump interface rx no buffer counter */
+ if (PREDICT_FALSE (xd->stats.rx_nombuf != xd->last_stats.rx_nombuf))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_NO_BUF);
+
+ vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index,
+ xd->stats.rx_nombuf -
+ xd->last_stats.rx_nombuf);
+ }
+
+ /* missed pkt counter */
+ if (PREDICT_FALSE (xd->stats.imissed != xd->last_stats.imissed))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_MISS);
+
+ vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index,
+ xd->stats.imissed -
+ xd->last_stats.imissed);
+ }
+ rxerrors = xd->stats.ierrors;
+ last_rxerrors = xd->last_stats.ierrors;
+
+ if (PREDICT_FALSE (rxerrors != last_rxerrors))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_ERROR);
+
+ vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index,
+ rxerrors - last_rxerrors);
+ }
+
+ dpdk_get_xstats (xd);
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c
new file mode 100644
index 00000000..697bdbe5
--- /dev/null
+++ b/src/plugins/dpdk/device/format.c
@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+#include <assert.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <dpdk/device/dpdk.h>
+
+#include <dpdk/device/dpdk_priv.h>
+#include <vppinfra/error.h>
+
+#define foreach_dpdk_counter \
+ _ (tx_frames_ok, opackets) \
+ _ (tx_bytes_ok, obytes) \
+ _ (tx_errors, oerrors) \
+ _ (rx_frames_ok, ipackets) \
+ _ (rx_bytes_ok, ibytes) \
+ _ (rx_errors, ierrors) \
+ _ (rx_missed, imissed) \
+ _ (rx_no_bufs, rx_nombuf)
+
+#define foreach_dpdk_q_counter \
+ _ (rx_frames_ok, q_ipackets) \
+ _ (tx_frames_ok, q_opackets) \
+ _ (rx_bytes_ok, q_ibytes) \
+ _ (tx_bytes_ok, q_obytes) \
+ _ (rx_errors, q_errors)
+
+#define foreach_dpdk_rss_hf \
+ _(ETH_RSS_FRAG_IPV4, "ipv4-frag") \
+ _(ETH_RSS_NONFRAG_IPV4_TCP, "ipv4-tcp") \
+ _(ETH_RSS_NONFRAG_IPV4_UDP, "ipv4-udp") \
+ _(ETH_RSS_NONFRAG_IPV4_SCTP, "ipv4-sctp") \
+ _(ETH_RSS_NONFRAG_IPV4_OTHER, "ipv4-other") \
+ _(ETH_RSS_IPV4, "ipv4") \
+ _(ETH_RSS_IPV6_TCP_EX, "ipv6-tcp-ex") \
+ _(ETH_RSS_IPV6_UDP_EX, "ipv6-udp-ex") \
+ _(ETH_RSS_FRAG_IPV6, "ipv6-frag") \
+ _(ETH_RSS_NONFRAG_IPV6_TCP, "ipv6-tcp") \
+ _(ETH_RSS_NONFRAG_IPV6_UDP, "ipv6-udp") \
+ _(ETH_RSS_NONFRAG_IPV6_SCTP, "ipv6-sctp") \
+ _(ETH_RSS_NONFRAG_IPV6_OTHER, "ipv6-other") \
+ _(ETH_RSS_L2_PAYLOAD, "l2-payload") \
+ _(ETH_RSS_IPV6_EX, "ipv6-ex") \
+ _(ETH_RSS_IPV6, "ipv6")
+
+
+#define foreach_dpdk_rx_offload_caps \
+ _(DEV_RX_OFFLOAD_VLAN_STRIP, "vlan-strip") \
+ _(DEV_RX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_RX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_LRO , "rcp-lro") \
+ _(DEV_RX_OFFLOAD_QINQ_STRIP, "qinq-strip")
+
+#define foreach_dpdk_tx_offload_caps \
+ _(DEV_TX_OFFLOAD_VLAN_INSERT, "vlan-insert") \
+ _(DEV_TX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_TX_OFFLOAD_SCTP_CKSUM , "sctp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_TSO , "tcp-tso") \
+ _(DEV_TX_OFFLOAD_UDP_TSO , "udp-tso") \
+ _(DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM, "outer-ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_QINQ_INSERT, "qinq-insert")
+
+#define foreach_dpdk_pkt_rx_offload_flag \
+ _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \
+ _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \
+ _ (PKT_RX_FDIR, "RX packet with FDIR infos") \
+ _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \
+ _ (PKT_RX_VLAN_STRIPPED, "RX packet VLAN tag stripped") \
+ _ (PKT_RX_IP_CKSUM_GOOD, "IP cksum of RX pkt. is valid") \
+ _ (PKT_RX_L4_CKSUM_GOOD, "L4 cksum of RX pkt. is valid") \
+ _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \
+ _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet") \
+ _ (PKT_RX_QINQ_STRIPPED, "RX packet QinQ tags stripped")
+
+#define foreach_dpdk_pkt_type \
+ _ (L2, ETHER, "Ethernet packet") \
+ _ (L2, ETHER_TIMESYNC, "Ethernet packet for time sync") \
+ _ (L2, ETHER_ARP, "ARP packet") \
+ _ (L2, ETHER_LLDP, "LLDP (Link Layer Discovery Protocol) packet") \
+ _ (L2, ETHER_NSH, "NSH (Network Service Header) packet") \
+ _ (L2, ETHER_VLAN, "VLAN packet") \
+ _ (L2, ETHER_QINQ, "QinQ packet") \
+ _ (L3, IPV4, "IPv4 packet without extension headers") \
+ _ (L3, IPV4_EXT, "IPv4 packet with extension headers") \
+ _ (L3, IPV4_EXT_UNKNOWN, "IPv4 packet with or without extension headers") \
+ _ (L3, IPV6, "IPv6 packet without extension headers") \
+ _ (L3, IPV6_EXT, "IPv6 packet with extension headers") \
+ _ (L3, IPV6_EXT_UNKNOWN, "IPv6 packet with or without extension headers") \
+ _ (L4, TCP, "TCP packet") \
+ _ (L4, UDP, "UDP packet") \
+ _ (L4, FRAG, "Fragmented IP packet") \
+ _ (L4, SCTP, "SCTP (Stream Control Transmission Protocol) packet") \
+ _ (L4, ICMP, "ICMP packet") \
+ _ (L4, NONFRAG, "Non-fragmented IP packet") \
+ _ (TUNNEL, GRE, "GRE tunneling packet") \
+ _ (TUNNEL, VXLAN, "VXLAN tunneling packet") \
+ _ (TUNNEL, NVGRE, "NVGRE Tunneling packet") \
+ _ (TUNNEL, GENEVE, "GENEVE Tunneling packet") \
+ _ (TUNNEL, GRENAT, "Teredo, VXLAN or GRE Tunneling packet") \
+ _ (INNER_L2, ETHER, "Inner Ethernet packet") \
+ _ (INNER_L2, ETHER_VLAN, "Inner Ethernet packet with VLAN") \
+ _ (INNER_L3, IPV4, "Inner IPv4 packet without extension headers") \
+ _ (INNER_L3, IPV4_EXT, "Inner IPv4 packet with extension headers") \
+ _ (INNER_L3, IPV4_EXT_UNKNOWN, "Inner IPv4 packet with or without extension headers") \
+ _ (INNER_L3, IPV6, "Inner IPv6 packet without extension headers") \
+ _ (INNER_L3, IPV6_EXT, "Inner IPv6 packet with extension headers") \
+ _ (INNER_L3, IPV6_EXT_UNKNOWN, "Inner IPv6 packet with or without extension headers") \
+ _ (INNER_L4, TCP, "Inner TCP packet") \
+ _ (INNER_L4, UDP, "Inner UDP packet") \
+ _ (INNER_L4, FRAG, "Inner fagmented IP packet") \
+ _ (INNER_L4, SCTP, "Inner SCTP (Stream Control Transmission Protocol) packet") \
+ _ (INNER_L4, ICMP, "Inner ICMP packet") \
+ _ (INNER_L4, NONFRAG, "Inner non-fragmented IP packet")
+
+#define foreach_dpdk_pkt_tx_offload_flag \
+ _ (PKT_TX_VLAN_PKT, "TX packet is a 802.1q VLAN packet") \
+ _ (PKT_TX_IP_CKSUM, "IP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_TCP_CKSUM, "TCP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_SCTP_CKSUM, "SCTP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_IEEE1588_TMST, "TX IEEE1588 packet to timestamp")
+
+#define foreach_dpdk_pkt_offload_flag \
+ foreach_dpdk_pkt_rx_offload_flag \
+ foreach_dpdk_pkt_tx_offload_flag
+
+#define foreach_dpdk_log_level \
+ _ (EMERG, "emergency") \
+ _ (ALERT, "alert") \
+ _ (CRIT, "critical") \
+ _ (ERR, "error") \
+ _ (WARNING, "warning") \
+ _ (NOTICE, "notice") \
+ _ (INFO, "info") \
+ _ (DEBUG, "debug")
+
+u8 *
+format_dpdk_device_name (u8 * s, va_list * args)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ char *devname_format;
+ char *device_name;
+ u32 i = va_arg (*args, u32);
+ struct rte_eth_dev_info dev_info;
+ u8 *ret;
+
+ if (dm->conf->interface_name_format_decimal)
+ devname_format = "%s%d/%d/%d";
+ else
+ devname_format = "%s%x/%x/%x";
+
+ switch (dm->devices[i].port_type)
+ {
+ case VNET_DPDK_PORT_TYPE_ETH_1G:
+ device_name = "GigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_10G:
+ device_name = "TenGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_25G:
+ device_name = "TwentyFiveGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_40G:
+ device_name = "FortyGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_50G:
+ device_name = "FiftyGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_100G:
+ device_name = "HundredGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_BOND:
+ return format (s, "BondEthernet%d", dm->devices[i].port_id);
+
+ case VNET_DPDK_PORT_TYPE_ETH_SWITCH:
+ device_name = "EthernetSwitch";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_VF:
+ device_name = "VirtualFunctionEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_AF_PACKET:
+ rte_eth_dev_info_get (i, &dev_info);
+ return format (s, "af_packet%d", dm->devices[i].port_id);
+
+ case VNET_DPDK_PORT_TYPE_VIRTIO_USER:
+ device_name = "VirtioUser";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_VHOST_ETHER:
+ device_name = "VhostEthernet";
+ break;
+
+ default:
+ case VNET_DPDK_PORT_TYPE_UNKNOWN:
+ device_name = "UnknownEthernet";
+ break;
+ }
+
+ rte_eth_dev_info_get (i, &dev_info);
+
+ if (dev_info.pci_dev)
+ ret = format (s, devname_format, device_name, dev_info.pci_dev->addr.bus,
+ dev_info.pci_dev->addr.devid,
+ dev_info.pci_dev->addr.function);
+ else
+ ret = format (s, "%s%d", device_name, dm->devices[i].device_index);
+
+ if (dm->devices[i].interface_name_suffix)
+ return format (ret, "/%s", dm->devices[i].interface_name_suffix);
+ return ret;
+}
+
+static u8 *
+format_dpdk_device_type (u8 * s, va_list * args)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ char *dev_type;
+ u32 i = va_arg (*args, u32);
+
+ switch (dm->devices[i].pmd)
+ {
+ case VNET_DPDK_PMD_E1000EM:
+ dev_type = "Intel 82540EM (e1000)";
+ break;
+
+ case VNET_DPDK_PMD_IGB:
+ dev_type = "Intel e1000";
+ break;
+
+ case VNET_DPDK_PMD_I40E:
+ dev_type = "Intel X710/XL710 Family";
+ break;
+
+ case VNET_DPDK_PMD_I40EVF:
+ dev_type = "Intel X710/XL710 Family VF";
+ break;
+
+ case VNET_DPDK_PMD_FM10K:
+ dev_type = "Intel FM10000 Family Ethernet Switch";
+ break;
+
+ case VNET_DPDK_PMD_IGBVF:
+ dev_type = "Intel e1000 VF";
+ break;
+
+ case VNET_DPDK_PMD_VIRTIO:
+ dev_type = "Red Hat Virtio";
+ break;
+
+ case VNET_DPDK_PMD_IXGBEVF:
+ dev_type = "Intel 82599 VF";
+ break;
+
+ case VNET_DPDK_PMD_IXGBE:
+ dev_type = "Intel 82599";
+ break;
+
+ case VNET_DPDK_PMD_ENIC:
+ dev_type = "Cisco VIC";
+ break;
+
+ case VNET_DPDK_PMD_CXGBE:
+ dev_type = "Chelsio T4/T5";
+ break;
+
+ case VNET_DPDK_PMD_MLX4:
+ dev_type = "Mellanox ConnectX-3 Family";
+ break;
+
+ case VNET_DPDK_PMD_MLX5:
+ dev_type = "Mellanox ConnectX-4 Family";
+ break;
+
+ case VNET_DPDK_PMD_VMXNET3:
+ dev_type = "VMware VMXNET3";
+ break;
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ dev_type = "af_packet";
+ break;
+
+ case VNET_DPDK_PMD_BOND:
+ dev_type = "Ethernet Bonding";
+ break;
+
+ case VNET_DPDK_PMD_DPAA2:
+ dev_type = "NXP DPAA2 Mac";
+ break;
+
+ case VNET_DPDK_PMD_VIRTIO_USER:
+ dev_type = "Virtio User";
+ break;
+
+ case VNET_DPDK_PMD_THUNDERX:
+ dev_type = "Cavium ThunderX";
+ break;
+
+ case VNET_DPDK_PMD_VHOST_ETHER:
+ dev_type = "VhostEthernet";
+ break;
+
+ default:
+ case VNET_DPDK_PMD_UNKNOWN:
+ dev_type = "### UNKNOWN ###";
+ break;
+ }
+
+ return format (s, dev_type);
+}
+
+static u8 *
+format_dpdk_link_status (u8 * s, va_list * args)
+{
+ dpdk_device_t *xd = va_arg (*args, dpdk_device_t *);
+ struct rte_eth_link *l = &xd->link;
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->hw_if_index);
+
+ s = format (s, "%s ", l->link_status ? "up" : "down");
+ if (l->link_status)
+ {
+ u32 promisc = rte_eth_promiscuous_get (xd->device_index);
+
+ s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ?
+ "full" : "half");
+ s = format (s, "speed %u mtu %d %s\n", l->link_speed,
+ hi->max_packet_bytes, promisc ? " promisc" : "");
+ }
+ else
+ s = format (s, "\n");
+
+ return s;
+}
+
+#define _line_len 72
+#define _(v, str) \
+if (bitmap & v) { \
+ if (format_get_indent (s) > next_split ) { \
+ next_split += _line_len; \
+ s = format(s,"\n%U", format_white_space, indent); \
+ } \
+ s = format(s, "%s ", str); \
+}
+
+static u8 *
+format_dpdk_rss_hf_name (u8 * s, va_list * args)
+{
+ u64 bitmap = va_arg (*args, u64);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_rss_hf return s;
+}
+
+static u8 *
+format_dpdk_rx_offload_caps (u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_rx_offload_caps return s;
+}
+
+static u8 *
+format_dpdk_tx_offload_caps (u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+ if (!bitmap)
+ return format (s, "none");
+
+ foreach_dpdk_tx_offload_caps return s;
+}
+
+#undef _line_len
+#undef _
+
+u8 *
+format_dpdk_device_errors (u8 * s, va_list * args)
+{
+ dpdk_device_t *xd = va_arg (*args, dpdk_device_t *);
+ clib_error_t *e;
+ uword indent = format_get_indent (s);
+
+ vec_foreach (e, xd->errors)
+ {
+ s = format (s, "%U%v\n", format_white_space, indent, e->what);
+ }
+ return s;
+}
+
+u8 *
+format_dpdk_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ int verbose = va_arg (*args, int);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dev_instance);
+ uword indent = format_get_indent (s);
+ f64 now = vlib_time_now (dm->vlib_main);
+ struct rte_eth_dev_info di;
+
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+
+ s = format (s, "%U\n%Ucarrier %U",
+ format_dpdk_device_type, xd->device_index,
+ format_white_space, indent + 2, format_dpdk_link_status, xd);
+
+ rte_eth_dev_info_get (xd->device_index, &di);
+
+ if (verbose > 1 && xd->flags & DPDK_DEVICE_FLAG_PMD)
+ {
+ struct rte_pci_device *pci;
+ struct rte_eth_rss_conf rss_conf;
+ int vlan_off;
+ int retval;
+
+ rss_conf.rss_key = 0;
+ retval = rte_eth_dev_rss_hash_conf_get (xd->device_index, &rss_conf);
+ if (retval < 0)
+ clib_warning ("rte_eth_dev_rss_hash_conf_get returned %d", retval);
+ pci = di.pci_dev;
+
+ if (pci)
+ s =
+ format (s,
+ "%Upci id: device %04x:%04x subsystem %04x:%04x\n"
+ "%Upci address: %04x:%02x:%02x.%02x\n",
+ format_white_space, indent + 2, pci->id.vendor_id,
+ pci->id.device_id, pci->id.subsystem_vendor_id,
+ pci->id.subsystem_device_id, format_white_space, indent + 2,
+ pci->addr.domain, pci->addr.bus, pci->addr.devid,
+ pci->addr.function);
+ s =
+ format (s, "%Umax rx packet len: %d\n", format_white_space,
+ indent + 2, di.max_rx_pktlen);
+ s =
+ format (s, "%Umax num of queues: rx %d tx %d\n", format_white_space,
+ indent + 2, di.max_rx_queues, di.max_tx_queues);
+ s =
+ format (s, "%Upromiscuous: unicast %s all-multicast %s\n",
+ format_white_space, indent + 2,
+ rte_eth_promiscuous_get (xd->device_index) ? "on" : "off",
+ rte_eth_allmulticast_get (xd->device_index) ? "on" : "off");
+ vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index);
+ s = format (s, "%Uvlan offload: strip %s filter %s qinq %s\n",
+ format_white_space, indent + 2,
+ vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off");
+ s = format (s, "%Urx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rx_offload_caps, di.rx_offload_capa);
+ s = format (s, "%Utx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_tx_offload_caps, di.tx_offload_capa);
+ s = format (s, "%Urss active: %U\n"
+ "%Urss supported: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, rss_conf.rss_hf,
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, di.flow_type_rss_offloads);
+ }
+
+ s = format (s, "%Urx queues %d, rx desc %d, tx queues %d, tx desc %d\n",
+ format_white_space, indent + 2,
+ xd->rx_q_used, xd->nb_rx_desc, xd->tx_q_used, xd->nb_tx_desc);
+
+ if (xd->cpu_socket > -1)
+ s = format (s, "%Ucpu socket %d\n",
+ format_white_space, indent + 2, xd->cpu_socket);
+
+ /* $$$ MIB counters */
+ {
+#define _(N, V) \
+ if ((xd->stats.V - xd->last_cleared_stats.V) != 0) { \
+ s = format (s, "\n%U%-40U%16Ld", \
+ format_white_space, indent + 2, \
+ format_c_identifier, #N, \
+ xd->stats.V - xd->last_cleared_stats.V); \
+ } \
+
+ foreach_dpdk_counter
+#undef _
+ }
+
+ u8 *xs = 0;
+ u32 i = 0;
+ struct rte_eth_xstat *xstat, *last_xstat;
+ struct rte_eth_xstat_name *xstat_names = 0;
+ int len = rte_eth_xstats_get_names (xd->device_index, NULL, 0);
+ vec_validate (xstat_names, len - 1);
+ rte_eth_xstats_get_names (xd->device_index, xstat_names, len);
+
+ ASSERT (vec_len (xd->xstats) == vec_len (xd->last_cleared_xstats));
+
+ /* *INDENT-OFF* */
+ vec_foreach_index(i, xd->xstats)
+ {
+ u64 delta = 0;
+ xstat = vec_elt_at_index(xd->xstats, i);
+ last_xstat = vec_elt_at_index(xd->last_cleared_xstats, i);
+
+ delta = xstat->value - last_xstat->value;
+ if (verbose == 2 || (verbose && delta))
+ {
+ /* format_c_identifier doesn't like c strings inside vector */
+ u8 * name = format(0,"%s", xstat_names[i].name);
+ xs = format(xs, "\n%U%-38U%16Ld",
+ format_white_space, indent + 4,
+ format_c_identifier, name, delta);
+ vec_free(name);
+ }
+ }
+ /* *INDENT-ON* */
+
+ vec_free (xstat_names);
+
+ if (xs)
+ {
+ s = format (s, "\n%Uextended stats:%v",
+ format_white_space, indent + 2, xs);
+ vec_free (xs);
+ }
+
+ if (vec_len (xd->errors))
+ {
+ s = format (s, "%UErrors:\n %U", format_white_space, indent,
+ format_dpdk_device_errors, xd);
+ }
+
+ return s;
+}
+
+u8 *
+format_dpdk_tx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main ();
+ dpdk_tx_dma_trace_t *t = va_arg (*va, dpdk_tx_dma_trace_t *);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index);
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U tx queue %d",
+ format_vnet_sw_interface_name, vnm, sw, t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index, format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_ethernet_header_with_length, t->buffer.pre_data,
+ sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+u8 *
+format_dpdk_rx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main ();
+ dpdk_rx_dma_trace_t *t = va_arg (*va, dpdk_rx_dma_trace_t *);
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index);
+ format_function_t *f;
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U rx queue %d",
+ format_vnet_sw_interface_name, vnm, sw, t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index, format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_dpdk_rte_mbuf, &t->mb, &t->data);
+
+ if (vm->trace_main.verbose)
+ {
+ s = format (s, "\n%UPacket Dump%s", format_white_space, indent + 2,
+ t->mb.data_len > sizeof (t->data) ? " (truncated)" : "");
+ s = format (s, "\n%U%U", format_white_space, indent + 4,
+ format_hexdump, &t->data,
+ t->mb.data_len >
+ sizeof (t->data) ? sizeof (t->data) : t->mb.data_len);
+ }
+ f = node->format_buffer;
+ if (!f)
+ f = format_hex_bytes;
+ s = format (s, "\n%U%U", format_white_space, indent,
+ f, t->buffer.pre_data, sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+
+static inline u8 *
+format_dpdk_pkt_types (u8 * s, va_list * va)
+{
+ u32 *pkt_types = va_arg (*va, u32 *);
+ uword indent __attribute__ ((unused)) = format_get_indent (s) + 2;
+
+ if (!*pkt_types)
+ return s;
+
+ s = format (s, "Packet Types");
+
+#define _(L, F, S) \
+ if ((*pkt_types & RTE_PTYPE_##L##_MASK) == RTE_PTYPE_##L##_##F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", format_white_space, indent, \
+ "RTE_PTYPE_" #L "_" #F, RTE_PTYPE_##L##_##F, S); \
+ }
+
+ foreach_dpdk_pkt_type
+#undef _
+ return s;
+}
+
+static inline u8 *
+format_dpdk_pkt_offload_flags (u8 * s, va_list * va)
+{
+ u64 *ol_flags = va_arg (*va, u64 *);
+ uword indent = format_get_indent (s) + 2;
+
+ if (!*ol_flags)
+ return s;
+
+ s = format (s, "Packet Offload Flags");
+
+#define _(F, S) \
+ if (*ol_flags & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_offload_flag
+#undef _
+ return s;
+}
+
+u8 *
+format_dpdk_rte_mbuf_vlan (u8 * s, va_list * va)
+{
+ ethernet_vlan_header_tv_t *vlan_hdr =
+ va_arg (*va, ethernet_vlan_header_tv_t *);
+
+ if (clib_net_to_host_u16 (vlan_hdr->type) == ETHERNET_TYPE_DOT1AD)
+ {
+ s = format (s, "%U 802.1q vlan ",
+ format_ethernet_vlan_tci,
+ clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id));
+ vlan_hdr++;
+ }
+
+ s = format (s, "%U",
+ format_ethernet_vlan_tci,
+ clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id));
+
+ return s;
+}
+
+u8 *
+format_dpdk_rte_mbuf (u8 * s, va_list * va)
+{
+ struct rte_mbuf *mb = va_arg (*va, struct rte_mbuf *);
+ ethernet_header_t *eth_hdr = va_arg (*va, ethernet_header_t *);
+ uword indent = format_get_indent (s) + 2;
+
+ s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d"
+ "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x, data_off %d, phys_addr 0x%x"
+ "\n%Upacket_type 0x%x",
+ mb->port, mb->nb_segs, mb->pkt_len,
+ format_white_space, indent,
+ mb->buf_len, mb->data_len, mb->ol_flags, mb->data_off,
+ mb->buf_physaddr, format_white_space, indent, mb->packet_type);
+
+ if (mb->ol_flags)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_offload_flags, &mb->ol_flags);
+
+ if ((mb->ol_flags & PKT_RX_VLAN_PKT) &&
+ ((mb->ol_flags & (PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ_STRIPPED)) == 0))
+ {
+ ethernet_vlan_header_tv_t *vlan_hdr =
+ ((ethernet_vlan_header_tv_t *) & (eth_hdr->type));
+ s = format (s, " %U", format_dpdk_rte_mbuf_vlan, vlan_hdr);
+ }
+
+ if (mb->packet_type)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_types, &mb->packet_type);
+
+ return s;
+}
+
+clib_error_t *
+unformat_rss_fn (unformat_input_t * input, uword * rss_fn)
+{
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (0)
+ ;
+#undef _
+#define _(f, s) \
+ else if (unformat (input, s)) \
+ *rss_fn |= f;
+
+ foreach_dpdk_rss_hf
+#undef _
+ else
+ {
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ }
+ return 0;
+}
+
+uword
+unformat_dpdk_log_level (unformat_input_t * input, va_list * args)
+{
+ u32 *r = va_arg (*args, u32 *);
+
+ if (0);
+#define _(v,s) else if (unformat (input, s)) *r = RTE_LOG_##v;
+ foreach_dpdk_log_level
+#undef _
+ else
+ return 0;
+ return 1;
+}
+
+clib_error_t *
+unformat_hqos (unformat_input_t * input, dpdk_device_config_hqos_t * hqos)
+{
+ clib_error_t *error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "hqos-thread %u", &hqos->hqos_thread))
+ hqos->hqos_thread_valid = 1;
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ break;
+ }
+ }
+
+ return error;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c
new file mode 100755
index 00000000..acf712ff
--- /dev/null
+++ b/src/plugins/dpdk/device/init.c
@@ -0,0 +1,1589 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/linux/sysfs.h>
+#include <vlib/unix/unix.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <dpdk/device/dpdk.h>
+#include <vlib/pci/pci.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <dpdk/device/dpdk_priv.h>
+
+dpdk_main_t dpdk_main;
+
+#define LINK_STATE_ELOGS 0
+
+/* Port configuration, mildly modified Intel app values */
+
+static struct rte_eth_conf port_conf_template = {
+ .rxmode = {
+ .split_hdr_size = 0,
+ .header_split = 0, /**< Header Split disabled */
+ .hw_ip_checksum = 0, /**< IP checksum offload disabled */
+ .hw_vlan_filter = 0, /**< VLAN filtering disabled */
+ .hw_strip_crc = 0, /**< CRC stripped by hardware */
+ },
+ .txmode = {
+ .mq_mode = ETH_MQ_TX_NONE,
+ },
+};
+
+static dpdk_port_type_t
+port_type_from_speed_capa (struct rte_eth_dev_info *dev_info)
+{
+
+ if (dev_info->speed_capa & ETH_LINK_SPEED_100G)
+ return VNET_DPDK_PORT_TYPE_ETH_100G;
+ else if (dev_info->speed_capa & ETH_LINK_SPEED_50G)
+ return VNET_DPDK_PORT_TYPE_ETH_50G;
+ else if (dev_info->speed_capa & ETH_LINK_SPEED_40G)
+ return VNET_DPDK_PORT_TYPE_ETH_40G;
+ else if (dev_info->speed_capa & ETH_LINK_SPEED_25G)
+ return VNET_DPDK_PORT_TYPE_ETH_25G;
+ else if (dev_info->speed_capa & ETH_LINK_SPEED_10G)
+ return VNET_DPDK_PORT_TYPE_ETH_10G;
+ else if (dev_info->speed_capa & ETH_LINK_SPEED_1G)
+ return VNET_DPDK_PORT_TYPE_ETH_1G;
+
+ return VNET_DPDK_PORT_TYPE_UNKNOWN;
+}
+
+
+static u32
+dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+ u32 old = 0;
+
+ if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC (flags))
+ {
+ old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0;
+
+ if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL)
+ xd->flags |= DPDK_DEVICE_FLAG_PROMISC;
+ else
+ xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ {
+ if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
+ rte_eth_promiscuous_enable (xd->device_index);
+ else
+ rte_eth_promiscuous_disable (xd->device_index);
+ }
+ }
+ else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU (flags))
+ {
+ int rv;
+
+ xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes;
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ dpdk_device_stop (xd);
+
+ rv = rte_eth_dev_configure
+ (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf);
+
+ if (rv < 0)
+ vlib_cli_output (vlib_get_main (),
+ "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
+ dpdk_device_start (xd);
+
+ }
+ return old;
+}
+
+static void
+dpdk_device_lock_init (dpdk_device_t * xd)
+{
+ int q;
+ vec_validate (xd->lockp, xd->tx_q_used - 1);
+ for (q = 0; q < xd->tx_q_used; q++)
+ {
+ xd->lockp[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) xd->lockp[q], 0, CLIB_CACHE_LINE_BYTES);
+ }
+}
+
+static clib_error_t *
+dpdk_lib_init (dpdk_main_t * dm)
+{
+ u32 nports;
+ u32 nb_desc = 0;
+ int i;
+ clib_error_t *error;
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ vnet_device_main_t *vdm = &vnet_device_main;
+ vnet_sw_interface_t *sw;
+ vnet_hw_interface_t *hi;
+ dpdk_device_t *xd;
+ vlib_pci_addr_t last_pci_addr;
+ u32 last_pci_addr_port = 0;
+ vlib_thread_registration_t *tr_hqos;
+ uword *p_hqos;
+
+ u32 next_hqos_cpu = 0;
+ u8 af_packet_port_id = 0;
+ u8 bond_ether_port_id = 0;
+ last_pci_addr.as_u32 = ~0;
+
+ dm->hqos_cpu_first_index = 0;
+ dm->hqos_cpu_count = 0;
+
+ /* find out which cpus will be used for I/O TX */
+ p_hqos = hash_get_mem (tm->thread_registrations_by_name, "hqos-threads");
+ tr_hqos = p_hqos ? (vlib_thread_registration_t *) p_hqos[0] : 0;
+
+ if (tr_hqos && tr_hqos->count > 0)
+ {
+ dm->hqos_cpu_first_index = tr_hqos->first_index;
+ dm->hqos_cpu_count = tr_hqos->count;
+ }
+
+ vec_validate_aligned (dm->devices_by_hqos_cpu, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ nports = rte_eth_dev_count ();
+ if (nports < 1)
+ {
+ clib_warning ("DPDK drivers found no ports...");
+ }
+
+ if (CLIB_DEBUG > 0)
+ clib_warning ("DPDK drivers found %d ports...", nports);
+
+ /*
+ * All buffers are all allocated from the same rte_mempool.
+ * Thus they all have the same number of data bytes.
+ */
+ dm->vlib_buffer_free_list_index =
+ vlib_buffer_get_or_create_free_list (vm,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
+ "dpdk rx");
+
+ if (dm->conf->enable_tcp_udp_checksum)
+ dm->buffer_flags_template &= ~(VNET_BUFFER_F_L4_CHECKSUM_CORRECT
+ | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED);
+
+ /* vlib_buffer_t template */
+ vec_validate_aligned (dm->buffer_templates, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+ for (i = 0; i < tm->n_vlib_mains; i++)
+ {
+ vlib_buffer_free_list_t *fl;
+ vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, i);
+ fl = vlib_buffer_get_free_list (vm,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+ vlib_buffer_init_for_free_list (bt, fl);
+ bt->flags = dm->buffer_flags_template;
+ bt->current_data = -RTE_PKTMBUF_HEADROOM;
+ vnet_buffer (bt)->sw_if_index[VLIB_TX] = (u32) ~ 0;
+ }
+
+ for (i = 0; i < nports; i++)
+ {
+ u8 addr[6];
+ u8 vlan_strip = 0;
+ int j;
+ struct rte_eth_dev_info dev_info;
+ struct rte_eth_link l;
+ dpdk_device_config_t *devconf = 0;
+ vlib_pci_addr_t pci_addr;
+ uword *p = 0;
+
+ rte_eth_dev_info_get (i, &dev_info);
+ if (dev_info.pci_dev) /* bonded interface has no pci info */
+ {
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr,
+ pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ /* Create vnet interface */
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
+ xd->cpu_socket = (i8) rte_eth_dev_socket_id (i);
+
+ /* Handle interface naming for devices with multiple ports sharing same PCI ID */
+ if (dev_info.pci_dev)
+ {
+ struct rte_eth_dev_info di = { 0 };
+ rte_eth_dev_info_get (i + 1, &di);
+ if (di.pci_dev && pci_addr.as_u32 != last_pci_addr.as_u32 &&
+ memcmp (&dev_info.pci_dev->addr, &di.pci_dev->addr,
+ sizeof (struct rte_pci_addr)) == 0)
+ {
+ xd->interface_name_suffix = format (0, "0");
+ last_pci_addr.as_u32 = pci_addr.as_u32;
+ last_pci_addr_port = i;
+ }
+ else if (pci_addr.as_u32 == last_pci_addr.as_u32)
+ {
+ xd->interface_name_suffix =
+ format (0, "%u", i - last_pci_addr_port);
+ }
+ else
+ {
+ last_pci_addr.as_u32 = ~0;
+ }
+ }
+ else
+ last_pci_addr.as_u32 = ~0;
+
+ clib_memcpy (&xd->tx_conf, &dev_info.default_txconf,
+ sizeof (struct rte_eth_txconf));
+ if (dm->conf->no_multi_seg)
+ {
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 0;
+ port_conf_template.rxmode.enable_scatter = 0;
+ }
+ else
+ {
+ xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 1;
+ port_conf_template.rxmode.enable_scatter = 1;
+ xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG;
+ }
+
+ clib_memcpy (&xd->port_conf, &port_conf_template,
+ sizeof (struct rte_eth_conf));
+
+ xd->tx_q_used = clib_min (dev_info.max_tx_queues, tm->n_vlib_mains);
+
+ if (devconf->num_tx_queues > 0
+ && devconf->num_tx_queues < xd->tx_q_used)
+ xd->tx_q_used = clib_min (xd->tx_q_used, devconf->num_tx_queues);
+
+ if (devconf->num_rx_queues > 1 && dm->use_rss == 0)
+ {
+ dm->use_rss = 1;
+ }
+
+ if (devconf->num_rx_queues > 1
+ && dev_info.max_rx_queues >= devconf->num_rx_queues)
+ {
+ xd->rx_q_used = devconf->num_rx_queues;
+ xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+ if (devconf->rss_fn == 0)
+ xd->port_conf.rx_adv_conf.rss_conf.rss_hf =
+ ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP;
+ else
+ xd->port_conf.rx_adv_conf.rss_conf.rss_hf = devconf->rss_fn;
+ }
+ else
+ xd->rx_q_used = 1;
+
+ xd->flags |= DPDK_DEVICE_FLAG_PMD;
+
+ /* workaround for drivers not setting driver_name */
+ if ((!dev_info.driver_name) && (dev_info.pci_dev))
+ dev_info.driver_name = dev_info.pci_dev->driver->driver.name;
+
+ ASSERT (dev_info.driver_name);
+
+ if (!xd->pmd)
+ {
+
+
+#define _(s,f) else if (dev_info.driver_name && \
+ !strcmp(dev_info.driver_name, s)) \
+ xd->pmd = VNET_DPDK_PMD_##f;
+ if (0)
+ ;
+ foreach_dpdk_pmd
+#undef _
+ else
+ xd->pmd = VNET_DPDK_PMD_UNKNOWN;
+
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
+
+ switch (xd->pmd)
+ {
+ /* Drivers with valid speed_capa set */
+ case VNET_DPDK_PMD_E1000EM:
+ case VNET_DPDK_PMD_IGB:
+ case VNET_DPDK_PMD_IXGBE:
+ case VNET_DPDK_PMD_I40E:
+ xd->port_type = port_type_from_speed_capa (&dev_info);
+ xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD |
+ DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
+
+ break;
+ case VNET_DPDK_PMD_CXGBE:
+ case VNET_DPDK_PMD_MLX4:
+ case VNET_DPDK_PMD_MLX5:
+ xd->port_type = port_type_from_speed_capa (&dev_info);
+ break;
+
+ /* SR-IOV VFs */
+ case VNET_DPDK_PMD_IGBVF:
+ case VNET_DPDK_PMD_IXGBEVF:
+ case VNET_DPDK_PMD_I40EVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
+ xd->port_conf.rxmode.hw_strip_crc = 1;
+ break;
+
+ case VNET_DPDK_PMD_THUNDERX:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
+ xd->port_conf.rxmode.hw_strip_crc = 1;
+ break;
+
+ case VNET_DPDK_PMD_DPAA2:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+
+ /* Cisco VIC */
+ case VNET_DPDK_PMD_ENIC:
+ rte_eth_link_get_nowait (i, &l);
+ if (l.link_speed == 40000)
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ else
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+
+ /* Intel Red Rock Canyon */
+ case VNET_DPDK_PMD_FM10K:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH;
+ xd->port_conf.rxmode.hw_strip_crc = 1;
+ break;
+
+ /* virtio */
+ case VNET_DPDK_PMD_VIRTIO:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO;
+ break;
+
+ /* vmxnet3 */
+ case VNET_DPDK_PMD_VMXNET3:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ break;
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET;
+ xd->port_id = af_packet_port_id++;
+ break;
+
+ case VNET_DPDK_PMD_BOND:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_BOND;
+ xd->port_id = bond_ether_port_id++;
+ break;
+
+ case VNET_DPDK_PMD_VIRTIO_USER:
+ xd->port_type = VNET_DPDK_PORT_TYPE_VIRTIO_USER;
+ break;
+
+ case VNET_DPDK_PMD_VHOST_ETHER:
+ xd->port_type = VNET_DPDK_PORT_TYPE_VHOST_ETHER;
+ break;
+
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+
+ if (devconf->num_rx_desc)
+ xd->nb_rx_desc = devconf->num_rx_desc;
+
+ if (devconf->num_tx_desc)
+ xd->nb_tx_desc = devconf->num_tx_desc;
+ }
+
+ /*
+ * Ensure default mtu is not > the mtu read from the hardware.
+ * Otherwise rte_eth_dev_configure() will fail and the port will
+ * not be available.
+ */
+ if (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen)
+ {
+ /*
+ * This device does not support the platforms's max frame
+ * size. Use it's advertised mru instead.
+ */
+ xd->port_conf.rxmode.max_rx_pkt_len = dev_info.max_rx_pktlen;
+ }
+ else
+ {
+ xd->port_conf.rxmode.max_rx_pkt_len = ETHERNET_MAX_PACKET_BYTES;
+
+ /*
+ * Some platforms do not account for Ethernet FCS (4 bytes) in
+ * MTU calculations. To interop with them increase mru but only
+ * if the device's settings can support it.
+ */
+ if ((dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES + 4)) &&
+ xd->port_conf.rxmode.hw_strip_crc)
+ {
+ /*
+ * Allow additional 4 bytes (for Ethernet FCS). These bytes are
+ * stripped by h/w and so will not consume any buffer memory.
+ */
+ xd->port_conf.rxmode.max_rx_pkt_len += 4;
+ }
+ }
+
+ if (xd->pmd == VNET_DPDK_PMD_AF_PACKET)
+ {
+ f64 now = vlib_time_now (vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+ clib_memcpy (addr + 2, &rnd, sizeof (rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+ else
+ rte_eth_macaddr_get (i, (struct ether_addr *) addr);
+
+ if (xd->tx_q_used < tm->n_vlib_mains)
+ dpdk_device_lock_init (xd);
+
+ xd->device_index = xd - dm->devices;
+ ASSERT (i == xd->device_index);
+ xd->per_interface_next_index = ~0;
+
+ /* assign interface to input thread */
+ dpdk_device_and_queue_t *dq;
+ int q;
+
+ if (devconf->hqos_enabled)
+ {
+ xd->flags |= DPDK_DEVICE_FLAG_HQOS;
+
+ if (devconf->hqos.hqos_thread_valid)
+ {
+ int cpu = dm->hqos_cpu_first_index + devconf->hqos.hqos_thread;
+
+ if (devconf->hqos.hqos_thread >= dm->hqos_cpu_count)
+ return clib_error_return (0, "invalid HQoS thread index");
+
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+ }
+ else
+ {
+ int cpu = dm->hqos_cpu_first_index + next_hqos_cpu;
+
+ if (dm->hqos_cpu_count == 0)
+ return clib_error_return (0, "no HQoS threads available");
+
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+
+ next_hqos_cpu++;
+ if (next_hqos_cpu == dm->hqos_cpu_count)
+ next_hqos_cpu = 0;
+
+ devconf->hqos.hqos_thread_valid = 1;
+ devconf->hqos.hqos_thread = cpu;
+ }
+ }
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], xd->nb_tx_desc,
+ sizeof (tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE - 1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->d_trace_buffers, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+
+ /* count the number of descriptors used for this device */
+ nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used;
+
+ error = ethernet_register_interface
+ (dm->vnet_main, dpdk_device_class.index, xd->device_index,
+ /* ethernet address */ addr,
+ &xd->hw_if_index, dpdk_flag_change);
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+ vnet_hw_interface_set_input_node (dm->vnet_main, xd->hw_if_index,
+ dpdk_input_node.index);
+
+ if (devconf->workers)
+ {
+ int i;
+ q = 0;
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (i, devconf->workers, ({
+ vnet_hw_interface_assign_rx_thread (dm->vnet_main, xd->hw_if_index, q++,
+ vdm->first_worker_thread_index + i);
+ }));
+ /* *INDENT-ON* */
+ }
+ else
+ for (q = 0; q < xd->rx_q_used; q++)
+ {
+ vnet_hw_interface_assign_rx_thread (dm->vnet_main, xd->hw_if_index, q, /* any */
+ ~1);
+ }
+
+ hi = vnet_get_hw_interface (dm->vnet_main, xd->hw_if_index);
+
+ if (xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD)
+ hi->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD;
+
+ dpdk_device_setup (xd);
+
+ if (vec_len (xd->errors))
+ clib_warning ("setup failed for device %U. Errors:\n %U",
+ format_dpdk_device_name, i,
+ format_dpdk_device_errors, xd);
+
+ if (devconf->hqos_enabled)
+ {
+ clib_error_t *rv;
+ rv = dpdk_port_setup_hqos (xd, &devconf->hqos);
+ if (rv)
+ return rv;
+ }
+
+ /*
+ * For cisco VIC vNIC, set default to VLAN strip enabled, unless
+ * specified otherwise in the startup config.
+ * For other NICs default to VLAN strip disabled, unless specified
+ * otherwis in the startup config.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ if (devconf->vlan_strip_offload != DPDK_DEVICE_VLAN_STRIP_OFF)
+ vlan_strip = 1; /* remove vlan tag from VIC port by default */
+ else
+ clib_warning ("VLAN strip disabled for interface\n");
+ }
+ else if (devconf->vlan_strip_offload == DPDK_DEVICE_VLAN_STRIP_ON)
+ vlan_strip = 1;
+
+ if (vlan_strip)
+ {
+ int vlan_off;
+ vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index);
+ vlan_off |= ETH_VLAN_STRIP_OFFLOAD;
+ xd->port_conf.rxmode.hw_vlan_strip = vlan_off;
+ if (rte_eth_dev_set_vlan_offload (xd->device_index, vlan_off) == 0)
+ clib_warning ("VLAN strip enabled for interface\n");
+ else
+ clib_warning ("VLAN strip cannot be supported by interface\n");
+ }
+
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] =
+ xd->port_conf.rxmode.max_rx_pkt_len - sizeof (ethernet_header_t);
+
+ rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes);
+ }
+
+ if (nb_desc > dm->conf->num_mbufs)
+ clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n",
+ dm->conf->num_mbufs, nb_desc);
+
+ return 0;
+}
+
+static void
+dpdk_bind_devices_to_uio (dpdk_config_main_t * conf)
+{
+ vlib_pci_main_t *pm = &pci_main;
+ clib_error_t *error;
+ vlib_pci_device_t *d;
+ u8 *pci_addr = 0;
+ int num_whitelisted = vec_len (conf->dev_confs);
+
+ /* *INDENT-OFF* */
+ pool_foreach (d, pm->pci_devs, ({
+ dpdk_device_config_t * devconf = 0;
+ vec_reset_length (pci_addr);
+ pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, &d->bus_address, 0);
+
+ if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && d->device_class != PCI_CLASS_PROCESSOR_CO)
+ continue;
+
+ if (num_whitelisted)
+ {
+ uword * p = hash_get (conf->device_config_index_by_pci_addr, d->bus_address.as_u32);
+
+ if (!p)
+ continue;
+
+ devconf = pool_elt_at_index (conf->dev_confs, p[0]);
+ }
+
+ /* virtio */
+ if (d->vendor_id == 0x1af4 && d->device_id == 0x1000)
+ ;
+ /* vmxnet3 */
+ else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0)
+ ;
+ /* all Intel network devices */
+ else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_NETWORK_ETHERNET)
+ ;
+ /* all Intel QAT devices VFs */
+ else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_PROCESSOR_CO &&
+ (d->device_id == 0x0443 || d->device_id == 0x37c9 || d->device_id == 0x19e3))
+ ;
+ /* Cisco VIC */
+ else if (d->vendor_id == 0x1137 && d->device_id == 0x0043)
+ ;
+ /* Chelsio T4/T5 */
+ else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000)
+ ;
+ /* Mellanox */
+ else if (d->vendor_id == 0x15b3 && d->device_id >= 0x1013 && d->device_id <= 0x101a)
+ {
+ continue;
+ }
+ else
+ {
+ clib_warning ("Unsupported PCI device 0x%04x:0x%04x found "
+ "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id,
+ pci_addr);
+ continue;
+ }
+
+ error = vlib_pci_bind_to_uio (d, (char *) conf->uio_driver_name);
+
+ if (error)
+ {
+ if (devconf == 0)
+ {
+ pool_get (conf->dev_confs, devconf);
+ hash_set (conf->device_config_index_by_pci_addr, d->bus_address.as_u32,
+ devconf - conf->dev_confs);
+ devconf->pci_addr.as_u32 = d->bus_address.as_u32;
+ }
+ devconf->is_blacklisted = 1;
+ clib_error_report (error);
+ }
+ }));
+ /* *INDENT-ON* */
+ vec_free (pci_addr);
+}
+
+static clib_error_t *
+dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr,
+ unformat_input_t * input, u8 is_default)
+{
+ clib_error_t *error = 0;
+ uword *p;
+ dpdk_device_config_t *devconf;
+ unformat_input_t sub_input;
+
+ if (is_default)
+ {
+ devconf = &conf->default_devconf;
+ }
+ else
+ {
+ p = hash_get (conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+
+ if (!p)
+ {
+ pool_get (conf->dev_confs, devconf);
+ hash_set (conf->device_config_index_by_pci_addr, pci_addr.as_u32,
+ devconf - conf->dev_confs);
+ }
+ else
+ return clib_error_return (0,
+ "duplicate configuration for PCI address %U",
+ format_vlib_pci_addr, &pci_addr);
+ }
+
+ devconf->pci_addr.as_u32 = pci_addr.as_u32;
+ devconf->hqos_enabled = 0;
+ dpdk_device_config_hqos_default (&devconf->hqos);
+
+ if (!input)
+ return 0;
+
+ unformat_skip_white_space (input);
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "num-rx-queues %u", &devconf->num_rx_queues))
+ ;
+ else if (unformat (input, "num-tx-queues %u", &devconf->num_tx_queues))
+ ;
+ else if (unformat (input, "num-rx-desc %u", &devconf->num_rx_desc))
+ ;
+ else if (unformat (input, "num-tx-desc %u", &devconf->num_tx_desc))
+ ;
+ else if (unformat (input, "workers %U", unformat_bitmap_list,
+ &devconf->workers))
+ ;
+ else
+ if (unformat
+ (input, "rss %U", unformat_vlib_cli_sub_input, &sub_input))
+ {
+ error = unformat_rss_fn (&sub_input, &devconf->rss_fn);
+ if (error)
+ break;
+ }
+ else if (unformat (input, "vlan-strip-offload off"))
+ devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF;
+ else if (unformat (input, "vlan-strip-offload on"))
+ devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON;
+ else
+ if (unformat
+ (input, "hqos %U", unformat_vlib_cli_sub_input, &sub_input))
+ {
+ devconf->hqos_enabled = 1;
+ error = unformat_hqos (&sub_input, &devconf->hqos);
+ if (error)
+ break;
+ }
+ else if (unformat (input, "hqos"))
+ {
+ devconf->hqos_enabled = 1;
+ }
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ break;
+ }
+ }
+
+ if (error)
+ return error;
+
+ if (devconf->workers && devconf->num_rx_queues == 0)
+ devconf->num_rx_queues = clib_bitmap_count_set_bits (devconf->workers);
+ else if (devconf->workers &&
+ clib_bitmap_count_set_bits (devconf->workers) !=
+ devconf->num_rx_queues)
+ error =
+ clib_error_return (0,
+ "%U: number of worker threadds must be "
+ "equal to number of rx queues", format_vlib_pci_addr,
+ &pci_addr);
+
+ return error;
+}
+
+static clib_error_t *
+dpdk_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ clib_error_t *error = 0;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_config_main_t *conf = &dpdk_config_main;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_device_config_t *devconf;
+ vlib_pci_addr_t pci_addr;
+ unformat_input_t sub_input;
+ uword x;
+ u8 *s, *tmp = 0;
+ u8 *rte_cmd = 0, *ethname = 0;
+ u32 log_level;
+ int ret, i;
+ int num_whitelisted = 0;
+ u8 no_pci = 0;
+ u8 no_huge = 0;
+ u8 huge_dir = 0;
+ u8 file_prefix = 0;
+ u8 *socket_mem = 0;
+ u8 *huge_dir_path = 0;
+
+ huge_dir_path =
+ format (0, "%s/hugepages%c", vlib_unix_get_runtime_dir (), 0);
+
+ conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword));
+ log_level = RTE_LOG_NOTICE;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ /* Prime the pump */
+ if (unformat (input, "no-hugetlb"))
+ {
+ vec_add1 (conf->eal_init_args, (u8 *) "no-huge");
+ no_huge = 1;
+ }
+
+ else if (unformat (input, "enable-tcp-udp-checksum"))
+ conf->enable_tcp_udp_checksum = 1;
+
+ else if (unformat (input, "decimal-interface-names"))
+ conf->interface_name_format_decimal = 1;
+
+ else if (unformat (input, "log-level %U", unformat_dpdk_log_level, &x))
+ log_level = x;
+
+ else if (unformat (input, "no-multi-seg"))
+ conf->no_multi_seg = 1;
+
+ else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input,
+ &sub_input))
+ {
+ error =
+ dpdk_device_config (conf, (vlib_pci_addr_t) (u32) ~ 1, &sub_input,
+ 1);
+
+ if (error)
+ return error;
+ }
+ else
+ if (unformat
+ (input, "dev %U %U", unformat_vlib_pci_addr, &pci_addr,
+ unformat_vlib_cli_sub_input, &sub_input))
+ {
+ error = dpdk_device_config (conf, pci_addr, &sub_input, 0);
+
+ if (error)
+ return error;
+
+ num_whitelisted++;
+ }
+ else if (unformat (input, "dev %U", unformat_vlib_pci_addr, &pci_addr))
+ {
+ error = dpdk_device_config (conf, pci_addr, 0, 0);
+
+ if (error)
+ return error;
+
+ num_whitelisted++;
+ }
+ else if (unformat (input, "num-mbufs %d", &conf->num_mbufs))
+ ;
+ else if (unformat (input, "uio-driver %s", &conf->uio_driver_name))
+ ;
+ else if (unformat (input, "socket-mem %s", &socket_mem))
+ ;
+ else if (unformat (input, "no-pci"))
+ {
+ no_pci = 1;
+ tmp = format (0, "--no-pci%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ else if (unformat (input, "poll-sleep %d", &dm->poll_sleep_usec))
+ ;
+
+#define _(a) \
+ else if (unformat(input, #a)) \
+ { \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ }
+ foreach_eal_double_hyphen_predicate_arg
+#undef _
+#define _(a) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ if (!strncmp(#a, "huge-dir", 8)) \
+ huge_dir = 1; \
+ else if (!strncmp(#a, "file-prefix", 11)) \
+ file_prefix = 1; \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ if (!strncmp(#a, "vdev", 4)) \
+ if (strstr((char*)s, "af_packet")) \
+ clib_warning ("af_packet obsoleted. Use CLI 'create host-interface'."); \
+ vec_add1 (conf->eal_init_args, s); \
+ }
+ foreach_eal_double_hyphen_arg
+#undef _
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (conf->eal_init_args, s); \
+ }
+ foreach_eal_single_hyphen_arg
+#undef _
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (conf->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (conf->eal_init_args, s); \
+ conf->a##_set_manually = 1; \
+ }
+ foreach_eal_single_hyphen_mandatory_arg
+#undef _
+ else if (unformat (input, "default"))
+ ;
+
+ else if (unformat_skip_white_space (input))
+ ;
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+
+ if (!conf->uio_driver_name)
+ conf->uio_driver_name = format (0, "uio_pci_generic%c", 0);
+
+ /*
+ * Use 1G huge pages if available.
+ */
+ if (!no_huge && !huge_dir)
+ {
+ u32 x, *mem_by_socket = 0;
+ uword c = 0;
+ u8 use_1g = 1;
+ u8 use_2m = 1;
+ u8 less_than_1g = 1;
+ int rv;
+
+ umount ((char *) huge_dir_path);
+
+ /* Process "socket-mem" parameter value */
+ if (vec_len (socket_mem))
+ {
+ unformat_input_t in;
+ unformat_init_vector (&in, socket_mem);
+ while (unformat_check_input (&in) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (&in, "%u,", &x))
+ ;
+ else if (unformat (&in, "%u", &x))
+ ;
+ else if (unformat (&in, ","))
+ x = 0;
+ else
+ break;
+
+ vec_add1 (mem_by_socket, x);
+
+ if (x > 1023)
+ less_than_1g = 0;
+ }
+ /* Note: unformat_free vec_frees(in.buffer), aka socket_mem... */
+ unformat_free (&in);
+ socket_mem = 0;
+ }
+ else
+ {
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
+ {
+ vec_validate(mem_by_socket, c);
+ mem_by_socket[c] = 256; /* default per-socket mem */
+ }
+ ));
+ /* *INDENT-ON* */
+ }
+
+ /* check if available enough 1GB pages for each socket */
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (c, tm->cpu_socket_bitmap, (
+ {
+ int pages_avail, page_size, mem;
+ clib_error_t *e = 0;
+
+ vec_validate(mem_by_socket, c);
+ mem = mem_by_socket[c];
+
+ page_size = 1024;
+ e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
+
+ if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem)
+ use_1g = 0;
+
+ if (e)
+ clib_error_free (e);
+
+ page_size = 2;
+ e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
+
+ if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem)
+ use_2m = 0;
+
+ if (e)
+ clib_error_free (e);
+ }));
+ /* *INDENT-ON* */
+
+ if (mem_by_socket == 0)
+ {
+ error = clib_error_return (0, "mem_by_socket NULL");
+ goto done;
+ }
+ _vec_len (mem_by_socket) = c + 1;
+
+ /* regenerate socket_mem string */
+ vec_foreach_index (x, mem_by_socket)
+ socket_mem = format (socket_mem, "%s%u",
+ socket_mem ? "," : "", mem_by_socket[x]);
+ socket_mem = format (socket_mem, "%c", 0);
+
+ vec_free (mem_by_socket);
+
+ error = vlib_unix_recursive_mkdir ((char *) huge_dir_path);
+ if (error)
+ {
+ goto done;
+ }
+
+ if (use_1g && !(less_than_1g && use_2m))
+ {
+ rv = mount ("none", (char *) huge_dir_path, "hugetlbfs", 0,
+ "pagesize=1G");
+ }
+ else if (use_2m)
+ {
+ rv = mount ("none", (char *) huge_dir_path, "hugetlbfs", 0, NULL);
+ }
+ else
+ {
+ return clib_error_return (0, "not enough free huge pages");
+ }
+
+ if (rv)
+ {
+ error = clib_error_return (0, "mount failed %d", errno);
+ goto done;
+ }
+
+ tmp = format (0, "--huge-dir%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%s%c", huge_dir_path, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ if (!file_prefix)
+ {
+ tmp = format (0, "--file-prefix%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "vpp%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ }
+
+ vec_free (rte_cmd);
+ vec_free (ethname);
+
+ if (error)
+ return error;
+
+ /* I'll bet that -c and -n must be the first and second args... */
+ if (!conf->coremask_set_manually)
+ {
+ vlib_thread_registration_t *tr;
+ uword *coremask = 0;
+ int i;
+
+ /* main thread core */
+ coremask = clib_bitmap_set (coremask, tm->main_lcore, 1);
+
+ for (i = 0; i < vec_len (tm->registrations); i++)
+ {
+ tr = tm->registrations[i];
+ coremask = clib_bitmap_or (coremask, tr->coremask);
+ }
+
+ vec_insert (conf->eal_init_args, 2, 1);
+ conf->eal_init_args[1] = (u8 *) "-c";
+ tmp = format (0, "%U%c", format_bitmap_hex, coremask, 0);
+ conf->eal_init_args[2] = tmp;
+ clib_bitmap_free (coremask);
+ }
+
+ if (!conf->nchannels_set_manually)
+ {
+ vec_insert (conf->eal_init_args, 2, 3);
+ conf->eal_init_args[3] = (u8 *) "-n";
+ tmp = format (0, "%d", conf->nchannels);
+ conf->eal_init_args[4] = tmp;
+ }
+
+ if (no_pci == 0 && geteuid () == 0)
+ dpdk_bind_devices_to_uio (conf);
+
+#define _(x) \
+ if (devconf->x == 0 && conf->default_devconf.x > 0) \
+ devconf->x = conf->default_devconf.x ;
+
+ /* *INDENT-OFF* */
+ pool_foreach (devconf, conf->dev_confs, ({
+
+ /* default per-device config items */
+ foreach_dpdk_device_config_item
+
+ /* add DPDK EAL whitelist/blacklist entry */
+ if (num_whitelisted > 0 && devconf->is_blacklisted == 0)
+ {
+ tmp = format (0, "-w%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ else if (num_whitelisted == 0 && devconf->is_blacklisted != 0)
+ {
+ tmp = format (0, "-b%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ }
+ }));
+ /* *INDENT-ON* */
+
+#undef _
+
+ /* set master-lcore */
+ tmp = format (0, "--master-lcore%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%u%c", tm->main_lcore, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+
+ /* set socket-mem */
+ tmp = format (0, "--socket-mem%c", 0);
+ vec_add1 (conf->eal_init_args, tmp);
+ tmp = format (0, "%s%c", socket_mem, 0);
+ vec_add1 (conf->eal_init_args, tmp);
+
+ /* NULL terminate the "argv" vector, in case of stupidity */
+ vec_add1 (conf->eal_init_args, 0);
+ _vec_len (conf->eal_init_args) -= 1;
+
+ /* Set up DPDK eal and packet mbuf pool early. */
+
+ rte_log_set_global_level (log_level);
+
+ vm = vlib_get_main ();
+
+ /* make copy of args as rte_eal_init tends to mess up with arg array */
+ for (i = 1; i < vec_len (conf->eal_init_args); i++)
+ conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ",
+ conf->eal_init_args[i]);
+
+ clib_warning ("EAL init args: %s", conf->eal_init_args_str);
+ ret =
+ rte_eal_init (vec_len (conf->eal_init_args),
+ (char **) conf->eal_init_args);
+
+ /* lazy umount hugepages */
+ umount2 ((char *) huge_dir_path, MNT_DETACH);
+ rmdir ((char *) huge_dir_path);
+ vec_free (huge_dir_path);
+
+ if (ret < 0)
+ return clib_error_return (0, "rte_eal_init returned %d", ret);
+
+ /* Dump the physical memory layout prior to creating the mbuf_pool */
+ fprintf (stdout, "DPDK physical memory layout:\n");
+ rte_dump_physmem_layout (stdout);
+
+ /* main thread 1st */
+ error = dpdk_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ());
+ if (error)
+ return error;
+
+ for (i = 0; i < RTE_MAX_LCORE; i++)
+ {
+ error = dpdk_buffer_pool_create (vm, conf->num_mbufs,
+ rte_lcore_to_socket_id (i));
+ if (error)
+ return error;
+ }
+
+done:
+ return error;
+}
+
+VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk");
+
+void
+dpdk_update_link_state (dpdk_device_t * xd, f64 now)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ struct rte_eth_link prev_link = xd->link;
+ u32 hw_flags = 0;
+ u8 hw_flags_chg = 0;
+
+ /* only update link state for PMD interfaces */
+ if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
+ return;
+
+ xd->time_last_link_update = now ? now : xd->time_last_link_update;
+ memset (&xd->link, 0, sizeof (xd->link));
+ rte_eth_link_get_nowait (xd->device_index, &xd->link);
+
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t *vm = vlib_get_main ();
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format =
+ "update-link-state: sw_if_index %d, admin_up %d,"
+ "old link_state %d new link_state %d",.format_args = "i4i1i1i1",};
+
+ struct
+ {
+ u32 sw_if_index;
+ u8 admin_up;
+ u8 old_link_state;
+ u8 new_link_state;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0;
+ ed->old_link_state = (u8)
+ vnet_hw_interface_is_link_up (vnm, xd->hw_if_index);
+ ed->new_link_state = (u8) xd->link.link_status;
+ }
+
+ if ((xd->flags & (DPDK_DEVICE_FLAG_ADMIN_UP | DPDK_DEVICE_FLAG_BOND_SLAVE))
+ && ((xd->link.link_status != 0) ^
+ vnet_hw_interface_is_link_up (vnm, xd->hw_if_index)))
+ {
+ hw_flags_chg = 1;
+ hw_flags |= (xd->link.link_status ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0);
+ }
+
+ if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_duplex)
+ {
+ case ETH_LINK_HALF_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX;
+ break;
+ case ETH_LINK_FULL_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX;
+ break;
+ default:
+ break;
+ }
+ }
+ if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_speed)
+ {
+ case ETH_SPEED_NUM_10M:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M;
+ break;
+ case ETH_SPEED_NUM_100M:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M;
+ break;
+ case ETH_SPEED_NUM_1G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G;
+ break;
+ case ETH_SPEED_NUM_10G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G;
+ break;
+ case ETH_SPEED_NUM_40G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G;
+ break;
+ case 0:
+ break;
+ default:
+ clib_warning ("unknown link speed %d", xd->link.link_speed);
+ break;
+ }
+ }
+ if (hw_flags_chg)
+ {
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t *vm = vlib_get_main ();
+
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format =
+ "update-link-state: sw_if_index %d, new flags %d",.format_args
+ = "i4i4",};
+
+ struct
+ {
+ u32 sw_if_index;
+ u32 flags;
+ } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->flags = hw_flags;
+ }
+ vnet_hw_interface_set_flags (vnm, xd->hw_if_index, hw_flags);
+ }
+}
+
+static uword
+dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ clib_error_t *error;
+ vnet_main_t *vnm = vnet_get_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ ethernet_main_t *em = &ethernet_main;
+ dpdk_device_t *xd;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ int i;
+
+ error = dpdk_lib_init (dm);
+
+ if (error)
+ clib_error_report (error);
+
+ tm->worker_thread_release = 1;
+
+ f64 now = vlib_time_now (vm);
+ vec_foreach (xd, dm->devices)
+ {
+ dpdk_update_link_state (xd, now);
+ }
+
+ {
+ /*
+ * Extra set up for bond interfaces:
+ * 1. Setup MACs for bond interfaces and their slave links which was set
+ * in dpdk_device_setup() but needs to be done again here to take
+ * effect.
+ * 2. Set up info and register slave link state change callback handling.
+ * 3. Set up info for bond interface related CLI support.
+ */
+ int nports = rte_eth_dev_count ();
+ if (nports > 0)
+ {
+ for (i = 0; i < nports; i++)
+ {
+ xd = &dm->devices[i];
+ ASSERT (i == xd->device_index);
+ if (xd->pmd == VNET_DPDK_PMD_BOND)
+ {
+ u8 addr[6];
+ u8 slink[16];
+ int nlink = rte_eth_bond_slaves_get (i, slink, 16);
+ if (nlink > 0)
+ {
+ vnet_hw_interface_t *bhi;
+ ethernet_interface_t *bei;
+ int rv;
+
+ /* Get MAC of 1st slave link */
+ rte_eth_macaddr_get
+ (slink[0], (struct ether_addr *) addr);
+
+ /* Set MAC of bounded interface to that of 1st slave link */
+ clib_warning ("Set MAC for bond port %d BondEthernet%d",
+ i, xd->port_id);
+ rv = rte_eth_bond_mac_address_set
+ (i, (struct ether_addr *) addr);
+ if (rv)
+ clib_warning ("Set MAC addr failure rv=%d", rv);
+
+ /* Populate MAC of bonded interface in VPP hw tables */
+ bhi = vnet_get_hw_interface
+ (vnm, dm->devices[i].hw_if_index);
+ bei = pool_elt_at_index
+ (em->interfaces, bhi->hw_instance);
+ clib_memcpy (bhi->hw_address, addr, 6);
+ clib_memcpy (bei->address, addr, 6);
+
+ /* Init l3 packet size allowed on bonded interface */
+ bhi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES;
+ bhi->max_l3_packet_bytes[VLIB_RX] =
+ bhi->max_l3_packet_bytes[VLIB_TX] =
+ ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t);
+ while (nlink >= 1)
+ { /* for all slave links */
+ int slave = slink[--nlink];
+ dpdk_device_t *sdev = &dm->devices[slave];
+ vnet_hw_interface_t *shi;
+ vnet_sw_interface_t *ssi;
+ ethernet_interface_t *sei;
+ /* Add MAC to all slave links except the first one */
+ if (nlink)
+ {
+ clib_warning ("Add MAC for slave port %d", slave);
+ rv = rte_eth_dev_mac_addr_add
+ (slave, (struct ether_addr *) addr, 0);
+ if (rv)
+ clib_warning ("Add MAC addr failure rv=%d", rv);
+ }
+ /* Setup slave link state change callback handling */
+ rte_eth_dev_callback_register
+ (slave, RTE_ETH_EVENT_INTR_LSC,
+ dpdk_port_state_callback, NULL);
+ dpdk_device_t *sxd = &dm->devices[slave];
+ sxd->flags |= DPDK_DEVICE_FLAG_BOND_SLAVE;
+ sxd->bond_port = i;
+ /* Set slaves bitmap for bonded interface */
+ bhi->bond_info = clib_bitmap_set
+ (bhi->bond_info, sdev->hw_if_index, 1);
+ /* Set MACs and slave link flags on slave interface */
+ shi = vnet_get_hw_interface (vnm, sdev->hw_if_index);
+ ssi = vnet_get_sw_interface
+ (vnm, sdev->vlib_sw_if_index);
+ sei = pool_elt_at_index
+ (em->interfaces, shi->hw_instance);
+ shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE;
+ ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE;
+ clib_memcpy (shi->hw_address, addr, 6);
+ clib_memcpy (sei->address, addr, 6);
+ /* Set l3 packet size allowed as the lowest of slave */
+ if (bhi->max_l3_packet_bytes[VLIB_RX] >
+ shi->max_l3_packet_bytes[VLIB_RX])
+ bhi->max_l3_packet_bytes[VLIB_RX] =
+ bhi->max_l3_packet_bytes[VLIB_TX] =
+ shi->max_l3_packet_bytes[VLIB_RX];
+ /* Set max packet size allowed as the lowest of slave */
+ if (bhi->max_packet_bytes > shi->max_packet_bytes)
+ bhi->max_packet_bytes = shi->max_packet_bytes;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ while (1)
+ {
+ /*
+ * check each time through the loop in case intervals are changed
+ */
+ f64 min_wait = dm->link_state_poll_interval < dm->stat_poll_interval ?
+ dm->link_state_poll_interval : dm->stat_poll_interval;
+
+ vlib_process_wait_for_event_or_clock (vm, min_wait);
+
+ if (dm->admin_up_down_in_progress)
+ /* skip the poll if an admin up down is in progress (on any interface) */
+ continue;
+
+ vec_foreach (xd, dm->devices)
+ {
+ f64 now = vlib_time_now (vm);
+ if ((now - xd->time_last_stats_update) >= dm->stat_poll_interval)
+ dpdk_update_counters (xd, now);
+ if ((now - xd->time_last_link_update) >= dm->link_state_poll_interval)
+ dpdk_update_link_state (xd, now);
+
+ }
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (dpdk_process_node,static) = {
+ .function = dpdk_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "dpdk-process",
+ .process_log2_n_stack_bytes = 17,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+dpdk_init (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ vlib_node_t *ei;
+ clib_error_t *error = 0;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+ /* verify that structs are cacheline aligned */
+ STATIC_ASSERT (offsetof (dpdk_device_t, cacheline0) == 0,
+ "Cache line marker must be 1st element in dpdk_device_t");
+ STATIC_ASSERT (offsetof (dpdk_device_t, cacheline1) ==
+ CLIB_CACHE_LINE_BYTES,
+ "Data in cache line 0 is bigger than cache line size");
+ STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0,
+ "Cache line marker must be 1st element in frame_queue_trace_t");
+
+ dm->vlib_main = vm;
+ dm->vnet_main = vnet_get_main ();
+ dm->conf = &dpdk_config_main;
+
+ ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input");
+ if (ei == 0)
+ return clib_error_return (0, "ethernet-input node AWOL");
+
+ dm->ethernet_input_node_index = ei->index;
+
+ dm->conf->nchannels = 4;
+ dm->conf->num_mbufs = dm->conf->num_mbufs ? dm->conf->num_mbufs : NB_MBUF;
+ vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet");
+
+ vec_validate (dm->recycle, tm->n_thread_stacks - 1);
+
+ /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */
+ dm->buffer_flags_template =
+ (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_EXT_HDR_VALID
+ | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED |
+ VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
+
+ dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL;
+ dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL;
+
+ /* init CLI */
+ if ((error = vlib_call_init_function (vm, dpdk_cli_init)))
+ return error;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (dpdk_init);
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c
new file mode 100644
index 00000000..cf8b9699
--- /dev/null
+++ b/src/plugins/dpdk/device/node.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <dpdk/device/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls/packet.h>
+#include <vnet/handoff.h>
+#include <vnet/devices/devices.h>
+#include <vnet/feature/feature.h>
+
+#include <dpdk/device/dpdk_priv.h>
+
+static char *dpdk_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_error
+#undef _
+};
+
+always_inline int
+vlib_buffer_is_ip4 (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b);
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP4));
+}
+
+always_inline int
+vlib_buffer_is_ip6 (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b);
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6));
+}
+
+always_inline int
+vlib_buffer_is_mpls (vlib_buffer_t * b)
+{
+ ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b);
+ return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS));
+}
+
+always_inline u32
+dpdk_rx_next_from_etype (struct rte_mbuf * mb, vlib_buffer_t * b0)
+{
+ if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0)))
+ {
+ if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0))
+ return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT;
+ else
+ return VNET_DEVICE_INPUT_NEXT_IP4_INPUT;
+ }
+ else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0)))
+ return VNET_DEVICE_INPUT_NEXT_IP6_INPUT;
+ else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0)))
+ return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT;
+ else
+ return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+}
+
+always_inline u32
+dpdk_rx_next_from_packet_start (struct rte_mbuf * mb, vlib_buffer_t * b0)
+{
+ word start_delta;
+ int rv;
+
+ start_delta = b0->current_data -
+ ((mb->buf_addr + mb->data_off) - (void *) b0->data);
+
+ vlib_buffer_advance (b0, -start_delta);
+
+ if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0)))
+ {
+ if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0))
+ rv = VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT;
+ else
+ rv = VNET_DEVICE_INPUT_NEXT_IP4_INPUT;
+ }
+ else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0)))
+ rv = VNET_DEVICE_INPUT_NEXT_IP6_INPUT;
+ else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0)))
+ rv = VNET_DEVICE_INPUT_NEXT_MPLS_INPUT;
+ else
+ rv = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+
+ vlib_buffer_advance (b0, start_delta);
+ return rv;
+}
+
+always_inline void
+dpdk_rx_error_from_mb (struct rte_mbuf *mb, u32 * next, u8 * error)
+{
+ if (mb->ol_flags & PKT_RX_IP_CKSUM_BAD)
+ {
+ *error = DPDK_ERROR_IP_CHECKSUM_ERROR;
+ *next = VNET_DEVICE_INPUT_NEXT_DROP;
+ }
+ else
+ *error = DPDK_ERROR_NONE;
+}
+
+static void
+dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id, u32 * buffers, uword n_buffers)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ u32 *b, n_left;
+ u32 next0;
+
+ n_left = n_buffers;
+ b = buffers;
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ dpdk_rx_dma_trace_t *t0;
+ struct rte_mbuf *mb;
+ u8 error0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ mb = rte_mbuf_from_vlib_buffer (b0);
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ next0 = xd->per_interface_next_index;
+ else
+ next0 = dpdk_rx_next_from_packet_start (mb, b0);
+
+ dpdk_rx_error_from_mb (mb, &next0, &error0);
+
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = bi0;
+
+ clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
+ clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ clib_memcpy (t0->buffer.pre_data, b0->data,
+ sizeof (t0->buffer.pre_data));
+ clib_memcpy (&t0->data, mb->buf_addr + mb->data_off, sizeof (t0->data));
+
+ b += 1;
+ }
+}
+
+static inline u32
+dpdk_rx_burst (dpdk_main_t * dm, dpdk_device_t * xd, u16 queue_id)
+{
+ u32 n_buffers;
+ u32 n_left;
+ u32 n_this_chunk;
+
+ n_left = VLIB_FRAME_SIZE;
+ n_buffers = 0;
+
+ if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD))
+ {
+ while (n_left)
+ {
+ n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id,
+ xd->rx_vectors[queue_id] +
+ n_buffers, n_left);
+ n_buffers += n_this_chunk;
+ n_left -= n_this_chunk;
+
+ /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */
+ if (n_this_chunk < 32)
+ break;
+ }
+ }
+ else
+ {
+ ASSERT (0);
+ }
+
+ return n_buffers;
+}
+
+
+static_always_inline void
+dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b,
+ struct rte_mbuf *mb, vlib_buffer_free_list_t * fl)
+{
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb_seg = 0;
+ vlib_buffer_t *b_seg, *b_chain = 0;
+ mb_seg = mb->next;
+ b_chain = b;
+
+ if (mb->nb_segs < 2)
+ return;
+
+ b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ b->total_length_not_including_first_buffer = 0;
+
+ while (nb_seg < mb->nb_segs)
+ {
+ ASSERT (mb_seg != 0);
+
+ b_seg = vlib_buffer_from_rte_mbuf (mb_seg);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+
+ ASSERT ((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT (b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data =
+ (mb_seg->buf_addr + mb_seg->data_off) - (void *) b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b->total_length_not_including_first_buffer += mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+}
+
+static_always_inline void
+dpdk_prefetch_buffer (struct rte_mbuf *mb)
+{
+ vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb);
+ CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, STORE);
+}
+
+static_always_inline void
+dpdk_prefetch_ethertype (struct rte_mbuf *mb)
+{
+ CLIB_PREFETCH (mb->buf_addr + mb->data_off +
+ STRUCT_OFFSET_OF (ethernet_header_t, type),
+ CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+
+/*
+ This function should fill 1st cacheline of vlib_buffer_t metadata with data
+ from buffer template. Instead of filling field by field, we construct
+ template and then use 128/256 bit vector instruction to copy data.
+ This code first loads whole cacheline into 4 128-bit registers (xmm)
+ or two 256 bit registers (ymm) and then stores data into all 4 buffers
+ efectively saving on register load operations.
+*/
+
+static_always_inline void
+dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3,
+ void *s)
+{
+#if defined(CLIB_HAVE_VEC128)
+ int i;
+ for (i = 0; i < 2; i++)
+ {
+ *(u8x32 *) (((u8 *) d0) + i * 32) =
+ *(u8x32 *) (((u8 *) d1) + i * 32) =
+ *(u8x32 *) (((u8 *) d2) + i * 32) =
+ *(u8x32 *) (((u8 *) d3) + i * 32) = *(u8x32 *) (((u8 *) s) + i * 32);
+ }
+#elif defined(CLIB_HAVE_VEC64)
+ int i;
+ for (i = 0; i < 4; i++)
+ {
+ *(u8x16 *) (((u8 *) d0) + i * 16) =
+ *(u8x16 *) (((u8 *) d1) + i * 16) =
+ *(u8x16 *) (((u8 *) d2) + i * 16) =
+ *(u8x16 *) (((u8 *) d3) + i * 16) = *(u8x16 *) (((u8 *) s) + i * 16);
+ }
+#else
+#error "Either CLIB_HAVE_VEC128 or CLIB_HAVE_VEC64 has to be defined"
+#endif
+}
+
+/*
+ * This function is used when there are no worker threads.
+ * The main thread performs IO and forwards the packets.
+ */
+static_always_inline u32
+dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
+ vlib_node_runtime_t * node, u32 thread_index, u16 queue_id,
+ int maybe_multiseg)
+{
+ u32 n_buffers;
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ u32 n_left_to_next, *to_next;
+ u32 mb_index;
+ vlib_main_t *vm = vlib_get_main ();
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__ ((unused));
+ vlib_buffer_free_list_t *fl;
+ vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, thread_index);
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
+ return 0;
+
+ n_buffers = dpdk_rx_burst (dm, xd, queue_id);
+
+ if (n_buffers == 0)
+ {
+ return 0;
+ }
+
+ vec_reset_length (xd->d_trace_buffers[thread_index]);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node);
+
+ if (n_trace > 0)
+ {
+ u32 n = clib_min (n_trace, n_buffers);
+ mb_index = 0;
+
+ while (n--)
+ {
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++];
+ vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb);
+ vec_add1 (xd->d_trace_buffers[thread_index],
+ vlib_get_buffer_index (vm, b));
+ }
+ }
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ /* Update buffer template */
+ vnet_buffer (bt)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ bt->error = node->errors[DPDK_ERROR_NONE];
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t *b0, *b1, *b2, *b3;
+ u32 bi0, next0;
+ u32 bi1, next1;
+ u32 bi2, next2;
+ u32 bi3, next3;
+ u8 error0, error1, error2, error3;
+ u64 or_ol_flags;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_buffers >= 12 && n_left_to_next >= 4)
+ {
+ struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+
+ /* prefetches are interleaved with the rest of the code to reduce
+ pressure on L1 cache */
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 8]);
+ dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 4]);
+
+ mb0 = xd->rx_vectors[queue_id][mb_index];
+ mb1 = xd->rx_vectors[queue_id][mb_index + 1];
+ mb2 = xd->rx_vectors[queue_id][mb_index + 2];
+ mb3 = xd->rx_vectors[queue_id][mb_index + 3];
+
+ ASSERT (mb0);
+ ASSERT (mb1);
+ ASSERT (mb2);
+ ASSERT (mb3);
+
+ if (maybe_multiseg)
+ {
+ if (PREDICT_FALSE (mb0->nb_segs > 1))
+ dpdk_prefetch_buffer (mb0->next);
+ if (PREDICT_FALSE (mb1->nb_segs > 1))
+ dpdk_prefetch_buffer (mb1->next);
+ if (PREDICT_FALSE (mb2->nb_segs > 1))
+ dpdk_prefetch_buffer (mb2->next);
+ if (PREDICT_FALSE (mb3->nb_segs > 1))
+ dpdk_prefetch_buffer (mb3->next);
+ }
+
+ b0 = vlib_buffer_from_rte_mbuf (mb0);
+ b1 = vlib_buffer_from_rte_mbuf (mb1);
+ b2 = vlib_buffer_from_rte_mbuf (mb2);
+ b3 = vlib_buffer_from_rte_mbuf (mb3);
+
+ dpdk_buffer_init_from_template (b0, b1, b2, b3, bt);
+
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 9]);
+ dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 5]);
+
+ /* current_data must be set to -RTE_PKTMBUF_HEADROOM in template */
+ b0->current_data += mb0->data_off;
+ b1->current_data += mb1->data_off;
+ b2->current_data += mb2->data_off;
+ b3->current_data += mb3->data_off;
+
+ b0->current_length = mb0->data_len;
+ b1->current_length = mb1->data_len;
+ b2->current_length = mb2->data_len;
+ b3->current_length = mb3->data_len;
+
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 10]);
+ dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 7]);
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+ bi1 = vlib_get_buffer_index (vm, b1);
+ bi2 = vlib_get_buffer_index (vm, b2);
+ bi3 = vlib_get_buffer_index (vm, b3);
+
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ to_next[2] = bi2;
+ to_next[3] = bi3;
+ to_next += 4;
+ n_left_to_next -= 4;
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ {
+ next0 = next1 = next2 = next3 = xd->per_interface_next_index;
+ }
+ else
+ {
+ next0 = dpdk_rx_next_from_etype (mb0, b0);
+ next1 = dpdk_rx_next_from_etype (mb1, b1);
+ next2 = dpdk_rx_next_from_etype (mb2, b2);
+ next3 = dpdk_rx_next_from_etype (mb3, b3);
+ }
+
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 11]);
+ dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 6]);
+
+ or_ol_flags = (mb0->ol_flags | mb1->ol_flags |
+ mb2->ol_flags | mb3->ol_flags);
+ if (PREDICT_FALSE (or_ol_flags & PKT_RX_IP_CKSUM_BAD))
+ {
+ dpdk_rx_error_from_mb (mb0, &next0, &error0);
+ dpdk_rx_error_from_mb (mb1, &next1, &error1);
+ dpdk_rx_error_from_mb (mb2, &next2, &error2);
+ dpdk_rx_error_from_mb (mb3, &next3, &error3);
+ b0->error = node->errors[error0];
+ b1->error = node->errors[error1];
+ b2->error = node->errors[error2];
+ b3->error = node->errors[error3];
+ }
+
+ vlib_buffer_advance (b0, device_input_next_node_advance[next0]);
+ vlib_buffer_advance (b1, device_input_next_node_advance[next1]);
+ vlib_buffer_advance (b2, device_input_next_node_advance[next2]);
+ vlib_buffer_advance (b3, device_input_next_node_advance[next3]);
+
+ n_rx_bytes += mb0->pkt_len;
+ n_rx_bytes += mb1->pkt_len;
+ n_rx_bytes += mb2->pkt_len;
+ n_rx_bytes += mb3->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ if (maybe_multiseg)
+ {
+ dpdk_process_subseq_segs (vm, b0, mb0, fl);
+ dpdk_process_subseq_segs (vm, b1, mb1, fl);
+ dpdk_process_subseq_segs (vm, b2, mb2, fl);
+ dpdk_process_subseq_segs (vm, b3, mb3, fl);
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3);
+
+ /* Do we have any driver RX features configured on the interface? */
+ vnet_feature_start_device_input_x4 (xd->vlib_sw_if_index,
+ &next0, &next1, &next2, &next3,
+ b0, b1, b2, b3);
+
+ vlib_validate_buffer_enqueue_x4 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, bi2, bi3,
+ next0, next1, next2, next3);
+ n_buffers -= 4;
+ mb_index += 4;
+ }
+ while (n_buffers > 0 && n_left_to_next > 0)
+ {
+ struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index];
+
+ if (PREDICT_TRUE (n_buffers > 3))
+ {
+ dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 2]);
+ dpdk_prefetch_ethertype (xd->rx_vectors[queue_id]
+ [mb_index + 1]);
+ }
+
+ ASSERT (mb0);
+
+ b0 = vlib_buffer_from_rte_mbuf (mb0);
+
+ /* Prefetch one next segment if it exists. */
+ if (PREDICT_FALSE (mb0->nb_segs > 1))
+ dpdk_prefetch_buffer (mb0->next);
+
+ clib_memcpy (b0, bt, CLIB_CACHE_LINE_BYTES);
+
+ ASSERT (b0->current_data == -RTE_PKTMBUF_HEADROOM);
+ b0->current_data += mb0->data_off;
+ b0->current_length = mb0->data_len;
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+
+ to_next[0] = bi0;
+ to_next++;
+ n_left_to_next--;
+
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ next0 = xd->per_interface_next_index;
+ else
+ next0 = dpdk_rx_next_from_etype (mb0, b0);
+
+ dpdk_rx_error_from_mb (mb0, &next0, &error0);
+ b0->error = node->errors[error0];
+
+ vlib_buffer_advance (b0, device_input_next_node_advance[next0]);
+
+ n_rx_bytes += mb0->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ dpdk_process_subseq_segs (vm, b0, mb0, fl);
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
+
+ /* Do we have any driver RX features configured on the interface? */
+ vnet_feature_start_device_input_x1 (xd->vlib_sw_if_index, &next0,
+ b0);
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ n_buffers--;
+ mb_index++;
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[thread_index]) > 0))
+ {
+ dpdk_rx_trace (dm, node, xd, queue_id,
+ xd->d_trace_buffers[thread_index],
+ vec_len (xd->d_trace_buffers[thread_index]));
+ vlib_set_trace_count (vm, node,
+ n_trace -
+ vec_len (xd->d_trace_buffers[thread_index]));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main ()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ thread_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes);
+
+ vnet_device_increment_rx_packets (thread_index, mb_index);
+
+ return mb_index;
+}
+
+static inline void
+poll_rate_limit (dpdk_main_t * dm)
+{
+ /* Limit the poll rate by sleeping for N msec between polls */
+ if (PREDICT_FALSE (dm->poll_sleep_usec != 0))
+ {
+ struct timespec ts, tsrem;
+
+ ts.tv_sec = 0;
+ ts.tv_nsec = 1000 * dm->poll_sleep_usec;
+
+ while (nanosleep (&ts, &tsrem) < 0)
+ {
+ ts = tsrem;
+ }
+ }
+}
+
+/** \brief Main DPDK input node
+ @node dpdk-input
+
+ This is the main DPDK input node: across each assigned interface,
+ call rte_eth_rx_burst(...) or similar to obtain a vector of
+ packets to process. Handle early packet discard. Derive @c
+ vlib_buffer_t metadata from <code>struct rte_mbuf</code> metadata,
+ Depending on the resulting metadata: adjust <code>b->current_data,
+ b->current_length </code> and dispatch directly to
+ ip4-input-no-checksum, or ip6-input. Trace the packet if required.
+
+ @param vm vlib_main_t corresponding to the current thread
+ @param node vlib_node_runtime_t
+ @param f vlib_frame_t input-node, not used.
+
+ @par Graph mechanics: buffer metadata, next index usage
+
+ @em Uses:
+ - <code>struct rte_mbuf mb->ol_flags</code>
+ - PKT_RX_IP_CKSUM_BAD
+ - <code> RTE_ETH_IS_xxx_HDR(mb->packet_type) </code>
+ - packet classification result
+
+ @em Sets:
+ - <code>b->error</code> if the packet is to be dropped immediately
+ - <code>b->current_data, b->current_length</code>
+ - adjusted as needed to skip the L2 header in direct-dispatch cases
+ - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
+ - rx interface sw_if_index
+ - <code>vnet_buffer(b)->sw_if_index[VLIB_TX] = ~0</code>
+ - required by ipX-lookup
+ - <code>b->flags</code>
+ - to indicate multi-segment pkts (VLIB_BUFFER_NEXT_PRESENT), etc.
+
+ <em>Next Nodes:</em>
+ - Static arcs to: error-drop, ethernet-input,
+ ip4-input-no-checksum, ip6-input, mpls-input
+ - per-interface redirection, controlled by
+ <code>xd->per_interface_next_index</code>
+*/
+
+static uword
+dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_t *xd;
+ uword n_rx_packets = 0;
+ vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
+ vnet_device_and_queue_t *dq;
+ u32 thread_index = node->thread_index;
+
+ /*
+ * Poll all devices on this cpu for input/interrupts.
+ */
+ /* *INDENT-OFF* */
+ foreach_device_and_queue (dq, rt->devices_and_queues)
+ {
+ xd = vec_elt_at_index(dm->devices, dq->dev_instance);
+ if (PREDICT_FALSE (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE))
+ continue; /* Do not poll slave to a bonded interface */
+ if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG)
+ n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1);
+ else
+ n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 0);
+ }
+ /* *INDENT-ON* */
+
+ poll_rate_limit (dm);
+
+ return n_rx_packets;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (dpdk_input_node) = {
+ .function = dpdk_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "dpdk-input",
+ .sibling_of = "device-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_dpdk_rx_dma_trace,
+
+ .n_errors = DPDK_N_ERROR,
+ .error_strings = dpdk_error_strings,
+};
+
+VLIB_NODE_FUNCTION_MULTIARCH (dpdk_input_node, dpdk_input);
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */