summaryrefslogtreecommitdiffstats
path: root/vnet
diff options
context:
space:
mode:
authorJasvinder Singh <jasvinder.singh@intel.com>2016-07-21 17:02:19 +0100
committerDamjan Marion <dmarion.lists@gmail.com>2016-09-28 16:37:28 +0000
commit85ecc810ca98550a250c74f32244760e459e3f87 (patch)
tree5c26a0c4f7f382a89e747696c3bd0130f2ebb81f /vnet
parentac8146caf1f474f2c440f2316bbcc2d41245ff35 (diff)
DPDK HQoS: Enable Hierarchical Scheduler in VPP
This commit extends the vpp framework with new thread type "hqos-threads" that runs the Hierarchical Quality of Service (HQoS) scheduler associted with output interface. HQoS Scheduler prioritize the packets from different users and ensures sufficient bandwidth to pass the more important traffic. At high level, HQoS scheduler is a buffer that can temporarily store a large number of packets. In otherwords, it is a collection of large number of queues organized into hierarchy of 5 levels; the port (i.e. the physical interface) is at the root of the hierarchy followed by the subport (a set of users), the pipes (individual users), the traffic classes (each with a strict priority) and at the leaves, the queues. In each HQoS scheduler, three operations are performed; classification (setting HQoS port, subport, pipe, traffic class and queue within traffic class from packet fields), enqueue (selecting HQoS queue for the packet, and to drop the packet if the queue is full) and dequeue (schedule the packet based on its length and available credits, and handover the scheduled packet to the output interface). In vpp, the number of hqos threads will be equal to cpu cores specified in corelist-hqos-threads parameter cpu section of the vpp configuration file. One hqos thread can run HQoS for multiple output interfaces. A particular HQoS instance is initialised with default parameters required to configure hqos port, subport, pipe and queues. Some of them can be re-configured in run-time through CLI commands as well binary APIs. Following illustrates the sample startup configuration file with 4x worker threads feeding 2x hqos threads that handle each HQoS for 1x output interface. For more details on HQoS configuration please refer to DPDK Programmer's Guide. dpdk { socket-mem 16384,16384 dev 0000:02:00.0 { num-rx-queues 2 hqos } dev 0000:06:00.0 { num-rx-queues 2 hqos } num-mbufs 1000000 } cpu { main-core 0 corelist-workers 1, 2, 3, 4 corelist-hqos-threads 5, 6 } Change-Id: I635c3395a7c4ddf0a239ef77b0b0a31a6dfc4767 Signed-off-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com> Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
Diffstat (limited to 'vnet')
-rw-r--r--vnet/Makefile.am1
-rw-r--r--vnet/vnet/devices/dpdk/cli.c669
-rw-r--r--vnet/vnet/devices/dpdk/device.c43
-rw-r--r--vnet/vnet/devices/dpdk/dpdk.h89
-rw-r--r--vnet/vnet/devices/dpdk/format.c20
-rw-r--r--vnet/vnet/devices/dpdk/hqos.c742
-rw-r--r--vnet/vnet/devices/dpdk/init.c88
-rw-r--r--vnet/vnet/devices/dpdk/qos_doc.md287
8 files changed, 1935 insertions, 4 deletions
diff --git a/vnet/Makefile.am b/vnet/Makefile.am
index 41568e06045..bcc7faf028f 100644
--- a/vnet/Makefile.am
+++ b/vnet/Makefile.am
@@ -650,6 +650,7 @@ libvnet_la_SOURCES += \
vnet/devices/dpdk/format.c \
vnet/devices/dpdk/init.c \
vnet/devices/dpdk/node.c \
+ vnet/devices/dpdk/hqos.c \
vnet/devices/dpdk/vhost_user.c \
vnet/devices/dpdk/cli.c
diff --git a/vnet/vnet/devices/dpdk/cli.c b/vnet/vnet/devices/dpdk/cli.c
index 9e8fed44efb..7941f9e0e16 100644
--- a/vnet/vnet/devices/dpdk/cli.c
+++ b/vnet/vnet/devices/dpdk/cli.c
@@ -890,6 +890,675 @@ VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = {
};
/* *INDENT-ON* */
+static clib_error_t *
+show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ int cpu;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_cli_output (vm, "All interfaces are handled by main thread");
+
+ for (cpu = 0; cpu < vec_len (dm->devices_by_hqos_cpu); cpu++)
+ {
+ if (vec_len (dm->devices_by_hqos_cpu[cpu]))
+ vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu,
+ vlib_worker_threads[cpu].name,
+ vlib_worker_threads[cpu].dpdk_lcore_id);
+
+ vec_foreach (dq, dm->devices_by_hqos_cpu[cpu])
+ {
+ u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index;
+ vnet_hw_interface_t *hi =
+ vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ vlib_cli_output (vm, " %v queue %u", hi->name, dq->queue_id);
+ }
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos_placement, static) = {
+ .path = "show dpdk interface hqos placement",
+ .short_help = "show dpdk interface hqos placement",
+ .function = show_dpdk_if_hqos_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ dpdk_device_and_queue_t *dq;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 cpu = (u32) ~ 0;
+ int i;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "thread %d", &cpu))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ if (cpu < dm->hqos_cpu_first_index ||
+ cpu >= (dm->hqos_cpu_first_index + dm->hqos_cpu_count))
+ return clib_error_return (0, "please specify valid thread id");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ for (i = 0; i < vec_len (dm->devices_by_hqos_cpu); i++)
+ {
+ vec_foreach (dq, dm->devices_by_hqos_cpu[i])
+ {
+ if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index)
+ {
+ if (cpu == i) /* nothing to do */
+ return 0;
+
+ vec_del1 (dm->devices_by_hqos_cpu[i],
+ dq - dm->devices_by_hqos_cpu[i]);
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->queue_id = 0;
+ dq->device = xd->device_index;
+
+ vec_sort_with_function (dm->devices_by_hqos_cpu[i],
+ dpdk_device_queue_sort);
+
+ vec_sort_with_function (dm->devices_by_hqos_cpu[cpu],
+ dpdk_device_queue_sort);
+
+ return 0;
+ }
+ }
+ }
+
+ return clib_error_return (0, "not found");
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_placement, static) = {
+ .path = "set dpdk interface hqos placement",
+ .short_help = "set dpdk interface hqos placement <if-name> thread <n>",
+ .function = set_dpdk_if_hqos_placement,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport_id = (u32) ~ 0;
+ u32 pipe_id = (u32) ~ 0;
+ u32 profile_id = (u32) ~ 0;
+ int rv;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "subport %d", &subport_id))
+ ;
+ else if (unformat (line_input, "pipe %d", &pipe_id))
+ ;
+ else if (unformat (line_input, "profile %d", &profile_id))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rv =
+ rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id,
+ profile_id);
+ if (rv)
+ return clib_error_return (0, "pipe configuration failed");
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pipe, static) =
+{
+ .path = "set dpdk interface hqos pipe",
+ .short_help = "set dpdk interface hqos pipe <if-name> subport <n> pipe <n> "
+ "profile <n>",
+ .function = set_dpdk_if_hqos_pipe,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_subport (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 subport_id = (u32) ~ 0;
+ struct rte_sched_subport_params p = {
+ .tb_rate = 1250000000, /* 10GbE */
+ .tb_size = 1000000,
+ .tc_rate = {1250000000, 1250000000, 1250000000, 1250000000},
+ .tc_period = 10,
+ };
+ int rv;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "subport %d", &subport_id))
+ ;
+ else if (unformat (line_input, "rate %d", &p.tb_rate))
+ {
+ p.tc_rate[0] = p.tb_rate;
+ p.tc_rate[1] = p.tb_rate;
+ p.tc_rate[2] = p.tb_rate;
+ p.tc_rate[3] = p.tb_rate;
+ }
+ else if (unformat (line_input, "bktsize %d", &p.tb_size))
+ ;
+ else if (unformat (line_input, "tc0 %d", &p.tc_rate[0]))
+ ;
+ else if (unformat (line_input, "tc1 %d", &p.tc_rate[1]))
+ ;
+ else if (unformat (line_input, "tc2 %d", &p.tc_rate[2]))
+ ;
+ else if (unformat (line_input, "tc3 %d", &p.tc_rate[3]))
+ ;
+ else if (unformat (line_input, "period %d", &p.tc_period))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport_id, &p);
+ if (rv)
+ return clib_error_return (0, "subport configuration failed");
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_subport, static) = {
+ .path = "set dpdk interface hqos subport",
+ .short_help = "set dpdk interface hqos subport <if-name> subport <n> "
+ "[rate <n>] [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] "
+ "[period <n>]",
+ .function = set_dpdk_if_hqos_subport,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 entry, tc, queue, val, i;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "entry %d", &entry))
+ ;
+ else if (unformat (line_input, "tc %d", &tc))
+ ;
+ else if (unformat (line_input, "queue %d", &queue))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+ if (entry >= 64)
+ return clib_error_return (0, "invalid entry");
+ if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE)
+ return clib_error_return (0, "invalid traffic class");
+ if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS)
+ return clib_error_return (0, "invalid traffic class");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ /* Detect the set of worker threads */
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0];
+ int worker_thread_first = tr->first_index;
+ int worker_thread_count = tr->count;
+
+ val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue;
+ for (i = 0; i < worker_thread_count; i++)
+ xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val;
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_tctbl, static) = {
+ .path = "set dpdk interface hqos tctbl",
+ .short_help = "set dpdk interface hqos tctbl <if-name> entry <n> tc <n> queue <n>",
+ .function = set_dpdk_if_hqos_tctbl,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+
+ /* Device specific data */
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ u32 hw_if_index = (u32) ~ 0;
+
+ /* Detect the set of worker threads */
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0];
+ int worker_thread_first = tr->first_index;
+ int worker_thread_count = tr->count;
+
+ /* Packet field configuration */
+ u64 mask;
+ u32 id, offset;
+
+ /* HQoS params */
+ u32 n_subports_per_port, n_pipes_per_subport, tctbl_size;
+
+ u32 i;
+
+ /* Parse input arguments */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "id %d", &id))
+ ;
+ else if (unformat (line_input, "offset %d", &offset))
+ ;
+ else if (unformat (line_input, "mask %llx", &mask))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ /* Get interface */
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ return 0;
+ }
+
+ n_subports_per_port = devconf->hqos.port.n_subports_per_port;
+ n_pipes_per_subport = devconf->hqos.port.n_pipes_per_subport;
+ tctbl_size = RTE_DIM (devconf->hqos.tc_table);
+
+ /* Validate packet field configuration: id, offset and mask */
+ if (id >= 3)
+ return clib_error_return (0, "invalid packet field id");
+
+ switch (id)
+ {
+ case 0:
+ if (dpdk_hqos_validate_mask (mask, n_subports_per_port) != 0)
+ return clib_error_return (0, "invalid subport ID mask "
+ "(n_subports_per_port = %u)",
+ n_subports_per_port);
+ break;
+ case 1:
+ if (dpdk_hqos_validate_mask (mask, n_pipes_per_subport) != 0)
+ return clib_error_return (0, "invalid pipe ID mask "
+ "(n_pipes_per_subport = %u)",
+ n_pipes_per_subport);
+ break;
+ case 2:
+ default:
+ if (dpdk_hqos_validate_mask (mask, tctbl_size) != 0)
+ return clib_error_return (0, "invalid TC table index mask "
+ "(TC table size = %u)", tctbl_size);
+ }
+
+ /* Propagate packet field configuration to all workers */
+ for (i = 0; i < worker_thread_count; i++)
+ switch (id)
+ {
+ case 0:
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field0_slabmask = mask;
+ break;
+ case 1:
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field1_slabmask = mask;
+ break;
+ case 2:
+ default:
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabpos = offset;
+ xd->hqos_wt[worker_thread_first + i].hqos_field2_slabmask = mask;
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pktfield, static) = {
+ .path = "set dpdk interface hqos pktfield",
+ .short_help = "set dpdk interface hqos pktfield <if-name> id <n> offset <n> "
+ "mask <n>",
+ .function = set_dpdk_if_hqos_pktfield,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+ vnet_hw_interface_t *hw;
+ dpdk_device_t *xd;
+ dpdk_device_config_hqos_t *cfg;
+ dpdk_device_hqos_per_hqos_thread_t *ht;
+ dpdk_device_hqos_per_worker_thread_t *wk;
+ u32 *tctbl;
+ u32 hw_if_index = (u32) ~ 0;
+ u32 profile_id, i;
+ struct rte_eth_dev_info dev_info;
+ dpdk_device_config_t *devconf = 0;
+ vlib_thread_registration_t *tr;
+ uword *p = 0;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~ 0)
+ return clib_error_return (0, "please specify interface name!!");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ rte_eth_dev_info_get (xd->device_index, &dev_info);
+ if (dev_info.pci_dev)
+ { /* bonded interface has no pci info */
+ vlib_pci_addr_t pci_addr;
+
+ pci_addr.domain = dev_info.pci_dev->addr.domain;
+ pci_addr.bus = dev_info.pci_dev->addr.bus;
+ pci_addr.slot = dev_info.pci_dev->addr.devid;
+ pci_addr.function = dev_info.pci_dev->addr.function;
+
+ p =
+ hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32);
+ }
+
+ if (p)
+ devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
+ else
+ devconf = &dm->conf->default_devconf;
+
+ if (devconf->hqos_enabled == 0)
+ {
+ vlib_cli_output (vm, "HQoS disabled for this interface");
+ return 0;
+ }
+
+ /* Detect the set of worker threads */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = (vlib_thread_registration_t *) p[0];
+
+ cfg = &devconf->hqos;
+ ht = xd->hqos_ht;
+ wk = &xd->hqos_wt[tr->first_index];
+ tctbl = wk->hqos_tc_table;
+
+ vlib_cli_output (vm, " Thread:");
+ vlib_cli_output (vm, " Input SWQ size = %u packets", cfg->swq_size);
+ vlib_cli_output (vm, " Enqueue burst size = %u packets",
+ ht->hqos_burst_enq);
+ vlib_cli_output (vm, " Dequeue burst size = %u packets",
+ ht->hqos_burst_deq);
+
+ vlib_cli_output (vm,
+ " Packet field 0: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field0_slabpos, wk->hqos_field0_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 1: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field1_slabpos, wk->hqos_field1_slabmask);
+ vlib_cli_output (vm,
+ " Packet field 2: slab position = %4u, slab bitmask = 0x%016llx",
+ wk->hqos_field2_slabpos, wk->hqos_field2_slabmask);
+ vlib_cli_output (vm, " Packet field 2 translation table:");
+ vlib_cli_output (vm, " [ 0 .. 15]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[0], tctbl[1], tctbl[2], tctbl[3],
+ tctbl[4], tctbl[5], tctbl[6], tctbl[7],
+ tctbl[8], tctbl[9], tctbl[10], tctbl[11],
+ tctbl[12], tctbl[13], tctbl[14], tctbl[15]);
+ vlib_cli_output (vm, " [16 .. 31]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[16], tctbl[17], tctbl[18], tctbl[19],
+ tctbl[20], tctbl[21], tctbl[22], tctbl[23],
+ tctbl[24], tctbl[25], tctbl[26], tctbl[27],
+ tctbl[28], tctbl[29], tctbl[30], tctbl[31]);
+ vlib_cli_output (vm, " [32 .. 47]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[32], tctbl[33], tctbl[34], tctbl[35],
+ tctbl[36], tctbl[37], tctbl[38], tctbl[39],
+ tctbl[40], tctbl[41], tctbl[42], tctbl[43],
+ tctbl[44], tctbl[45], tctbl[46], tctbl[47]);
+ vlib_cli_output (vm, " [48 .. 63]: "
+ "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u",
+ tctbl[48], tctbl[49], tctbl[50], tctbl[51],
+ tctbl[52], tctbl[53], tctbl[54], tctbl[55],
+ tctbl[56], tctbl[57], tctbl[58], tctbl[59],
+ tctbl[60], tctbl[61], tctbl[62], tctbl[63]);
+
+ vlib_cli_output (vm, " Port:");
+ vlib_cli_output (vm, " Rate = %u bytes/second", cfg->port.rate);
+ vlib_cli_output (vm, " MTU = %u bytes", cfg->port.mtu);
+ vlib_cli_output (vm, " Frame overhead = %u bytes",
+ cfg->port.frame_overhead);
+ vlib_cli_output (vm, " Number of subports = %u",
+ cfg->port.n_subports_per_port);
+ vlib_cli_output (vm, " Number of pipes per subport = %u",
+ cfg->port.n_pipes_per_subport);
+ vlib_cli_output (vm,
+ " Packet queue size: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u packets",
+ cfg->port.qsize[0], cfg->port.qsize[1], cfg->port.qsize[2],
+ cfg->port.qsize[3]);
+ vlib_cli_output (vm, " Number of pipe profiles = %u",
+ cfg->port.n_pipe_profiles);
+
+ for (profile_id = 0; profile_id < vec_len (cfg->pipe); profile_id++)
+ {
+ vlib_cli_output (vm, " Pipe profile %u:", profile_id);
+ vlib_cli_output (vm, " Rate = %u bytes/second",
+ cfg->pipe[profile_id].tb_rate);
+ vlib_cli_output (vm, " Token bucket size = %u bytes",
+ cfg->pipe[profile_id].tb_size);
+ vlib_cli_output (vm,
+ " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second",
+ cfg->pipe[profile_id].tc_rate[0],
+ cfg->pipe[profile_id].tc_rate[1],
+ cfg->pipe[profile_id].tc_rate[2],
+ cfg->pipe[profile_id].tc_rate[3]);
+ vlib_cli_output (vm, " TC period = %u milliseconds",
+ cfg->pipe[profile_id].tc_period);
+#ifdef RTE_SCHED_SUBPORT_TC_OV
+ vlib_cli_output (vm, " TC3 oversubscription_weight = %u",
+ cfg->pipe[profile_id].tc_ov_weight);
+#endif
+
+ for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
+ {
+ vlib_cli_output (vm,
+ " TC%u WRR weights: Q0 = %u, Q1 = %u, Q2 = %u, Q3 = %u",
+ i, cfg->pipe[profile_id].wrr_weights[i * 4],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 1],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 2],
+ cfg->pipe[profile_id].wrr_weights[i * 4 + 3]);
+ }
+ }
+
+#ifdef RTE_SCHED_RED
+ vlib_cli_output (vm, " Weighted Random Early Detection (WRED):");
+ for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++)
+ {
+ vlib_cli_output (vm, " TC%u min: G = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].min_th,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].min_th,
+ cfg->port.red_params[i][e_RTE_METER_RED].min_th);
+
+ vlib_cli_output (vm, " TC%u max: G = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].max_th,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].max_th,
+ cfg->port.red_params[i][e_RTE_METER_RED].max_th);
+
+ vlib_cli_output (vm,
+ " TC%u inverted probability: G = %u, Y = %u, R = %u",
+ i, cfg->port.red_params[i][e_RTE_METER_GREEN].maxp_inv,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].maxp_inv,
+ cfg->port.red_params[i][e_RTE_METER_RED].maxp_inv);
+
+ vlib_cli_output (vm, " TC%u weight: R = %u, Y = %u, R = %u", i,
+ cfg->port.red_params[i][e_RTE_METER_GREEN].wq_log2,
+ cfg->port.red_params[i][e_RTE_METER_YELLOW].wq_log2,
+ cfg->port.red_params[i][e_RTE_METER_RED].wq_log2);
+ }
+#endif
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos, static) = {
+ .path = "show dpdk interface hqos",
+ .short_help = "show dpdk interface hqos <if-name>",
+ .function = show_dpdk_if_hqos,
+};
+/* *INDENT-ON* */
+
clib_error_t *
dpdk_cli_init (vlib_main_t * vm)
{
diff --git a/vnet/vnet/devices/dpdk/device.c b/vnet/vnet/devices/dpdk/device.c
index 3649178f388..5d1fcf60d2d 100644
--- a/vnet/vnet/devices/dpdk/device.c
+++ b/vnet/vnet/devices/dpdk/device.c
@@ -332,7 +332,48 @@ static_always_inline
queue_id = (queue_id + 1) % xd->tx_q_used;
}
- if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD))
+ if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_HQOS)) /* HQoS ON */
+ {
+ if (PREDICT_TRUE (tx_head > tx_tail))
+ {
+ /* no wrap, transmit in one burst */
+ dpdk_device_hqos_per_worker_thread_t *hqos =
+ &xd->hqos_wt[vm->cpu_index];
+
+ dpdk_hqos_metadata_set (hqos,
+ &tx_vector[tx_tail], tx_head - tx_tail);
+ rv = rte_ring_sp_enqueue_burst (hqos->swq,
+ (void **) &tx_vector[tx_tail],
+ (uint16_t) (tx_head - tx_tail));
+ }
+ else
+ {
+ /*
+ * This can only happen if there is a flowcontrol callback.
+ * We need to split the transmit into two calls: one for
+ * the packets up to the wrap point, and one to continue
+ * at the start of the ring.
+ * Transmit pkts up to the wrap point.
+ */
+ dpdk_device_hqos_per_worker_thread_t *hqos =
+ &xd->hqos_wt[vm->cpu_index];
+
+ dpdk_hqos_metadata_set (hqos,
+ &tx_vector[tx_tail],
+ xd->nb_tx_desc - tx_tail);
+ rv = rte_ring_sp_enqueue_burst (hqos->swq,
+ (void **) &tx_vector[tx_tail],
+ (uint16_t) (xd->nb_tx_desc -
+ tx_tail));
+ /*
+ * If we transmitted everything we wanted, then allow 1 retry
+ * so we can try to transmit the rest. If we didn't transmit
+ * everything, stop now.
+ */
+ n_retry = (rv == xd->nb_tx_desc - tx_tail) ? 1 : 0;
+ }
+ }
+ else if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD))
{
if (PREDICT_TRUE (tx_head > tx_tail))
{
diff --git a/vnet/vnet/devices/dpdk/dpdk.h b/vnet/vnet/devices/dpdk/dpdk.h
index 48072560eed..2e72f5faad4 100644
--- a/vnet/vnet/devices/dpdk/dpdk.h
+++ b/vnet/vnet/devices/dpdk/dpdk.h
@@ -51,6 +51,7 @@
#include <rte_pci_dev_ids.h>
#include <rte_version.h>
#include <rte_eth_bond.h>
+#include <rte_sched.h>
#include <vnet/unix/pcap.h>
#include <vnet/devices/virtio/vhost-user.h>
@@ -184,6 +185,34 @@ typedef struct
typedef struct
{
+ struct rte_ring *swq;
+
+ u64 hqos_field0_slabmask;
+ u32 hqos_field0_slabpos;
+ u32 hqos_field0_slabshr;
+ u64 hqos_field1_slabmask;
+ u32 hqos_field1_slabpos;
+ u32 hqos_field1_slabshr;
+ u64 hqos_field2_slabmask;
+ u32 hqos_field2_slabpos;
+ u32 hqos_field2_slabshr;
+ u32 hqos_tc_table[64];
+} dpdk_device_hqos_per_worker_thread_t;
+
+typedef struct
+{
+ struct rte_ring **swq;
+ struct rte_mbuf **pkts_enq;
+ struct rte_mbuf **pkts_deq;
+ struct rte_sched_port *hqos;
+ u32 hqos_burst_enq;
+ u32 hqos_burst_deq;
+ u32 pkts_enq_len;
+ u32 swq_pos;
+} dpdk_device_hqos_per_hqos_thread_t;
+
+typedef struct
+{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
volatile u32 **lockp;
@@ -213,6 +242,7 @@ typedef struct
#define DPDK_DEVICE_FLAG_KNI (1 << 3)
#define DPDK_DEVICE_FLAG_VHOST_USER (1 << 4)
#define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5)
+#define DPDK_DEVICE_FLAG_HQOS (1 << 6)
u16 nb_tx_desc;
CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
@@ -230,6 +260,10 @@ typedef struct
struct rte_eth_conf port_conf;
struct rte_eth_txconf tx_conf;
+ /* HQoS related */
+ dpdk_device_hqos_per_worker_thread_t *hqos_wt;
+ dpdk_device_hqos_per_hqos_thread_t *hqos_ht;
+
/* KNI related */
struct rte_kni *kni;
u8 kni_port_id;
@@ -281,6 +315,14 @@ typedef struct
typedef struct
{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+
+ /* total input packet counter */
+ u64 aggregate_rx_packets;
+} dpdk_hqos_thread_t;
+
+typedef struct
+{
u32 device;
u16 queue_id;
} dpdk_device_and_queue_t;
@@ -302,6 +344,42 @@ typedef struct dpdk_efd_t
u16 pad;
} dpdk_efd_t;
+#ifndef DPDK_HQOS_DBG_BYPASS
+#define DPDK_HQOS_DBG_BYPASS 0
+#endif
+
+typedef struct dpdk_device_config_hqos_t
+{
+ u32 hqos_thread;
+ u32 hqos_thread_valid;
+
+ u32 swq_size;
+ u32 burst_enq;
+ u32 burst_deq;
+
+ u32 pktfield0_slabpos;
+ u32 pktfield1_slabpos;
+ u32 pktfield2_slabpos;
+ u64 pktfield0_slabmask;
+ u64 pktfield1_slabmask;
+ u64 pktfield2_slabmask;
+ u32 tc_table[64];
+
+ struct rte_sched_port_params port;
+ struct rte_sched_subport_params *subport;
+ struct rte_sched_pipe_params *pipe;
+ uint32_t *pipe_map;
+} dpdk_device_config_hqos_t;
+
+int dpdk_hqos_validate_mask (u64 mask, u32 n);
+void dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t *
+ hqos, u32 pipe_profile_id);
+void dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos);
+clib_error_t *dpdk_port_setup_hqos (dpdk_device_t * xd,
+ dpdk_device_config_hqos_t * hqos);
+void dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos,
+ struct rte_mbuf **pkts, u32 n_pkts);
+
#define foreach_dpdk_device_config_item \
_ (num_rx_queues) \
_ (num_tx_queues) \
@@ -322,6 +400,8 @@ typedef struct
foreach_dpdk_device_config_item
#undef _
clib_bitmap_t * workers;
+ u32 hqos_enabled;
+ dpdk_device_config_hqos_t hqos;
} dpdk_device_config_t;
typedef struct
@@ -370,6 +450,7 @@ typedef struct
/* Devices */
dpdk_device_t *devices;
dpdk_device_and_queue_t **devices_by_cpu;
+ dpdk_device_and_queue_t **devices_by_hqos_cpu;
/* per-thread recycle lists */
u32 **recycle;
@@ -386,6 +467,8 @@ typedef struct
/* dpdk worker "threads" */
dpdk_worker_t *workers;
+ /* dpdk HQoS "threads" */
+ dpdk_hqos_thread_t *hqos_threads;
/* Ethernet input node index */
u32 ethernet_input_node_index;
@@ -420,6 +503,10 @@ typedef struct
int input_cpu_first_index;
int input_cpu_count;
+ /* which cpus are running I/O TX */
+ int hqos_cpu_first_index;
+ int hqos_cpu_count;
+
/* control interval of dpdk link state and stat polling */
f64 link_state_poll_interval;
f64 stat_poll_interval;
@@ -618,6 +705,8 @@ format_function_t format_dpdk_rte_mbuf;
format_function_t format_dpdk_rx_rte_mbuf;
unformat_function_t unformat_socket_mem;
clib_error_t *unformat_rss_fn (unformat_input_t * input, uword * rss_fn);
+clib_error_t *unformat_hqos (unformat_input_t * input,
+ dpdk_device_config_hqos_t * hqos);
static inline void
diff --git a/vnet/vnet/devices/dpdk/format.c b/vnet/vnet/devices/dpdk/format.c
index ef7ee0e7853..1b3fb5ef37d 100644
--- a/vnet/vnet/devices/dpdk/format.c
+++ b/vnet/vnet/devices/dpdk/format.c
@@ -864,6 +864,26 @@ unformat_rss_fn (unformat_input_t * input, uword * rss_fn)
return 0;
}
+clib_error_t *
+unformat_hqos (unformat_input_t * input, dpdk_device_config_hqos_t * hqos)
+{
+ clib_error_t *error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "hqos-thread %u", &hqos->hqos_thread))
+ hqos->hqos_thread_valid = 1;
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ break;
+ }
+ }
+
+ return error;
+}
+
/*
* fd.io coding-style-patch-verification: ON
*
diff --git a/vnet/vnet/devices/dpdk/hqos.c b/vnet/vnet/devices/dpdk/hqos.c
new file mode 100644
index 00000000000..d05ae09ac2b
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/hqos.c
@@ -0,0 +1,742 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/bitmap.h>
+
+#include <vnet/vnet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include <vlib/unix/physmem.h>
+#include <vlib/pci/pci.h>
+#include <vlibmemory/api.h>
+#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */
+
+#define vl_typedefs /* define message structures */
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_typedefs
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_printfun
+
+#include "dpdk_priv.h"
+
+dpdk_main_t dpdk_main;
+
+/***
+ *
+ * HQoS default configuration values
+ *
+ ***/
+
+static dpdk_device_config_hqos_t hqos_params_default = {
+ .hqos_thread_valid = 0,
+
+ .swq_size = 4096,
+ .burst_enq = 256,
+ .burst_deq = 220,
+
+ /*
+ * Packet field to identify the subport.
+ *
+ * Default value: Since only one subport is defined by default (see below:
+ * n_subports_per_port = 1), the subport ID is hardcoded to 0.
+ */
+ .pktfield0_slabpos = 0,
+ .pktfield0_slabmask = 0,
+
+ /*
+ * Packet field to identify the pipe.
+ *
+ * Default value: Assuming Ethernet/IPv4/UDP packets, UDP payload bits 12 .. 23
+ */
+ .pktfield1_slabpos = 40,
+ .pktfield1_slabmask = 0x0000000FFF000000LLU,
+
+ /* Packet field used as index into TC translation table to identify the traffic
+ * class and queue.
+ *
+ * Default value: Assuming Ethernet/IPv4 packets, IPv4 DSCP field
+ */
+ .pktfield2_slabpos = 8,
+ .pktfield2_slabmask = 0x00000000000000FCLLU,
+ .tc_table = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ },
+
+ /* port */
+ .port = {
+ .name = NULL, /* Set at init */
+ .socket = 0, /* Set at init */
+ .rate = 1250000000, /* Assuming 10GbE port */
+ .mtu = 14 + 1500, /* Assuming Ethernet/IPv4 pkt (Ethernet FCS not included) */
+ .frame_overhead = RTE_SCHED_FRAME_OVERHEAD_DEFAULT,
+ .n_subports_per_port = 1,
+ .n_pipes_per_subport = 4096,
+ .qsize = {64, 64, 64, 64},
+ .pipe_profiles = NULL, /* Set at config */
+ .n_pipe_profiles = 1,
+
+#ifdef RTE_SCHED_RED
+ .red_params = {
+ /* Traffic Class 0 Colors Green / Yellow / Red */
+ [0][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [0][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [0][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 1 - Colors Green / Yellow / Red */
+ [1][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [1][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [1][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 2 - Colors Green / Yellow / Red */
+ [2][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [2][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [2][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+
+ /* Traffic Class 3 - Colors Green / Yellow / Red */
+ [3][0] = {.min_th = 48,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [3][1] = {.min_th = 40,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9},
+ [3][2] = {.min_th = 32,.max_th = 64,.maxp_inv =
+ 10,.wq_log2 = 9}
+ },
+#endif /* RTE_SCHED_RED */
+ },
+};
+
+static struct rte_sched_subport_params hqos_subport_params_default = {
+ .tb_rate = 1250000000, /* 10GbE line rate (measured in bytes/second) */
+ .tb_size = 1000000,
+ .tc_rate = {1250000000, 1250000000, 1250000000, 1250000000},
+ .tc_period = 10,
+};
+
+static struct rte_sched_pipe_params hqos_pipe_params_default = {
+ .tb_rate = 305175, /* 10GbE line rate divided by 4K pipes */
+ .tb_size = 1000000,
+ .tc_rate = {305175, 305175, 305175, 305175},
+ .tc_period = 40,
+#ifdef RTE_SCHED_SUBPORT_TC_OV
+ .tc_ov_weight = 1,
+#endif
+ .wrr_weights = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+};
+
+/***
+ *
+ * HQoS configuration
+ *
+ ***/
+
+int
+dpdk_hqos_validate_mask (u64 mask, u32 n)
+{
+ int count = __builtin_popcountll (mask);
+ int pos_lead = sizeof (u64) * 8 - __builtin_clzll (mask);
+ int pos_trail = __builtin_ctzll (mask);
+ int count_expected = __builtin_popcount (n - 1);
+
+ /* Handle the exceptions */
+ if (n == 0)
+ return -1; /* Error */
+
+ if ((mask == 0) && (n == 1))
+ return 0; /* OK */
+
+ if (((mask == 0) && (n != 1)) || ((mask != 0) && (n == 1)))
+ return -2; /* Error */
+
+ /* Check that mask is contiguous */
+ if ((pos_lead - pos_trail) != count)
+ return -3; /* Error */
+
+ /* Check that mask contains the expected number of bits set */
+ if (count != count_expected)
+ return -4; /* Error */
+
+ return 0; /* OK */
+}
+
+void
+dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t *
+ hqos, u32 pipe_profile_id)
+{
+ memcpy (&hqos->pipe[pipe_profile_id], &hqos_pipe_params_default,
+ sizeof (hqos_pipe_params_default));
+}
+
+void
+dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos)
+{
+ struct rte_sched_subport_params *subport_params;
+ struct rte_sched_pipe_params *pipe_params;
+ u32 *pipe_map;
+ u32 i;
+
+ memcpy (hqos, &hqos_params_default, sizeof (hqos_params_default));
+
+ /* pipe */
+ vec_add2 (hqos->pipe, pipe_params, hqos->port.n_pipe_profiles);
+
+ for (i = 0; i < vec_len (hqos->pipe); i++)
+ memcpy (&pipe_params[i],
+ &hqos_pipe_params_default, sizeof (hqos_pipe_params_default));
+
+ hqos->port.pipe_profiles = hqos->pipe;
+
+ /* subport */
+ vec_add2 (hqos->subport, subport_params, hqos->port.n_subports_per_port);
+
+ for (i = 0; i < vec_len (hqos->subport); i++)
+ memcpy (&subport_params[i],
+ &hqos_subport_params_default,
+ sizeof (hqos_subport_params_default));
+
+ /* pipe profile */
+ vec_add2 (hqos->pipe_map,
+ pipe_map,
+ hqos->port.n_subports_per_port * hqos->port.n_pipes_per_subport);
+
+ for (i = 0; i < vec_len (hqos->pipe_map); i++)
+ pipe_map[i] = 0;
+}
+
+/***
+ *
+ * HQoS init
+ *
+ ***/
+
+clib_error_t *
+dpdk_port_setup_hqos (dpdk_device_t * xd, dpdk_device_config_hqos_t * hqos)
+{
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ char name[32];
+ u32 subport_id, i;
+ int rv;
+
+ /* Detect the set of worker threads */
+ int worker_thread_first = 0;
+ int worker_thread_count = 0;
+
+ uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ vlib_thread_registration_t *tr =
+ p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (tr && tr->count > 0)
+ {
+ worker_thread_first = tr->first_index;
+ worker_thread_count = tr->count;
+ }
+
+ /* Allocate the per-thread device data array */
+ vec_validate_aligned (xd->hqos_wt, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+ memset (xd->hqos_wt, 0, tm->n_vlib_mains * sizeof (xd->hqos_wt[0]));
+
+ vec_validate_aligned (xd->hqos_ht, 0, CLIB_CACHE_LINE_BYTES);
+ memset (xd->hqos_ht, 0, sizeof (xd->hqos_ht[0]));
+
+ /* Allocate space for one SWQ per worker thread in the I/O TX thread data structure */
+ vec_validate (xd->hqos_ht->swq, worker_thread_count - 1);
+
+ /* SWQ */
+ for (i = 0; i < worker_thread_count; i++)
+ {
+ u32 swq_flags = RING_F_SP_ENQ | RING_F_SC_DEQ;
+
+ snprintf (name, sizeof (name), "SWQ-worker%u-to-device%u", i,
+ xd->device_index);
+ xd->hqos_ht->swq[i] =
+ rte_ring_create (name, hqos->swq_size, xd->cpu_socket, swq_flags);
+ if (xd->hqos_ht->swq[i] == NULL)
+ return clib_error_return (0,
+ "SWQ-worker%u-to-device%u: rte_ring_create err",
+ i, xd->device_index);
+ }
+
+ /*
+ * HQoS
+ */
+
+ /* HQoS port */
+ snprintf (name, sizeof (name), "HQoS%u", xd->device_index);
+ hqos->port.name = strdup (name);
+ if (hqos->port.name == NULL)
+ return clib_error_return (0, "HQoS%u: strdup err", xd->device_index);
+
+ hqos->port.socket = rte_eth_dev_socket_id (xd->device_index);
+ if (hqos->port.socket == SOCKET_ID_ANY)
+ hqos->port.socket = 0;
+
+ xd->hqos_ht->hqos = rte_sched_port_config (&hqos->port);
+ if (xd->hqos_ht->hqos == NULL)
+ return clib_error_return (0, "HQoS%u: rte_sched_port_config err",
+ xd->device_index);
+
+ /* HQoS subport */
+ for (subport_id = 0; subport_id < hqos->port.n_subports_per_port;
+ subport_id++)
+ {
+ u32 pipe_id;
+
+ rv =
+ rte_sched_subport_config (xd->hqos_ht->hqos, subport_id,
+ &hqos->subport[subport_id]);
+ if (rv)
+ return clib_error_return (0,
+ "HQoS%u subport %u: rte_sched_subport_config err (%d)",
+ xd->device_index, subport_id, rv);
+
+ /* HQoS pipe */
+ for (pipe_id = 0; pipe_id < hqos->port.n_pipes_per_subport; pipe_id++)
+ {
+ u32 pos = subport_id * hqos->port.n_pipes_per_subport + pipe_id;
+ u32 profile_id = hqos->pipe_map[pos];
+
+ rv =
+ rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id,
+ profile_id);
+ if (rv)
+ return clib_error_return (0,
+ "HQoS%u subport %u pipe %u: rte_sched_pipe_config err (%d)",
+ xd->device_index, subport_id, pipe_id,
+ rv);
+ }
+ }
+
+ /* Set up per-thread device data for the I/O TX thread */
+ xd->hqos_ht->hqos_burst_enq = hqos->burst_enq;
+ xd->hqos_ht->hqos_burst_deq = hqos->burst_deq;
+ vec_validate (xd->hqos_ht->pkts_enq, 2 * hqos->burst_enq - 1);
+ vec_validate (xd->hqos_ht->pkts_deq, hqos->burst_deq - 1);
+ xd->hqos_ht->pkts_enq_len = 0;
+ xd->hqos_ht->swq_pos = 0;
+
+ /* Set up per-thread device data for each worker thread */
+ for (i = 0; i < worker_thread_count; i++)
+ {
+ u32 tid = worker_thread_first + i;
+
+ xd->hqos_wt[tid].swq = xd->hqos_ht->swq[i];
+ xd->hqos_wt[tid].hqos_field0_slabpos = hqos->pktfield0_slabpos;
+ xd->hqos_wt[tid].hqos_field0_slabmask = hqos->pktfield0_slabmask;
+ xd->hqos_wt[tid].hqos_field0_slabshr =
+ __builtin_ctzll (hqos->pktfield0_slabmask);
+ xd->hqos_wt[tid].hqos_field1_slabpos = hqos->pktfield1_slabpos;
+ xd->hqos_wt[tid].hqos_field1_slabmask = hqos->pktfield1_slabmask;
+ xd->hqos_wt[tid].hqos_field1_slabshr =
+ __builtin_ctzll (hqos->pktfield1_slabmask);
+ xd->hqos_wt[tid].hqos_field2_slabpos = hqos->pktfield2_slabpos;
+ xd->hqos_wt[tid].hqos_field2_slabmask = hqos->pktfield2_slabmask;
+ xd->hqos_wt[tid].hqos_field2_slabshr =
+ __builtin_ctzll (hqos->pktfield2_slabmask);
+ memcpy (xd->hqos_wt[tid].hqos_tc_table, hqos->tc_table,
+ sizeof (hqos->tc_table));
+ }
+
+ return 0;
+}
+
+/***
+ *
+ * HQoS run-time
+ *
+ ***/
+/*
+ * dpdk_hqos_thread - Contains the main loop of an HQoS thread.
+ *
+ * w
+ * Information for the current thread
+ */
+static_always_inline void
+dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 cpu_index = vm->cpu_index;
+ u32 dev_pos;
+
+ dev_pos = 0;
+ while (1)
+ {
+ vlib_worker_thread_barrier_check ();
+
+ u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]);
+ if (dev_pos >= n_devs)
+ dev_pos = 0;
+
+ dpdk_device_and_queue_t *dq =
+ vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos);
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
+
+ dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
+ u32 device_index = xd->device_index;
+ u16 queue_id = dq->queue_id;
+
+ struct rte_mbuf **pkts_enq = hqos->pkts_enq;
+ u32 pkts_enq_len = hqos->pkts_enq_len;
+ u32 swq_pos = hqos->swq_pos;
+ u32 n_swq = vec_len (hqos->swq), i;
+
+ for (i = 0; i < n_swq; i++)
+ {
+ /* Get current SWQ for this device */
+ struct rte_ring *swq = hqos->swq[swq_pos];
+
+ /* Read SWQ burst to packet buffer of this device */
+ pkts_enq_len += rte_ring_sc_dequeue_burst (swq,
+ (void **)
+ &pkts_enq[pkts_enq_len],
+ hqos->hqos_burst_enq);
+
+ /* Get next SWQ for this device */
+ swq_pos++;
+ if (swq_pos >= n_swq)
+ swq_pos = 0;
+ hqos->swq_pos = swq_pos;
+
+ /* HWQ TX enqueue when burst available */
+ if (pkts_enq_len >= hqos->hqos_burst_enq)
+ {
+ u32 n_pkts = rte_eth_tx_burst (device_index,
+ (uint16_t) queue_id,
+ pkts_enq,
+ (uint16_t) pkts_enq_len);
+
+ for (; n_pkts < pkts_enq_len; n_pkts++)
+ rte_pktmbuf_free (pkts_enq[n_pkts]);
+
+ pkts_enq_len = 0;
+ break;
+ }
+ }
+ hqos->pkts_enq_len = pkts_enq_len;
+
+ /* Advance to next device */
+ dev_pos++;
+ }
+}
+
+static_always_inline void
+dpdk_hqos_thread_internal (vlib_main_t * vm)
+{
+ dpdk_main_t *dm = &dpdk_main;
+ u32 cpu_index = vm->cpu_index;
+ u32 dev_pos;
+
+ dev_pos = 0;
+ while (1)
+ {
+ vlib_worker_thread_barrier_check ();
+
+ u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]);
+ if (PREDICT_FALSE (n_devs == 0))
+ {
+ dev_pos = 0;
+ continue;
+ }
+ if (dev_pos >= n_devs)
+ dev_pos = 0;
+
+ dpdk_device_and_queue_t *dq =
+ vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos);
+ dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
+
+ dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
+ u32 device_index = xd->device_index;
+ u16 queue_id = dq->queue_id;
+
+ struct rte_mbuf **pkts_enq = hqos->pkts_enq;
+ struct rte_mbuf **pkts_deq = hqos->pkts_deq;
+ u32 pkts_enq_len = hqos->pkts_enq_len;
+ u32 swq_pos = hqos->swq_pos;
+ u32 n_swq = vec_len (hqos->swq), i;
+
+ /*
+ * SWQ dequeue and HQoS enqueue for current device
+ */
+ for (i = 0; i < n_swq; i++)
+ {
+ /* Get current SWQ for this device */
+ struct rte_ring *swq = hqos->swq[swq_pos];
+
+ /* Read SWQ burst to packet buffer of this device */
+ pkts_enq_len += rte_ring_sc_dequeue_burst (swq,
+ (void **)
+ &pkts_enq[pkts_enq_len],
+ hqos->hqos_burst_enq);
+
+ /* Get next SWQ for this device */
+ swq_pos++;
+ if (swq_pos >= n_swq)
+ swq_pos = 0;
+ hqos->swq_pos = swq_pos;
+
+ /* HQoS enqueue when burst available */
+ if (pkts_enq_len >= hqos->hqos_burst_enq)
+ {
+ rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
+
+ pkts_enq_len = 0;
+ break;
+ }
+ }
+ hqos->pkts_enq_len = pkts_enq_len;
+
+ /*
+ * HQoS dequeue and HWQ TX enqueue for current device
+ */
+ {
+ u32 pkts_deq_len, n_pkts;
+
+ pkts_deq_len = rte_sched_port_dequeue (hqos->hqos,
+ pkts_deq,
+ hqos->hqos_burst_deq);
+
+ for (n_pkts = 0; n_pkts < pkts_deq_len;)
+ n_pkts += rte_eth_tx_burst (device_index,
+ (uint16_t) queue_id,
+ &pkts_deq[n_pkts],
+ (uint16_t) (pkts_deq_len - n_pkts));
+ }
+
+ /* Advance to next device */
+ dev_pos++;
+ }
+}
+
+void
+dpdk_hqos_thread (vlib_worker_thread_t * w)
+{
+ vlib_main_t *vm;
+ vlib_thread_main_t *tm = vlib_get_thread_main ();
+ dpdk_main_t *dm = &dpdk_main;
+
+ vm = vlib_get_main ();
+
+ ASSERT (vm->cpu_index == os_get_cpu_number ());
+
+ clib_time_init (&vm->clib_time);
+ clib_mem_set_heap (w->thread_mheap);
+
+ /* Wait until the dpdk init sequence is complete */
+ while (tm->worker_thread_release == 0)
+ vlib_worker_thread_barrier_check ();
+
+ if (vec_len (dm->devices_by_hqos_cpu[vm->cpu_index]) == 0)
+ return
+ clib_error
+ ("current I/O TX thread does not have any devices assigned to it");
+
+ if (DPDK_HQOS_DBG_BYPASS)
+ dpdk_hqos_thread_internal_hqos_dbg_bypass (vm);
+ else
+ dpdk_hqos_thread_internal (vm);
+}
+
+void
+dpdk_hqos_thread_fn (void *arg)
+{
+ vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+ vlib_worker_thread_init (w);
+ dpdk_hqos_thread (w);
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_THREAD (hqos_thread_reg, static) =
+{
+ .name = "hqos-threads",
+ .short_name = "hqos-threads",
+ .function = dpdk_hqos_thread_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * HQoS run-time code to be called by the worker threads
+ */
+#define BITFIELD(byte_array, slab_pos, slab_mask, slab_shr) \
+({ \
+ u64 slab = *((u64 *) &byte_array[slab_pos]); \
+ u64 val = (rte_be_to_cpu_64(slab) & slab_mask) >> slab_shr; \
+ val; \
+})
+
+#define RTE_SCHED_PORT_HIERARCHY(subport, pipe, traffic_class, queue, color) \
+ ((((u64) (queue)) & 0x3) | \
+ ((((u64) (traffic_class)) & 0x3) << 2) | \
+ ((((u64) (color)) & 0x3) << 4) | \
+ ((((u64) (subport)) & 0xFFFF) << 16) | \
+ ((((u64) (pipe)) & 0xFFFFFFFF) << 32))
+
+void
+dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos,
+ struct rte_mbuf **pkts, u32 n_pkts)
+{
+ u32 i;
+
+ for (i = 0; i < (n_pkts & (~0x3)); i += 4)
+ {
+ struct rte_mbuf *pkt0 = pkts[i];
+ struct rte_mbuf *pkt1 = pkts[i + 1];
+ struct rte_mbuf *pkt2 = pkts[i + 2];
+ struct rte_mbuf *pkt3 = pkts[i + 3];
+
+ u8 *pkt0_data = rte_pktmbuf_mtod (pkt0, u8 *);
+ u8 *pkt1_data = rte_pktmbuf_mtod (pkt1, u8 *);
+ u8 *pkt2_data = rte_pktmbuf_mtod (pkt2, u8 *);
+ u8 *pkt3_data = rte_pktmbuf_mtod (pkt3, u8 *);
+
+ u64 pkt0_subport = BITFIELD (pkt0_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt0_pipe = BITFIELD (pkt0_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt0_dscp = BITFIELD (pkt0_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt0_tc = hqos->hqos_tc_table[pkt0_dscp & 0x3F] >> 2;
+ u32 pkt0_tc_q = hqos->hqos_tc_table[pkt0_dscp & 0x3F] & 0x3;
+
+ u64 pkt1_subport = BITFIELD (pkt1_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt1_pipe = BITFIELD (pkt1_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt1_dscp = BITFIELD (pkt1_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt1_tc = hqos->hqos_tc_table[pkt1_dscp & 0x3F] >> 2;
+ u32 pkt1_tc_q = hqos->hqos_tc_table[pkt1_dscp & 0x3F] & 0x3;
+
+ u64 pkt2_subport = BITFIELD (pkt2_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt2_pipe = BITFIELD (pkt2_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt2_dscp = BITFIELD (pkt2_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt2_tc = hqos->hqos_tc_table[pkt2_dscp & 0x3F] >> 2;
+ u32 pkt2_tc_q = hqos->hqos_tc_table[pkt2_dscp & 0x3F] & 0x3;
+
+ u64 pkt3_subport = BITFIELD (pkt3_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt3_pipe = BITFIELD (pkt3_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt3_dscp = BITFIELD (pkt3_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt3_tc = hqos->hqos_tc_table[pkt3_dscp & 0x3F] >> 2;
+ u32 pkt3_tc_q = hqos->hqos_tc_table[pkt3_dscp & 0x3F] & 0x3;
+
+ u64 pkt0_sched = RTE_SCHED_PORT_HIERARCHY (pkt0_subport,
+ pkt0_pipe,
+ pkt0_tc,
+ pkt0_tc_q,
+ 0);
+ u64 pkt1_sched = RTE_SCHED_PORT_HIERARCHY (pkt1_subport,
+ pkt1_pipe,
+ pkt1_tc,
+ pkt1_tc_q,
+ 0);
+ u64 pkt2_sched = RTE_SCHED_PORT_HIERARCHY (pkt2_subport,
+ pkt2_pipe,
+ pkt2_tc,
+ pkt2_tc_q,
+ 0);
+ u64 pkt3_sched = RTE_SCHED_PORT_HIERARCHY (pkt3_subport,
+ pkt3_pipe,
+ pkt3_tc,
+ pkt3_tc_q,
+ 0);
+
+ pkt0->hash.sched.lo = pkt0_sched & 0xFFFFFFFF;
+ pkt0->hash.sched.hi = pkt0_sched >> 32;
+ pkt1->hash.sched.lo = pkt1_sched & 0xFFFFFFFF;
+ pkt1->hash.sched.hi = pkt1_sched >> 32;
+ pkt2->hash.sched.lo = pkt2_sched & 0xFFFFFFFF;
+ pkt2->hash.sched.hi = pkt2_sched >> 32;
+ pkt3->hash.sched.lo = pkt3_sched & 0xFFFFFFFF;
+ pkt3->hash.sched.hi = pkt3_sched >> 32;
+ }
+
+ for (; i < n_pkts; i++)
+ {
+ struct rte_mbuf *pkt = pkts[i];
+
+ u8 *pkt_data = rte_pktmbuf_mtod (pkt, u8 *);
+
+ u64 pkt_subport = BITFIELD (pkt_data, hqos->hqos_field0_slabpos,
+ hqos->hqos_field0_slabmask,
+ hqos->hqos_field0_slabshr);
+ u64 pkt_pipe = BITFIELD (pkt_data, hqos->hqos_field1_slabpos,
+ hqos->hqos_field1_slabmask,
+ hqos->hqos_field1_slabshr);
+ u64 pkt_dscp = BITFIELD (pkt_data, hqos->hqos_field2_slabpos,
+ hqos->hqos_field2_slabmask,
+ hqos->hqos_field2_slabshr);
+ u32 pkt_tc = hqos->hqos_tc_table[pkt_dscp & 0x3F] >> 2;
+ u32 pkt_tc_q = hqos->hqos_tc_table[pkt_dscp & 0x3F] & 0x3;
+
+ u64 pkt_sched = RTE_SCHED_PORT_HIERARCHY (pkt_subport,
+ pkt_pipe,
+ pkt_tc,
+ pkt_tc_q,
+ 0);
+
+ pkt->hash.sched.lo = pkt_sched & 0xFFFFFFFF;
+ pkt->hash.sched.hi = pkt_sched >> 32;
+ }
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/vnet/vnet/devices/dpdk/init.c b/vnet/vnet/devices/dpdk/init.c
index 433d1d584fb..0516edc8faf 100644
--- a/vnet/vnet/devices/dpdk/init.c
+++ b/vnet/vnet/devices/dpdk/init.c
@@ -251,10 +251,10 @@ dpdk_lib_init (dpdk_main_t * dm)
dpdk_device_t *xd;
vlib_pci_addr_t last_pci_addr;
u32 last_pci_addr_port = 0;
- vlib_thread_registration_t *tr;
- uword *p;
+ vlib_thread_registration_t *tr, *tr_hqos;
+ uword *p, *p_hqos;
- u32 next_cpu = 0;
+ u32 next_cpu = 0, next_hqos_cpu = 0;
u8 af_packet_port_id = 0;
last_pci_addr.as_u32 = ~0;
@@ -280,6 +280,30 @@ dpdk_lib_init (dpdk_main_t * dm)
vec_validate_aligned (dm->workers, tm->n_vlib_mains - 1,
CLIB_CACHE_LINE_BYTES);
+ dm->hqos_cpu_first_index = 0;
+ dm->hqos_cpu_count = 0;
+
+ /* find out which cpus will be used for I/O TX */
+ p_hqos = hash_get_mem (tm->thread_registrations_by_name, "hqos-threads");
+ tr_hqos = p_hqos ? (vlib_thread_registration_t *) p_hqos[0] : 0;
+
+ if (tr_hqos && tr_hqos->count > 0)
+ {
+ dm->hqos_cpu_first_index = tr_hqos->first_index;
+ dm->hqos_cpu_count = tr_hqos->count;
+ }
+
+ vec_validate_aligned (dm->devices_by_hqos_cpu, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (dm->hqos_threads, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+#ifdef NETMAP
+ if (rte_netmap_probe () < 0)
+ return clib_error_return (0, "rte netmap probe failed");
+#endif
+
nports = rte_eth_dev_count ();
if (nports < 1)
{
@@ -662,6 +686,42 @@ dpdk_lib_init (dpdk_main_t * dm)
next_cpu = 0;
}
+
+ if (devconf->hqos_enabled)
+ {
+ xd->flags |= DPDK_DEVICE_FLAG_HQOS;
+
+ if (devconf->hqos.hqos_thread_valid)
+ {
+ int cpu = dm->hqos_cpu_first_index + devconf->hqos.hqos_thread;
+
+ if (devconf->hqos.hqos_thread >= dm->hqos_cpu_count)
+ return clib_error_return (0, "invalid HQoS thread index");
+
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+ }
+ else
+ {
+ int cpu = dm->hqos_cpu_first_index + next_hqos_cpu;
+
+ if (dm->hqos_cpu_count == 0)
+ return clib_error_return (0, "no HQoS threads available");
+
+ vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+
+ next_hqos_cpu++;
+ if (next_hqos_cpu == dm->hqos_cpu_count)
+ next_hqos_cpu = 0;
+
+ devconf->hqos.hqos_thread_valid = 1;
+ devconf->hqos.hqos_thread = cpu;
+ }
+ }
+
vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
CLIB_CACHE_LINE_BYTES);
for (j = 0; j < tm->n_vlib_mains; j++)
@@ -685,6 +745,13 @@ dpdk_lib_init (dpdk_main_t * dm)
if (rv)
return rv;
+ if (devconf->hqos_enabled)
+ {
+ rv = dpdk_port_setup_hqos (xd, &devconf->hqos);
+ if (rv < 0)
+ return rv;
+ }
+
/* count the number of descriptors used for this device */
nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used;
@@ -927,6 +994,8 @@ dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr,
}
devconf->pci_addr.as_u32 = pci_addr.as_u32;
+ devconf->hqos_enabled = 0;
+ dpdk_device_config_hqos_default (&devconf->hqos);
if (!input)
return 0;
@@ -957,6 +1026,19 @@ dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr,
else if (unformat (input, "vlan-strip-offload on"))
devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON;
else
+ if (unformat
+ (input, "hqos %U", unformat_vlib_cli_sub_input, &sub_input))
+ {
+ devconf->hqos_enabled = 1;
+ error = unformat_hqos (&sub_input, &devconf->hqos);
+ if (error)
+ break;
+ }
+ else if (unformat (input, "hqos"))
+ {
+ devconf->hqos_enabled = 1;
+ }
+ else
{
error = clib_error_return (0, "unknown input `%U'",
format_unformat_error, input);
diff --git a/vnet/vnet/devices/dpdk/qos_doc.md b/vnet/vnet/devices/dpdk/qos_doc.md
new file mode 100644
index 00000000000..1b58a77f197
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/qos_doc.md
@@ -0,0 +1,287 @@
+# Quality-of-Service (QoS)- Hierarchcal Scheduler
+
+The Quality-of-Service (QoS) scheduler performs egress-traffic management by prioritizing the transmission of the packets of different type services and subcribers based on the Service Level Agreements (SLAs). The QoS scheduler can be enabled on one or more NIC output interfaces depending upon the requirement.
+
+## Overview
+
+The QoS schdeuler supports a number of scheduling and shaping levels which construct hierarchical-tree. The first level in the hierarchy is port (i.e. the physical interface) that constitutes the root node of the tree. The subsequent level is subport which represents the group of the users/subscribers. The individual user/subscriber is represented by the pipe at the next level. Each user can have different traffic type based on the criteria of specific loss rate, jitter, and latency. These traffic types are represented at the traffic-class level in the form of different traffic-classes. The last level contains number of queues which are grouped together to host the packets of the specific class type traffic.
+
+The QoS scheduler implementation requires flow classification, enqueue and dequeue operations. The flow classification is mandatory stage for HQoS where incoming packets are classified by mapping the packet fields information to 5-tuple (HQoS subport, pipe, traffic class, queue within traffic class, and color) and storing that information in mbuf sched field. The enqueue operation uses this information to determine the queue for storing the packet, and at this stage, if the specific queue is full, QoS drops the packet. The dequeue operation consists of scheduling the packet based on its length and available credits, and handing over the scheduled packet to the output interface.
+
+for more information on QoS Scheduler, please refer DPDK Programmer's Guide- http://dpdk.org/doc/guides/prog_guide/qos_framework.html
+
+### QoS Schdeuler Parameters
+
+Following illustrates the default HQoS configuration for each 10GbE output port:
+
+Single subport (subport 0):
+ - Subport rate set to 100% of port rate
+ - Each of the 4 traffic classes has rate set to 100% of port rate
+4K pipes per subport 0 (pipes 0 .. 4095) with identical configuration:
+ - Pipe rate set to 1/4K of port rate
+ - Each of the 4 traffic classes has rate set to 100% of pipe rate
+ - Within each traffic class, the byte-level WRR weights for the 4 queues are set to 1:1:1:1
+
+#### Port configuration
+port {
+ rate 1250000000 /* Assuming 10GbE port */
+ frame_overhead 24 /* Overhead fields per Ethernet frame */
+ /* 7B (Preamble) + 1B (Start of Frame Delimiter (SFD)) + 4B (Frame Check Sequence (FCS)) + 12B (Inter Frame Gap (IFG)) */
+ mtu 1522 /* Assuming Ethernet/IPv4 pkt (Ethernet FCS not included) */
+ n_subports_per_port 1 /* Number of subports per output interface */
+ n_pipes_per_subport 4096 /* Number of pipes (users/subscribers) */
+ queue_sizes 64 64 64 64 /* Packet queue size for each traffic class. All queues within the same pipe traffic class have the same size.
+ Queues from different pipes serving the same traffic class have the same size.*/
+}
+
+#### Subport configuration
+subport 0 {
+ tb_rate 1250000000 /* Subport level token bucket rate (bytes per second) */
+ tb_size 1000000 /* Subport level token bucket size (bytes) */
+ tc0_rate 1250000000 /* Subport level token bucket rate for traffic class 0 (bytes per second) */
+ tc1_rate 1250000000 /* Subport level token bucket rate for traffic class 1 (bytes per second) */
+ tc2_rate 1250000000 /* Subport level token bucket rate for traffic class 2 (bytes per second) */
+ tc3_rate 1250000000 /* Subport level token bucket rate for traffic class 3 (bytes per second) */
+ tc_period 10 /* Time interval for refilling the token bucket associated with traffic class (Milliseconds) */
+ pipe 0 4095 profile 0 /* pipes (users/subscribers) configured with pipe profile 0 */
+}
+
+#### Pipe configuration
+pipe_profile 0 {
+ tb_rate 305175 /* Pipe level token bucket rate (bytes per second) */
+ tb_size 1000000 /* Pipe level token bucket size (bytes) */
+ tc0_rate 305175 /* Pipe level token bucket rate for traffic class 0 (bytes per second) */
+ tc1_rate 305175 /* Pipe level token bucket rate for traffic class 1 (bytes per second) */
+ tc2_rate 305175 /* Pipe level token bucket rate for traffic class 2 (bytes per second) */
+ tc3_rate 305175 /* Pipe level token bucket rate for traffic class 3 (bytes per second) */
+ tc_period 40 /* Time interval for refilling the token bucket associated with traffic class at pipe level (Milliseconds) */
+ tc3_oversubscription_weight 1 /* Weight traffic class 3 oversubscription */
+ tc0_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 0 */
+ tc1_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 1 */
+ tc2_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 2 */
+ tc3_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 3 */
+}
+
+#### Random Early Detection (RED) parameters per traffic class and color (Green / Yellow / Red)
+red {
+ tc0_wred_min 48 40 32 /* Minimum threshold for traffic class 0 queue (min_th) in number of packets */
+ tc0_wred_max 64 64 64 /* Maximum threshold for traffic class 0 queue (max_th) in number of packets */
+ tc0_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 0 queue (maxp = 1 / maxp_inv) */
+ tc0_wred_weight 9 9 9 /* Traffic Class 0 queue weight */
+ tc1_wred_min 48 40 32 /* Minimum threshold for traffic class 1 queue (min_th) in number of packets */
+ tc1_wred_max 64 64 64 /* Maximum threshold for traffic class 1 queue (max_th) in number of packets */
+ tc1_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 1 queue (maxp = 1 / maxp_inv) */
+ tc1_wred_weight 9 9 9 /* Traffic Class 1 queue weight */
+ tc2_wred_min 48 40 32 /* Minimum threshold for traffic class 2 queue (min_th) in number of packets */
+ tc2_wred_max 64 64 64 /* Maximum threshold for traffic class 2 queue (max_th) in number of packets */
+ tc2_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 2 queue (maxp = 1 / maxp_inv) */
+ tc2_wred_weight 9 9 9 /* Traffic Class 2 queue weight */
+ tc3_wred_min 48 40 32 /* Minimum threshold for traffic class 3 queue (min_th) in number of packets */
+ tc3_wred_max 64 64 64 /* Maximum threshold for traffic class 3 queue (max_th) in number of packets */
+ tc3_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 3 queue (maxp = 1 / maxp_inv) */
+ tc3_wred_weight 9 9 9 /* Traffic Class 3 queue weight */
+}
+
+### DPDK QoS Scheduler Integration in VPP
+
+The Hierarchical Quaity-of-Service (HQoS) scheduler object could be seen as part of the logical NIC output interface. To enable HQoS on specific output interface, vpp startup.conf file has to be configured accordingly. The output interface that requires HQoS, should have "hqos" parameter specified in dpdk section. Another optional parameter "hqos-thread" has been defined which can be used to associate the output interface with specific hqos thread. In cpu section of the config file, "corelist-hqos-threads" is introduced to assign logical cpu cores to run the HQoS threads. A HQoS thread can run multiple HQoS objects each associated with different output interfaces. All worker threads instead of writing packets to NIC TX queue directly, write the packets to a software queues. The hqos_threads read the software queues, and enqueue the packets to HQoS objects, as well as dequeue packets from HQOS objects and write them to NIC output interfaces. The worker threads need to be able to send the packets to any output interface, therefore, each HQoS object associated with NIC output interface should have software queues equal to worker threads count.
+
+Following illustrates the sample startup configuration file with 4x worker threads feeding 2x hqos threads that handle each QoS scheduler for 1x output interface.
+
+dpdk {
+ socket-mem 16384,16384
+
+ dev 0000:02:00.0 {
+ num-rx-queues 2
+ hqos
+ }
+ dev 0000:06:00.0 {
+ num-rx-queues 2
+ hqos
+ }
+
+ num-mbufs 1000000
+}
+
+cpu {
+ main-core 0
+ corelist-workers 1, 2, 3, 4
+ corelist-hqos-threads 5, 6
+}
+
+### QoS scheduler CLI Commands
+ Each QoS scheduler instance is initialised with default parameters required to configure hqos port, subport, pipe and queues. Some of the parameters can be re-configured in run-time through CLI commands.
+
+#### Configuration
+ Following commands can be used to configure QoS scheduler parameters-
+
+ * The command below can be used to set the subport level parameters such as token bucket rate (bytes per seconds), token bucket size (bytes),
+ traffic class rates (bytes per seconds) and token update period (Milliseconds).
+ set dpdk interface hqos subport <if-name> subport <n> [rate <n>] [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]
+
+ * For setting the pipe profile, following command can be used.
+ set dpdk interface hqos pipe <if-name> subport <n> pipe <n> profile <n>
+
+ * To assign QoS scheduler instance to the specific thread, following command can be used.
+ set dpdk interface hqos placement <if-name> thread <n>
+
+ * The command below is used to set the packet fields required for classifiying the incoming packet. As a result of classification process,
+ packet field information will be mapped to 5 tuples (subport, pipe, traffic class, pipe, color) and stored in packet mbuf.
+ set dpdk interface hqos pktfield <if-name> id <n> offset <n> mask <n>
+
+ * The DSCP table entries used for idenfiying the traffic class and queue can be set using the command below;
+ set dpdk interface hqos tctbl <if-name> entry <n> tc <n> queue <n>
+
+#### Show Command
+
+ * The QoS Scheduler configuration can displayed using the command below.
+
+ vpp# show dpdk interface hqos TenGigabitEthernet2/0/0
+ Thread:
+ Input SWQ size = 4096 packets
+ Enqueue burst size = 256 packets
+ Dequeue burst size = 220 packets
+ Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000
+ Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000
+ Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc
+ Packet field 2 translation table:
+ [ 0 .. 15]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [16 .. 31]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [32 .. 47]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ [48 .. 63]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ Port:
+ Rate = 1250000000 bytes/second
+ MTU = 1514 bytes
+ Frame overhead = 24 bytes
+ Number of subports = 1
+ Number of pipes per subport = 4096
+ Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets
+ Number of pipe profiles = 1
+ Pipe profile 0:
+ Rate = 305175 bytes/second
+ Token bucket size = 1000000 bytes
+ Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second
+ TC period = 40 milliseconds
+ TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+ TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
+
+
+ * The QoS Scheduler placement over the logical cpu cores can be displayed using below command.
+
+ vpp# show dpdk interface hqos placement
+ Thread 5 (vpp_hqos-threads_0 at lcore 5):
+ TenGigabitEthernet2/0/0 queue 0
+ Thread 6 (vpp_hqos-threads_1 at lcore 6):
+ TenGigabitEthernet4/0/1 queue 0
+
+
+### QoS Scheduler Binary APIs
+
+ This section explans the available binary APIs for configuring QoS scheduler parameters in run-time.
+
+** The following API can be used to set the pipe profile of pipe that belongs to subport id-
+
+ sw_interface_set_dpdk_hqos_pipe rx <intfc> | sw_if_index <id> subport <subport-id> pipe <pipe-id>
+ profile <profile-id>
+
+ The data structures used for set the pipe profile parameter are as follows;
+
+ /** \brief DPDK interface HQoS pipe profile set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param subport - subport ID
+ @param pipe - pipe ID within its subport
+ @param profile - pipe profile ID
+ */
+ define sw_interface_set_dpdk_hqos_pipe {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 subport;
+ u32 pipe;
+ u32 profile;
+ };
+
+ /** \brief DPDK interface HQoS pipe profile set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_pipe_reply {
+ u32 context;
+ i32 retval;
+ };
+
+
+** The following API can be used to set the subport level parameters, for example- token bucket rate (bytes per seconds), tocken bucket size (bytes),
+ traffic class rate (bytes per seconds) and tokens update period.
+
+ sw_interface_set_dpdk_hqos_subport rx <intfc> | sw_if_index <id> subport <subport-id> [rate <n>]
+ [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]
+
+ The data structures used for set the subport level parameter are as follows;
+
+ /** \brief DPDK interface HQoS subport parameters set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param subport - subport ID
+ @param tb_rate - subport token bucket rate (measured in bytes/second)
+ @param tb_size - subport token bucket size (measured in credits)
+ @param tc_rate - subport traffic class 0 .. 3 rates (measured in bytes/second)
+ @param tc_period - enforcement period for rates (measured in milliseconds)
+ */
+ define sw_interface_set_dpdk_hqos_subport {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 subport;
+ u32 tb_rate;
+ u32 tb_size;
+ u32 tc_rate[4];
+ u32 tc_period;
+ };
+
+ /** \brief DPDK interface HQoS subport parameters set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_subport_reply {
+ u32 context;
+ i32 retval;
+ };
+
+
+** The following API can be used set the DSCP table entry. The DSCP table have 64 entries to map the packet DSCP field onto traffic class and hqos input queue.
+
+ sw_interface_set_dpdk_hqos_tctbl rx <intfc> | sw_if_index <id> entry <n> tc <n> queue <n>
+
+ The data structures used for setting DSCP table entries are given below.
+
+ /** \brief DPDK interface HQoS tctbl entry set request
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param sw_if_index - the interface
+ @param entry - entry index ID
+ @param tc - traffic class (0 .. 3)
+ @param queue - traffic class queue (0 .. 3)
+ */
+ define sw_interface_set_dpdk_hqos_tctbl {
+ u32 client_index;
+ u32 context;
+ u32 sw_if_index;
+ u32 entry;
+ u32 tc;
+ u32 queue;
+ };
+
+ /** \brief DPDK interface HQoS tctbl entry set reply
+ @param context - sender context, to match reply w/ request
+ @param retval - request return code
+ */
+ define sw_interface_set_dpdk_hqos_tctbl_reply {
+ u32 context;
+ i32 retval;
+ };