From c3a814be9dc769be942ff8029c7b6eccd4b3af05 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 28 Feb 2017 19:22:22 +0100 Subject: dpdk: be a plugin Change-Id: I238258cdeb77035adc5e88903d824593d0a1da90 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/cli.c | 2079 +++++++++++++++++++++++++++++++++++ src/plugins/dpdk/device/device.c | 852 ++++++++++++++ src/plugins/dpdk/device/dpdk.h | 490 +++++++++ src/plugins/dpdk/device/dpdk_priv.h | 135 +++ src/plugins/dpdk/device/format.c | 754 +++++++++++++ src/plugins/dpdk/device/node.c | 674 ++++++++++++ 6 files changed, 4984 insertions(+) create mode 100644 src/plugins/dpdk/device/cli.c create mode 100644 src/plugins/dpdk/device/device.c create mode 100644 src/plugins/dpdk/device/dpdk.h create mode 100644 src/plugins/dpdk/device/dpdk_priv.h create mode 100644 src/plugins/dpdk/device/format.c create mode 100644 src/plugins/dpdk/device/node.c (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c new file mode 100644 index 00000000..d2def2fc --- /dev/null +++ b/src/plugins/dpdk/device/cli.c @@ -0,0 +1,2079 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +/** + * @file + * @brief CLI for DPDK Abstraction Layer and pcap Tx Trace. + * + * This file contains the source code for CLI for DPDK + * Abstraction Layer and pcap Tx Trace. + */ + + +static clib_error_t * +get_hqos (u32 hw_if_index, u32 subport_id, dpdk_device_t ** xd, + dpdk_device_config_t ** devconf) +{ + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + struct rte_eth_dev_info dev_info; + uword *p = 0; + clib_error_t *error = NULL; + + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + if (subport_id != 0) + { + error = clib_error_return (0, "Invalid subport"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + *xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rte_eth_dev_info_get ((*xd)->device_index, &dev_info); + if (dev_info.pci_dev) + { /* bonded interface has no pci info */ + vlib_pci_addr_t pci_addr; + + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + + p = + hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32); + } + + if (p) + (*devconf) = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + (*devconf) = &dm->conf->default_devconf; + +done: + return error; +} + +static clib_error_t * +pcap_trace_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ +#define PCAP_DEF_PKT_TO_CAPTURE (100) + + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + u8 *filename; + u8 *chroot_filename = 0; + u32 max = 0; + int enabled = 0; + int errorFlag = 0; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on")) + { + if (dm->tx_pcap_enable == 0) + { + enabled = 1; + } + else + { + vlib_cli_output (vm, "pcap tx capture already on..."); + errorFlag = 1; + break; + } + } + else if (unformat (line_input, "off")) + { + if (dm->tx_pcap_enable) + { + vlib_cli_output (vm, "captured %d pkts...", + dm->pcap_main.n_packets_captured + 1); + if (dm->pcap_main.n_packets_captured) + { + dm->pcap_main.n_packets_to_capture = + dm->pcap_main.n_packets_captured; + error = pcap_write (&dm->pcap_main); + if (error) + clib_error_report (error); + else + vlib_cli_output (vm, "saved to %s...", dm->pcap_filename); + } + + dm->tx_pcap_enable = 0; + } + else + { + vlib_cli_output (vm, "pcap tx capture already off..."); + errorFlag = 1; + break; + } + } + else if (unformat (line_input, "max %d", &max)) + { + if (dm->tx_pcap_enable) + { + vlib_cli_output (vm, + "can't change max value while pcap tx capture active..."); + errorFlag = 1; + break; + } + } + else if (unformat (line_input, "intfc %U", + unformat_vnet_sw_interface, dm->vnet_main, + &dm->pcap_sw_if_index)) + ; + + else if (unformat (line_input, "intfc any")) + { + dm->pcap_sw_if_index = 0; + } + else if (unformat (line_input, "file %s", &filename)) + { + if (dm->tx_pcap_enable) + { + vlib_cli_output (vm, + "can't change file while pcap tx capture active..."); + errorFlag = 1; + break; + } + + /* Brain-police user path input */ + if (strstr ((char *) filename, "..") + || index ((char *) filename, '/')) + { + vlib_cli_output (vm, "illegal characters in filename '%s'", + filename); + vlib_cli_output (vm, + "Hint: Only filename, do not enter directory structure."); + vec_free (filename); + errorFlag = 1; + break; + } + + chroot_filename = format (0, "/tmp/%s%c", filename, 0); + vec_free (filename); + } + else if (unformat (line_input, "status")) + { + if (dm->pcap_sw_if_index == 0) + { + vlib_cli_output (vm, "max is %d for any interface to file %s", + dm-> + pcap_pkts_to_capture ? dm->pcap_pkts_to_capture + : PCAP_DEF_PKT_TO_CAPTURE, + dm-> + pcap_filename ? dm->pcap_filename : (u8 *) + "/tmp/vpe.pcap"); + } + else + { + vlib_cli_output (vm, "max is %d for interface %U to file %s", + dm-> + pcap_pkts_to_capture ? dm->pcap_pkts_to_capture + : PCAP_DEF_PKT_TO_CAPTURE, + format_vnet_sw_if_index_name, dm->vnet_main, + dm->pcap_sw_if_index, + dm-> + pcap_filename ? dm->pcap_filename : (u8 *) + "/tmp/vpe.pcap"); + } + + if (dm->tx_pcap_enable == 0) + { + vlib_cli_output (vm, "pcap tx capture is off..."); + } + else + { + vlib_cli_output (vm, "pcap tx capture is on: %d of %d pkts...", + dm->pcap_main.n_packets_captured, + dm->pcap_main.n_packets_to_capture); + } + break; + } + + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + errorFlag = 1; + break; + } + } + unformat_free (line_input); + + + if (errorFlag == 0) + { + /* Since no error, save configured values. */ + if (chroot_filename) + { + if (dm->pcap_filename) + vec_free (dm->pcap_filename); + vec_add1 (chroot_filename, 0); + dm->pcap_filename = chroot_filename; + } + + if (max) + dm->pcap_pkts_to_capture = max; + + + if (enabled) + { + if (dm->pcap_filename == 0) + dm->pcap_filename = format (0, "/tmp/vpe.pcap%c", 0); + + memset (&dm->pcap_main, 0, sizeof (dm->pcap_main)); + dm->pcap_main.file_name = (char *) dm->pcap_filename; + dm->pcap_main.n_packets_to_capture = PCAP_DEF_PKT_TO_CAPTURE; + if (dm->pcap_pkts_to_capture) + dm->pcap_main.n_packets_to_capture = dm->pcap_pkts_to_capture; + + dm->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet; + dm->tx_pcap_enable = 1; + vlib_cli_output (vm, "pcap tx capture on..."); + } + } + else if (chroot_filename) + vec_free (chroot_filename); + + + return error; +} + +/*? + * This command is used to start or stop a packet capture, or show + * the status of packet capture. + * + * This command has the following optional parameters: + * + * - on|off - Used to start or stop a packet capture. + * + * - max - Depth of local buffer. Once 'nn' number + * of packets have been received, buffer is flushed to file. Once another + * 'nn' number of packets have been received, buffer is flushed + * to file, overwriting previous write. If not entered, value defaults + * to 100. Can only be updated if packet capture is off. + * + * - intfc |any - Used to specify a given interface, + * or use 'any' to run packet capture on all interfaces. + * 'any' is the default if not provided. Settings from a previous + * packet capture are preserved, so 'any' can be used to reset + * the interface setting. + * + * - file - Used to specify the output filename. The file will + * be placed in the '/tmp' directory, so only the filename is + * supported. Directory should not be entered. If file already exists, file + * will be overwritten. If no filename is provided, '/tmp/vpe.pcap' + * will be used. Can only be updated if packet capture is off. + * + * - status - Displays the current status and configured attributes + * associated with a packet capture. If packet capture is in progress, + * 'status' also will return the number of packets currently in + * the local buffer. All additional attributes entered on command line + * with 'status' will be ingnored and not applied. + * + * @cliexpar + * Example of how to display the status of a tx packet capture when off: + * @cliexstart{pcap tx trace status} + * max is 100, for any interface to file /tmp/vpe.pcap + * pcap tx capture is off... + * @cliexend + * Example of how to start a tx packet capture: + * @cliexstart{pcap tx trace on max 35 intfc GigabitEthernet0/8/0 file vppTest.pcap} + * pcap tx capture on... + * @cliexend + * Example of how to display the status of a tx packet capture in progress: + * @cliexstart{pcap tx trace status} + * max is 35, for interface GigabitEthernet0/8/0 to file /tmp/vppTest.pcap + * pcap tx capture is on: 20 of 35 pkts... + * @cliexend + * Example of how to stop a tx packet capture: + * @cliexstart{vppctl pcap tx trace off} + * captured 21 pkts... + * saved to /tmp/vppTest.pcap... + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (pcap_trace_command, static) = { + .path = "pcap tx trace", + .short_help = + "pcap tx trace [on|off] [max ] [intfc |any] [file ] [status]", + .function = pcap_trace_command_fn, +}; +/* *INDENT-ON* */ + + +static clib_error_t * +show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + struct rte_mempool *rmp; + int i; + + for (i = 0; i < vec_len (dpdk_main.pktmbuf_pools); i++) + { + rmp = dpdk_main.pktmbuf_pools[i]; + if (rmp) + { + unsigned count = rte_mempool_avail_count (rmp); + unsigned free_count = rte_mempool_in_use_count (rmp); + + vlib_cli_output (vm, + "name=\"%s\" available = %7d allocated = %7d total = %7d\n", + rmp->name, (u32) count, (u32) free_count, + (u32) (count + free_count)); + } + else + { + vlib_cli_output (vm, "rte_mempool is NULL (!)\n"); + } + } + return 0; +} + +/*? + * This command displays statistics of each DPDK mempool. + * + * @cliexpar + * Example of how to display DPDK buffer data: + * @cliexstart{show dpdk buffer} + * name="mbuf_pool_socket0" available = 15104 allocated = 1280 total = 16384 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_bufferr,static) = { + .path = "show dpdk buffer", + .short_help = "show dpdk buffer", + .function = show_dpdk_buffer, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +static clib_error_t * +test_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + static u32 *allocated_buffers; + u32 n_alloc = 0; + u32 n_free = 0; + u32 first, actual_alloc; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "allocate %d", &n_alloc)) + ; + else if (unformat (input, "free %d", &n_free)) + ; + else + break; + } + + if (n_free) + { + if (vec_len (allocated_buffers) < n_free) + return clib_error_return (0, "Can't free %d, only %d allocated", + n_free, vec_len (allocated_buffers)); + + first = vec_len (allocated_buffers) - n_free; + vlib_buffer_free (vm, allocated_buffers + first, n_free); + _vec_len (allocated_buffers) = first; + } + if (n_alloc) + { + first = vec_len (allocated_buffers); + vec_validate (allocated_buffers, + vec_len (allocated_buffers) + n_alloc - 1); + + actual_alloc = vlib_buffer_alloc (vm, allocated_buffers + first, + n_alloc); + _vec_len (allocated_buffers) = first + actual_alloc; + + if (actual_alloc < n_alloc) + vlib_cli_output (vm, "WARNING: only allocated %d buffers", + actual_alloc); + } + + vlib_cli_output (vm, "Currently %d buffers allocated", + vec_len (allocated_buffers)); + + if (allocated_buffers && vec_len (allocated_buffers) == 0) + vec_free (allocated_buffers); + + return 0; +} + +/*? + * This command tests the allocation and freeing of DPDK buffers. + * If both 'allocate' and 'free' are entered on the + * same command, the 'free' is executed first. If no + * parameters are provided, this command display how many DPDK buffers + * the test command has allocated. + * + * @cliexpar + * @parblock + * + * Example of how to display how many DPDK buffer test command has allcoated: + * @cliexstart{test dpdk buffer} + * Currently 0 buffers allocated + * @cliexend + * + * Example of how to allocate DPDK buffers using the test command: + * @cliexstart{test dpdk buffer allocate 10} + * Currently 10 buffers allocated + * @cliexend + * + * Example of how to free DPDK buffers allocated by the test command: + * @cliexstart{test dpdk buffer free 10} + * Currently 0 buffers allocated + * @cliexend + * @endparblock +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_test_dpdk_buffer,static) = { + .path = "test dpdk buffer", + .short_help = "test dpdk buffer [allocate ] [free ]", + .function = test_dpdk_buffer, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 nb_rx_desc = (u32) ~ 0; + u32 nb_tx_desc = (u32) ~ 0; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "tx %d", &nb_tx_desc)) + ; + else if (unformat (line_input, "rx %d", &nb_rx_desc)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) + { + error = + clib_error_return (0, + "number of descriptors can be set only for " + "physical devices"); + goto done; + } + + if ((nb_rx_desc == (u32) ~ 0 || nb_rx_desc == xd->nb_rx_desc) && + (nb_tx_desc == (u32) ~ 0 || nb_tx_desc == xd->nb_tx_desc)) + { + error = clib_error_return (0, "nothing changed"); + goto done; + } + + if (nb_rx_desc != (u32) ~ 0) + xd->nb_rx_desc = nb_rx_desc; + + if (nb_tx_desc != (u32) ~ 0) + xd->nb_tx_desc = nb_tx_desc; + + error = dpdk_port_setup (dm, xd); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command sets the number of DPDK 'rx' and + * 'tx' descriptors for the given physical interface. Use + * the command 'show hardware-interface' to display the + * current descriptor allocation. + * + * @cliexpar + * Example of how to set the DPDK interface descriptors: + * @cliexcmd{set dpdk interface descriptors GigabitEthernet0/8/0 rx 512 tx 512} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = { + .path = "set dpdk interface descriptors", + .short_help = "set dpdk interface descriptors [rx ] [tx ]", + .function = set_dpdk_if_desc, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_and_queue_t *dq; + int cpu; + + if (tm->n_vlib_mains == 1) + vlib_cli_output (vm, "All interfaces are handled by main thread"); + + for (cpu = 0; cpu < vec_len (dm->devices_by_cpu); cpu++) + { + if (cpu >= dm->input_cpu_first_index && + cpu < (dm->input_cpu_first_index + dm->input_cpu_count)) + vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu, + vlib_worker_threads[cpu].name, + vlib_worker_threads[cpu].lcore_id); + + /* *INDENT-OFF* */ + vec_foreach(dq, dm->devices_by_cpu[cpu]) + { + u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index; + vnet_hw_interface_t * hi = vnet_get_hw_interface(dm->vnet_main, hw_if_index); + vlib_cli_output(vm, " %v queue %u", hi->name, dq->queue_id); + } + /* *INDENT-ON* */ + } + return 0; +} + +/*? + * This command is used to display the thread and core each + * DPDK interface and queue is assigned too. + * + * @cliexpar + * Example of how to display the DPDK interface placement: + * @cliexstart{show dpdk interface placement} + * Thread 1 (vpp_wk_0 at lcore 1): + * GigabitEthernet0/8/0 queue 0 + * GigabitEthernet0/9/0 queue 0 + * Thread 2 (vpp_wk_1 at lcore 2): + * GigabitEthernet0/8/0 queue 1 + * GigabitEthernet0/9/0 queue 1 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_if_placement,static) = { + .path = "show dpdk interface placement", + .short_help = "show dpdk interface placement", + .function = show_dpdk_if_placement, +}; +/* *INDENT-ON* */ + +static int +dpdk_device_queue_sort (void *a1, void *a2) +{ + dpdk_device_and_queue_t *dq1 = a1; + dpdk_device_and_queue_t *dq2 = a2; + + if (dq1->device > dq2->device) + return 1; + else if (dq1->device < dq2->device) + return -1; + else if (dq1->queue_id > dq2->queue_id) + return 1; + else if (dq1->queue_id < dq2->queue_id) + return -1; + else + return 0; +} + +static clib_error_t * +set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_and_queue_t *dq; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 queue = (u32) 0; + u32 cpu = (u32) ~ 0; + int i; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "queue %d", &queue)) + ; + else if (unformat (line_input, "thread %d", &cpu)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + if (cpu < dm->input_cpu_first_index || + cpu >= (dm->input_cpu_first_index + dm->input_cpu_count)) + { + error = clib_error_return (0, "please specify valid thread id"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + for (i = 0; i < vec_len (dm->devices_by_cpu); i++) + { + /* *INDENT-OFF* */ + vec_foreach(dq, dm->devices_by_cpu[i]) + { + if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index && + queue == dq->queue_id) + { + if (cpu == i) /* nothing to do */ + goto done; + + vec_del1(dm->devices_by_cpu[i], dq - dm->devices_by_cpu[i]); + vec_add2(dm->devices_by_cpu[cpu], dq, 1); + dq->queue_id = queue; + dq->device = xd->device_index; + xd->cpu_socket_id_by_queue[queue] = + rte_lcore_to_socket_id(vlib_worker_threads[cpu].lcore_id); + + vec_sort_with_function(dm->devices_by_cpu[i], + dpdk_device_queue_sort); + + vec_sort_with_function(dm->devices_by_cpu[cpu], + dpdk_device_queue_sort); + + if (vec_len(dm->devices_by_cpu[i]) == 0) + vlib_node_set_state (vlib_mains[i], dpdk_input_node.index, + VLIB_NODE_STATE_DISABLED); + + if (vec_len(dm->devices_by_cpu[cpu]) == 1) + vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index, + VLIB_NODE_STATE_POLLING); + + goto done; + } + } + /* *INDENT-ON* */ + } + + error = clib_error_return (0, "not found"); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to assign a given interface, and optionally a + * given queue, to a different thread. This will not create a thread, + * so the thread must already exist. Use '/etc/vpp/startup.conf' + * for the initial thread creation. If the 'queue' is not provided, + * it defaults to 0. + * + * @cliexpar + * Example of how to display the DPDK interface placement: + * @cliexstart{show dpdk interface placement} + * Thread 1 (vpp_wk_0 at lcore 1): + * GigabitEthernet0/8/0 queue 0 + * GigabitEthernet0/9/0 queue 0 + * Thread 2 (vpp_wk_1 at lcore 2): + * GigabitEthernet0/8/0 queue 1 + * GigabitEthernet0/9/0 queue 1 + * @cliexend + * Example of how to assign a DPDK interface and queue to a thread: + * @cliexcmd{set dpdk interface placement GigabitEthernet0/8/0 queue 1 thread 1} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = { + .path = "set dpdk interface placement", + .short_help = "set dpdk interface placement [queue ] thread ", + .function = set_dpdk_if_placement, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_and_queue_t *dq; + int cpu; + + if (tm->n_vlib_mains == 1) + vlib_cli_output (vm, "All interfaces are handled by main thread"); + + for (cpu = 0; cpu < vec_len (dm->devices_by_hqos_cpu); cpu++) + { + if (cpu >= dm->hqos_cpu_first_index && + cpu < (dm->hqos_cpu_first_index + dm->hqos_cpu_count)) + vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu, + vlib_worker_threads[cpu].name, + vlib_worker_threads[cpu].lcore_id); + + vec_foreach (dq, dm->devices_by_hqos_cpu[cpu]) + { + u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index; + vnet_hw_interface_t *hi = + vnet_get_hw_interface (dm->vnet_main, hw_if_index); + vlib_cli_output (vm, " %v queue %u", hi->name, dq->queue_id); + } + } + return 0; +} + +/*? + * This command is used to display the thread and core each + * DPDK output interface and HQoS queue is assigned too. + * + * @cliexpar + * Example of how to display the DPDK output interface and HQoS queue placement: + * @cliexstart{show dpdk interface hqos placement} + * Thread 1 (vpp_hqos-threads_0 at lcore 3): + * GigabitEthernet0/8/0 queue 0 + * Thread 2 (vpp_hqos-threads_1 at lcore 4): + * GigabitEthernet0/9/0 queue 0 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos_placement, static) = { + .path = "show dpdk interface hqos placement", + .short_help = "show dpdk interface hqos placement", + .function = show_dpdk_if_hqos_placement, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_and_queue_t *dq; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 cpu = (u32) ~ 0; + int i; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "thread %d", &cpu)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + return clib_error_return (0, "please specify valid interface name"); + + if (cpu < dm->hqos_cpu_first_index || + cpu >= (dm->hqos_cpu_first_index + dm->hqos_cpu_count)) + { + error = clib_error_return (0, "please specify valid thread id"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + for (i = 0; i < vec_len (dm->devices_by_hqos_cpu); i++) + { + vec_foreach (dq, dm->devices_by_hqos_cpu[i]) + { + if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index) + { + if (cpu == i) /* nothing to do */ + goto done; + + vec_del1 (dm->devices_by_hqos_cpu[i], + dq - dm->devices_by_hqos_cpu[i]); + vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1); + dq->queue_id = 0; + dq->device = xd->device_index; + + vec_sort_with_function (dm->devices_by_hqos_cpu[i], + dpdk_device_queue_sort); + + vec_sort_with_function (dm->devices_by_hqos_cpu[cpu], + dpdk_device_queue_sort); + + goto done; + } + } + } + + error = clib_error_return (0, "not found"); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to assign a given DPDK output interface and + * HQoS queue to a different thread. This will not create a thread, + * so the thread must already exist. Use '/etc/vpp/startup.conf' + * for the initial thread creation. See @ref qos_doc for more details. + * + * @cliexpar + * Example of how to display the DPDK output interface and HQoS queue placement: + * @cliexstart{show dpdk interface hqos placement} + * Thread 1 (vpp_hqos-threads_0 at lcore 3): + * GigabitEthernet0/8/0 queue 0 + * Thread 2 (vpp_hqos-threads_1 at lcore 4): + * GigabitEthernet0/9/0 queue 0 + * @cliexend + * Example of how to assign a DPDK output interface and HQoS queue to a thread: + * @cliexcmd{set dpdk interface hqos placement GigabitEthernet0/8/0 thread 2} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_placement, static) = { + .path = "set dpdk interface hqos placement", + .short_help = "set dpdk interface hqos placement thread ", + .function = set_dpdk_if_hqos_placement, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 subport_id = (u32) ~ 0; + u32 pipe_id = (u32) ~ 0; + u32 profile_id = (u32) ~ 0; + int rv; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "subport %d", &subport_id)) + ; + else if (unformat (line_input, "pipe %d", &pipe_id)) + ; + else if (unformat (line_input, "profile %d", &profile_id)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rv = + rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id, + profile_id); + if (rv) + { + error = clib_error_return (0, "pipe configuration failed"); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to change the profile associate with a HQoS pipe. The + * '' is zero based. Use the command + * 'show dpdk interface hqos' to display the content of each profile. + * See @ref qos_doc for more details. + * + * @note + * Currently there is not an API to create a new HQoS pipe profile. One is + * created by default in the code (search for 'hqos_pipe_params_default''). + * Additional profiles can be created in code and code recompiled. Then use this + * command to assign it. + * + * @cliexpar + * Example of how to assign a new profile to a HQoS pipe: + * @cliexcmd{set dpdk interface hqos pipe GigabitEthernet0/8/0 subport 0 pipe 2 profile 1} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pipe, static) = +{ + .path = "set dpdk interface hqos pipe", + .short_help = "set dpdk interface hqos pipe subport pipe " + "profile ", + .function = set_dpdk_if_hqos_pipe, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_subport (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = NULL; + u32 hw_if_index = (u32) ~ 0; + u32 subport_id = (u32) ~ 0; + struct rte_sched_subport_params p; + int rv; + clib_error_t *error = NULL; + u32 tb_rate = (u32) ~ 0; + u32 tb_size = (u32) ~ 0; + u32 tc_rate[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE] = + { (u32) ~ 0, (u32) ~ 0, (u32) ~ 0, (u32) ~ 0 }; + u32 tc_period = (u32) ~ 0; + dpdk_device_config_t *devconf = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "subport %d", &subport_id)) + ; + else if (unformat (line_input, "rate %d", &tb_rate)) + ; + else if (unformat (line_input, "bktsize %d", &tb_size)) + ; + else if (unformat (line_input, "tc0 %d", &tc_rate[0])) + ; + else if (unformat (line_input, "tc1 %d", &tc_rate[1])) + ; + else if (unformat (line_input, "tc2 %d", &tc_rate[2])) + ; + else if (unformat (line_input, "tc3 %d", &tc_rate[3])) + ; + else if (unformat (line_input, "period %d", &tc_period)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + error = get_hqos (hw_if_index, subport_id, &xd, &devconf); + + if (error == NULL) + { + /* Copy the current values over to local structure. */ + memcpy (&p, &devconf->hqos.subport[subport_id], sizeof (p)); + + /* Update local structure with input values. */ + if (tb_rate != (u32) ~ 0) + { + p.tb_rate = tb_rate; + p.tc_rate[0] = tb_rate; + p.tc_rate[1] = tb_rate; + p.tc_rate[2] = tb_rate; + p.tc_rate[3] = tb_rate; + } + if (tb_size != (u32) ~ 0) + { + p.tb_size = tb_size; + } + if (tc_rate[0] != (u32) ~ 0) + { + p.tc_rate[0] = tc_rate[0]; + } + if (tc_rate[1] != (u32) ~ 0) + { + p.tc_rate[1] = tc_rate[1]; + } + if (tc_rate[2] != (u32) ~ 0) + { + p.tc_rate[2] = tc_rate[2]; + } + if (tc_rate[3] != (u32) ~ 0) + { + p.tc_rate[3] = tc_rate[3]; + } + if (tc_period != (u32) ~ 0) + { + p.tc_period = tc_period; + } + + /* Apply changes. */ + rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport_id, &p); + if (rv) + { + error = clib_error_return (0, "subport configuration failed"); + goto done; + } + else + { + /* Successfully applied, so save of the input values. */ + memcpy (&devconf->hqos.subport[subport_id], &p, sizeof (p)); + } + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to set the subport level parameters such as token + * bucket rate (bytes per seconds), token bucket size (bytes), traffic class + * rates (bytes per seconds) and token update period (Milliseconds). + * + * By default, the 'rate' is set to 1250000000 bytes/second (10GbE + * rate) and each of the four traffic classes is set to 100% of the port rate. + * If the 'rate' is updated by this command, all four traffic classes + * are assigned the same value. Each of the four traffic classes can be updated + * individually. + * + * @cliexpar + * Example of how modify the subport attributes for a 1GbE link: + * @cliexcmd{set dpdk interface hqos subport GigabitEthernet0/8/0 subport 0 rate 125000000} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_subport, static) = { + .path = "set dpdk interface hqos subport", + .short_help = "set dpdk interface hqos subport subport " + "[rate ] [bktsize ] [tc0 ] [tc1 ] [tc2 ] [tc3 ] " + "[period ]", + .function = set_dpdk_if_hqos_subport, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 tc = (u32) ~ 0; + u32 queue = (u32) ~ 0; + u32 entry = (u32) ~ 0; + u32 val, i; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "entry %d", &entry)) + ; + else if (unformat (line_input, "tc %d", &tc)) + ; + else if (unformat (line_input, "queue %d", &queue)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + if (entry >= 64) + { + error = clib_error_return (0, "invalid entry"); + goto done; + } + if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE) + { + error = clib_error_return (0, "invalid traffic class"); + goto done; + } + if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) + { + error = clib_error_return (0, "invalid traffic class queue"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + /* Detect the set of worker threads */ + uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + /* Should never happen, shut up Coverity warning */ + if (p == 0) + { + error = clib_error_return (0, "no worker registrations?"); + goto done; + } + + vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; + int worker_thread_first = tr->first_index; + int worker_thread_count = tr->count; + + val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue; + for (i = 0; i < worker_thread_count; i++) + xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val; + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to set the traffic class translation table. The + * traffic class translation table is used to map 64 values (0-63) to one of + * four traffic class and one of four HQoS input queue. Use the 'show + * dpdk interface hqos' command to display the traffic class translation + * table. See @ref qos_doc for more details. + * + * This command has the following parameters: + * + * - - Used to specify the output interface. + * + * - entry - Mapped value (0-63) to assign traffic class and queue to. + * + * - tc - Traffic class (0-3) to be used by the provided mapped value. + * + * - queue - HQoS input queue (0-3) to be used by the provided mapped value. + * + * @cliexpar + * Example of how modify the traffic class translation table: + * @cliexcmd{set dpdk interface hqos tctbl GigabitEthernet0/8/0 entry 16 tc 2 queue 2} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_tctbl, static) = { + .path = "set dpdk interface hqos tctbl", + .short_help = "set dpdk interface hqos tctbl entry tc queue ", + .function = set_dpdk_if_hqos_tctbl, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + clib_error_t *error = NULL; + + /* Device specific data */ + struct rte_eth_dev_info dev_info; + dpdk_device_config_t *devconf = 0; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + + /* Detect the set of worker threads */ + uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + /* Should never happen, shut up Coverity warning */ + if (p == 0) + return clib_error_return (0, "no worker registrations?"); + + vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; + int worker_thread_first = tr->first_index; + int worker_thread_count = tr->count; + + /* Packet field configuration */ + u64 mask = (u64) ~ 0; + u32 id = (u32) ~ 0; + u32 offset = (u32) ~ 0; + + /* HQoS params */ + u32 n_subports_per_port, n_pipes_per_subport, tctbl_size; + + u32 i; + + /* Parse input arguments */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "id subport")) + id = 0; + else if (unformat (line_input, "id pipe")) + id = 1; + else if (unformat (line_input, "id tc")) + id = 2; + else if (unformat (line_input, "id %d", &id)) + ; + else if (unformat (line_input, "offset %d", &offset)) + ; + else if (unformat (line_input, "mask %llx", &mask)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + /* Get interface */ + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rte_eth_dev_info_get (xd->device_index, &dev_info); + if (dev_info.pci_dev) + { /* bonded interface has no pci info */ + vlib_pci_addr_t pci_addr; + + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + + p = + hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32); + } + + if (p) + devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + devconf = &dm->conf->default_devconf; + + if (devconf->hqos_enabled == 0) + { + vlib_cli_output (vm, "HQoS disabled for this interface"); + goto done; + } + + n_subports_per_port = devconf->hqos.port.n_subports_per_port; + n_pipes_per_subport = devconf->hqos.port.n_pipes_per_subport; + tctbl_size = RTE_DIM (devconf->hqos.tc_table); + + /* Validate packet field configuration: id, offset and mask */ + if (id >= 3) + { + error = clib_error_return (0, "invalid packet field id"); + goto done; + } + + switch (id) + { + case 0: + if (dpdk_hqos_validate_mask (mask, n_subports_per_port) != 0) + { + error = clib_error_return (0, "invalid subport ID mask " + "(n_subports_per_port = %u)", + n_subports_per_port); + goto done; + } + break; + case 1: + if (dpdk_hqos_validate_mask (mask, n_pipes_per_subport) != 0) + { + error = clib_error_return (0, "invalid pipe ID mask " + "(n_pipes_per_subport = %u)", + n_pipes_per_subport); + goto done; + } + break; + case 2: + default: + if (dpdk_hqos_validate_mask (mask, tctbl_size) != 0) + { + error = clib_error_return (0, "invalid TC table index mask " + "(TC table size = %u)", tctbl_size); + goto done; + } + } + + /* Propagate packet field configuration to all workers */ + for (i = 0; i < worker_thread_count; i++) + switch (id) + { + case 0: + xd->hqos_wt[worker_thread_first + i].hqos_field0_slabpos = offset; + xd->hqos_wt[worker_thread_first + i].hqos_field0_slabmask = mask; + xd->hqos_wt[worker_thread_first + i].hqos_field0_slabshr = + __builtin_ctzll (mask); + break; + case 1: + xd->hqos_wt[worker_thread_first + i].hqos_field1_slabpos = offset; + xd->hqos_wt[worker_thread_first + i].hqos_field1_slabmask = mask; + xd->hqos_wt[worker_thread_first + i].hqos_field1_slabshr = + __builtin_ctzll (mask); + break; + case 2: + default: + xd->hqos_wt[worker_thread_first + i].hqos_field2_slabpos = offset; + xd->hqos_wt[worker_thread_first + i].hqos_field2_slabmask = mask; + xd->hqos_wt[worker_thread_first + i].hqos_field2_slabshr = + __builtin_ctzll (mask); + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to set the packet fields required for classifiying the + * incoming packet. As a result of classification process, packet field + * information will be mapped to 5 tuples (subport, pipe, traffic class, pipe, + * color) and stored in packet mbuf. + * + * This command has the following parameters: + * + * - - Used to specify the output interface. + * + * - id subport|pipe|tc - Classification occurs across three fields. + * This parameter indicates which of the three masks are being configured. Legacy + * code used 0-2 to represent these three fields, so 0-2 is still accepted. + * - subport|0 - Currently only one subport is supported, so only + * an empty mask is supported for the subport classification. + * - pipe|1 - Currently, 4096 pipes per subport are supported, so a + * 12-bit mask should be configure to map to the 0-4095 pipes. + * - tc|2 - The translation table (see 'set dpdk interface hqos + * tctbl' command) maps each value (0-63) into one of the 4 traffic classes + * per pipe. A 6-bit mask should be configure to map this field to a traffic class. + * + * - offset - Offset in the packet to apply the 64-bit mask for classification. + * The offset should be on an 8-byte boundary (0,8,16,24..). + * + * - mask - 64-bit mask to apply to packet at the given 'offset'. + * Bits must be contiguous and should not include '0x'. + * + * The default values for the 'pktfield' assumes Ethernet/IPv4/UDP packets with + * no VLAN. Adjust based on expected packet format and desired classification field. + * - 'subport' is always empty (offset 0 mask 0000000000000000) + * - By default, 'pipe' maps to the UDP payload bits 12 .. 23 (offset 40 + * mask 0000000fff000000) + * - By default, 'tc' maps to the DSCP field in IP header (offset 48 mask + * 00000000000000fc) + * + * @cliexpar + * Example of how modify the 'pipe' classification filter to match VLAN: + * @cliexcmd{set dpdk interface hqos pktfield GigabitEthernet0/8/0 id pipe offset 8 mask 0000000000000FFF} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pktfield, static) = { + .path = "set dpdk interface hqos pktfield", + .short_help = "set dpdk interface hqos pktfield id subport|pipe|tc offset " + "mask ", + .function = set_dpdk_if_hqos_pktfield, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + dpdk_device_config_hqos_t *cfg; + dpdk_device_hqos_per_hqos_thread_t *ht; + dpdk_device_hqos_per_worker_thread_t *wk; + u32 *tctbl; + u32 hw_if_index = (u32) ~ 0; + u32 profile_id, subport_id, i; + struct rte_eth_dev_info dev_info; + dpdk_device_config_t *devconf = 0; + vlib_thread_registration_t *tr; + uword *p = 0; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify interface name!!"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rte_eth_dev_info_get (xd->device_index, &dev_info); + if (dev_info.pci_dev) + { /* bonded interface has no pci info */ + vlib_pci_addr_t pci_addr; + + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + + p = + hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32); + } + + if (p) + devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + devconf = &dm->conf->default_devconf; + + if (devconf->hqos_enabled == 0) + { + vlib_cli_output (vm, "HQoS disabled for this interface"); + goto done; + } + + /* Detect the set of worker threads */ + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + + /* Should never happen, shut up Coverity warning */ + if (p == 0) + { + error = clib_error_return (0, "no worker registrations?"); + goto done; + } + + tr = (vlib_thread_registration_t *) p[0]; + + cfg = &devconf->hqos; + ht = xd->hqos_ht; + wk = &xd->hqos_wt[tr->first_index]; + tctbl = wk->hqos_tc_table; + + vlib_cli_output (vm, " Thread:"); + vlib_cli_output (vm, " Input SWQ size = %u packets", cfg->swq_size); + vlib_cli_output (vm, " Enqueue burst size = %u packets", + ht->hqos_burst_enq); + vlib_cli_output (vm, " Dequeue burst size = %u packets", + ht->hqos_burst_deq); + + vlib_cli_output (vm, + " Packet field 0: slab position = %4u, slab bitmask = 0x%016llx (subport)", + wk->hqos_field0_slabpos, wk->hqos_field0_slabmask); + vlib_cli_output (vm, + " Packet field 1: slab position = %4u, slab bitmask = 0x%016llx (pipe)", + wk->hqos_field1_slabpos, wk->hqos_field1_slabmask); + vlib_cli_output (vm, + " Packet field 2: slab position = %4u, slab bitmask = 0x%016llx (tc)", + wk->hqos_field2_slabpos, wk->hqos_field2_slabmask); + vlib_cli_output (vm, + " Packet field 2 tc translation table: ([Mapped Value Range]: tc/queue tc/queue ...)"); + vlib_cli_output (vm, + " [ 0 .. 15]: " + "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u", + tctbl[0] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[0] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[1] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[1] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[2] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[2] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[3] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[3] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[4] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[4] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[5] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[5] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[6] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[6] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[7] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[7] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[8] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[8] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[9] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[9] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[10] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[10] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[11] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[11] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[12] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[12] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[13] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[13] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[14] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[14] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[15] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[15] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS); + vlib_cli_output (vm, + " [16 .. 31]: " + "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u", + tctbl[16] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[16] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[17] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[17] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[18] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[18] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[19] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[19] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[20] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[20] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[21] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[21] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[22] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[22] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[23] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[23] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[24] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[24] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[25] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[25] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[26] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[26] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[27] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[27] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[28] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[28] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[29] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[29] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[30] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[30] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[31] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[31] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS); + vlib_cli_output (vm, + " [32 .. 47]: " + "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u", + tctbl[32] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[32] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[33] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[33] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[34] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[34] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[35] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[35] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[36] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[36] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[37] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[37] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[38] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[38] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[39] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[39] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[40] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[40] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[41] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[41] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[42] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[42] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[43] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[43] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[44] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[44] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[45] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[45] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[46] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[46] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[47] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[47] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS); + vlib_cli_output (vm, + " [48 .. 63]: " + "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u", + tctbl[48] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[48] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[49] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[49] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[50] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[50] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[51] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[51] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[52] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[52] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[53] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[53] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[54] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[54] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[55] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[55] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[56] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[56] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[57] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[57] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[58] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[58] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[59] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[59] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[60] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[60] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[61] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[61] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[62] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[62] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[63] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[63] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS); + vlib_cli_output (vm, " Port:"); + vlib_cli_output (vm, " Rate = %u bytes/second", cfg->port.rate); + vlib_cli_output (vm, " MTU = %u bytes", cfg->port.mtu); + vlib_cli_output (vm, " Frame overhead = %u bytes", + cfg->port.frame_overhead); + vlib_cli_output (vm, " Number of subports = %u", + cfg->port.n_subports_per_port); + vlib_cli_output (vm, " Number of pipes per subport = %u", + cfg->port.n_pipes_per_subport); + vlib_cli_output (vm, + " Packet queue size: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u packets", + cfg->port.qsize[0], cfg->port.qsize[1], cfg->port.qsize[2], + cfg->port.qsize[3]); + vlib_cli_output (vm, " Number of pipe profiles = %u", + cfg->port.n_pipe_profiles); + + for (subport_id = 0; subport_id < vec_len (cfg->subport); subport_id++) + { + vlib_cli_output (vm, " Subport %u:", subport_id); + vlib_cli_output (vm, " Rate = %u bytes/second", + cfg->subport[subport_id].tb_rate); + vlib_cli_output (vm, " Token bucket size = %u bytes", + cfg->subport[subport_id].tb_size); + vlib_cli_output (vm, + " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second", + cfg->subport[subport_id].tc_rate[0], + cfg->subport[subport_id].tc_rate[1], + cfg->subport[subport_id].tc_rate[2], + cfg->subport[subport_id].tc_rate[3]); + vlib_cli_output (vm, " TC period = %u milliseconds", + cfg->subport[subport_id].tc_period); + } + + for (profile_id = 0; profile_id < vec_len (cfg->pipe); profile_id++) + { + vlib_cli_output (vm, " Pipe profile %u:", profile_id); + vlib_cli_output (vm, " Rate = %u bytes/second", + cfg->pipe[profile_id].tb_rate); + vlib_cli_output (vm, " Token bucket size = %u bytes", + cfg->pipe[profile_id].tb_size); + vlib_cli_output (vm, + " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second", + cfg->pipe[profile_id].tc_rate[0], + cfg->pipe[profile_id].tc_rate[1], + cfg->pipe[profile_id].tc_rate[2], + cfg->pipe[profile_id].tc_rate[3]); + vlib_cli_output (vm, " TC period = %u milliseconds", + cfg->pipe[profile_id].tc_period); +#ifdef RTE_SCHED_SUBPORT_TC_OV + vlib_cli_output (vm, " TC3 oversubscription_weight = %u", + cfg->pipe[profile_id].tc_ov_weight); +#endif + + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) + { + vlib_cli_output (vm, + " TC%u WRR weights: Q0 = %u, Q1 = %u, Q2 = %u, Q3 = %u", + i, cfg->pipe[profile_id].wrr_weights[i * 4], + cfg->pipe[profile_id].wrr_weights[i * 4 + 1], + cfg->pipe[profile_id].wrr_weights[i * 4 + 2], + cfg->pipe[profile_id].wrr_weights[i * 4 + 3]); + } + } + +#ifdef RTE_SCHED_RED + vlib_cli_output (vm, " Weighted Random Early Detection (WRED):"); + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) + { + vlib_cli_output (vm, " TC%u min: G = %u, Y = %u, R = %u", i, + cfg->port.red_params[i][e_RTE_METER_GREEN].min_th, + cfg->port.red_params[i][e_RTE_METER_YELLOW].min_th, + cfg->port.red_params[i][e_RTE_METER_RED].min_th); + + vlib_cli_output (vm, " TC%u max: G = %u, Y = %u, R = %u", i, + cfg->port.red_params[i][e_RTE_METER_GREEN].max_th, + cfg->port.red_params[i][e_RTE_METER_YELLOW].max_th, + cfg->port.red_params[i][e_RTE_METER_RED].max_th); + + vlib_cli_output (vm, + " TC%u inverted probability: G = %u, Y = %u, R = %u", + i, cfg->port.red_params[i][e_RTE_METER_GREEN].maxp_inv, + cfg->port.red_params[i][e_RTE_METER_YELLOW].maxp_inv, + cfg->port.red_params[i][e_RTE_METER_RED].maxp_inv); + + vlib_cli_output (vm, " TC%u weight: R = %u, Y = %u, R = %u", i, + cfg->port.red_params[i][e_RTE_METER_GREEN].wq_log2, + cfg->port.red_params[i][e_RTE_METER_YELLOW].wq_log2, + cfg->port.red_params[i][e_RTE_METER_RED].wq_log2); + } +#endif + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to display details of an output interface's HQoS + * settings. + * + * @cliexpar + * Example of how to display HQoS settings for an interfaces: + * @cliexstart{show dpdk interface hqos GigabitEthernet0/8/0} + * Thread: + * Input SWQ size = 4096 packets + * Enqueue burst size = 256 packets + * Dequeue burst size = 220 packets + * Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000 (subport) + * Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000 (pipe) + * Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc (tc) + * Packet field 2 tc translation table: ([Mapped Value Range]: tc/queue tc/queue ...) + * [ 0 .. 15]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + * [16 .. 31]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + * [32 .. 47]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + * [48 .. 63]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + * Port: + * Rate = 1250000000 bytes/second + * MTU = 1514 bytes + * Frame overhead = 24 bytes + * Number of subports = 1 + * Number of pipes per subport = 4096 + * Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets + * Number of pipe profiles = 2 + * Subport 0: + * Rate = 1250000000 bytes/second + * Token bucket size = 1000000 bytes + * Traffic class rate: TC0 = 1250000000, TC1 = 1250000000, TC2 = 1250000000, TC3 = 1250000000 bytes/second + * TC period = 10 milliseconds + * Pipe profile 0: + * Rate = 305175 bytes/second + * Token bucket size = 1000000 bytes + * Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second + * TC period = 40 milliseconds + * TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + * TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + * TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + * TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos, static) = { + .path = "show dpdk interface hqos", + .short_help = "show dpdk interface hqos ", + .function = show_dpdk_if_hqos, +}; + +/* *INDENT-ON* */ + +static clib_error_t * +show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = NULL; +#ifdef RTE_SCHED_COLLECT_STATS + dpdk_main_t *dm = &dpdk_main; + u32 hw_if_index = (u32) ~ 0; + u32 subport = (u32) ~ 0; + u32 pipe = (u32) ~ 0; + u32 tc = (u32) ~ 0; + u32 tc_q = (u32) ~ 0; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + uword *p = 0; + struct rte_eth_dev_info dev_info; + dpdk_device_config_t *devconf = 0; + u32 qindex; + struct rte_sched_queue_stats stats; + u16 qlen; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + + else if (unformat (line_input, "subport %d", &subport)) + ; + + else if (unformat (line_input, "pipe %d", &pipe)) + ; + + else if (unformat (line_input, "tc %d", &tc)) + ; + + else if (unformat (line_input, "tc_q %d", &tc_q)) + ; + + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify interface name!!"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rte_eth_dev_info_get (xd->device_index, &dev_info); + if (dev_info.pci_dev) + { /* bonded interface has no pci info */ + vlib_pci_addr_t pci_addr; + + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + + p = + hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32); + } + + if (p) + devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + devconf = &dm->conf->default_devconf; + + if (devconf->hqos_enabled == 0) + { + vlib_cli_output (vm, "HQoS disabled for this interface"); + goto done; + } + + /* + * Figure out which queue to query. cf rte_sched_port_qindex. (Not sure why + * that method isn't made public by DPDK - how _should_ we get the queue ID?) + */ + qindex = subport * devconf->hqos.port.n_pipes_per_subport + pipe; + qindex = qindex * RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE + tc; + qindex = qindex * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + tc_q; + + if (rte_sched_queue_read_stats (xd->hqos_ht->hqos, qindex, &stats, &qlen) != + 0) + { + error = clib_error_return (0, "failed to read stats"); + goto done; + } + + vlib_cli_output (vm, "%=24s%=16s", "Stats Parameter", "Value"); + vlib_cli_output (vm, "%=24s%=16d", "Packets", stats.n_pkts); + vlib_cli_output (vm, "%=24s%=16d", "Packets dropped", stats.n_pkts_dropped); +#ifdef RTE_SCHED_RED + vlib_cli_output (vm, "%=24s%=16d", "Packets dropped (RED)", + stats.n_pkts_red_dropped); +#endif + vlib_cli_output (vm, "%=24s%=16d", "Bytes", stats.n_bytes); + vlib_cli_output (vm, "%=24s%=16d", "Bytes dropped", stats.n_bytes_dropped); + +#else + + /* Get a line of input */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + vlib_cli_output (vm, "RTE_SCHED_COLLECT_STATS disabled in DPDK"); + goto done; + +#endif + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to display statistics associated with a HQoS traffic class + * queue. + * + * @note + * Statistic collection by the scheduler is disabled by default in DPDK. In order to + * turn it on, add the following line to '../vpp/dpdk/Makefile': + * - $(call set,RTE_SCHED_COLLECT_STATS,y) + * + * @cliexpar + * Example of how to display statistics of HQoS a HQoS traffic class queue: + * @cliexstart{show dpdk hqos queue GigabitEthernet0/9/0 subport 0 pipe 3181 tc 0 tc_q 0} + * Stats Parameter Value + * Packets 140 + * Packets dropped 0 + * Bytes 8400 + * Bytes dropped 0 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_hqos_queue_stats, static) = { + .path = "show dpdk hqos queue", + .short_help = "show dpdk hqos queue subport pipe tc tc_q ", + .function = show_dpdk_hqos_queue_stats, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_dpdk_version_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ +#define _(a,b,c) vlib_cli_output (vm, "%-25s " b, a ":", c); + _("DPDK Version", "%s", rte_version ()); + _("DPDK EAL init args", "%s", dpdk_config_main.eal_init_args_str); +#undef _ + return 0; +} + +/*? + * This command is used to display the current DPDK version and + * the list of arguments passed to DPDK when started. + * + * @cliexpar + * Example of how to display how many DPDK buffer test command has allcoated: + * @cliexstart{show dpdk version} + * DPDK Version: DPDK 16.11.0 + * DPDK EAL init args: -c 1 -n 4 --huge-dir /run/vpp/hugepages --file-prefix vpp -w 0000:00:08.0 -w 0000:00:09.0 --master-lcore 0 --socket-mem 256 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_vpe_version_command, static) = { + .path = "show dpdk version", + .short_help = "show dpdk version", + .function = show_dpdk_version_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +dpdk_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (dpdk_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c new file mode 100644 index 00000000..50b26689 --- /dev/null +++ b/src/plugins/dpdk/device/device.c @@ -0,0 +1,852 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#define foreach_dpdk_tx_func_error \ + _(BAD_RETVAL, "DPDK tx function returned an error") \ + _(RING_FULL, "Tx packet drops (ring full)") \ + _(PKT_DROP, "Tx packet drops (dpdk tx failure)") \ + _(REPL_FAIL, "Tx packet drops (replication failure)") + +typedef enum +{ +#define _(f,s) DPDK_TX_FUNC_ERROR_##f, + foreach_dpdk_tx_func_error +#undef _ + DPDK_TX_FUNC_N_ERROR, +} dpdk_tx_func_error_t; + +static char *dpdk_tx_func_error_strings[] = { +#define _(n,s) s, + foreach_dpdk_tx_func_error +#undef _ +}; + +clib_error_t * +dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address) +{ + int error; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance); + + error = rte_eth_dev_default_mac_addr_set (xd->device_index, + (struct ether_addr *) address); + + if (error) + { + return clib_error_return (0, "mac address set failed: %d", error); + } + else + { + vec_reset_length (xd->default_mac_address); + vec_add (xd->default_mac_address, address, sizeof (address)); + return NULL; + } +} + +clib_error_t * +dpdk_set_mc_filter (vnet_hw_interface_t * hi, + struct ether_addr mc_addr_vec[], int naddr) +{ + int error; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance); + + error = rte_eth_dev_set_mc_addr_list (xd->device_index, mc_addr_vec, naddr); + + if (error) + { + return clib_error_return (0, "mc addr list failed: %d", error); + } + else + { + return NULL; + } +} + +struct rte_mbuf * +dpdk_replicate_packet_mb (vlib_buffer_t * b) +{ + dpdk_main_t *dm = &dpdk_main; + struct rte_mbuf **mbufs = 0, *s, *d; + u8 nb_segs; + unsigned socket_id = rte_socket_id (); + int i; + + ASSERT (dm->pktmbuf_pools[socket_id]); + s = rte_mbuf_from_vlib_buffer (b); + nb_segs = s->nb_segs; + vec_validate (mbufs, nb_segs - 1); + + if (rte_pktmbuf_alloc_bulk (dm->pktmbuf_pools[socket_id], mbufs, nb_segs)) + { + vec_free (mbufs); + return 0; + } + + d = mbufs[0]; + d->nb_segs = s->nb_segs; + d->data_len = s->data_len; + d->pkt_len = s->pkt_len; + d->data_off = s->data_off; + clib_memcpy (d->buf_addr, s->buf_addr, RTE_PKTMBUF_HEADROOM + s->data_len); + + for (i = 1; i < nb_segs; i++) + { + d->next = mbufs[i]; + d = mbufs[i]; + s = s->next; + d->data_len = s->data_len; + clib_memcpy (d->buf_addr, s->buf_addr, + RTE_PKTMBUF_HEADROOM + s->data_len); + } + + d = mbufs[0]; + vec_free (mbufs); + return d; +} + +static void +dpdk_tx_trace_buffer (dpdk_main_t * dm, + vlib_node_runtime_t * node, + dpdk_device_t * xd, + u16 queue_id, u32 buffer_index, vlib_buffer_t * buffer) +{ + vlib_main_t *vm = vlib_get_main (); + dpdk_tx_dma_trace_t *t0; + struct rte_mbuf *mb; + + mb = rte_mbuf_from_vlib_buffer (buffer); + + t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0])); + t0->queue_index = queue_id; + t0->device_index = xd->device_index; + t0->buffer_index = buffer_index; + clib_memcpy (&t0->mb, mb, sizeof (t0->mb)); + clib_memcpy (&t0->buffer, buffer, + sizeof (buffer[0]) - sizeof (buffer->pre_data)); + clib_memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data, + sizeof (t0->buffer.pre_data)); +} + +static_always_inline void +dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b, + int maybe_multiseg) +{ + struct rte_mbuf *mb, *first_mb, *last_mb; + + /* buffer is coming from non-dpdk source so we need to init + rte_mbuf header */ + if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_EXT_HDR_VALID) == 0)) + { + vlib_buffer_t *b2 = b; + last_mb = mb = rte_mbuf_from_vlib_buffer (b2); + rte_pktmbuf_reset (mb); + while (maybe_multiseg && (b2->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + b2 = vlib_get_buffer (vm, b2->next_buffer); + mb = rte_mbuf_from_vlib_buffer (b2); + rte_pktmbuf_reset (mb); + } + } + + last_mb = first_mb = mb = rte_mbuf_from_vlib_buffer (b); + first_mb->nb_segs = 1; + mb->data_len = b->current_length; + mb->pkt_len = maybe_multiseg ? vlib_buffer_length_in_chain (vm, b) : + b->current_length; + mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data; + + while (maybe_multiseg && (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + b = vlib_get_buffer (vm, b->next_buffer); + mb = rte_mbuf_from_vlib_buffer (b); + last_mb->next = mb; + last_mb = mb; + mb->data_len = b->current_length; + mb->pkt_len = b->current_length; + mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data; + first_mb->nb_segs++; + if (PREDICT_FALSE (b->n_add_refs)) + { + rte_mbuf_refcnt_update (mb, b->n_add_refs); + b->n_add_refs = 0; + } + } +} + +/* + * This function calls the dpdk's tx_burst function to transmit the packets + * on the tx_vector. It manages a lock per-device if the device does not + * support multiple queues. It returns the number of packets untransmitted + * on the tx_vector. If all packets are transmitted (the normal case), the + * function returns 0. + * + * The function assumes there is at least one packet on the tx_vector. + */ +static_always_inline + u32 tx_burst_vector_internal (vlib_main_t * vm, + dpdk_device_t * xd, + struct rte_mbuf **tx_vector) +{ + dpdk_main_t *dm = &dpdk_main; + u32 n_packets; + u32 tx_head; + u32 tx_tail; + u32 n_retry; + int rv; + int queue_id; + tx_ring_hdr_t *ring; + + ring = vec_header (tx_vector, sizeof (*ring)); + + n_packets = ring->tx_head - ring->tx_tail; + + tx_head = ring->tx_head % xd->nb_tx_desc; + + /* + * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to + * unpredictable results. + */ + ASSERT (n_packets > 0); + + /* + * Check for tx_vector overflow. If this fails it is a system configuration + * error. The ring should be sized big enough to handle the largest un-flowed + * off burst from a traffic manager. A larger size also helps performance + * a bit because it decreases the probability of having to issue two tx_burst + * calls due to a ring wrap. + */ + ASSERT (n_packets < xd->nb_tx_desc); + ASSERT (ring->tx_tail == 0); + + n_retry = 16; + queue_id = vm->cpu_index; + + do + { + /* start the burst at the tail */ + tx_tail = ring->tx_tail % xd->nb_tx_desc; + + /* + * This device only supports one TX queue, + * and we're running multi-threaded... + */ + if (PREDICT_FALSE (xd->lockp != 0)) + { + queue_id = queue_id % xd->tx_q_used; + while (__sync_lock_test_and_set (xd->lockp[queue_id], 1)) + /* zzzz */ + queue_id = (queue_id + 1) % xd->tx_q_used; + } + + if (PREDICT_FALSE (xd->flags & DPDK_DEVICE_FLAG_HQOS)) /* HQoS ON */ + { + /* no wrap, transmit in one burst */ + dpdk_device_hqos_per_worker_thread_t *hqos = + &xd->hqos_wt[vm->cpu_index]; + + ASSERT (hqos->swq != NULL); + + dpdk_hqos_metadata_set (hqos, + &tx_vector[tx_tail], tx_head - tx_tail); + rv = rte_ring_sp_enqueue_burst (hqos->swq, + (void **) &tx_vector[tx_tail], + (uint16_t) (tx_head - tx_tail)); + } + else if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD)) + { + /* no wrap, transmit in one burst */ + rv = rte_eth_tx_burst (xd->device_index, + (uint16_t) queue_id, + &tx_vector[tx_tail], + (uint16_t) (tx_head - tx_tail)); + } + else + { + ASSERT (0); + rv = 0; + } + + if (PREDICT_FALSE (xd->lockp != 0)) + *xd->lockp[queue_id] = 0; + + if (PREDICT_FALSE (rv < 0)) + { + // emit non-fatal message, bump counter + vnet_main_t *vnm = dm->vnet_main; + vnet_interface_main_t *im = &vnm->interface_main; + u32 node_index; + + node_index = vec_elt_at_index (im->hw_interfaces, + xd->vlib_hw_if_index)->tx_node_index; + + vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1); + clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index, + rv); + return n_packets; // untransmitted packets + } + ring->tx_tail += (u16) rv; + n_packets -= (uint16_t) rv; + } + while (rv && n_packets && (n_retry > 0)); + + return n_packets; +} + +static_always_inline void +dpdk_prefetch_buffer_by_index (vlib_main_t * vm, u32 bi) +{ + vlib_buffer_t *b; + struct rte_mbuf *mb; + b = vlib_get_buffer (vm, bi); + mb = rte_mbuf_from_vlib_buffer (b); + CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, LOAD); +} + +static_always_inline void +dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp) +{ + dpdk_main_t *dm = &dpdk_main; + u32 my_cpu = vm->cpu_index; + struct rte_mbuf *mb_new; + + if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0) + return; + + mb_new = dpdk_replicate_packet_mb (b); + if (PREDICT_FALSE (mb_new == 0)) + { + vlib_error_count (vm, node->node_index, + DPDK_TX_FUNC_ERROR_REPL_FAIL, 1); + b->flags |= VLIB_BUFFER_REPL_FAIL; + } + else + *mbp = mb_new; + + vec_add1 (dm->recycle[my_cpu], bi); +} + +/* + * Transmits the packets on the frame to the interface associated with the + * node. It first copies packets on the frame to a tx_vector containing the + * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal + * which calls the dpdk tx_burst function. + */ +static uword +dpdk_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + dpdk_main_t *dm = &dpdk_main; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, rd->dev_instance); + u32 n_packets = f->n_vectors; + u32 n_left; + u32 *from; + struct rte_mbuf **tx_vector; + u16 i; + u16 nb_tx_desc = xd->nb_tx_desc; + int queue_id; + u32 my_cpu; + u32 tx_pkts = 0; + tx_ring_hdr_t *ring; + u32 n_on_ring; + + my_cpu = vm->cpu_index; + + queue_id = my_cpu; + + tx_vector = xd->tx_vectors[queue_id]; + ring = vec_header (tx_vector, sizeof (*ring)); + + n_on_ring = ring->tx_head - ring->tx_tail; + from = vlib_frame_vector_args (f); + + ASSERT (n_packets <= VLIB_FRAME_SIZE); + + if (PREDICT_FALSE (n_on_ring + n_packets > nb_tx_desc)) + { + /* + * Overflowing the ring should never happen. + * If it does then drop the whole frame. + */ + vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL, + n_packets); + + while (n_packets--) + { + u32 bi0 = from[n_packets]; + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + struct rte_mbuf *mb0 = rte_mbuf_from_vlib_buffer (b0); + rte_pktmbuf_free (mb0); + } + return n_on_ring; + } + + if (PREDICT_FALSE (dm->tx_pcap_enable)) + { + n_left = n_packets; + while (n_left > 0) + { + u32 bi0 = from[0]; + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + if (dm->pcap_sw_if_index == 0 || + dm->pcap_sw_if_index == vnet_buffer (b0)->sw_if_index[VLIB_TX]) + pcap_add_buffer (&dm->pcap_main, vm, bi0, 512); + from++; + n_left--; + } + } + + from = vlib_frame_vector_args (f); + n_left = n_packets; + i = ring->tx_head % nb_tx_desc; + + while (n_left >= 8) + { + u32 bi0, bi1, bi2, bi3; + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 or_flags; + + dpdk_prefetch_buffer_by_index (vm, from[4]); + dpdk_prefetch_buffer_by_index (vm, from[5]); + dpdk_prefetch_buffer_by_index (vm, from[6]); + dpdk_prefetch_buffer_by_index (vm, from[7]); + + bi0 = from[0]; + bi1 = from[1]; + bi2 = from[2]; + bi3 = from[3]; + from += 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + or_flags = b0->flags | b1->flags | b2->flags | b3->flags; + + if (or_flags & VLIB_BUFFER_NEXT_PRESENT) + { + dpdk_validate_rte_mbuf (vm, b0, 1); + dpdk_validate_rte_mbuf (vm, b1, 1); + dpdk_validate_rte_mbuf (vm, b2, 1); + dpdk_validate_rte_mbuf (vm, b3, 1); + } + else + { + dpdk_validate_rte_mbuf (vm, b0, 0); + dpdk_validate_rte_mbuf (vm, b1, 0); + dpdk_validate_rte_mbuf (vm, b2, 0); + dpdk_validate_rte_mbuf (vm, b3, 0); + } + + mb0 = rte_mbuf_from_vlib_buffer (b0); + mb1 = rte_mbuf_from_vlib_buffer (b1); + mb2 = rte_mbuf_from_vlib_buffer (b2); + mb3 = rte_mbuf_from_vlib_buffer (b3); + + if (PREDICT_FALSE (or_flags & VLIB_BUFFER_RECYCLE)) + { + dpdk_buffer_recycle (vm, node, b0, bi0, &mb0); + dpdk_buffer_recycle (vm, node, b1, bi1, &mb1); + dpdk_buffer_recycle (vm, node, b2, bi2, &mb2); + dpdk_buffer_recycle (vm, node, b3, bi3, &mb3); + + /* dont enqueue packets if replication failed as they must + be sent back to recycle */ + if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + tx_vector[i++ % nb_tx_desc] = mb0; + if (PREDICT_TRUE ((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + tx_vector[i++ % nb_tx_desc] = mb1; + if (PREDICT_TRUE ((b2->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + tx_vector[i++ % nb_tx_desc] = mb2; + if (PREDICT_TRUE ((b3->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + tx_vector[i++ % nb_tx_desc] = mb3; + } + else + { + if (PREDICT_FALSE (i + 3 >= nb_tx_desc)) + { + tx_vector[i++ % nb_tx_desc] = mb0; + tx_vector[i++ % nb_tx_desc] = mb1; + tx_vector[i++ % nb_tx_desc] = mb2; + tx_vector[i++ % nb_tx_desc] = mb3; + i %= nb_tx_desc; + } + else + { + tx_vector[i++] = mb0; + tx_vector[i++] = mb1; + tx_vector[i++] = mb2; + tx_vector[i++] = mb3; + } + } + + + if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0); + if (b1->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1); + if (b2->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi2, b2); + if (b3->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi3, b3); + } + + n_left -= 4; + } + while (n_left > 0) + { + u32 bi0; + struct rte_mbuf *mb0; + vlib_buffer_t *b0; + + bi0 = from[0]; + from++; + + b0 = vlib_get_buffer (vm, bi0); + + dpdk_validate_rte_mbuf (vm, b0, 1); + + mb0 = rte_mbuf_from_vlib_buffer (b0); + dpdk_buffer_recycle (vm, node, b0, bi0, &mb0); + + if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) + if (b0->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0); + + if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + { + tx_vector[i % nb_tx_desc] = mb0; + i++; + } + n_left--; + } + + /* account for additional packets in the ring */ + ring->tx_head += n_packets; + n_on_ring = ring->tx_head - ring->tx_tail; + + /* transmit as many packets as possible */ + n_packets = tx_burst_vector_internal (vm, xd, tx_vector); + + /* + * tx_pkts is the number of packets successfully transmitted + * This is the number originally on ring minus the number remaining on ring + */ + tx_pkts = n_on_ring - n_packets; + + { + /* If there is no callback then drop any non-transmitted packets */ + if (PREDICT_FALSE (n_packets)) + { + vlib_simple_counter_main_t *cm; + vnet_main_t *vnm = vnet_get_main (); + + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_TX_ERROR); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + n_packets); + + vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP, + n_packets); + + while (n_packets--) + rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]); + } + + /* Reset head/tail to avoid unnecessary wrap */ + ring->tx_head = 0; + ring->tx_tail = 0; + } + + /* Recycle replicated buffers */ + if (PREDICT_FALSE (vec_len (dm->recycle[my_cpu]))) + { + vlib_buffer_free (vm, dm->recycle[my_cpu], + vec_len (dm->recycle[my_cpu])); + _vec_len (dm->recycle[my_cpu]) = 0; + } + + ASSERT (ring->tx_head >= ring->tx_tail); + + return tx_pkts; +} + +static void +dpdk_clear_hw_interface_counters (u32 instance) +{ + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, instance); + + /* + * Set the "last_cleared_stats" to the current stats, so that + * things appear to clear from a display perspective. + */ + dpdk_update_counters (xd, vlib_time_now (dm->vlib_main)); + + clib_memcpy (&xd->last_cleared_stats, &xd->stats, sizeof (xd->stats)); + clib_memcpy (xd->last_cleared_xstats, xd->xstats, + vec_len (xd->last_cleared_xstats) * + sizeof (xd->last_cleared_xstats[0])); + +} + +static clib_error_t * +dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, hif->dev_instance); + int rv = 0; + + if (is_up) + { + f64 now = vlib_time_now (dm->vlib_main); + + if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) + { + rv = rte_eth_dev_start (xd->device_index); + if (!rv && xd->default_mac_address) + rv = rte_eth_dev_default_mac_addr_set (xd->device_index, + (struct ether_addr *) + xd->default_mac_address); + } + + if (xd->flags & DPDK_DEVICE_FLAG_PROMISC) + rte_eth_promiscuous_enable (xd->device_index); + else + rte_eth_promiscuous_disable (xd->device_index); + + rte_eth_allmulticast_enable (xd->device_index); + xd->flags |= DPDK_DEVICE_FLAG_ADMIN_UP; + dpdk_update_counters (xd, now); + dpdk_update_link_state (xd, now); + } + else + { + xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP; + + rte_eth_allmulticast_disable (xd->device_index); + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0); + rte_eth_dev_stop (xd->device_index); + + /* For bonded interface, stop slave links */ + if (xd->pmd == VNET_DPDK_PMD_BOND) + { + u8 slink[16]; + int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16); + while (nlink >= 1) + { + u8 dpdk_port = slink[--nlink]; + rte_eth_dev_stop (dpdk_port); + } + } + } + + if (rv < 0) + clib_warning ("rte_eth_dev_%s error: %d", is_up ? "start" : "stop", rv); + + return /* no error */ 0; +} + +/* + * Dynamically redirect all pkts from a specific interface + * to the specified node + */ +static void +dpdk_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + dpdk_main_t *xm = &dpdk_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + xd->per_interface_next_index = node_index; + return; + } + + xd->per_interface_next_index = + vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index); +} + + +static clib_error_t * +dpdk_subif_add_del_function (vnet_main_t * vnm, + u32 hw_if_index, + struct vnet_sw_interface_t *st, int is_add) +{ + dpdk_main_t *xm = &dpdk_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); + vnet_sw_interface_t *t = (vnet_sw_interface_t *) st; + int r, vlan_offload; + u32 prev_subifs = xd->num_subifs; + clib_error_t *err = 0; + + if (is_add) + xd->num_subifs++; + else if (xd->num_subifs) + xd->num_subifs--; + + if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) + goto done; + + /* currently we program VLANS only for IXGBE VF and I40E VF */ + if ((xd->pmd != VNET_DPDK_PMD_IXGBEVF) && (xd->pmd != VNET_DPDK_PMD_I40EVF)) + goto done; + + if (t->sub.eth.flags.no_tags == 1) + goto done; + + if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1)) + { + xd->num_subifs = prev_subifs; + err = clib_error_return (0, "unsupported VLAN setup"); + goto done; + } + + vlan_offload = rte_eth_dev_get_vlan_offload (xd->device_index); + vlan_offload |= ETH_VLAN_FILTER_OFFLOAD; + + if ((r = rte_eth_dev_set_vlan_offload (xd->device_index, vlan_offload))) + { + xd->num_subifs = prev_subifs; + err = clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d", + xd->device_index, r); + goto done; + } + + + if ((r = + rte_eth_dev_vlan_filter (xd->device_index, t->sub.eth.outer_vlan_id, + is_add))) + { + xd->num_subifs = prev_subifs; + err = clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d", + xd->device_index, r); + goto done; + } + +done: + if (xd->num_subifs) + xd->flags |= DPDK_DEVICE_FLAG_HAVE_SUBIF; + else + xd->flags &= ~DPDK_DEVICE_FLAG_HAVE_SUBIF; + + return err; +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (dpdk_device_class) = { + .name = "dpdk", + .tx_function = dpdk_interface_tx, + .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR, + .tx_function_error_strings = dpdk_tx_func_error_strings, + .format_device_name = format_dpdk_device_name, + .format_device = format_dpdk_device, + .format_tx_trace = format_dpdk_tx_dma_trace, + .clear_counters = dpdk_clear_hw_interface_counters, + .admin_up_down_function = dpdk_interface_admin_up_down, + .subif_add_del_function = dpdk_subif_add_del_function, + .rx_redirect_to_node = dpdk_set_interface_next_node, + .mac_addr_change_function = dpdk_set_mac_address, +}; + +VLIB_DEVICE_TX_FUNCTION_MULTIARCH (dpdk_device_class, dpdk_interface_tx) +/* *INDENT-ON* */ + +#define UP_DOWN_FLAG_EVENT 1 + +uword +admin_up_down_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + clib_error_t *error = 0; + uword event_type; + uword *event_data = 0; + u32 sw_if_index; + u32 flags; + + while (1) + { + vlib_process_wait_for_event (vm); + + event_type = vlib_process_get_events (vm, &event_data); + + dpdk_main.admin_up_down_in_progress = 1; + + switch (event_type) + { + case UP_DOWN_FLAG_EVENT: + { + if (vec_len (event_data) == 2) + { + sw_if_index = event_data[0]; + flags = event_data[1]; + error = + vnet_sw_interface_set_flags (vnet_get_main (), sw_if_index, + flags); + clib_error_report (error); + } + } + break; + } + + vec_reset_length (event_data); + + dpdk_main.admin_up_down_in_progress = 0; + + } + return 0; /* or not */ +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (admin_up_down_process_node,static) = { + .function = admin_up_down_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "admin-up-down-process", + .process_log2_n_stack_bytes = 17, // 256KB +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h new file mode 100644 index 00000000..2a1a6205 --- /dev/null +++ b/src/plugins/dpdk/device/dpdk.h @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_dpdk_h__ +#define __included_dpdk_h__ + +/* $$$$ We should rename always_inline -> clib_always_inline */ +#undef always_inline + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if CLIB_DEBUG > 0 +#define always_inline static inline +#else +#define always_inline static inline __attribute__ ((__always_inline__)) +#endif + +#include + +#define NB_MBUF (16<<10) + +extern vnet_device_class_t dpdk_device_class; +extern vlib_node_registration_t dpdk_input_node; +extern vlib_node_registration_t handoff_dispatch_node; + +#define foreach_dpdk_pmd \ + _ ("net_thunderx", THUNDERX) \ + _ ("net_e1000_em", E1000EM) \ + _ ("net_e1000_igb", IGB) \ + _ ("net_e1000_igb_vf", IGBVF) \ + _ ("net_ixgbe", IXGBE) \ + _ ("net_ixgbe_vf", IXGBEVF) \ + _ ("net_i40e", I40E) \ + _ ("net_i40e_vf", I40EVF) \ + _ ("net_virtio", VIRTIO) \ + _ ("net_enic", ENIC) \ + _ ("net_vmxnet3", VMXNET3) \ + _ ("AF_PACKET PMD", AF_PACKET) \ + _ ("rte_bond_pmd", BOND) \ + _ ("net_fm10k", FM10K) \ + _ ("net_cxgbe", CXGBE) \ + _ ("net_mlx5", MLX5) \ + _ ("net_dpaa2", DPAA2) + +typedef enum +{ + VNET_DPDK_PMD_NONE, +#define _(s,f) VNET_DPDK_PMD_##f, + foreach_dpdk_pmd +#undef _ + VNET_DPDK_PMD_UNKNOWN, /* must be last */ +} dpdk_pmd_t; + +typedef enum +{ + VNET_DPDK_PORT_TYPE_ETH_1G, + VNET_DPDK_PORT_TYPE_ETH_10G, + VNET_DPDK_PORT_TYPE_ETH_40G, + VNET_DPDK_PORT_TYPE_ETH_100G, + VNET_DPDK_PORT_TYPE_ETH_BOND, + VNET_DPDK_PORT_TYPE_ETH_SWITCH, + VNET_DPDK_PORT_TYPE_AF_PACKET, + VNET_DPDK_PORT_TYPE_UNKNOWN, +} dpdk_port_type_t; + +/* + * The header for the tx_vector in dpdk_device_t. + * Head and tail are indexes into the tx_vector and are of type + * u64 so they never overflow. + */ +typedef struct +{ + u64 tx_head; + u64 tx_tail; +} tx_ring_hdr_t; + +typedef struct +{ + struct rte_ring *swq; + + u64 hqos_field0_slabmask; + u32 hqos_field0_slabpos; + u32 hqos_field0_slabshr; + u64 hqos_field1_slabmask; + u32 hqos_field1_slabpos; + u32 hqos_field1_slabshr; + u64 hqos_field2_slabmask; + u32 hqos_field2_slabpos; + u32 hqos_field2_slabshr; + u32 hqos_tc_table[64]; +} dpdk_device_hqos_per_worker_thread_t; + +typedef struct +{ + struct rte_ring **swq; + struct rte_mbuf **pkts_enq; + struct rte_mbuf **pkts_deq; + struct rte_sched_port *hqos; + u32 hqos_burst_enq; + u32 hqos_burst_deq; + u32 pkts_enq_len; + u32 swq_pos; + u32 flush_count; +} dpdk_device_hqos_per_hqos_thread_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + volatile u32 **lockp; + + /* Instance ID */ + u32 device_index; + + u32 vlib_hw_if_index; + u32 vlib_sw_if_index; + + /* next node index if we decide to steal the rx graph arc */ + u32 per_interface_next_index; + + /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */ + struct rte_mbuf ***tx_vectors; /* one per worker thread */ + struct rte_mbuf ***rx_vectors; + + /* vector of traced contexts, per device */ + u32 **d_trace_buffers; + + dpdk_pmd_t pmd:8; + i8 cpu_socket; + + u16 flags; +#define DPDK_DEVICE_FLAG_ADMIN_UP (1 << 0) +#define DPDK_DEVICE_FLAG_PROMISC (1 << 1) +#define DPDK_DEVICE_FLAG_PMD (1 << 2) +#define DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE (1 << 3) +#define DPDK_DEVICE_FLAG_MAYBE_MULTISEG (1 << 4) +#define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5) +#define DPDK_DEVICE_FLAG_HQOS (1 << 6) + + u16 nb_tx_desc; + CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); + + u8 *interface_name_suffix; + + /* number of sub-interfaces */ + u16 num_subifs; + + /* PMD related */ + u16 tx_q_used; + u16 rx_q_used; + u16 nb_rx_desc; + u16 *cpu_socket_id_by_queue; + struct rte_eth_conf port_conf; + struct rte_eth_txconf tx_conf; + + /* HQoS related */ + dpdk_device_hqos_per_worker_thread_t *hqos_wt; + dpdk_device_hqos_per_hqos_thread_t *hqos_ht; + + /* af_packet */ + u8 af_packet_port_id; + + struct rte_eth_link link; + f64 time_last_link_update; + + struct rte_eth_stats stats; + struct rte_eth_stats last_stats; + struct rte_eth_stats last_cleared_stats; + struct rte_eth_xstat *xstats; + struct rte_eth_xstat *last_cleared_xstats; + f64 time_last_stats_update; + dpdk_port_type_t port_type; + + /* mac address */ + u8 *default_mac_address; +} dpdk_device_t; + +#define DPDK_STATS_POLL_INTERVAL (10.0) +#define DPDK_MIN_STATS_POLL_INTERVAL (0.001) /* 1msec */ + +#define DPDK_LINK_POLL_INTERVAL (3.0) +#define DPDK_MIN_LINK_POLL_INTERVAL (0.001) /* 1msec */ + +typedef struct +{ + u32 device; + u16 queue_id; +} dpdk_device_and_queue_t; + +#ifndef DPDK_HQOS_DBG_BYPASS +#define DPDK_HQOS_DBG_BYPASS 0 +#endif + +#ifndef HQOS_FLUSH_COUNT_THRESHOLD +#define HQOS_FLUSH_COUNT_THRESHOLD 100000 +#endif + +typedef struct dpdk_device_config_hqos_t +{ + u32 hqos_thread; + u32 hqos_thread_valid; + + u32 swq_size; + u32 burst_enq; + u32 burst_deq; + + u32 pktfield0_slabpos; + u32 pktfield1_slabpos; + u32 pktfield2_slabpos; + u64 pktfield0_slabmask; + u64 pktfield1_slabmask; + u64 pktfield2_slabmask; + u32 tc_table[64]; + + struct rte_sched_port_params port; + struct rte_sched_subport_params *subport; + struct rte_sched_pipe_params *pipe; + uint32_t *pipe_map; +} dpdk_device_config_hqos_t; + +int dpdk_hqos_validate_mask (u64 mask, u32 n); +void dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t * + hqos, u32 pipe_profile_id); +void dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos); +clib_error_t *dpdk_port_setup_hqos (dpdk_device_t * xd, + dpdk_device_config_hqos_t * hqos); +void dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos, + struct rte_mbuf **pkts, u32 n_pkts); + +#define foreach_dpdk_device_config_item \ + _ (num_rx_queues) \ + _ (num_tx_queues) \ + _ (num_rx_desc) \ + _ (num_tx_desc) \ + _ (rss_fn) + +typedef struct +{ + vlib_pci_addr_t pci_addr; + u8 is_blacklisted; + u8 vlan_strip_offload; +#define DPDK_DEVICE_VLAN_STRIP_DEFAULT 0 +#define DPDK_DEVICE_VLAN_STRIP_OFF 1 +#define DPDK_DEVICE_VLAN_STRIP_ON 2 + +#define _(x) uword x; + foreach_dpdk_device_config_item +#undef _ + clib_bitmap_t * workers; + u32 hqos_enabled; + dpdk_device_config_hqos_t hqos; +} dpdk_device_config_t; + +typedef struct +{ + + /* Config stuff */ + u8 **eal_init_args; + u8 *eal_init_args_str; + u8 *uio_driver_name; + u8 no_multi_seg; + u8 enable_tcp_udp_checksum; + u8 cryptodev; + + /* Required config parameters */ + u8 coremask_set_manually; + u8 nchannels_set_manually; + u32 coremask; + u32 nchannels; + u32 num_mbufs; + u8 num_kni; /* while kni_init allows u32, port_id in callback fn is only u8 */ + + /* + * format interface names ala xxxEthernet%d/%d/%d instead of + * xxxEthernet%x/%x/%x. + */ + u8 interface_name_format_decimal; + + /* per-device config */ + dpdk_device_config_t default_devconf; + dpdk_device_config_t *dev_confs; + uword *device_config_index_by_pci_addr; + +} dpdk_config_main_t; + +dpdk_config_main_t dpdk_config_main; + +typedef struct +{ + + /* Devices */ + dpdk_device_t *devices; + dpdk_device_and_queue_t **devices_by_cpu; + dpdk_device_and_queue_t **devices_by_hqos_cpu; + + /* per-thread recycle lists */ + u32 **recycle; + + /* buffer flags template, configurable to enable/disable tcp / udp cksum */ + u32 buffer_flags_template; + + /* vlib buffer free list, must be same size as an rte_mbuf */ + u32 vlib_buffer_free_list_index; + + /* Ethernet input node index */ + u32 ethernet_input_node_index; + + /* pcap tracing [only works if (CLIB_DEBUG > 0)] */ + int tx_pcap_enable; + pcap_main_t pcap_main; + u8 *pcap_filename; + u32 pcap_sw_if_index; + u32 pcap_pkts_to_capture; + + /* hashes */ + uword *dpdk_device_by_kni_port_id; + uword *vu_sw_if_index_by_listener_fd; + uword *vu_sw_if_index_by_sock_fd; + u32 *vu_inactive_interfaces_device_index; + + /* + * flag indicating that a posted admin up/down + * (via post_sw_interface_set_flags) is in progress + */ + u8 admin_up_down_in_progress; + + u8 use_rss; + + /* which cpus are running dpdk-input */ + int input_cpu_first_index; + int input_cpu_count; + + /* which cpus are running I/O TX */ + int hqos_cpu_first_index; + int hqos_cpu_count; + + /* control interval of dpdk link state and stat polling */ + f64 link_state_poll_interval; + f64 stat_poll_interval; + + /* Sleep for this many MS after each device poll */ + u32 poll_sleep; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + dpdk_config_main_t *conf; + + /* mempool */ + struct rte_mempool **pktmbuf_pools; + + /* API message ID base */ + u16 msg_id_base; +} dpdk_main_t; + +dpdk_main_t dpdk_main; + +typedef struct +{ + u32 buffer_index; + u16 device_index; + u8 queue_index; + struct rte_mbuf mb; + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} dpdk_tx_dma_trace_t; + +typedef struct +{ + u32 buffer_index; + u16 device_index; + u16 queue_index; + struct rte_mbuf mb; + vlib_buffer_t buffer; /* Copy of VLIB buffer; pkt data stored in pre_data. */ + u8 data[256]; /* First 256 data bytes, used for hexdump */ +} dpdk_rx_dma_trace_t; + +void vnet_buffer_needs_dpdk_mb (vlib_buffer_t * b); + +clib_error_t *dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address); + +clib_error_t *dpdk_set_mc_filter (vnet_hw_interface_t * hi, + struct ether_addr mc_addr_vec[], int naddr); + +void dpdk_thread_input (dpdk_main_t * dm, dpdk_device_t * xd); + +clib_error_t *dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd); + +u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance); + +struct rte_mbuf *dpdk_replicate_packet_mb (vlib_buffer_t * b); +struct rte_mbuf *dpdk_zerocopy_replicate_packet_mb (vlib_buffer_t * b); + +#define foreach_dpdk_error \ + _(NONE, "no error") \ + _(RX_PACKET_ERROR, "Rx packet errors") \ + _(RX_BAD_FCS, "Rx bad fcs") \ + _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \ + _(RX_ALLOC_FAIL, "rx buf alloc from free list failed") \ + _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem") \ + _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error") + +typedef enum +{ +#define _(f,s) DPDK_ERROR_##f, + foreach_dpdk_error +#undef _ + DPDK_N_ERROR, +} dpdk_error_t; + +int dpdk_set_stat_poll_interval (f64 interval); +int dpdk_set_link_state_poll_interval (f64 interval); +void dpdk_update_link_state (dpdk_device_t * xd, f64 now); +void dpdk_device_lock_init (dpdk_device_t * xd); +void dpdk_device_lock_free (dpdk_device_t * xd); + +void dpdk_rx_trace (dpdk_main_t * dm, + vlib_node_runtime_t * node, + dpdk_device_t * xd, + u16 queue_id, u32 * buffers, uword n_buffers); + +#define EFD_OPERATION_LESS_THAN 0 +#define EFD_OPERATION_GREATER_OR_EQUAL 1 + +format_function_t format_dpdk_device_name; +format_function_t format_dpdk_device; +format_function_t format_dpdk_tx_dma_trace; +format_function_t format_dpdk_rx_dma_trace; +format_function_t format_dpdk_rte_mbuf; +format_function_t format_dpdk_rx_rte_mbuf; +unformat_function_t unformat_socket_mem; +clib_error_t *unformat_rss_fn (unformat_input_t * input, uword * rss_fn); +clib_error_t *unformat_hqos (unformat_input_t * input, + dpdk_device_config_hqos_t * hqos); + +uword +admin_up_down_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f); + +#endif /* __included_dpdk_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/dpdk_priv.h b/src/plugins/dpdk/device/dpdk_priv.h new file mode 100644 index 00000000..dd40ff48 --- /dev/null +++ b/src/plugins/dpdk/device/dpdk_priv.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1) +#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1)) + +#define DPDK_NB_RX_DESC_DEFAULT 1024 +#define DPDK_NB_TX_DESC_DEFAULT 1024 +#define DPDK_NB_RX_DESC_VIRTIO 256 +#define DPDK_NB_TX_DESC_VIRTIO 256 + +#define I40E_DEV_ID_SFP_XL710 0x1572 +#define I40E_DEV_ID_QSFP_A 0x1583 +#define I40E_DEV_ID_QSFP_B 0x1584 +#define I40E_DEV_ID_QSFP_C 0x1585 +#define I40E_DEV_ID_10G_BASE_T 0x1586 +#define I40E_DEV_ID_VF 0x154C + +/* These args appear by themselves */ +#define foreach_eal_double_hyphen_predicate_arg \ +_(no-shconf) \ +_(no-hpet) \ +_(no-huge) \ +_(vmware-tsc-map) + +#define foreach_eal_single_hyphen_mandatory_arg \ +_(coremask, c) \ +_(nchannels, n) \ + +#define foreach_eal_single_hyphen_arg \ +_(blacklist, b) \ +_(mem-alloc-request, m) \ +_(force-ranks, r) + +/* These args are preceeded by "--" and followed by a single string */ +#define foreach_eal_double_hyphen_arg \ +_(huge-dir) \ +_(proc-type) \ +_(file-prefix) \ +_(vdev) + +static inline void +dpdk_get_xstats (dpdk_device_t * xd) +{ + int len; + if ((len = rte_eth_xstats_get (xd->device_index, NULL, 0)) > 0) + { + vec_validate (xd->xstats, len - 1); + vec_validate (xd->last_cleared_xstats, len - 1); + + len = + rte_eth_xstats_get (xd->device_index, xd->xstats, + vec_len (xd->xstats)); + + ASSERT (vec_len (xd->xstats) == len); + ASSERT (vec_len (xd->last_cleared_xstats) == len); + + _vec_len (xd->xstats) = len; + _vec_len (xd->last_cleared_xstats) = len; + + } +} + + +static inline void +dpdk_update_counters (dpdk_device_t * xd, f64 now) +{ + vlib_simple_counter_main_t *cm; + vnet_main_t *vnm = vnet_get_main (); + u32 my_cpu = os_get_cpu_number (); + u64 rxerrors, last_rxerrors; + + /* only update counters for PMD interfaces */ + if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) + return; + + xd->time_last_stats_update = now ? now : xd->time_last_stats_update; + clib_memcpy (&xd->last_stats, &xd->stats, sizeof (xd->last_stats)); + rte_eth_stats_get (xd->device_index, &xd->stats); + + /* maybe bump interface rx no buffer counter */ + if (PREDICT_FALSE (xd->stats.rx_nombuf != xd->last_stats.rx_nombuf)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_NO_BUF); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + xd->stats.rx_nombuf - + xd->last_stats.rx_nombuf); + } + + /* missed pkt counter */ + if (PREDICT_FALSE (xd->stats.imissed != xd->last_stats.imissed)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_MISS); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + xd->stats.imissed - + xd->last_stats.imissed); + } + rxerrors = xd->stats.ierrors; + last_rxerrors = xd->last_stats.ierrors; + + if (PREDICT_FALSE (rxerrors != last_rxerrors)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_ERROR); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + rxerrors - last_rxerrors); + } + + dpdk_get_xstats (xd); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c new file mode 100644 index 00000000..25a8c5cb --- /dev/null +++ b/src/plugins/dpdk/device/format.c @@ -0,0 +1,754 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#define foreach_dpdk_counter \ + _ (tx_frames_ok, opackets) \ + _ (tx_bytes_ok, obytes) \ + _ (tx_errors, oerrors) \ + _ (rx_frames_ok, ipackets) \ + _ (rx_bytes_ok, ibytes) \ + _ (rx_errors, ierrors) \ + _ (rx_missed, imissed) \ + _ (rx_no_bufs, rx_nombuf) + +#define foreach_dpdk_q_counter \ + _ (rx_frames_ok, q_ipackets) \ + _ (tx_frames_ok, q_opackets) \ + _ (rx_bytes_ok, q_ibytes) \ + _ (tx_bytes_ok, q_obytes) \ + _ (rx_errors, q_errors) + +#define foreach_dpdk_rss_hf \ + _(ETH_RSS_FRAG_IPV4, "ipv4-frag") \ + _(ETH_RSS_NONFRAG_IPV4_TCP, "ipv4-tcp") \ + _(ETH_RSS_NONFRAG_IPV4_UDP, "ipv4-udp") \ + _(ETH_RSS_NONFRAG_IPV4_SCTP, "ipv4-sctp") \ + _(ETH_RSS_NONFRAG_IPV4_OTHER, "ipv4-other") \ + _(ETH_RSS_IPV4, "ipv4") \ + _(ETH_RSS_IPV6_TCP_EX, "ipv6-tcp-ex") \ + _(ETH_RSS_IPV6_UDP_EX, "ipv6-udp-ex") \ + _(ETH_RSS_FRAG_IPV6, "ipv6-frag") \ + _(ETH_RSS_NONFRAG_IPV6_TCP, "ipv6-tcp") \ + _(ETH_RSS_NONFRAG_IPV6_UDP, "ipv6-udp") \ + _(ETH_RSS_NONFRAG_IPV6_SCTP, "ipv6-sctp") \ + _(ETH_RSS_NONFRAG_IPV6_OTHER, "ipv6-other") \ + _(ETH_RSS_L2_PAYLOAD, "l2-payload") \ + _(ETH_RSS_IPV6_EX, "ipv6-ex") \ + _(ETH_RSS_IPV6, "ipv6") + + +#define foreach_dpdk_rx_offload_caps \ + _(DEV_RX_OFFLOAD_VLAN_STRIP, "vlan-strip") \ + _(DEV_RX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \ + _(DEV_RX_OFFLOAD_UDP_CKSUM , "udp-cksum") \ + _(DEV_RX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \ + _(DEV_RX_OFFLOAD_TCP_LRO , "rcp-lro") \ + _(DEV_RX_OFFLOAD_QINQ_STRIP, "qinq-strip") + +#define foreach_dpdk_tx_offload_caps \ + _(DEV_TX_OFFLOAD_VLAN_INSERT, "vlan-insert") \ + _(DEV_TX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \ + _(DEV_TX_OFFLOAD_UDP_CKSUM , "udp-cksum") \ + _(DEV_TX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \ + _(DEV_TX_OFFLOAD_SCTP_CKSUM , "sctp-cksum") \ + _(DEV_TX_OFFLOAD_TCP_TSO , "tcp-tso") \ + _(DEV_TX_OFFLOAD_UDP_TSO , "udp-tso") \ + _(DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM, "outer-ipv4-cksum") \ + _(DEV_TX_OFFLOAD_QINQ_INSERT, "qinq-insert") + +#define foreach_dpdk_pkt_rx_offload_flag \ + _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \ + _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \ + _ (PKT_RX_FDIR, "RX packet with FDIR infos") \ + _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \ + _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \ + _ (PKT_RX_VLAN_STRIPPED, "RX packet VLAN tag stripped") \ + _ (PKT_RX_IP_CKSUM_GOOD, "IP cksum of RX pkt. is valid") \ + _ (PKT_RX_L4_CKSUM_GOOD, "L4 cksum of RX pkt. is valid") \ + _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \ + _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet") \ + _ (PKT_RX_QINQ_STRIPPED, "RX packet QinQ tags stripped") + +#define foreach_dpdk_pkt_type \ + _ (L2, ETHER, "Ethernet packet") \ + _ (L2, ETHER_TIMESYNC, "Ethernet packet for time sync") \ + _ (L2, ETHER_ARP, "ARP packet") \ + _ (L2, ETHER_LLDP, "LLDP (Link Layer Discovery Protocol) packet") \ + _ (L2, ETHER_NSH, "NSH (Network Service Header) packet") \ + _ (L2, ETHER_VLAN, "VLAN packet") \ + _ (L2, ETHER_QINQ, "QinQ packet") \ + _ (L3, IPV4, "IPv4 packet without extension headers") \ + _ (L3, IPV4_EXT, "IPv4 packet with extension headers") \ + _ (L3, IPV4_EXT_UNKNOWN, "IPv4 packet with or without extension headers") \ + _ (L3, IPV6, "IPv6 packet without extension headers") \ + _ (L3, IPV6_EXT, "IPv6 packet with extension headers") \ + _ (L3, IPV6_EXT_UNKNOWN, "IPv6 packet with or without extension headers") \ + _ (L4, TCP, "TCP packet") \ + _ (L4, UDP, "UDP packet") \ + _ (L4, FRAG, "Fragmented IP packet") \ + _ (L4, SCTP, "SCTP (Stream Control Transmission Protocol) packet") \ + _ (L4, ICMP, "ICMP packet") \ + _ (L4, NONFRAG, "Non-fragmented IP packet") \ + _ (TUNNEL, GRE, "GRE tunneling packet") \ + _ (TUNNEL, VXLAN, "VXLAN tunneling packet") \ + _ (TUNNEL, NVGRE, "NVGRE Tunneling packet") \ + _ (TUNNEL, GENEVE, "GENEVE Tunneling packet") \ + _ (TUNNEL, GRENAT, "Teredo, VXLAN or GRE Tunneling packet") \ + _ (INNER_L2, ETHER, "Inner Ethernet packet") \ + _ (INNER_L2, ETHER_VLAN, "Inner Ethernet packet with VLAN") \ + _ (INNER_L3, IPV4, "Inner IPv4 packet without extension headers") \ + _ (INNER_L3, IPV4_EXT, "Inner IPv4 packet with extension headers") \ + _ (INNER_L3, IPV4_EXT_UNKNOWN, "Inner IPv4 packet with or without extension headers") \ + _ (INNER_L3, IPV6, "Inner IPv6 packet without extension headers") \ + _ (INNER_L3, IPV6_EXT, "Inner IPv6 packet with extension headers") \ + _ (INNER_L3, IPV6_EXT_UNKNOWN, "Inner IPv6 packet with or without extension headers") \ + _ (INNER_L4, TCP, "Inner TCP packet") \ + _ (INNER_L4, UDP, "Inner UDP packet") \ + _ (INNER_L4, FRAG, "Inner fagmented IP packet") \ + _ (INNER_L4, SCTP, "Inner SCTP (Stream Control Transmission Protocol) packet") \ + _ (INNER_L4, ICMP, "Inner ICMP packet") \ + _ (INNER_L4, NONFRAG, "Inner non-fragmented IP packet") + +#define foreach_dpdk_pkt_tx_offload_flag \ + _ (PKT_TX_VLAN_PKT, "TX packet is a 802.1q VLAN packet") \ + _ (PKT_TX_IP_CKSUM, "IP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_TCP_CKSUM, "TCP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_SCTP_CKSUM, "SCTP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_IEEE1588_TMST, "TX IEEE1588 packet to timestamp") + +#define foreach_dpdk_pkt_offload_flag \ + foreach_dpdk_pkt_rx_offload_flag \ + foreach_dpdk_pkt_tx_offload_flag + +u8 * +format_dpdk_device_name (u8 * s, va_list * args) +{ + dpdk_main_t *dm = &dpdk_main; + char *devname_format; + char *device_name; + u32 i = va_arg (*args, u32); + struct rte_eth_dev_info dev_info; + u8 *ret; + + if (dm->conf->interface_name_format_decimal) + devname_format = "%s%d/%d/%d"; + else + devname_format = "%s%x/%x/%x"; + + switch (dm->devices[i].port_type) + { + case VNET_DPDK_PORT_TYPE_ETH_1G: + device_name = "GigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_10G: + device_name = "TenGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_40G: + device_name = "FortyGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_100G: + device_name = "HundredGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_BOND: + return format (s, "BondEthernet%d", dm->devices[i].device_index); + + case VNET_DPDK_PORT_TYPE_ETH_SWITCH: + device_name = "EthernetSwitch"; + break; + + case VNET_DPDK_PORT_TYPE_AF_PACKET: + rte_eth_dev_info_get (i, &dev_info); + return format (s, "af_packet%d", dm->devices[i].af_packet_port_id); + + default: + case VNET_DPDK_PORT_TYPE_UNKNOWN: + device_name = "UnknownEthernet"; + break; + } + + rte_eth_dev_info_get (i, &dev_info); + + if (dev_info.pci_dev) + ret = format (s, devname_format, device_name, dev_info.pci_dev->addr.bus, + dev_info.pci_dev->addr.devid, + dev_info.pci_dev->addr.function); + else + ret = format (s, "%s%d", device_name, dm->devices[i].device_index); + + if (dm->devices[i].interface_name_suffix) + return format (ret, "/%s", dm->devices[i].interface_name_suffix); + return ret; +} + +static u8 * +format_dpdk_device_type (u8 * s, va_list * args) +{ + dpdk_main_t *dm = &dpdk_main; + char *dev_type; + u32 i = va_arg (*args, u32); + + switch (dm->devices[i].pmd) + { + case VNET_DPDK_PMD_E1000EM: + dev_type = "Intel 82540EM (e1000)"; + break; + + case VNET_DPDK_PMD_IGB: + dev_type = "Intel e1000"; + break; + + case VNET_DPDK_PMD_I40E: + dev_type = "Intel X710/XL710 Family"; + break; + + case VNET_DPDK_PMD_I40EVF: + dev_type = "Intel X710/XL710 Family VF"; + break; + + case VNET_DPDK_PMD_FM10K: + dev_type = "Intel FM10000 Family Ethernet Switch"; + break; + + case VNET_DPDK_PMD_IGBVF: + dev_type = "Intel e1000 VF"; + break; + + case VNET_DPDK_PMD_VIRTIO: + dev_type = "Red Hat Virtio"; + break; + + case VNET_DPDK_PMD_IXGBEVF: + dev_type = "Intel 82599 VF"; + break; + + case VNET_DPDK_PMD_IXGBE: + dev_type = "Intel 82599"; + break; + + case VNET_DPDK_PMD_ENIC: + dev_type = "Cisco VIC"; + break; + + case VNET_DPDK_PMD_CXGBE: + dev_type = "Chelsio T4/T5"; + break; + + case VNET_DPDK_PMD_MLX5: + dev_type = "Mellanox ConnectX-4 Family"; + break; + + case VNET_DPDK_PMD_VMXNET3: + dev_type = "VMware VMXNET3"; + break; + + case VNET_DPDK_PMD_AF_PACKET: + dev_type = "af_packet"; + break; + + case VNET_DPDK_PMD_BOND: + dev_type = "Ethernet Bonding"; + break; + + case VNET_DPDK_PMD_DPAA2: + dev_type = "NXP DPAA2 Mac"; + break; + + default: + case VNET_DPDK_PMD_UNKNOWN: + dev_type = "### UNKNOWN ###"; + break; + } + + return format (s, dev_type); +} + +static u8 * +format_dpdk_link_status (u8 * s, va_list * args) +{ + dpdk_device_t *xd = va_arg (*args, dpdk_device_t *); + struct rte_eth_link *l = &xd->link; + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index); + + s = format (s, "%s ", l->link_status ? "up" : "down"); + if (l->link_status) + { + u32 promisc = rte_eth_promiscuous_get (xd->device_index); + + s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ? + "full" : "half"); + s = format (s, "speed %u mtu %d %s\n", l->link_speed, + hi->max_packet_bytes, promisc ? " promisc" : ""); + } + else + s = format (s, "\n"); + + return s; +} + +#define _line_len 72 +#define _(v, str) \ +if (bitmap & v) { \ + if (format_get_indent (s) > next_split ) { \ + next_split += _line_len; \ + s = format(s,"\n%U", format_white_space, indent); \ + } \ + s = format(s, "%s ", str); \ +} + +static u8 * +format_dpdk_rss_hf_name (u8 * s, va_list * args) +{ + u64 bitmap = va_arg (*args, u64); + int next_split = _line_len; + int indent = format_get_indent (s); + + if (!bitmap) + return format (s, "none"); + + foreach_dpdk_rss_hf return s; +} + +static u8 * +format_dpdk_rx_offload_caps (u8 * s, va_list * args) +{ + u32 bitmap = va_arg (*args, u32); + int next_split = _line_len; + int indent = format_get_indent (s); + + if (!bitmap) + return format (s, "none"); + + foreach_dpdk_rx_offload_caps return s; +} + +static u8 * +format_dpdk_tx_offload_caps (u8 * s, va_list * args) +{ + u32 bitmap = va_arg (*args, u32); + int next_split = _line_len; + int indent = format_get_indent (s); + if (!bitmap) + return format (s, "none"); + + foreach_dpdk_tx_offload_caps return s; +} + +#undef _line_len +#undef _ + +u8 * +format_dpdk_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + int verbose = va_arg (*args, int); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, dev_instance); + uword indent = format_get_indent (s); + f64 now = vlib_time_now (dm->vlib_main); + struct rte_eth_dev_info di; + + dpdk_update_counters (xd, now); + dpdk_update_link_state (xd, now); + + s = format (s, "%U\n%Ucarrier %U", + format_dpdk_device_type, xd->device_index, + format_white_space, indent + 2, format_dpdk_link_status, xd); + + rte_eth_dev_info_get (xd->device_index, &di); + + if (verbose > 1 && xd->flags & DPDK_DEVICE_FLAG_PMD) + { + struct rte_pci_device *pci; + struct rte_eth_rss_conf rss_conf; + int vlan_off; + int retval; + + rss_conf.rss_key = 0; + retval = rte_eth_dev_rss_hash_conf_get (xd->device_index, &rss_conf); + if (retval < 0) + clib_warning ("rte_eth_dev_rss_hash_conf_get returned %d", retval); + pci = di.pci_dev; + + if (pci) + s = + format (s, + "%Upci id: device %04x:%04x subsystem %04x:%04x\n" + "%Upci address: %04x:%02x:%02x.%02x\n", + format_white_space, indent + 2, pci->id.vendor_id, + pci->id.device_id, pci->id.subsystem_vendor_id, + pci->id.subsystem_device_id, format_white_space, indent + 2, + pci->addr.domain, pci->addr.bus, pci->addr.devid, + pci->addr.function); + s = + format (s, "%Umax rx packet len: %d\n", format_white_space, + indent + 2, di.max_rx_pktlen); + s = + format (s, "%Umax num of queues: rx %d tx %d\n", format_white_space, + indent + 2, di.max_rx_queues, di.max_tx_queues); + s = + format (s, "%Upromiscuous: unicast %s all-multicast %s\n", + format_white_space, indent + 2, + rte_eth_promiscuous_get (xd->device_index) ? "on" : "off", + rte_eth_promiscuous_get (xd->device_index) ? "on" : "off"); + vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index); + s = format (s, "%Uvlan offload: strip %s filter %s qinq %s\n", + format_white_space, indent + 2, + vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off", + vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off", + vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off"); + s = format (s, "%Urx offload caps: %U\n", + format_white_space, indent + 2, + format_dpdk_rx_offload_caps, di.rx_offload_capa); + s = format (s, "%Utx offload caps: %U\n", + format_white_space, indent + 2, + format_dpdk_tx_offload_caps, di.tx_offload_capa); + s = format (s, "%Urss active: %U\n" + "%Urss supported: %U\n", + format_white_space, indent + 2, + format_dpdk_rss_hf_name, rss_conf.rss_hf, + format_white_space, indent + 2, + format_dpdk_rss_hf_name, di.flow_type_rss_offloads); + } + + s = format (s, "%Urx queues %d, rx desc %d, tx queues %d, tx desc %d\n", + format_white_space, indent + 2, + xd->rx_q_used, xd->nb_rx_desc, xd->tx_q_used, xd->nb_tx_desc); + + if (xd->cpu_socket > -1) + s = format (s, "%Ucpu socket %d\n", + format_white_space, indent + 2, xd->cpu_socket); + + /* $$$ MIB counters */ + { +#define _(N, V) \ + if ((xd->stats.V - xd->last_cleared_stats.V) != 0) { \ + s = format (s, "\n%U%-40U%16Ld", \ + format_white_space, indent + 2, \ + format_c_identifier, #N, \ + xd->stats.V - xd->last_cleared_stats.V); \ + } \ + + foreach_dpdk_counter +#undef _ + } + + u8 *xs = 0; + u32 i = 0; + struct rte_eth_xstat *xstat, *last_xstat; + struct rte_eth_xstat_name *xstat_names = 0; + int len = rte_eth_xstats_get_names (xd->device_index, NULL, 0); + vec_validate (xstat_names, len - 1); + rte_eth_xstats_get_names (xd->device_index, xstat_names, len); + + ASSERT (vec_len (xd->xstats) == vec_len (xd->last_cleared_xstats)); + + /* *INDENT-OFF* */ + vec_foreach_index(i, xd->xstats) + { + u64 delta = 0; + xstat = vec_elt_at_index(xd->xstats, i); + last_xstat = vec_elt_at_index(xd->last_cleared_xstats, i); + + delta = xstat->value - last_xstat->value; + if (verbose == 2 || (verbose && delta)) + { + /* format_c_identifier doesn't like c strings inside vector */ + u8 * name = format(0,"%s", xstat_names[i].name); + xs = format(xs, "\n%U%-38U%16Ld", + format_white_space, indent + 4, + format_c_identifier, name, delta); + vec_free(name); + } + } + /* *INDENT-ON* */ + + vec_free (xstat_names); + + if (xs) + { + s = format (s, "\n%Uextended stats:%v", + format_white_space, indent + 2, xs); + vec_free (xs); + } + + return s; +} + +u8 * +format_dpdk_tx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main (); + dpdk_tx_dma_trace_t *t = va_arg (*va, dpdk_tx_dma_trace_t *); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index); + uword indent = format_get_indent (s); + vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + + s = format (s, "%U tx queue %d", + format_vnet_sw_interface_name, vnm, sw, t->queue_index); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U%U", format_white_space, indent, + format_ethernet_header_with_length, t->buffer.pre_data, + sizeof (t->buffer.pre_data)); + + return s; +} + +u8 * +format_dpdk_rx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main (); + dpdk_rx_dma_trace_t *t = va_arg (*va, dpdk_rx_dma_trace_t *); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index); + format_function_t *f; + uword indent = format_get_indent (s); + vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + + s = format (s, "%U rx queue %d", + format_vnet_sw_interface_name, vnm, sw, t->queue_index); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U%U", + format_white_space, indent, + format_dpdk_rte_mbuf, &t->mb, &t->data); + + if (vm->trace_main.verbose) + { + s = format (s, "\n%UPacket Dump%s", format_white_space, indent + 2, + t->mb.data_len > sizeof (t->data) ? " (truncated)" : ""); + s = format (s, "\n%U%U", format_white_space, indent + 4, + format_hexdump, &t->data, + t->mb.data_len > + sizeof (t->data) ? sizeof (t->data) : t->mb.data_len); + } + f = node->format_buffer; + if (!f) + f = format_hex_bytes; + s = format (s, "\n%U%U", format_white_space, indent, + f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + + +static inline u8 * +format_dpdk_pkt_types (u8 * s, va_list * va) +{ + u32 *pkt_types = va_arg (*va, u32 *); + uword indent __attribute__ ((unused)) = format_get_indent (s) + 2; + + if (!*pkt_types) + return s; + + s = format (s, "Packet Types"); + +#define _(L, F, S) \ + if ((*pkt_types & RTE_PTYPE_##L##_MASK) == RTE_PTYPE_##L##_##F) \ + { \ + s = format (s, "\n%U%s (0x%04x) %s", format_white_space, indent, \ + "RTE_PTYPE_" #L "_" #F, RTE_PTYPE_##L##_##F, S); \ + } + + foreach_dpdk_pkt_type +#undef _ + return s; +} + +static inline u8 * +format_dpdk_pkt_offload_flags (u8 * s, va_list * va) +{ + u64 *ol_flags = va_arg (*va, u64 *); + uword indent = format_get_indent (s) + 2; + + if (!*ol_flags) + return s; + + s = format (s, "Packet Offload Flags"); + +#define _(F, S) \ + if (*ol_flags & F) \ + { \ + s = format (s, "\n%U%s (0x%04x) %s", \ + format_white_space, indent, #F, F, S); \ + } + + foreach_dpdk_pkt_offload_flag +#undef _ + return s; +} + +u8 * +format_dpdk_rte_mbuf_vlan (u8 * s, va_list * va) +{ + ethernet_vlan_header_tv_t *vlan_hdr = + va_arg (*va, ethernet_vlan_header_tv_t *); + + if (clib_net_to_host_u16 (vlan_hdr->type) == ETHERNET_TYPE_DOT1AD) + { + s = format (s, "%U 802.1q vlan ", + format_ethernet_vlan_tci, + clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id)); + vlan_hdr++; + } + + s = format (s, "%U", + format_ethernet_vlan_tci, + clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id)); + + return s; +} + +u8 * +format_dpdk_rte_mbuf (u8 * s, va_list * va) +{ + struct rte_mbuf *mb = va_arg (*va, struct rte_mbuf *); + ethernet_header_t *eth_hdr = va_arg (*va, ethernet_header_t *); + uword indent = format_get_indent (s) + 2; + + s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d" + "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x, data_off %d, phys_addr 0x%x" + "\n%Upacket_type 0x%x", + mb->port, mb->nb_segs, mb->pkt_len, + format_white_space, indent, + mb->buf_len, mb->data_len, mb->ol_flags, mb->data_off, + mb->buf_physaddr, format_white_space, indent, mb->packet_type); + + if (mb->ol_flags) + s = format (s, "\n%U%U", format_white_space, indent, + format_dpdk_pkt_offload_flags, &mb->ol_flags); + + if ((mb->ol_flags & PKT_RX_VLAN_PKT) && + ((mb->ol_flags & (PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ_STRIPPED)) == 0)) + { + ethernet_vlan_header_tv_t *vlan_hdr = + ((ethernet_vlan_header_tv_t *) & (eth_hdr->type)); + s = format (s, " %U", format_dpdk_rte_mbuf_vlan, vlan_hdr); + } + + if (mb->packet_type) + s = format (s, "\n%U%U", format_white_space, indent, + format_dpdk_pkt_types, &mb->packet_type); + + return s; +} + +/* FIXME is this function used? */ +#if 0 +uword +unformat_socket_mem (unformat_input_t * input, va_list * va) +{ + uword **r = va_arg (*va, uword **); + int i = 0; + u32 mem; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, ",")) + hash_set (*r, i, 1024); + else if (unformat (input, "%u,", &mem)) + hash_set (*r, i, mem); + else if (unformat (input, "%u", &mem)) + hash_set (*r, i, mem); + else + { + unformat_put_input (input); + goto done; + } + i++; + } + +done: + return 1; +} +#endif + +clib_error_t * +unformat_rss_fn (unformat_input_t * input, uword * rss_fn) +{ + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (0) + ; +#undef _ +#define _(f, s) \ + else if (unformat (input, s)) \ + *rss_fn |= f; + + foreach_dpdk_rss_hf +#undef _ + else + { + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + } + return 0; +} + +clib_error_t * +unformat_hqos (unformat_input_t * input, dpdk_device_config_hqos_t * hqos) +{ + clib_error_t *error = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "hqos-thread %u", &hqos->hqos_thread)) + hqos->hqos_thread_valid = 1; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + break; + } + } + + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c new file mode 100644 index 00000000..8824d789 --- /dev/null +++ b/src/plugins/dpdk/device/node.c @@ -0,0 +1,674 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static char *dpdk_error_strings[] = { +#define _(n,s) s, + foreach_dpdk_error +#undef _ +}; + +always_inline int +vlib_buffer_is_ip4 (vlib_buffer_t * b) +{ + ethernet_header_t *h = (ethernet_header_t *) b->data; + return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP4)); +} + +always_inline int +vlib_buffer_is_ip6 (vlib_buffer_t * b) +{ + ethernet_header_t *h = (ethernet_header_t *) b->data; + return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6)); +} + +always_inline int +vlib_buffer_is_mpls (vlib_buffer_t * b) +{ + ethernet_header_t *h = (ethernet_header_t *) b->data; + return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST)); +} + +always_inline u32 +dpdk_rx_next_from_etype (struct rte_mbuf * mb, vlib_buffer_t * b0) +{ + if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0))) + if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0)) + return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; + else + return VNET_DEVICE_INPUT_NEXT_IP4_INPUT; + else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0))) + return VNET_DEVICE_INPUT_NEXT_IP6_INPUT; + else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0))) + return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT; + else + return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; +} + +always_inline int +dpdk_mbuf_is_vlan (struct rte_mbuf *mb) +{ + return (mb->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == + RTE_PTYPE_L2_ETHER_VLAN; +} + +always_inline int +dpdk_mbuf_is_ip4 (struct rte_mbuf *mb) +{ + return RTE_ETH_IS_IPV4_HDR (mb->packet_type) != 0; +} + +always_inline int +dpdk_mbuf_is_ip6 (struct rte_mbuf *mb) +{ + return RTE_ETH_IS_IPV6_HDR (mb->packet_type) != 0; +} + +always_inline u32 +dpdk_rx_next_from_mb (struct rte_mbuf * mb, vlib_buffer_t * b0) +{ + if (PREDICT_FALSE (dpdk_mbuf_is_vlan (mb))) + return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + else if (PREDICT_TRUE (dpdk_mbuf_is_ip4 (mb))) + return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; + else if (PREDICT_TRUE (dpdk_mbuf_is_ip6 (mb))) + return VNET_DEVICE_INPUT_NEXT_IP6_INPUT; + else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0))) + return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT; + else + return dpdk_rx_next_from_etype (mb, b0); +} + +always_inline void +dpdk_rx_error_from_mb (struct rte_mbuf *mb, u32 * next, u8 * error) +{ + if (mb->ol_flags & PKT_RX_IP_CKSUM_BAD) + { + *error = DPDK_ERROR_IP_CHECKSUM_ERROR; + *next = VNET_DEVICE_INPUT_NEXT_DROP; + } + else + *error = DPDK_ERROR_NONE; +} + +void +dpdk_rx_trace (dpdk_main_t * dm, + vlib_node_runtime_t * node, + dpdk_device_t * xd, + u16 queue_id, u32 * buffers, uword n_buffers) +{ + vlib_main_t *vm = vlib_get_main (); + u32 *b, n_left; + u32 next0; + + n_left = n_buffers; + b = buffers; + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + dpdk_rx_dma_trace_t *t0; + struct rte_mbuf *mb; + u8 error0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + mb = rte_mbuf_from_vlib_buffer (b0); + + if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) + next0 = xd->per_interface_next_index; + else if (PREDICT_TRUE + ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0)) + next0 = dpdk_rx_next_from_mb (mb, b0); + else + next0 = dpdk_rx_next_from_etype (mb, b0); + + dpdk_rx_error_from_mb (mb, &next0, &error0); + + vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->queue_index = queue_id; + t0->device_index = xd->device_index; + t0->buffer_index = bi0; + + clib_memcpy (&t0->mb, mb, sizeof (t0->mb)); + clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + clib_memcpy (t0->buffer.pre_data, b0->data, + sizeof (t0->buffer.pre_data)); + clib_memcpy (&t0->data, mb->buf_addr + mb->data_off, sizeof (t0->data)); + + b += 1; + } +} + +static inline u32 +dpdk_rx_burst (dpdk_main_t * dm, dpdk_device_t * xd, u16 queue_id) +{ + u32 n_buffers; + u32 n_left; + u32 n_this_chunk; + + n_left = VLIB_FRAME_SIZE; + n_buffers = 0; + + if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD)) + { + while (n_left) + { + n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id, + xd->rx_vectors[queue_id] + + n_buffers, n_left); + n_buffers += n_this_chunk; + n_left -= n_this_chunk; + + /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */ + if (n_this_chunk < 32) + break; + } + } + else + { + ASSERT (0); + } + + return n_buffers; +} + + +static_always_inline void +dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b, + struct rte_mbuf *mb, vlib_buffer_free_list_t * fl) +{ + u8 nb_seg = 1; + struct rte_mbuf *mb_seg = 0; + vlib_buffer_t *b_seg, *b_chain = 0; + mb_seg = mb->next; + b_chain = b; + + while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs)) + { + ASSERT (mb_seg != 0); + + b_seg = vlib_buffer_from_rte_mbuf (mb_seg); + vlib_buffer_init_for_free_list (b_seg, fl); + + ASSERT ((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + ASSERT (b_seg->current_data == 0); + + /* + * The driver (e.g. virtio) may not put the packet data at the start + * of the segment, so don't assume b_seg->current_data == 0 is correct. + */ + b_seg->current_data = + (mb_seg->buf_addr + mb_seg->data_off) - (void *) b_seg->data; + + b_seg->current_length = mb_seg->data_len; + b->total_length_not_including_first_buffer += mb_seg->data_len; + + b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT; + b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg); + + b_chain = b_seg; + mb_seg = mb_seg->next; + nb_seg++; + } +} + +static_always_inline void +dpdk_prefetch_buffer (struct rte_mbuf *mb) +{ + vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); + CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, STORE); +} + +/* + * This function is used when there are no worker threads. + * The main thread performs IO and forwards the packets. + */ +static_always_inline u32 +dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, + vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id) +{ + u32 n_buffers; + u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + u32 n_left_to_next, *to_next; + u32 mb_index; + vlib_main_t *vm = vlib_get_main (); + uword n_rx_bytes = 0; + u32 n_trace, trace_cnt __attribute__ ((unused)); + vlib_buffer_free_list_t *fl; + u32 buffer_flags_template; + + if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) + return 0; + + n_buffers = dpdk_rx_burst (dm, xd, queue_id); + + if (n_buffers == 0) + { + return 0; + } + + buffer_flags_template = dm->buffer_flags_template; + + vec_reset_length (xd->d_trace_buffers[cpu_index]); + trace_cnt = n_trace = vlib_get_trace_count (vm, node); + + if (n_trace > 0) + { + u32 n = clib_min (n_trace, n_buffers); + mb_index = 0; + + while (n--) + { + struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++]; + vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); + vec_add1 (xd->d_trace_buffers[cpu_index], + vlib_get_buffer_index (vm, b)); + } + } + + fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + mb_index = 0; + + while (n_buffers > 0) + { + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 bi0, next0, l3_offset0; + u32 bi1, next1, l3_offset1; + u32 bi2, next2, l3_offset2; + u32 bi3, next3, l3_offset3; + u8 error0, error1, error2, error3; + u64 or_ol_flags; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_buffers > 8 && n_left_to_next > 4) + { + struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index]; + struct rte_mbuf *mb1 = xd->rx_vectors[queue_id][mb_index + 1]; + struct rte_mbuf *mb2 = xd->rx_vectors[queue_id][mb_index + 2]; + struct rte_mbuf *mb3 = xd->rx_vectors[queue_id][mb_index + 3]; + + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 4]); + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 5]); + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 6]); + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 7]); + + if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) + { + if (PREDICT_FALSE (mb0->nb_segs > 1)) + dpdk_prefetch_buffer (mb0->next); + if (PREDICT_FALSE (mb1->nb_segs > 1)) + dpdk_prefetch_buffer (mb1->next); + if (PREDICT_FALSE (mb2->nb_segs > 1)) + dpdk_prefetch_buffer (mb2->next); + if (PREDICT_FALSE (mb3->nb_segs > 1)) + dpdk_prefetch_buffer (mb3->next); + } + + ASSERT (mb0); + ASSERT (mb1); + ASSERT (mb2); + ASSERT (mb3); + + or_ol_flags = (mb0->ol_flags | mb1->ol_flags | + mb2->ol_flags | mb3->ol_flags); + b0 = vlib_buffer_from_rte_mbuf (mb0); + b1 = vlib_buffer_from_rte_mbuf (mb1); + b2 = vlib_buffer_from_rte_mbuf (mb2); + b3 = vlib_buffer_from_rte_mbuf (mb3); + + vlib_buffer_init_for_free_list (b0, fl); + vlib_buffer_init_for_free_list (b1, fl); + vlib_buffer_init_for_free_list (b2, fl); + vlib_buffer_init_for_free_list (b3, fl); + + bi0 = vlib_get_buffer_index (vm, b0); + bi1 = vlib_get_buffer_index (vm, b1); + bi2 = vlib_get_buffer_index (vm, b2); + bi3 = vlib_get_buffer_index (vm, b3); + + to_next[0] = bi0; + to_next[1] = bi1; + to_next[2] = bi2; + to_next[3] = bi3; + to_next += 4; + n_left_to_next -= 4; + + if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) + { + next0 = next1 = next2 = next3 = xd->per_interface_next_index; + } + else if (PREDICT_TRUE + ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0)) + { + next0 = dpdk_rx_next_from_mb (mb0, b0); + next1 = dpdk_rx_next_from_mb (mb1, b1); + next2 = dpdk_rx_next_from_mb (mb2, b2); + next3 = dpdk_rx_next_from_mb (mb3, b3); + } + else + { + next0 = dpdk_rx_next_from_etype (mb0, b0); + next1 = dpdk_rx_next_from_etype (mb1, b1); + next2 = dpdk_rx_next_from_etype (mb2, b2); + next3 = dpdk_rx_next_from_etype (mb3, b3); + } + + if (PREDICT_FALSE (or_ol_flags & PKT_RX_IP_CKSUM_BAD)) + { + dpdk_rx_error_from_mb (mb0, &next0, &error0); + dpdk_rx_error_from_mb (mb1, &next1, &error1); + dpdk_rx_error_from_mb (mb2, &next2, &error2); + dpdk_rx_error_from_mb (mb3, &next3, &error3); + b0->error = node->errors[error0]; + b1->error = node->errors[error1]; + b2->error = node->errors[error2]; + b3->error = node->errors[error3]; + } + else + { + b0->error = b1->error = node->errors[DPDK_ERROR_NONE]; + b2->error = b3->error = node->errors[DPDK_ERROR_NONE]; + } + + l3_offset0 = device_input_next_node_advance[next0]; + l3_offset1 = device_input_next_node_advance[next1]; + l3_offset2 = device_input_next_node_advance[next2]; + l3_offset3 = device_input_next_node_advance[next3]; + + b0->current_data = l3_offset0 + mb0->data_off; + b1->current_data = l3_offset1 + mb1->data_off; + b2->current_data = l3_offset2 + mb2->data_off; + b3->current_data = l3_offset3 + mb3->data_off; + + b0->current_data -= RTE_PKTMBUF_HEADROOM; + b1->current_data -= RTE_PKTMBUF_HEADROOM; + b2->current_data -= RTE_PKTMBUF_HEADROOM; + b3->current_data -= RTE_PKTMBUF_HEADROOM; + + b0->current_length = mb0->data_len - l3_offset0; + b1->current_length = mb1->data_len - l3_offset1; + b2->current_length = mb2->data_len - l3_offset2; + b3->current_length = mb3->data_len - l3_offset3; + + b0->flags = buffer_flags_template; + b1->flags = buffer_flags_template; + b2->flags = buffer_flags_template; + b3->flags = buffer_flags_template; + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b2)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b3)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b2)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b3)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + n_rx_bytes += mb0->pkt_len; + n_rx_bytes += mb1->pkt_len; + n_rx_bytes += mb2->pkt_len; + n_rx_bytes += mb3->pkt_len; + + /* Process subsequent segments of multi-segment packets */ + if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) + { + dpdk_process_subseq_segs (vm, b0, mb0, fl); + dpdk_process_subseq_segs (vm, b1, mb1, fl); + dpdk_process_subseq_segs (vm, b2, mb2, fl); + dpdk_process_subseq_segs (vm, b3, mb3, fl); + } + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3); + + /* Do we have any driver RX features configured on the interface? */ + vnet_feature_start_device_input_x4 (xd->vlib_sw_if_index, + &next0, &next1, &next2, &next3, + b0, b1, b2, b3, + l3_offset0, l3_offset1, + l3_offset2, l3_offset3); + + vlib_validate_buffer_enqueue_x4 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, bi2, bi3, + next0, next1, next2, next3); + n_buffers -= 4; + mb_index += 4; + } + while (n_buffers > 0 && n_left_to_next > 0) + { + struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index]; + + ASSERT (mb0); + + b0 = vlib_buffer_from_rte_mbuf (mb0); + + /* Prefetch one next segment if it exists. */ + if (PREDICT_FALSE (mb0->nb_segs > 1)) + dpdk_prefetch_buffer (mb0->next); + + vlib_buffer_init_for_free_list (b0, fl); + + bi0 = vlib_get_buffer_index (vm, b0); + + to_next[0] = bi0; + to_next++; + n_left_to_next--; + + if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) + next0 = xd->per_interface_next_index; + else if (PREDICT_TRUE + ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0)) + next0 = dpdk_rx_next_from_mb (mb0, b0); + else + next0 = dpdk_rx_next_from_etype (mb0, b0); + + dpdk_rx_error_from_mb (mb0, &next0, &error0); + b0->error = node->errors[error0]; + + l3_offset0 = device_input_next_node_advance[next0]; + + b0->current_data = l3_offset0; + b0->current_data += mb0->data_off - RTE_PKTMBUF_HEADROOM; + b0->current_length = mb0->data_len - l3_offset0; + + b0->flags = buffer_flags_template; + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + n_rx_bytes += mb0->pkt_len; + + /* Process subsequent segments of multi-segment packets */ + dpdk_process_subseq_segs (vm, b0, mb0, fl); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + /* Do we have any driver RX features configured on the interface? */ + vnet_feature_start_device_input_x1 (xd->vlib_sw_if_index, &next0, + b0, l3_offset0); + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + n_buffers--; + mb_index++; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[cpu_index]) > 0)) + { + dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers[cpu_index], + vec_len (xd->d_trace_buffers[cpu_index])); + vlib_set_trace_count (vm, node, n_trace - + vec_len (xd->d_trace_buffers[cpu_index])); + } + + vlib_increment_combined_counter + (vnet_get_main ()->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); + + vnet_device_increment_rx_packets (cpu_index, mb_index); + + return mb_index; +} + +static inline void +poll_rate_limit (dpdk_main_t * dm) +{ + /* Limit the poll rate by sleeping for N msec between polls */ + if (PREDICT_FALSE (dm->poll_sleep != 0)) + { + struct timespec ts, tsrem; + + ts.tv_sec = 0; + ts.tv_nsec = 1000 * 1000 * dm->poll_sleep; /* 1ms */ + + while (nanosleep (&ts, &tsrem) < 0) + { + ts = tsrem; + } + } +} + +/** \brief Main DPDK input node + @node dpdk-input + + This is the main DPDK input node: across each assigned interface, + call rte_eth_rx_burst(...) or similar to obtain a vector of + packets to process. Handle early packet discard. Derive @c + vlib_buffer_t metadata from struct rte_mbuf metadata, + Depending on the resulting metadata: adjust b->current_data, + b->current_length and dispatch directly to + ip4-input-no-checksum, or ip6-input. Trace the packet if required. + + @param vm vlib_main_t corresponding to the current thread + @param node vlib_node_runtime_t + @param f vlib_frame_t input-node, not used. + + @par Graph mechanics: buffer metadata, next index usage + + @em Uses: + - struct rte_mbuf mb->ol_flags + - PKT_RX_IP_CKSUM_BAD + - RTE_ETH_IS_xxx_HDR(mb->packet_type) + - packet classification result + + @em Sets: + - b->error if the packet is to be dropped immediately + - b->current_data, b->current_length + - adjusted as needed to skip the L2 header in direct-dispatch cases + - vnet_buffer(b)->sw_if_index[VLIB_RX] + - rx interface sw_if_index + - vnet_buffer(b)->sw_if_index[VLIB_TX] = ~0 + - required by ipX-lookup + - b->flags + - to indicate multi-segment pkts (VLIB_BUFFER_NEXT_PRESENT), etc. + + Next Nodes: + - Static arcs to: error-drop, ethernet-input, + ip4-input-no-checksum, ip6-input, mpls-input + - per-interface redirection, controlled by + xd->per_interface_next_index +*/ + +static uword +dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) +{ + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd; + uword n_rx_packets = 0; + dpdk_device_and_queue_t *dq; + u32 cpu_index = os_get_cpu_number (); + + /* + * Poll all devices on this cpu for input/interrupts. + */ + /* *INDENT-OFF* */ + vec_foreach (dq, dm->devices_by_cpu[cpu_index]) + { + xd = vec_elt_at_index(dm->devices, dq->device); + n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id); + } + /* *INDENT-ON* */ + + poll_rate_limit (dm); + + return n_rx_packets; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_input_node) = { + .function = dpdk_input, + .type = VLIB_NODE_TYPE_INPUT, + .name = "dpdk-input", + .sibling_of = "device-input", + + /* Will be enabled if/when hardware is detected. */ + .state = VLIB_NODE_STATE_DISABLED, + + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_dpdk_rx_dma_trace, + + .n_errors = DPDK_N_ERROR, + .error_strings = dpdk_error_strings, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (dpdk_input_node, dpdk_input); +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From 584282a1d2a283a2aefce6b543da022d95356c4e Mon Sep 17 00:00:00 2001 From: John Lo Date: Tue, 28 Feb 2017 16:34:35 -0500 Subject: Determine pkt type in dpdk-input node using ethertype only (VPP-647) Remove reliance on DPDK driver provided mbuf packet type in dpdk-input node as some NIC driver provide misleading information. Now using ethertype from the packet itself to determine packet type for next node. Change-Id: Ie7b514a984f9382c29f1a1e3eb423d68f817c064 Signed-off-by: John Lo --- src/plugins/dpdk/device/node.c | 74 +++++++++++++----------------------------- 1 file changed, 22 insertions(+), 52 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 8824d789..04c41655 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -59,10 +59,12 @@ always_inline u32 dpdk_rx_next_from_etype (struct rte_mbuf * mb, vlib_buffer_t * b0) { if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0))) - if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0)) - return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; - else - return VNET_DEVICE_INPUT_NEXT_IP4_INPUT; + { + if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0)) + return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; + else + return VNET_DEVICE_INPUT_NEXT_IP4_INPUT; + } else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0))) return VNET_DEVICE_INPUT_NEXT_IP6_INPUT; else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0))) @@ -71,40 +73,6 @@ dpdk_rx_next_from_etype (struct rte_mbuf * mb, vlib_buffer_t * b0) return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; } -always_inline int -dpdk_mbuf_is_vlan (struct rte_mbuf *mb) -{ - return (mb->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == - RTE_PTYPE_L2_ETHER_VLAN; -} - -always_inline int -dpdk_mbuf_is_ip4 (struct rte_mbuf *mb) -{ - return RTE_ETH_IS_IPV4_HDR (mb->packet_type) != 0; -} - -always_inline int -dpdk_mbuf_is_ip6 (struct rte_mbuf *mb) -{ - return RTE_ETH_IS_IPV6_HDR (mb->packet_type) != 0; -} - -always_inline u32 -dpdk_rx_next_from_mb (struct rte_mbuf * mb, vlib_buffer_t * b0) -{ - if (PREDICT_FALSE (dpdk_mbuf_is_vlan (mb))) - return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; - else if (PREDICT_TRUE (dpdk_mbuf_is_ip4 (mb))) - return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; - else if (PREDICT_TRUE (dpdk_mbuf_is_ip6 (mb))) - return VNET_DEVICE_INPUT_NEXT_IP6_INPUT; - else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0))) - return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT; - else - return dpdk_rx_next_from_etype (mb, b0); -} - always_inline void dpdk_rx_error_from_mb (struct rte_mbuf *mb, u32 * next, u8 * error) { @@ -146,9 +114,6 @@ dpdk_rx_trace (dpdk_main_t * dm, if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) next0 = xd->per_interface_next_index; - else if (PREDICT_TRUE - ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0)) - next0 = dpdk_rx_next_from_mb (mb, b0); else next0 = dpdk_rx_next_from_etype (mb, b0); @@ -251,6 +216,13 @@ dpdk_prefetch_buffer (struct rte_mbuf *mb) CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, STORE); } +static_always_inline void +dpdk_prefetch_buffer_data (struct rte_mbuf *mb) +{ + vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); + CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, STORE); +} + /* * This function is used when there are no worker threads. * The main thread performs IO and forwards the packets. @@ -371,16 +343,17 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, { next0 = next1 = next2 = next3 = xd->per_interface_next_index; } - else if (PREDICT_TRUE - ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0)) - { - next0 = dpdk_rx_next_from_mb (mb0, b0); - next1 = dpdk_rx_next_from_mb (mb1, b1); - next2 = dpdk_rx_next_from_mb (mb2, b2); - next3 = dpdk_rx_next_from_mb (mb3, b3); - } else { + /* prefetch packet data for faster access to the ethertype */ + dpdk_prefetch_buffer_data (xd->rx_vectors[queue_id] + [mb_index + 4]); + dpdk_prefetch_buffer_data (xd->rx_vectors[queue_id] + [mb_index + 5]); + dpdk_prefetch_buffer_data (xd->rx_vectors[queue_id] + [mb_index + 6]); + dpdk_prefetch_buffer_data (xd->rx_vectors[queue_id] + [mb_index + 7]); next0 = dpdk_rx_next_from_etype (mb0, b0); next1 = dpdk_rx_next_from_etype (mb1, b1); next2 = dpdk_rx_next_from_etype (mb2, b2); @@ -499,9 +472,6 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) next0 = xd->per_interface_next_index; - else if (PREDICT_TRUE - ((xd->flags & DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE) != 0)) - next0 = dpdk_rx_next_from_mb (mb0, b0); else next0 = dpdk_rx_next_from_etype (mb0, b0); -- cgit 1.2.3-korg From 53865c0e55c2fa0347df4d8338dca5f709c31f16 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 6 Mar 2017 12:06:29 +0100 Subject: dpdk: init.c should be under device/ Change-Id: I80831cee062a38a0f5ab1f1e56c2dc6dcd512b9d Signed-off-by: Damjan Marion --- src/plugins/dpdk.am | 2 +- src/plugins/dpdk/device/init.c | 2074 ++++++++++++++++++++++++++++++++++++++++ src/plugins/dpdk/init.c | 2074 ---------------------------------------- 3 files changed, 2075 insertions(+), 2075 deletions(-) create mode 100755 src/plugins/dpdk/device/init.c delete mode 100755 src/plugins/dpdk/init.c (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk.am b/src/plugins/dpdk.am index 212bbb73..01383de6 100644 --- a/src/plugins/dpdk.am +++ b/src/plugins/dpdk.am @@ -17,7 +17,6 @@ vppplugins_LTLIBRARIES += dpdk_plugin.la dpdk_plugin_la_LDFLAGS = $(AM_LDFLAGS) -Wl,--whole-archive,-l:libdpdk.a,--no-whole-archive,-lm,-ldl dpdk_plugin_la_SOURCES = \ - dpdk/init.c \ dpdk/main.c \ dpdk/buffer.c \ dpdk/thread.c \ @@ -25,6 +24,7 @@ dpdk_plugin_la_SOURCES = \ dpdk/device/dpdk_priv.h \ dpdk/device/device.c \ dpdk/device/format.c \ + dpdk/device/init.c \ dpdk/device/node.c \ dpdk/hqos/hqos.c \ dpdk/ipsec/esp_encrypt.c \ diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c new file mode 100755 index 00000000..e009ef3e --- /dev/null +++ b/src/plugins/dpdk/device/init.c @@ -0,0 +1,2074 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +dpdk_main_t dpdk_main; + +#include +#include + +/* define message IDs */ +#include + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include +#undef vl_api_version + +/* Macro to finish up custom dump fns */ +#define FINISH \ + vec_add1 (s, 0); \ + vl_print (handle, (char *)s); \ + vec_free (s); \ + return handle; + +#include + +static void + vl_api_sw_interface_set_dpdk_hqos_pipe_t_handler + (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_pipe_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 subport = ntohl (mp->subport); + u32 pipe = ntohl (mp->pipe); + u32 profile = ntohl (mp->profile); + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rv = rte_sched_pipe_config (xd->hqos_ht->hqos, subport, pipe, profile); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_PIPE_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_pipe_t_print + (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_pipe "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = format (s, "subport %u pipe %u profile %u ", + ntohl (mp->subport), ntohl (mp->pipe), ntohl (mp->profile)); + + FINISH; +} + +static void + vl_api_sw_interface_set_dpdk_hqos_subport_t_handler + (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_subport_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd; + struct rte_sched_subport_params p; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 subport = ntohl (mp->subport); + p.tb_rate = ntohl (mp->tb_rate); + p.tb_size = ntohl (mp->tb_size); + p.tc_rate[0] = ntohl (mp->tc_rate[0]); + p.tc_rate[1] = ntohl (mp->tc_rate[1]); + p.tc_rate[2] = ntohl (mp->tc_rate[2]); + p.tc_rate[3] = ntohl (mp->tc_rate[3]); + p.tc_period = ntohl (mp->tc_period); + + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport, &p); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_SUBPORT_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_subport_t_print + (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_subport "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = + format (s, + "subport %u rate %u bkt_size %u tc0 %u tc1 %u tc2 %u tc3 %u period %u", + ntohl (mp->subport), ntohl (mp->tb_rate), ntohl (mp->tb_size), + ntohl (mp->tc_rate[0]), ntohl (mp->tc_rate[1]), + ntohl (mp->tc_rate[2]), ntohl (mp->tc_rate[3]), + ntohl (mp->tc_period)); + + FINISH; +} + +static void + vl_api_sw_interface_set_dpdk_hqos_tctbl_t_handler + (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_tctbl_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_device_t *xd; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 entry = ntohl (mp->entry); + u32 tc = ntohl (mp->tc); + u32 queue = ntohl (mp->queue); + u32 val, i; + + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE) + { + clib_warning ("invalid traffic class !!"); + rv = VNET_API_ERROR_INVALID_VALUE; + goto done; + } + if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) + { + clib_warning ("invalid queue !!"); + rv = VNET_API_ERROR_INVALID_VALUE; + goto done; + } + + /* Detect the set of worker threads */ + uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + + if (p == 0) + { + clib_warning ("worker thread registration AWOL !!"); + rv = VNET_API_ERROR_INVALID_VALUE_2; + goto done; + } + + vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; + int worker_thread_first = tr->first_index; + int worker_thread_count = tr->count; + + val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue; + for (i = 0; i < worker_thread_count; i++) + xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val; + + BAD_SW_IF_INDEX_LABEL; +done: + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_TCTBL_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_tctbl_t_print + (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_tctbl "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = format (s, "entry %u tc %u queue %u", + ntohl (mp->entry), ntohl (mp->tc), ntohl (mp->queue)); + + FINISH; +} + +#define foreach_dpdk_plugin_api_msg \ +_(SW_INTERFACE_SET_DPDK_HQOS_PIPE, sw_interface_set_dpdk_hqos_pipe) \ +_(SW_INTERFACE_SET_DPDK_HQOS_SUBPORT, sw_interface_set_dpdk_hqos_subport) \ +_(SW_INTERFACE_SET_DPDK_HQOS_TCTBL, sw_interface_set_dpdk_hqos_tctbl) + +/* Set up the API message handling tables */ +static clib_error_t * +dpdk_plugin_api_hookup (vlib_main_t * vm) +{ + dpdk_main_t *dm __attribute__ ((unused)) = &dpdk_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + dm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_dpdk_plugin_api_msg; +#undef _ + return 0; +} + +#define vl_msg_name_crc_list +#include +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (dpdk_main_t * dm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + dm->msg_id_base); + foreach_vl_msg_name_crc_dpdk; +#undef _ +} + +// TODO +/* +static void plugin_custom_dump_configure (dpdk_main_t * dm) +{ +#define _(n,f) dm->api_main->msg_print_handlers \ + [VL_API_##n + dm->msg_id_base] \ + = (void *) vl_api_##f##_t_print; + foreach_dpdk_plugin_api_msg; +#undef _ +} +*/ +/* force linker to link functions used by vlib and declared weak */ +void *vlib_weakly_linked_functions[] = { + &rte_pktmbuf_init, + &rte_pktmbuf_pool_init, +}; + +#define LINK_STATE_ELOGS 0 + +#define DEFAULT_HUGE_DIR "/run/vpp/hugepages" +#define VPP_RUN_DIR "/run/vpp" + +/* Port configuration, mildly modified Intel app values */ + +static struct rte_eth_conf port_conf_template = { + .rxmode = { + .split_hdr_size = 0, + .header_split = 0, /**< Header Split disabled */ + .hw_ip_checksum = 0, /**< IP checksum offload disabled */ + .hw_vlan_filter = 0, /**< VLAN filtering disabled */ + .hw_strip_crc = 0, /**< CRC stripped by hardware */ + }, + .txmode = { + .mq_mode = ETH_MQ_TX_NONE, + }, +}; + +clib_error_t * +dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) +{ + int rv; + int j; + + ASSERT (os_get_cpu_number () == 0); + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + { + vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0); + rte_eth_dev_stop (xd->device_index); + } + + rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used, + xd->tx_q_used, &xd->port_conf); + + if (rv < 0) + return clib_error_return (0, "rte_eth_dev_configure[%d]: err %d", + xd->device_index, rv); + + /* Set up one TX-queue per worker thread */ + for (j = 0; j < xd->tx_q_used; j++) + { + rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, + xd->cpu_socket, &xd->tx_conf); + + /* retry with any other CPU socket */ + if (rv < 0) + rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, + SOCKET_ID_ANY, &xd->tx_conf); + if (rv < 0) + break; + } + + if (rv < 0) + return clib_error_return (0, "rte_eth_tx_queue_setup[%d]: err %d", + xd->device_index, rv); + + for (j = 0; j < xd->rx_q_used; j++) + { + + rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, + xd->cpu_socket, 0, + dm-> + pktmbuf_pools[xd->cpu_socket_id_by_queue + [j]]); + + /* retry with any other CPU socket */ + if (rv < 0) + rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, + SOCKET_ID_ANY, 0, + dm-> + pktmbuf_pools[xd->cpu_socket_id_by_queue + [j]]); + if (rv < 0) + return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d", + xd->device_index, rv); + } + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + { + int rv; + rv = rte_eth_dev_start (xd->device_index); + if (!rv && xd->default_mac_address) + rv = rte_eth_dev_default_mac_addr_set (xd->device_index, + (struct ether_addr *) + xd->default_mac_address); + if (rv < 0) + clib_warning ("rte_eth_dev_start %d returned %d", + xd->device_index, rv); + } + return 0; +} + +static u32 +dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) +{ + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance); + u32 old = 0; + + if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC (flags)) + { + old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0; + + if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) + xd->flags |= DPDK_DEVICE_FLAG_PROMISC; + else + xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC; + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + { + if (xd->flags & DPDK_DEVICE_FLAG_PROMISC) + rte_eth_promiscuous_enable (xd->device_index); + else + rte_eth_promiscuous_disable (xd->device_index); + } + } + else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU (flags)) + { + /* + * DAW-FIXME: The Cisco VIC firmware does not provide an api for a + * driver to dynamically change the mtu. If/when the + * VIC firmware gets fixed, then this should be removed. + */ + if (xd->pmd == VNET_DPDK_PMD_ENIC) + { + struct rte_eth_dev_info dev_info; + + /* + * Restore mtu to what has been set by CIMC in the firmware cfg. + */ + rte_eth_dev_info_get (xd->device_index, &dev_info); + hi->max_packet_bytes = dev_info.max_rx_pktlen; + + vlib_cli_output (vlib_get_main (), + "Cisco VIC mtu can only be changed " + "using CIMC then rebooting the server!"); + } + else + { + int rv; + + xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes; + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + rte_eth_dev_stop (xd->device_index); + + rv = rte_eth_dev_configure + (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf); + + if (rv < 0) + vlib_cli_output (vlib_get_main (), + "rte_eth_dev_configure[%d]: err %d", + xd->device_index, rv); + + rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + { + int rv = rte_eth_dev_start (xd->device_index); + if (!rv && xd->default_mac_address) + rv = rte_eth_dev_default_mac_addr_set (xd->device_index, + (struct ether_addr *) + xd->default_mac_address); + if (rv < 0) + clib_warning ("rte_eth_dev_start %d returned %d", + xd->device_index, rv); + } + } + } + return old; +} + +void +dpdk_device_lock_init (dpdk_device_t * xd) +{ + int q; + vec_validate (xd->lockp, xd->tx_q_used - 1); + for (q = 0; q < xd->tx_q_used; q++) + { + xd->lockp[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, + CLIB_CACHE_LINE_BYTES); + memset ((void *) xd->lockp[q], 0, CLIB_CACHE_LINE_BYTES); + } +} + +void +dpdk_device_lock_free (dpdk_device_t * xd) +{ + int q; + + for (q = 0; q < vec_len (xd->lockp); q++) + clib_mem_free ((void *) xd->lockp[q]); + vec_free (xd->lockp); + xd->lockp = 0; +} + +static clib_error_t * +dpdk_lib_init (dpdk_main_t * dm) +{ + u32 nports; + u32 nb_desc = 0; + int i; + clib_error_t *error; + vlib_main_t *vm = vlib_get_main (); + vlib_thread_main_t *tm = vlib_get_thread_main (); + vnet_sw_interface_t *sw; + vnet_hw_interface_t *hi; + dpdk_device_t *xd; + vlib_pci_addr_t last_pci_addr; + u32 last_pci_addr_port = 0; + vlib_thread_registration_t *tr, *tr_hqos; + uword *p, *p_hqos; + + u32 next_cpu = 0, next_hqos_cpu = 0; + u8 af_packet_port_id = 0; + last_pci_addr.as_u32 = ~0; + + dm->input_cpu_first_index = 0; + dm->input_cpu_count = 1; + + /* find out which cpus will be used for input */ + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + tr = p ? (vlib_thread_registration_t *) p[0] : 0; + + if (tr && tr->count > 0) + { + dm->input_cpu_first_index = tr->first_index; + dm->input_cpu_count = tr->count; + } + + vec_validate_aligned (dm->devices_by_cpu, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + + dm->hqos_cpu_first_index = 0; + dm->hqos_cpu_count = 0; + + /* find out which cpus will be used for I/O TX */ + p_hqos = hash_get_mem (tm->thread_registrations_by_name, "hqos-threads"); + tr_hqos = p_hqos ? (vlib_thread_registration_t *) p_hqos[0] : 0; + + if (tr_hqos && tr_hqos->count > 0) + { + dm->hqos_cpu_first_index = tr_hqos->first_index; + dm->hqos_cpu_count = tr_hqos->count; + } + + vec_validate_aligned (dm->devices_by_hqos_cpu, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + + nports = rte_eth_dev_count (); + if (nports < 1) + { + clib_warning ("DPDK drivers found no ports..."); + } + + if (CLIB_DEBUG > 0) + clib_warning ("DPDK drivers found %d ports...", nports); + + /* + * All buffers are all allocated from the same rte_mempool. + * Thus they all have the same number of data bytes. + */ + dm->vlib_buffer_free_list_index = + vlib_buffer_get_or_create_free_list (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + "dpdk rx"); + + if (dm->conf->enable_tcp_udp_checksum) + dm->buffer_flags_template &= ~(IP_BUFFER_L4_CHECKSUM_CORRECT + | IP_BUFFER_L4_CHECKSUM_COMPUTED); + + for (i = 0; i < nports; i++) + { + u8 addr[6]; + u8 vlan_strip = 0; + int j; + struct rte_eth_dev_info dev_info; + clib_error_t *rv; + struct rte_eth_link l; + dpdk_device_config_t *devconf = 0; + vlib_pci_addr_t pci_addr; + uword *p = 0; + + rte_eth_dev_info_get (i, &dev_info); + if (dev_info.pci_dev) /* bonded interface has no pci info */ + { + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + p = + hash_get (dm->conf->device_config_index_by_pci_addr, + pci_addr.as_u32); + } + + if (p) + devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + devconf = &dm->conf->default_devconf; + + /* Create vnet interface */ + vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES); + xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT; + xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT; + xd->cpu_socket = (i8) rte_eth_dev_socket_id (i); + + /* Handle interface naming for devices with multiple ports sharing same PCI ID */ + if (dev_info.pci_dev) + { + struct rte_eth_dev_info di = { 0 }; + rte_eth_dev_info_get (i + 1, &di); + if (di.pci_dev && pci_addr.as_u32 != last_pci_addr.as_u32 && + memcmp (&dev_info.pci_dev->addr, &di.pci_dev->addr, + sizeof (struct rte_pci_addr)) == 0) + { + xd->interface_name_suffix = format (0, "0"); + last_pci_addr.as_u32 = pci_addr.as_u32; + last_pci_addr_port = i; + } + else if (pci_addr.as_u32 == last_pci_addr.as_u32) + { + xd->interface_name_suffix = + format (0, "%u", i - last_pci_addr_port); + } + else + { + last_pci_addr.as_u32 = ~0; + } + } + else + last_pci_addr.as_u32 = ~0; + + clib_memcpy (&xd->tx_conf, &dev_info.default_txconf, + sizeof (struct rte_eth_txconf)); + if (dm->conf->no_multi_seg) + { + xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + port_conf_template.rxmode.jumbo_frame = 0; + } + else + { + xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS; + port_conf_template.rxmode.jumbo_frame = 1; + xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG; + } + + clib_memcpy (&xd->port_conf, &port_conf_template, + sizeof (struct rte_eth_conf)); + + xd->tx_q_used = clib_min (dev_info.max_tx_queues, tm->n_vlib_mains); + + if (devconf->num_tx_queues > 0 + && devconf->num_tx_queues < xd->tx_q_used) + xd->tx_q_used = clib_min (xd->tx_q_used, devconf->num_tx_queues); + + if (devconf->num_rx_queues > 1 && dm->use_rss == 0) + { + dm->use_rss = 1; + } + + if (devconf->num_rx_queues > 1 + && dev_info.max_rx_queues >= devconf->num_rx_queues) + { + xd->rx_q_used = devconf->num_rx_queues; + xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + if (devconf->rss_fn == 0) + xd->port_conf.rx_adv_conf.rss_conf.rss_hf = + ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP; + else + xd->port_conf.rx_adv_conf.rss_conf.rss_hf = devconf->rss_fn; + } + else + xd->rx_q_used = 1; + + xd->flags |= DPDK_DEVICE_FLAG_PMD; + + /* workaround for drivers not setting driver_name */ + if ((!dev_info.driver_name) && (dev_info.pci_dev)) + dev_info.driver_name = dev_info.pci_dev->driver->driver.name; + + ASSERT (dev_info.driver_name); + + if (!xd->pmd) + { + + +#define _(s,f) else if (dev_info.driver_name && \ + !strcmp(dev_info.driver_name, s)) \ + xd->pmd = VNET_DPDK_PMD_##f; + if (0) + ; + foreach_dpdk_pmd +#undef _ + else + xd->pmd = VNET_DPDK_PMD_UNKNOWN; + + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; + xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT; + xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT; + + switch (xd->pmd) + { + /* 1G adapters */ + case VNET_DPDK_PMD_E1000EM: + case VNET_DPDK_PMD_IGB: + case VNET_DPDK_PMD_IGBVF: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + break; + + /* 10G adapters */ + case VNET_DPDK_PMD_IXGBE: + case VNET_DPDK_PMD_IXGBEVF: + case VNET_DPDK_PMD_THUNDERX: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + case VNET_DPDK_PMD_DPAA2: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + + /* Cisco VIC */ + case VNET_DPDK_PMD_ENIC: + rte_eth_link_get_nowait (i, &l); + xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; + if (l.link_speed == 40000) + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + else + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + + /* Intel Fortville */ + case VNET_DPDK_PMD_I40E: + case VNET_DPDK_PMD_I40EVF: + xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + + switch (dev_info.pci_dev->id.device_id) + { + case I40E_DEV_ID_10G_BASE_T: + case I40E_DEV_ID_SFP_XL710: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + case I40E_DEV_ID_QSFP_A: + case I40E_DEV_ID_QSFP_B: + case I40E_DEV_ID_QSFP_C: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + break; + case I40E_DEV_ID_VF: + rte_eth_link_get_nowait (i, &l); + xd->port_type = l.link_speed == 10000 ? + VNET_DPDK_PORT_TYPE_ETH_10G : VNET_DPDK_PORT_TYPE_ETH_40G; + break; + default: + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; + } + break; + + case VNET_DPDK_PMD_CXGBE: + switch (dev_info.pci_dev->id.device_id) + { + case 0x540d: /* T580-CR */ + case 0x5410: /* T580-LP-cr */ + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + break; + case 0x5403: /* T540-CR */ + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + default: + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; + } + break; + + case VNET_DPDK_PMD_MLX5: + { + char *pn_100g[] = { "MCX415A-CCAT", "MCX416A-CCAT", 0 }; + char *pn_40g[] = { "MCX413A-BCAT", "MCX414A-BCAT", + "MCX415A-BCAT", "MCX416A-BCAT", "MCX4131A-BCAT", 0 + }; + char *pn_10g[] = { "MCX4111A-XCAT", "MCX4121A-XCAT", 0 }; + + vlib_pci_device_t *pd = vlib_get_pci_device (&pci_addr); + u8 *pn = 0; + char **c; + int found = 0; + pn = format (0, "%U%c", + format_vlib_pci_vpd, pd->vpd_r, "PN", 0); + + if (!pn) + break; + + c = pn_100g; + while (!found && c[0]) + { + if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) + { + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_100G; + break; + } + c++; + } + + c = pn_40g; + while (!found && c[0]) + { + if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) + { + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + break; + } + c++; + } + + c = pn_10g; + while (!found && c[0]) + { + if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) + { + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + } + c++; + } + + vec_free (pn); + } + + break; + /* Intel Red Rock Canyon */ + case VNET_DPDK_PMD_FM10K: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH; + break; + + /* virtio */ + case VNET_DPDK_PMD_VIRTIO: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO; + xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO; + break; + + /* vmxnet3 */ + case VNET_DPDK_PMD_VMXNET3: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + break; + + case VNET_DPDK_PMD_AF_PACKET: + xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET; + xd->af_packet_port_id = af_packet_port_id++; + break; + + case VNET_DPDK_PMD_BOND: + xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_BOND; + break; + + default: + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; + } + + if (devconf->num_rx_desc) + xd->nb_rx_desc = devconf->num_rx_desc; + + if (devconf->num_tx_desc) + xd->nb_tx_desc = devconf->num_tx_desc; + } + + /* + * Ensure default mtu is not > the mtu read from the hardware. + * Otherwise rte_eth_dev_configure() will fail and the port will + * not be available. + */ + if (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen) + { + /* + * This device does not support the platforms's max frame + * size. Use it's advertised mru instead. + */ + xd->port_conf.rxmode.max_rx_pkt_len = dev_info.max_rx_pktlen; + } + else + { + xd->port_conf.rxmode.max_rx_pkt_len = ETHERNET_MAX_PACKET_BYTES; + + /* + * Some platforms do not account for Ethernet FCS (4 bytes) in + * MTU calculations. To interop with them increase mru but only + * if the device's settings can support it. + */ + if ((dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES + 4)) && + xd->port_conf.rxmode.hw_strip_crc) + { + /* + * Allow additional 4 bytes (for Ethernet FCS). These bytes are + * stripped by h/w and so will not consume any buffer memory. + */ + xd->port_conf.rxmode.max_rx_pkt_len += 4; + } + } + + if (xd->pmd == VNET_DPDK_PMD_AF_PACKET) + { + f64 now = vlib_time_now (vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + clib_memcpy (addr + 2, &rnd, sizeof (rnd)); + addr[0] = 2; + addr[1] = 0xfe; + } + else + rte_eth_macaddr_get (i, (struct ether_addr *) addr); + + if (xd->tx_q_used < tm->n_vlib_mains) + dpdk_device_lock_init (xd); + + xd->device_index = xd - dm->devices; + ASSERT (i == xd->device_index); + xd->per_interface_next_index = ~0; + + /* assign interface to input thread */ + dpdk_device_and_queue_t *dq; + int q; + + if (devconf->workers) + { + int i; + q = 0; + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, devconf->workers, ({ + int cpu = dm->input_cpu_first_index + i; + unsigned lcore = vlib_worker_threads[cpu].lcore_id; + vec_validate(xd->cpu_socket_id_by_queue, q); + xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore); + vec_add2(dm->devices_by_cpu[cpu], dq, 1); + dq->device = xd->device_index; + dq->queue_id = q++; + })); + /* *INDENT-ON* */ + } + else + for (q = 0; q < xd->rx_q_used; q++) + { + int cpu = dm->input_cpu_first_index + next_cpu; + unsigned lcore = vlib_worker_threads[cpu].lcore_id; + + /* + * numa node for worker thread handling this queue + * needed for taking buffers from the right mempool + */ + vec_validate (xd->cpu_socket_id_by_queue, q); + xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id (lcore); + + /* + * construct vector of (device,queue) pairs for each worker thread + */ + vec_add2 (dm->devices_by_cpu[cpu], dq, 1); + dq->device = xd->device_index; + dq->queue_id = q; + + next_cpu++; + if (next_cpu == dm->input_cpu_count) + next_cpu = 0; + } + + + if (devconf->hqos_enabled) + { + xd->flags |= DPDK_DEVICE_FLAG_HQOS; + + if (devconf->hqos.hqos_thread_valid) + { + int cpu = dm->hqos_cpu_first_index + devconf->hqos.hqos_thread; + + if (devconf->hqos.hqos_thread >= dm->hqos_cpu_count) + return clib_error_return (0, "invalid HQoS thread index"); + + vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1); + dq->device = xd->device_index; + dq->queue_id = 0; + } + else + { + int cpu = dm->hqos_cpu_first_index + next_hqos_cpu; + + if (dm->hqos_cpu_count == 0) + return clib_error_return (0, "no HQoS threads available"); + + vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1); + dq->device = xd->device_index; + dq->queue_id = 0; + + next_hqos_cpu++; + if (next_hqos_cpu == dm->hqos_cpu_count) + next_hqos_cpu = 0; + + devconf->hqos.hqos_thread_valid = 1; + devconf->hqos.hqos_thread = cpu; + } + } + + vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + for (j = 0; j < tm->n_vlib_mains; j++) + { + vec_validate_ha (xd->tx_vectors[j], xd->nb_tx_desc, + sizeof (tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->tx_vectors[j]); + } + + vec_validate_aligned (xd->rx_vectors, xd->rx_q_used, + CLIB_CACHE_LINE_BYTES); + for (j = 0; j < xd->rx_q_used; j++) + { + vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE - 1, + CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->rx_vectors[j]); + } + + vec_validate_aligned (xd->d_trace_buffers, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + + rv = dpdk_port_setup (dm, xd); + + if (rv) + return rv; + + if (devconf->hqos_enabled) + { + rv = dpdk_port_setup_hqos (xd, &devconf->hqos); + if (rv) + return rv; + } + + /* count the number of descriptors used for this device */ + nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used; + + error = ethernet_register_interface + (dm->vnet_main, dpdk_device_class.index, xd->device_index, + /* ethernet address */ addr, + &xd->vlib_hw_if_index, dpdk_flag_change); + if (error) + return error; + + sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index); + + /* + * DAW-FIXME: The Cisco VIC firmware does not provide an api for a + * driver to dynamically change the mtu. If/when the + * VIC firmware gets fixed, then this should be removed. + */ + if (xd->pmd == VNET_DPDK_PMD_ENIC) + { + /* + * Initialize mtu to what has been set by CIMC in the firmware cfg. + */ + hi->max_packet_bytes = dev_info.max_rx_pktlen; + if (devconf->vlan_strip_offload != DPDK_DEVICE_VLAN_STRIP_OFF) + vlan_strip = 1; /* remove vlan tag from VIC port by default */ + else + clib_warning ("VLAN strip disabled for interface\n"); + } + else if (devconf->vlan_strip_offload == DPDK_DEVICE_VLAN_STRIP_ON) + vlan_strip = 1; + + if (vlan_strip) + { + int vlan_off; + vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index); + vlan_off |= ETH_VLAN_STRIP_OFFLOAD; + xd->port_conf.rxmode.hw_vlan_strip = vlan_off; + if (rte_eth_dev_set_vlan_offload (xd->device_index, vlan_off) == 0) + clib_warning ("VLAN strip enabled for interface\n"); + else + clib_warning ("VLAN strip cannot be supported by interface\n"); + } + + hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = + xd->port_conf.rxmode.max_rx_pkt_len - sizeof (ethernet_header_t); + + rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); + } + + if (nb_desc > dm->conf->num_mbufs) + clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n", + dm->conf->num_mbufs, nb_desc); + + return 0; +} + +static void +dpdk_bind_devices_to_uio (dpdk_config_main_t * conf) +{ + vlib_pci_main_t *pm = &pci_main; + clib_error_t *error; + vlib_pci_device_t *d; + u8 *pci_addr = 0; + int num_whitelisted = vec_len (conf->dev_confs); + + /* *INDENT-OFF* */ + pool_foreach (d, pm->pci_devs, ({ + dpdk_device_config_t * devconf = 0; + vec_reset_length (pci_addr); + pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); + + if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && d->device_class != PCI_CLASS_PROCESSOR_CO) + continue; + + if (num_whitelisted) + { + uword * p = hash_get (conf->device_config_index_by_pci_addr, d->bus_address.as_u32); + + if (!p) + continue; + + devconf = pool_elt_at_index (conf->dev_confs, p[0]); + } + + /* virtio */ + if (d->vendor_id == 0x1af4 && d->device_id == 0x1000) + ; + /* vmxnet3 */ + else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0) + ; + /* all Intel devices */ + else if (d->vendor_id == 0x8086) + ; + /* Cisco VIC */ + else if (d->vendor_id == 0x1137 && d->device_id == 0x0043) + ; + /* Chelsio T4/T5 */ + else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000) + ; + else + { + clib_warning ("Unsupported Ethernet PCI device 0x%04x:0x%04x found " + "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id, + pci_addr); + continue; + } + + error = vlib_pci_bind_to_uio (d, (char *) conf->uio_driver_name); + + if (error) + { + if (devconf == 0) + { + pool_get (conf->dev_confs, devconf); + hash_set (conf->device_config_index_by_pci_addr, d->bus_address.as_u32, + devconf - conf->dev_confs); + devconf->pci_addr.as_u32 = d->bus_address.as_u32; + } + devconf->is_blacklisted = 1; + clib_error_report (error); + } + })); + /* *INDENT-ON* */ + vec_free (pci_addr); +} + +static clib_error_t * +dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr, + unformat_input_t * input, u8 is_default) +{ + clib_error_t *error = 0; + uword *p; + dpdk_device_config_t *devconf; + unformat_input_t sub_input; + + if (is_default) + { + devconf = &conf->default_devconf; + } + else + { + p = hash_get (conf->device_config_index_by_pci_addr, pci_addr.as_u32); + + if (!p) + { + pool_get (conf->dev_confs, devconf); + hash_set (conf->device_config_index_by_pci_addr, pci_addr.as_u32, + devconf - conf->dev_confs); + } + else + return clib_error_return (0, + "duplicate configuration for PCI address %U", + format_vlib_pci_addr, &pci_addr); + } + + devconf->pci_addr.as_u32 = pci_addr.as_u32; + devconf->hqos_enabled = 0; + dpdk_device_config_hqos_default (&devconf->hqos); + + if (!input) + return 0; + + unformat_skip_white_space (input); + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "num-rx-queues %u", &devconf->num_rx_queues)) + ; + else if (unformat (input, "num-tx-queues %u", &devconf->num_tx_queues)) + ; + else if (unformat (input, "num-rx-desc %u", &devconf->num_rx_desc)) + ; + else if (unformat (input, "num-tx-desc %u", &devconf->num_tx_desc)) + ; + else if (unformat (input, "workers %U", unformat_bitmap_list, + &devconf->workers)) + ; + else + if (unformat + (input, "rss %U", unformat_vlib_cli_sub_input, &sub_input)) + { + error = unformat_rss_fn (&sub_input, &devconf->rss_fn); + if (error) + break; + } + else if (unformat (input, "vlan-strip-offload off")) + devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF; + else if (unformat (input, "vlan-strip-offload on")) + devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON; + else + if (unformat + (input, "hqos %U", unformat_vlib_cli_sub_input, &sub_input)) + { + devconf->hqos_enabled = 1; + error = unformat_hqos (&sub_input, &devconf->hqos); + if (error) + break; + } + else if (unformat (input, "hqos")) + { + devconf->hqos_enabled = 1; + } + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + break; + } + } + + if (error) + return error; + + if (devconf->workers && devconf->num_rx_queues == 0) + devconf->num_rx_queues = clib_bitmap_count_set_bits (devconf->workers); + else if (devconf->workers && + clib_bitmap_count_set_bits (devconf->workers) != + devconf->num_rx_queues) + error = + clib_error_return (0, + "%U: number of worker threadds must be " + "equal to number of rx queues", format_vlib_pci_addr, + &pci_addr); + + return error; +} + +static clib_error_t * +dpdk_config (vlib_main_t * vm, unformat_input_t * input) +{ + clib_error_t *error = 0; + dpdk_main_t *dm = &dpdk_main; + dpdk_config_main_t *conf = &dpdk_config_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_device_config_t *devconf; + vlib_pci_addr_t pci_addr; + unformat_input_t sub_input; + u8 *s, *tmp = 0; + u8 *rte_cmd = 0, *ethname = 0; + u32 log_level; + int ret, i; + int num_whitelisted = 0; + u8 no_pci = 0; + u8 no_huge = 0; + u8 huge_dir = 0; + u8 file_prefix = 0; + u8 *socket_mem = 0; + + conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword)); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + /* Prime the pump */ + if (unformat (input, "no-hugetlb")) + { + vec_add1 (conf->eal_init_args, (u8 *) "no-huge"); + no_huge = 1; + } + + else if (unformat (input, "enable-tcp-udp-checksum")) + conf->enable_tcp_udp_checksum = 1; + + else if (unformat (input, "decimal-interface-names")) + conf->interface_name_format_decimal = 1; + + else if (unformat (input, "no-multi-seg")) + conf->no_multi_seg = 1; + + else if (unformat (input, "enable-cryptodev")) + conf->cryptodev = 1; + + else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input, + &sub_input)) + { + error = + dpdk_device_config (conf, (vlib_pci_addr_t) (u32) ~ 1, &sub_input, + 1); + + if (error) + return error; + } + else + if (unformat + (input, "dev %U %U", unformat_vlib_pci_addr, &pci_addr, + unformat_vlib_cli_sub_input, &sub_input)) + { + error = dpdk_device_config (conf, pci_addr, &sub_input, 0); + + if (error) + return error; + + num_whitelisted++; + } + else if (unformat (input, "dev %U", unformat_vlib_pci_addr, &pci_addr)) + { + error = dpdk_device_config (conf, pci_addr, 0, 0); + + if (error) + return error; + + num_whitelisted++; + } + else if (unformat (input, "num-mbufs %d", &conf->num_mbufs)) + ; + else if (unformat (input, "kni %d", &conf->num_kni)) + ; + else if (unformat (input, "uio-driver %s", &conf->uio_driver_name)) + ; + else if (unformat (input, "socket-mem %s", &socket_mem)) + ; + else if (unformat (input, "no-pci")) + { + no_pci = 1; + tmp = format (0, "--no-pci%c", 0); + vec_add1 (conf->eal_init_args, tmp); + } + else if (unformat (input, "poll-sleep %d", &dm->poll_sleep)) + ; + +#define _(a) \ + else if (unformat(input, #a)) \ + { \ + tmp = format (0, "--%s%c", #a, 0); \ + vec_add1 (conf->eal_init_args, tmp); \ + } + foreach_eal_double_hyphen_predicate_arg +#undef _ +#define _(a) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + if (!strncmp(#a, "huge-dir", 8)) \ + huge_dir = 1; \ + else if (!strncmp(#a, "file-prefix", 11)) \ + file_prefix = 1; \ + tmp = format (0, "--%s%c", #a, 0); \ + vec_add1 (conf->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + if (!strncmp(#a, "vdev", 4)) \ + if (strstr((char*)s, "af_packet")) \ + clib_warning ("af_packet obsoleted. Use CLI 'create host-interface'."); \ + vec_add1 (conf->eal_init_args, s); \ + } + foreach_eal_double_hyphen_arg +#undef _ +#define _(a,b) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + tmp = format (0, "-%s%c", #b, 0); \ + vec_add1 (conf->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + vec_add1 (conf->eal_init_args, s); \ + } + foreach_eal_single_hyphen_arg +#undef _ +#define _(a,b) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + tmp = format (0, "-%s%c", #b, 0); \ + vec_add1 (conf->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + vec_add1 (conf->eal_init_args, s); \ + conf->a##_set_manually = 1; \ + } + foreach_eal_single_hyphen_mandatory_arg +#undef _ + else if (unformat (input, "default")) + ; + + else if (unformat_skip_white_space (input)) + ; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + goto done; + } + } + + if (!conf->uio_driver_name) + conf->uio_driver_name = format (0, "uio_pci_generic%c", 0); + + /* + * Use 1G huge pages if available. + */ + if (!no_huge && !huge_dir) + { + u32 x, *mem_by_socket = 0; + uword c = 0; + u8 use_1g = 1; + u8 use_2m = 1; + u8 less_than_1g = 1; + int rv; + + umount (DEFAULT_HUGE_DIR); + + /* Process "socket-mem" parameter value */ + if (vec_len (socket_mem)) + { + unformat_input_t in; + unformat_init_vector (&in, socket_mem); + while (unformat_check_input (&in) != UNFORMAT_END_OF_INPUT) + { + if (unformat (&in, "%u,", &x)) + ; + else if (unformat (&in, "%u", &x)) + ; + else if (unformat (&in, ",")) + x = 0; + else + break; + + vec_add1 (mem_by_socket, x); + + if (x > 1023) + less_than_1g = 0; + } + /* Note: unformat_free vec_frees(in.buffer), aka socket_mem... */ + unformat_free (&in); + socket_mem = 0; + } + else + { + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( + { + vec_validate(mem_by_socket, c); + mem_by_socket[c] = 256; /* default per-socket mem */ + } + )); + /* *INDENT-ON* */ + } + + /* check if available enough 1GB pages for each socket */ + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( + { + int pages_avail, page_size, mem; + + vec_validate(mem_by_socket, c); + mem = mem_by_socket[c]; + + page_size = 1024; + pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); + + if (pages_avail < 0 || page_size * pages_avail < mem) + use_1g = 0; + + page_size = 2; + pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); + + if (pages_avail < 0 || page_size * pages_avail < mem) + use_2m = 0; + })); + /* *INDENT-ON* */ + + if (mem_by_socket == 0) + { + error = clib_error_return (0, "mem_by_socket NULL"); + goto done; + } + _vec_len (mem_by_socket) = c + 1; + + /* regenerate socket_mem string */ + vec_foreach_index (x, mem_by_socket) + socket_mem = format (socket_mem, "%s%u", + socket_mem ? "," : "", mem_by_socket[x]); + socket_mem = format (socket_mem, "%c", 0); + + vec_free (mem_by_socket); + + rv = mkdir (VPP_RUN_DIR, 0755); + if (rv && errno != EEXIST) + { + error = clib_error_return (0, "mkdir '%s' failed errno %d", + VPP_RUN_DIR, errno); + goto done; + } + + rv = mkdir (DEFAULT_HUGE_DIR, 0755); + if (rv && errno != EEXIST) + { + error = clib_error_return (0, "mkdir '%s' failed errno %d", + DEFAULT_HUGE_DIR, errno); + goto done; + } + + if (use_1g && !(less_than_1g && use_2m)) + { + rv = + mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G"); + } + else if (use_2m) + { + rv = mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL); + } + else + { + return clib_error_return (0, "not enough free huge pages"); + } + + if (rv) + { + error = clib_error_return (0, "mount failed %d", errno); + goto done; + } + + tmp = format (0, "--huge-dir%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0); + vec_add1 (conf->eal_init_args, tmp); + if (!file_prefix) + { + tmp = format (0, "--file-prefix%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "vpp%c", 0); + vec_add1 (conf->eal_init_args, tmp); + } + } + + vec_free (rte_cmd); + vec_free (ethname); + + if (error) + return error; + + /* I'll bet that -c and -n must be the first and second args... */ + if (!conf->coremask_set_manually) + { + vlib_thread_registration_t *tr; + uword *coremask = 0; + int i; + + /* main thread core */ + coremask = clib_bitmap_set (coremask, tm->main_lcore, 1); + + for (i = 0; i < vec_len (tm->registrations); i++) + { + tr = tm->registrations[i]; + coremask = clib_bitmap_or (coremask, tr->coremask); + } + + vec_insert (conf->eal_init_args, 2, 1); + conf->eal_init_args[1] = (u8 *) "-c"; + tmp = format (0, "%U%c", format_bitmap_hex, coremask, 0); + conf->eal_init_args[2] = tmp; + clib_bitmap_free (coremask); + } + + if (!conf->nchannels_set_manually) + { + vec_insert (conf->eal_init_args, 2, 3); + conf->eal_init_args[3] = (u8 *) "-n"; + tmp = format (0, "%d", conf->nchannels); + conf->eal_init_args[4] = tmp; + } + + if (no_pci == 0 && geteuid () == 0) + dpdk_bind_devices_to_uio (conf); + +#define _(x) \ + if (devconf->x == 0 && conf->default_devconf.x > 0) \ + devconf->x = conf->default_devconf.x ; + + /* *INDENT-OFF* */ + pool_foreach (devconf, conf->dev_confs, ({ + + /* default per-device config items */ + foreach_dpdk_device_config_item + + /* add DPDK EAL whitelist/blacklist entry */ + if (num_whitelisted > 0 && devconf->is_blacklisted == 0) + { + tmp = format (0, "-w%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0); + vec_add1 (conf->eal_init_args, tmp); + } + else if (num_whitelisted == 0 && devconf->is_blacklisted != 0) + { + tmp = format (0, "-b%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0); + vec_add1 (conf->eal_init_args, tmp); + } + })); + /* *INDENT-ON* */ + +#undef _ + + /* set master-lcore */ + tmp = format (0, "--master-lcore%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%u%c", tm->main_lcore, 0); + vec_add1 (conf->eal_init_args, tmp); + + /* set socket-mem */ + tmp = format (0, "--socket-mem%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%s%c", socket_mem, 0); + vec_add1 (conf->eal_init_args, tmp); + + /* NULL terminate the "argv" vector, in case of stupidity */ + vec_add1 (conf->eal_init_args, 0); + _vec_len (conf->eal_init_args) -= 1; + + /* Set up DPDK eal and packet mbuf pool early. */ + + log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE; + + rte_set_log_level (log_level); + + vm = vlib_get_main (); + + /* make copy of args as rte_eal_init tends to mess up with arg array */ + for (i = 1; i < vec_len (conf->eal_init_args); i++) + conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ", + conf->eal_init_args[i]); + + ret = + rte_eal_init (vec_len (conf->eal_init_args), + (char **) conf->eal_init_args); + + /* lazy umount hugepages */ + umount2 (DEFAULT_HUGE_DIR, MNT_DETACH); + + if (ret < 0) + return clib_error_return (0, "rte_eal_init returned %d", ret); + + /* Dump the physical memory layout prior to creating the mbuf_pool */ + fprintf (stdout, "DPDK physical memory layout:\n"); + rte_dump_physmem_layout (stdout); + + /* main thread 1st */ + error = vlib_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ()); + if (error) + return error; + + for (i = 0; i < RTE_MAX_LCORE; i++) + { + error = vlib_buffer_pool_create (vm, conf->num_mbufs, + rte_lcore_to_socket_id (i)); + if (error) + return error; + } + +done: + return error; +} + +VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk"); + +void +dpdk_update_link_state (dpdk_device_t * xd, f64 now) +{ + vnet_main_t *vnm = vnet_get_main (); + struct rte_eth_link prev_link = xd->link; + u32 hw_flags = 0; + u8 hw_flags_chg = 0; + + /* only update link state for PMD interfaces */ + if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) + return; + + xd->time_last_link_update = now ? now : xd->time_last_link_update; + memset (&xd->link, 0, sizeof (xd->link)); + rte_eth_link_get_nowait (xd->device_index, &xd->link); + + if (LINK_STATE_ELOGS) + { + vlib_main_t *vm = vlib_get_main (); + ELOG_TYPE_DECLARE (e) = + { + .format = + "update-link-state: sw_if_index %d, admin_up %d," + "old link_state %d new link_state %d",.format_args = "i4i1i1i1",}; + + struct + { + u32 sw_if_index; + u8 admin_up; + u8 old_link_state; + u8 new_link_state; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->sw_if_index = xd->vlib_sw_if_index; + ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0; + ed->old_link_state = (u8) + vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index); + ed->new_link_state = (u8) xd->link.link_status; + } + + if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) && + ((xd->link.link_status != 0) ^ + vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index))) + { + hw_flags_chg = 1; + hw_flags |= (xd->link.link_status ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); + } + + if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex)) + { + hw_flags_chg = 1; + switch (xd->link.link_duplex) + { + case ETH_LINK_HALF_DUPLEX: + hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX; + break; + case ETH_LINK_FULL_DUPLEX: + hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX; + break; + default: + break; + } + } + if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed)) + { + hw_flags_chg = 1; + switch (xd->link.link_speed) + { + case ETH_SPEED_NUM_10M: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M; + break; + case ETH_SPEED_NUM_100M: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M; + break; + case ETH_SPEED_NUM_1G: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G; + break; + case ETH_SPEED_NUM_10G: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G; + break; + case ETH_SPEED_NUM_40G: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G; + break; + case 0: + break; + default: + clib_warning ("unknown link speed %d", xd->link.link_speed); + break; + } + } + if (hw_flags_chg) + { + if (LINK_STATE_ELOGS) + { + vlib_main_t *vm = vlib_get_main (); + + ELOG_TYPE_DECLARE (e) = + { + .format = + "update-link-state: sw_if_index %d, new flags %d",.format_args + = "i4i4",}; + + struct + { + u32 sw_if_index; + u32 flags; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->sw_if_index = xd->vlib_sw_if_index; + ed->flags = hw_flags; + } + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, hw_flags); + } +} + +static uword +dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + clib_error_t *error; + vnet_main_t *vnm = vnet_get_main (); + dpdk_main_t *dm = &dpdk_main; + ethernet_main_t *em = ðernet_main; + dpdk_device_t *xd; + vlib_thread_main_t *tm = vlib_get_thread_main (); + int i; + + error = dpdk_lib_init (dm); + + /* + * Turn on the input node if we found some devices to drive + * and we're not running worker threads or i/o threads + */ + + if (error == 0 && vec_len (dm->devices) > 0) + { + if (tm->n_vlib_mains == 1) + vlib_node_set_state (vm, dpdk_input_node.index, + VLIB_NODE_STATE_POLLING); + else + for (i = 0; i < tm->n_vlib_mains; i++) + if (vec_len (dm->devices_by_cpu[i]) > 0) + vlib_node_set_state (vlib_mains[i], dpdk_input_node.index, + VLIB_NODE_STATE_POLLING); + } + + if (error) + clib_error_report (error); + + tm->worker_thread_release = 1; + + f64 now = vlib_time_now (vm); + vec_foreach (xd, dm->devices) + { + dpdk_update_link_state (xd, now); + } + + { + /* + * Extra set up for bond interfaces: + * 1. Setup MACs for bond interfaces and their slave links which was set + * in dpdk_port_setup() but needs to be done again here to take effect. + * 2. Set up info for bond interface related CLI support. + */ + int nports = rte_eth_dev_count (); + if (nports > 0) + { + for (i = 0; i < nports; i++) + { + struct rte_eth_dev_info dev_info; + rte_eth_dev_info_get (i, &dev_info); + if (!dev_info.driver_name) + dev_info.driver_name = dev_info.pci_dev->driver->driver.name; + + ASSERT (dev_info.driver_name); + if (strncmp (dev_info.driver_name, "rte_bond_pmd", 12) == 0) + { + u8 addr[6]; + u8 slink[16]; + int nlink = rte_eth_bond_slaves_get (i, slink, 16); + if (nlink > 0) + { + vnet_hw_interface_t *bhi; + ethernet_interface_t *bei; + int rv; + + /* Get MAC of 1st slave link */ + rte_eth_macaddr_get (slink[0], + (struct ether_addr *) addr); + /* Set MAC of bounded interface to that of 1st slave link */ + rv = + rte_eth_bond_mac_address_set (i, + (struct ether_addr *) + addr); + if (rv < 0) + clib_warning ("Failed to set MAC address"); + + /* Populate MAC of bonded interface in VPP hw tables */ + bhi = + vnet_get_hw_interface (vnm, + dm->devices[i].vlib_hw_if_index); + bei = + pool_elt_at_index (em->interfaces, bhi->hw_instance); + clib_memcpy (bhi->hw_address, addr, 6); + clib_memcpy (bei->address, addr, 6); + /* Init l3 packet size allowed on bonded interface */ + bhi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES; + bhi->max_l3_packet_bytes[VLIB_RX] = + bhi->max_l3_packet_bytes[VLIB_TX] = + ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t); + while (nlink >= 1) + { /* for all slave links */ + int slave = slink[--nlink]; + dpdk_device_t *sdev = &dm->devices[slave]; + vnet_hw_interface_t *shi; + vnet_sw_interface_t *ssi; + /* Add MAC to all slave links except the first one */ + if (nlink) + rte_eth_dev_mac_addr_add (slave, + (struct ether_addr *) + addr, 0); + /* Set slaves bitmap for bonded interface */ + bhi->bond_info = + clib_bitmap_set (bhi->bond_info, + sdev->vlib_hw_if_index, 1); + /* Set slave link flags on slave interface */ + shi = + vnet_get_hw_interface (vnm, sdev->vlib_hw_if_index); + ssi = + vnet_get_sw_interface (vnm, sdev->vlib_sw_if_index); + shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE; + ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE; + + /* Set l3 packet size allowed as the lowest of slave */ + if (bhi->max_l3_packet_bytes[VLIB_RX] > + shi->max_l3_packet_bytes[VLIB_RX]) + bhi->max_l3_packet_bytes[VLIB_RX] = + bhi->max_l3_packet_bytes[VLIB_TX] = + shi->max_l3_packet_bytes[VLIB_RX]; + + /* Set max packet size allowed as the lowest of slave */ + if (bhi->max_packet_bytes > shi->max_packet_bytes) + bhi->max_packet_bytes = shi->max_packet_bytes; + } + } + } + } + } + } + + while (1) + { + /* + * check each time through the loop in case intervals are changed + */ + f64 min_wait = dm->link_state_poll_interval < dm->stat_poll_interval ? + dm->link_state_poll_interval : dm->stat_poll_interval; + + vlib_process_wait_for_event_or_clock (vm, min_wait); + + if (dm->admin_up_down_in_progress) + /* skip the poll if an admin up down is in progress (on any interface) */ + continue; + + vec_foreach (xd, dm->devices) + { + f64 now = vlib_time_now (vm); + if ((now - xd->time_last_stats_update) >= dm->stat_poll_interval) + dpdk_update_counters (xd, now); + if ((now - xd->time_last_link_update) >= dm->link_state_poll_interval) + dpdk_update_link_state (xd, now); + + } + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_process_node,static) = { + .function = dpdk_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "dpdk-process", + .process_log2_n_stack_bytes = 17, +}; +/* *INDENT-ON* */ + +int +dpdk_set_stat_poll_interval (f64 interval) +{ + if (interval < DPDK_MIN_STATS_POLL_INTERVAL) + return (VNET_API_ERROR_INVALID_VALUE); + + dpdk_main.stat_poll_interval = interval; + + return 0; +} + +int +dpdk_set_link_state_poll_interval (f64 interval) +{ + if (interval < DPDK_MIN_LINK_POLL_INTERVAL) + return (VNET_API_ERROR_INVALID_VALUE); + + dpdk_main.link_state_poll_interval = interval; + + return 0; +} + +clib_error_t * +dpdk_init (vlib_main_t * vm) +{ + dpdk_main_t *dm = &dpdk_main; + vlib_node_t *ei; + clib_error_t *error = 0; + vlib_thread_main_t *tm = vlib_get_thread_main (); + + /* verify that structs are cacheline aligned */ + STATIC_ASSERT (offsetof (dpdk_device_t, cacheline0) == 0, + "Cache line marker must be 1st element in dpdk_device_t"); + STATIC_ASSERT (offsetof (dpdk_device_t, cacheline1) == + CLIB_CACHE_LINE_BYTES, + "Data in cache line 0 is bigger than cache line size"); + STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0, + "Cache line marker must be 1st element in frame_queue_trace_t"); + + u8 *name; + name = format (0, "dpdk_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + dm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + vec_free (name); + + dm->vlib_main = vm; + dm->vnet_main = vnet_get_main (); + dm->conf = &dpdk_config_main; + + error = dpdk_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (dm, &api_main); + +// TODO +// plugin_custom_dump_configure (dm); + + ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input"); + if (ei == 0) + return clib_error_return (0, "ethernet-input node AWOL"); + + dm->ethernet_input_node_index = ei->index; + + dm->conf->nchannels = 4; + dm->conf->num_mbufs = dm->conf->num_mbufs ? dm->conf->num_mbufs : NB_MBUF; + vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet"); + + dm->dpdk_device_by_kni_port_id = hash_create (0, sizeof (uword)); + dm->vu_sw_if_index_by_listener_fd = hash_create (0, sizeof (uword)); + dm->vu_sw_if_index_by_sock_fd = hash_create (0, sizeof (uword)); + + /* $$$ use n_thread_stacks since it's known-good at this point */ + vec_validate (dm->recycle, tm->n_thread_stacks - 1); + + /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */ + dm->buffer_flags_template = + (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_EXT_HDR_VALID + | IP_BUFFER_L4_CHECKSUM_COMPUTED | IP_BUFFER_L4_CHECKSUM_CORRECT); + + dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL; + dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL; + + /* init CLI */ + if ((error = vlib_call_init_function (vm, dpdk_cli_init))) + return error; + + return error; +} + +VLIB_INIT_FUNCTION (dpdk_init); + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/init.c b/src/plugins/dpdk/init.c deleted file mode 100755 index e009ef3e..00000000 --- a/src/plugins/dpdk/init.c +++ /dev/null @@ -1,2074 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -dpdk_main_t dpdk_main; - -#include -#include - -/* define message IDs */ -#include - -#define vl_typedefs /* define message structures */ -#include -#undef vl_typedefs - -#define vl_endianfun /* define message structures */ -#include -#undef vl_endianfun - -#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) - -/* Get the API version number. */ -#define vl_api_version(n,v) static u32 api_version=(v); -#include -#undef vl_api_version - -/* Macro to finish up custom dump fns */ -#define FINISH \ - vec_add1 (s, 0); \ - vl_print (handle, (char *)s); \ - vec_free (s); \ - return handle; - -#include - -static void - vl_api_sw_interface_set_dpdk_hqos_pipe_t_handler - (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp) -{ - vl_api_sw_interface_set_dpdk_hqos_pipe_reply_t *rmp; - int rv = 0; - - dpdk_main_t *dm = &dpdk_main; - dpdk_device_t *xd; - - u32 sw_if_index = ntohl (mp->sw_if_index); - u32 subport = ntohl (mp->subport); - u32 pipe = ntohl (mp->pipe); - u32 profile = ntohl (mp->profile); - vnet_hw_interface_t *hw; - - VALIDATE_SW_IF_INDEX (mp); - - /* hw_if & dpdk device */ - hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); - - xd = vec_elt_at_index (dm->devices, hw->dev_instance); - - rv = rte_sched_pipe_config (xd->hqos_ht->hqos, subport, pipe, profile); - - BAD_SW_IF_INDEX_LABEL; - - REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_PIPE_REPLY); -} - -static void *vl_api_sw_interface_set_dpdk_hqos_pipe_t_print - (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp, void *handle) -{ - u8 *s; - - s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_pipe "); - - s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); - - s = format (s, "subport %u pipe %u profile %u ", - ntohl (mp->subport), ntohl (mp->pipe), ntohl (mp->profile)); - - FINISH; -} - -static void - vl_api_sw_interface_set_dpdk_hqos_subport_t_handler - (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp) -{ - vl_api_sw_interface_set_dpdk_hqos_subport_reply_t *rmp; - int rv = 0; - - dpdk_main_t *dm = &dpdk_main; - dpdk_device_t *xd; - struct rte_sched_subport_params p; - - u32 sw_if_index = ntohl (mp->sw_if_index); - u32 subport = ntohl (mp->subport); - p.tb_rate = ntohl (mp->tb_rate); - p.tb_size = ntohl (mp->tb_size); - p.tc_rate[0] = ntohl (mp->tc_rate[0]); - p.tc_rate[1] = ntohl (mp->tc_rate[1]); - p.tc_rate[2] = ntohl (mp->tc_rate[2]); - p.tc_rate[3] = ntohl (mp->tc_rate[3]); - p.tc_period = ntohl (mp->tc_period); - - vnet_hw_interface_t *hw; - - VALIDATE_SW_IF_INDEX (mp); - - /* hw_if & dpdk device */ - hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); - - xd = vec_elt_at_index (dm->devices, hw->dev_instance); - - rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport, &p); - - BAD_SW_IF_INDEX_LABEL; - - REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_SUBPORT_REPLY); -} - -static void *vl_api_sw_interface_set_dpdk_hqos_subport_t_print - (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp, void *handle) -{ - u8 *s; - - s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_subport "); - - s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); - - s = - format (s, - "subport %u rate %u bkt_size %u tc0 %u tc1 %u tc2 %u tc3 %u period %u", - ntohl (mp->subport), ntohl (mp->tb_rate), ntohl (mp->tb_size), - ntohl (mp->tc_rate[0]), ntohl (mp->tc_rate[1]), - ntohl (mp->tc_rate[2]), ntohl (mp->tc_rate[3]), - ntohl (mp->tc_period)); - - FINISH; -} - -static void - vl_api_sw_interface_set_dpdk_hqos_tctbl_t_handler - (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp) -{ - vl_api_sw_interface_set_dpdk_hqos_tctbl_reply_t *rmp; - int rv = 0; - - dpdk_main_t *dm = &dpdk_main; - vlib_thread_main_t *tm = vlib_get_thread_main (); - dpdk_device_t *xd; - - u32 sw_if_index = ntohl (mp->sw_if_index); - u32 entry = ntohl (mp->entry); - u32 tc = ntohl (mp->tc); - u32 queue = ntohl (mp->queue); - u32 val, i; - - vnet_hw_interface_t *hw; - - VALIDATE_SW_IF_INDEX (mp); - - /* hw_if & dpdk device */ - hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); - - xd = vec_elt_at_index (dm->devices, hw->dev_instance); - - if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE) - { - clib_warning ("invalid traffic class !!"); - rv = VNET_API_ERROR_INVALID_VALUE; - goto done; - } - if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) - { - clib_warning ("invalid queue !!"); - rv = VNET_API_ERROR_INVALID_VALUE; - goto done; - } - - /* Detect the set of worker threads */ - uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); - - if (p == 0) - { - clib_warning ("worker thread registration AWOL !!"); - rv = VNET_API_ERROR_INVALID_VALUE_2; - goto done; - } - - vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; - int worker_thread_first = tr->first_index; - int worker_thread_count = tr->count; - - val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue; - for (i = 0; i < worker_thread_count; i++) - xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val; - - BAD_SW_IF_INDEX_LABEL; -done: - - REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_TCTBL_REPLY); -} - -static void *vl_api_sw_interface_set_dpdk_hqos_tctbl_t_print - (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp, void *handle) -{ - u8 *s; - - s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_tctbl "); - - s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); - - s = format (s, "entry %u tc %u queue %u", - ntohl (mp->entry), ntohl (mp->tc), ntohl (mp->queue)); - - FINISH; -} - -#define foreach_dpdk_plugin_api_msg \ -_(SW_INTERFACE_SET_DPDK_HQOS_PIPE, sw_interface_set_dpdk_hqos_pipe) \ -_(SW_INTERFACE_SET_DPDK_HQOS_SUBPORT, sw_interface_set_dpdk_hqos_subport) \ -_(SW_INTERFACE_SET_DPDK_HQOS_TCTBL, sw_interface_set_dpdk_hqos_tctbl) - -/* Set up the API message handling tables */ -static clib_error_t * -dpdk_plugin_api_hookup (vlib_main_t * vm) -{ - dpdk_main_t *dm __attribute__ ((unused)) = &dpdk_main; -#define _(N,n) \ - vl_msg_api_set_handlers((VL_API_##N + dm->msg_id_base), \ - #n, \ - vl_api_##n##_t_handler, \ - vl_noop_handler, \ - vl_api_##n##_t_endian, \ - vl_api_##n##_t_print, \ - sizeof(vl_api_##n##_t), 1); - foreach_dpdk_plugin_api_msg; -#undef _ - return 0; -} - -#define vl_msg_name_crc_list -#include -#undef vl_msg_name_crc_list - -static void -setup_message_id_table (dpdk_main_t * dm, api_main_t * am) -{ -#define _(id,n,crc) \ - vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + dm->msg_id_base); - foreach_vl_msg_name_crc_dpdk; -#undef _ -} - -// TODO -/* -static void plugin_custom_dump_configure (dpdk_main_t * dm) -{ -#define _(n,f) dm->api_main->msg_print_handlers \ - [VL_API_##n + dm->msg_id_base] \ - = (void *) vl_api_##f##_t_print; - foreach_dpdk_plugin_api_msg; -#undef _ -} -*/ -/* force linker to link functions used by vlib and declared weak */ -void *vlib_weakly_linked_functions[] = { - &rte_pktmbuf_init, - &rte_pktmbuf_pool_init, -}; - -#define LINK_STATE_ELOGS 0 - -#define DEFAULT_HUGE_DIR "/run/vpp/hugepages" -#define VPP_RUN_DIR "/run/vpp" - -/* Port configuration, mildly modified Intel app values */ - -static struct rte_eth_conf port_conf_template = { - .rxmode = { - .split_hdr_size = 0, - .header_split = 0, /**< Header Split disabled */ - .hw_ip_checksum = 0, /**< IP checksum offload disabled */ - .hw_vlan_filter = 0, /**< VLAN filtering disabled */ - .hw_strip_crc = 0, /**< CRC stripped by hardware */ - }, - .txmode = { - .mq_mode = ETH_MQ_TX_NONE, - }, -}; - -clib_error_t * -dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) -{ - int rv; - int j; - - ASSERT (os_get_cpu_number () == 0); - - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - { - vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0); - rte_eth_dev_stop (xd->device_index); - } - - rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used, - xd->tx_q_used, &xd->port_conf); - - if (rv < 0) - return clib_error_return (0, "rte_eth_dev_configure[%d]: err %d", - xd->device_index, rv); - - /* Set up one TX-queue per worker thread */ - for (j = 0; j < xd->tx_q_used; j++) - { - rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, - xd->cpu_socket, &xd->tx_conf); - - /* retry with any other CPU socket */ - if (rv < 0) - rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, - SOCKET_ID_ANY, &xd->tx_conf); - if (rv < 0) - break; - } - - if (rv < 0) - return clib_error_return (0, "rte_eth_tx_queue_setup[%d]: err %d", - xd->device_index, rv); - - for (j = 0; j < xd->rx_q_used; j++) - { - - rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, - xd->cpu_socket, 0, - dm-> - pktmbuf_pools[xd->cpu_socket_id_by_queue - [j]]); - - /* retry with any other CPU socket */ - if (rv < 0) - rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, - SOCKET_ID_ANY, 0, - dm-> - pktmbuf_pools[xd->cpu_socket_id_by_queue - [j]]); - if (rv < 0) - return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d", - xd->device_index, rv); - } - - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - { - int rv; - rv = rte_eth_dev_start (xd->device_index); - if (!rv && xd->default_mac_address) - rv = rte_eth_dev_default_mac_addr_set (xd->device_index, - (struct ether_addr *) - xd->default_mac_address); - if (rv < 0) - clib_warning ("rte_eth_dev_start %d returned %d", - xd->device_index, rv); - } - return 0; -} - -static u32 -dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) -{ - dpdk_main_t *dm = &dpdk_main; - dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance); - u32 old = 0; - - if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC (flags)) - { - old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0; - - if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) - xd->flags |= DPDK_DEVICE_FLAG_PROMISC; - else - xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC; - - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - { - if (xd->flags & DPDK_DEVICE_FLAG_PROMISC) - rte_eth_promiscuous_enable (xd->device_index); - else - rte_eth_promiscuous_disable (xd->device_index); - } - } - else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU (flags)) - { - /* - * DAW-FIXME: The Cisco VIC firmware does not provide an api for a - * driver to dynamically change the mtu. If/when the - * VIC firmware gets fixed, then this should be removed. - */ - if (xd->pmd == VNET_DPDK_PMD_ENIC) - { - struct rte_eth_dev_info dev_info; - - /* - * Restore mtu to what has been set by CIMC in the firmware cfg. - */ - rte_eth_dev_info_get (xd->device_index, &dev_info); - hi->max_packet_bytes = dev_info.max_rx_pktlen; - - vlib_cli_output (vlib_get_main (), - "Cisco VIC mtu can only be changed " - "using CIMC then rebooting the server!"); - } - else - { - int rv; - - xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes; - - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - rte_eth_dev_stop (xd->device_index); - - rv = rte_eth_dev_configure - (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf); - - if (rv < 0) - vlib_cli_output (vlib_get_main (), - "rte_eth_dev_configure[%d]: err %d", - xd->device_index, rv); - - rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); - - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - { - int rv = rte_eth_dev_start (xd->device_index); - if (!rv && xd->default_mac_address) - rv = rte_eth_dev_default_mac_addr_set (xd->device_index, - (struct ether_addr *) - xd->default_mac_address); - if (rv < 0) - clib_warning ("rte_eth_dev_start %d returned %d", - xd->device_index, rv); - } - } - } - return old; -} - -void -dpdk_device_lock_init (dpdk_device_t * xd) -{ - int q; - vec_validate (xd->lockp, xd->tx_q_used - 1); - for (q = 0; q < xd->tx_q_used; q++) - { - xd->lockp[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, - CLIB_CACHE_LINE_BYTES); - memset ((void *) xd->lockp[q], 0, CLIB_CACHE_LINE_BYTES); - } -} - -void -dpdk_device_lock_free (dpdk_device_t * xd) -{ - int q; - - for (q = 0; q < vec_len (xd->lockp); q++) - clib_mem_free ((void *) xd->lockp[q]); - vec_free (xd->lockp); - xd->lockp = 0; -} - -static clib_error_t * -dpdk_lib_init (dpdk_main_t * dm) -{ - u32 nports; - u32 nb_desc = 0; - int i; - clib_error_t *error; - vlib_main_t *vm = vlib_get_main (); - vlib_thread_main_t *tm = vlib_get_thread_main (); - vnet_sw_interface_t *sw; - vnet_hw_interface_t *hi; - dpdk_device_t *xd; - vlib_pci_addr_t last_pci_addr; - u32 last_pci_addr_port = 0; - vlib_thread_registration_t *tr, *tr_hqos; - uword *p, *p_hqos; - - u32 next_cpu = 0, next_hqos_cpu = 0; - u8 af_packet_port_id = 0; - last_pci_addr.as_u32 = ~0; - - dm->input_cpu_first_index = 0; - dm->input_cpu_count = 1; - - /* find out which cpus will be used for input */ - p = hash_get_mem (tm->thread_registrations_by_name, "workers"); - tr = p ? (vlib_thread_registration_t *) p[0] : 0; - - if (tr && tr->count > 0) - { - dm->input_cpu_first_index = tr->first_index; - dm->input_cpu_count = tr->count; - } - - vec_validate_aligned (dm->devices_by_cpu, tm->n_vlib_mains - 1, - CLIB_CACHE_LINE_BYTES); - - dm->hqos_cpu_first_index = 0; - dm->hqos_cpu_count = 0; - - /* find out which cpus will be used for I/O TX */ - p_hqos = hash_get_mem (tm->thread_registrations_by_name, "hqos-threads"); - tr_hqos = p_hqos ? (vlib_thread_registration_t *) p_hqos[0] : 0; - - if (tr_hqos && tr_hqos->count > 0) - { - dm->hqos_cpu_first_index = tr_hqos->first_index; - dm->hqos_cpu_count = tr_hqos->count; - } - - vec_validate_aligned (dm->devices_by_hqos_cpu, tm->n_vlib_mains - 1, - CLIB_CACHE_LINE_BYTES); - - nports = rte_eth_dev_count (); - if (nports < 1) - { - clib_warning ("DPDK drivers found no ports..."); - } - - if (CLIB_DEBUG > 0) - clib_warning ("DPDK drivers found %d ports...", nports); - - /* - * All buffers are all allocated from the same rte_mempool. - * Thus they all have the same number of data bytes. - */ - dm->vlib_buffer_free_list_index = - vlib_buffer_get_or_create_free_list (vm, - VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, - "dpdk rx"); - - if (dm->conf->enable_tcp_udp_checksum) - dm->buffer_flags_template &= ~(IP_BUFFER_L4_CHECKSUM_CORRECT - | IP_BUFFER_L4_CHECKSUM_COMPUTED); - - for (i = 0; i < nports; i++) - { - u8 addr[6]; - u8 vlan_strip = 0; - int j; - struct rte_eth_dev_info dev_info; - clib_error_t *rv; - struct rte_eth_link l; - dpdk_device_config_t *devconf = 0; - vlib_pci_addr_t pci_addr; - uword *p = 0; - - rte_eth_dev_info_get (i, &dev_info); - if (dev_info.pci_dev) /* bonded interface has no pci info */ - { - pci_addr.domain = dev_info.pci_dev->addr.domain; - pci_addr.bus = dev_info.pci_dev->addr.bus; - pci_addr.slot = dev_info.pci_dev->addr.devid; - pci_addr.function = dev_info.pci_dev->addr.function; - p = - hash_get (dm->conf->device_config_index_by_pci_addr, - pci_addr.as_u32); - } - - if (p) - devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); - else - devconf = &dm->conf->default_devconf; - - /* Create vnet interface */ - vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES); - xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT; - xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT; - xd->cpu_socket = (i8) rte_eth_dev_socket_id (i); - - /* Handle interface naming for devices with multiple ports sharing same PCI ID */ - if (dev_info.pci_dev) - { - struct rte_eth_dev_info di = { 0 }; - rte_eth_dev_info_get (i + 1, &di); - if (di.pci_dev && pci_addr.as_u32 != last_pci_addr.as_u32 && - memcmp (&dev_info.pci_dev->addr, &di.pci_dev->addr, - sizeof (struct rte_pci_addr)) == 0) - { - xd->interface_name_suffix = format (0, "0"); - last_pci_addr.as_u32 = pci_addr.as_u32; - last_pci_addr_port = i; - } - else if (pci_addr.as_u32 == last_pci_addr.as_u32) - { - xd->interface_name_suffix = - format (0, "%u", i - last_pci_addr_port); - } - else - { - last_pci_addr.as_u32 = ~0; - } - } - else - last_pci_addr.as_u32 = ~0; - - clib_memcpy (&xd->tx_conf, &dev_info.default_txconf, - sizeof (struct rte_eth_txconf)); - if (dm->conf->no_multi_seg) - { - xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; - port_conf_template.rxmode.jumbo_frame = 0; - } - else - { - xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS; - port_conf_template.rxmode.jumbo_frame = 1; - xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG; - } - - clib_memcpy (&xd->port_conf, &port_conf_template, - sizeof (struct rte_eth_conf)); - - xd->tx_q_used = clib_min (dev_info.max_tx_queues, tm->n_vlib_mains); - - if (devconf->num_tx_queues > 0 - && devconf->num_tx_queues < xd->tx_q_used) - xd->tx_q_used = clib_min (xd->tx_q_used, devconf->num_tx_queues); - - if (devconf->num_rx_queues > 1 && dm->use_rss == 0) - { - dm->use_rss = 1; - } - - if (devconf->num_rx_queues > 1 - && dev_info.max_rx_queues >= devconf->num_rx_queues) - { - xd->rx_q_used = devconf->num_rx_queues; - xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; - if (devconf->rss_fn == 0) - xd->port_conf.rx_adv_conf.rss_conf.rss_hf = - ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP; - else - xd->port_conf.rx_adv_conf.rss_conf.rss_hf = devconf->rss_fn; - } - else - xd->rx_q_used = 1; - - xd->flags |= DPDK_DEVICE_FLAG_PMD; - - /* workaround for drivers not setting driver_name */ - if ((!dev_info.driver_name) && (dev_info.pci_dev)) - dev_info.driver_name = dev_info.pci_dev->driver->driver.name; - - ASSERT (dev_info.driver_name); - - if (!xd->pmd) - { - - -#define _(s,f) else if (dev_info.driver_name && \ - !strcmp(dev_info.driver_name, s)) \ - xd->pmd = VNET_DPDK_PMD_##f; - if (0) - ; - foreach_dpdk_pmd -#undef _ - else - xd->pmd = VNET_DPDK_PMD_UNKNOWN; - - xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; - xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT; - xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT; - - switch (xd->pmd) - { - /* 1G adapters */ - case VNET_DPDK_PMD_E1000EM: - case VNET_DPDK_PMD_IGB: - case VNET_DPDK_PMD_IGBVF: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; - break; - - /* 10G adapters */ - case VNET_DPDK_PMD_IXGBE: - case VNET_DPDK_PMD_IXGBEVF: - case VNET_DPDK_PMD_THUNDERX: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - case VNET_DPDK_PMD_DPAA2: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - - /* Cisco VIC */ - case VNET_DPDK_PMD_ENIC: - rte_eth_link_get_nowait (i, &l); - xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; - if (l.link_speed == 40000) - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - else - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - - /* Intel Fortville */ - case VNET_DPDK_PMD_I40E: - case VNET_DPDK_PMD_I40EVF: - xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - - switch (dev_info.pci_dev->id.device_id) - { - case I40E_DEV_ID_10G_BASE_T: - case I40E_DEV_ID_SFP_XL710: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - case I40E_DEV_ID_QSFP_A: - case I40E_DEV_ID_QSFP_B: - case I40E_DEV_ID_QSFP_C: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - break; - case I40E_DEV_ID_VF: - rte_eth_link_get_nowait (i, &l); - xd->port_type = l.link_speed == 10000 ? - VNET_DPDK_PORT_TYPE_ETH_10G : VNET_DPDK_PORT_TYPE_ETH_40G; - break; - default: - xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; - } - break; - - case VNET_DPDK_PMD_CXGBE: - switch (dev_info.pci_dev->id.device_id) - { - case 0x540d: /* T580-CR */ - case 0x5410: /* T580-LP-cr */ - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - break; - case 0x5403: /* T540-CR */ - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - default: - xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; - } - break; - - case VNET_DPDK_PMD_MLX5: - { - char *pn_100g[] = { "MCX415A-CCAT", "MCX416A-CCAT", 0 }; - char *pn_40g[] = { "MCX413A-BCAT", "MCX414A-BCAT", - "MCX415A-BCAT", "MCX416A-BCAT", "MCX4131A-BCAT", 0 - }; - char *pn_10g[] = { "MCX4111A-XCAT", "MCX4121A-XCAT", 0 }; - - vlib_pci_device_t *pd = vlib_get_pci_device (&pci_addr); - u8 *pn = 0; - char **c; - int found = 0; - pn = format (0, "%U%c", - format_vlib_pci_vpd, pd->vpd_r, "PN", 0); - - if (!pn) - break; - - c = pn_100g; - while (!found && c[0]) - { - if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) - { - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_100G; - break; - } - c++; - } - - c = pn_40g; - while (!found && c[0]) - { - if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) - { - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - break; - } - c++; - } - - c = pn_10g; - while (!found && c[0]) - { - if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) - { - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - } - c++; - } - - vec_free (pn); - } - - break; - /* Intel Red Rock Canyon */ - case VNET_DPDK_PMD_FM10K: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH; - break; - - /* virtio */ - case VNET_DPDK_PMD_VIRTIO: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; - xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO; - xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO; - break; - - /* vmxnet3 */ - case VNET_DPDK_PMD_VMXNET3: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; - xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; - break; - - case VNET_DPDK_PMD_AF_PACKET: - xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET; - xd->af_packet_port_id = af_packet_port_id++; - break; - - case VNET_DPDK_PMD_BOND: - xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_BOND; - break; - - default: - xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; - } - - if (devconf->num_rx_desc) - xd->nb_rx_desc = devconf->num_rx_desc; - - if (devconf->num_tx_desc) - xd->nb_tx_desc = devconf->num_tx_desc; - } - - /* - * Ensure default mtu is not > the mtu read from the hardware. - * Otherwise rte_eth_dev_configure() will fail and the port will - * not be available. - */ - if (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen) - { - /* - * This device does not support the platforms's max frame - * size. Use it's advertised mru instead. - */ - xd->port_conf.rxmode.max_rx_pkt_len = dev_info.max_rx_pktlen; - } - else - { - xd->port_conf.rxmode.max_rx_pkt_len = ETHERNET_MAX_PACKET_BYTES; - - /* - * Some platforms do not account for Ethernet FCS (4 bytes) in - * MTU calculations. To interop with them increase mru but only - * if the device's settings can support it. - */ - if ((dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES + 4)) && - xd->port_conf.rxmode.hw_strip_crc) - { - /* - * Allow additional 4 bytes (for Ethernet FCS). These bytes are - * stripped by h/w and so will not consume any buffer memory. - */ - xd->port_conf.rxmode.max_rx_pkt_len += 4; - } - } - - if (xd->pmd == VNET_DPDK_PMD_AF_PACKET) - { - f64 now = vlib_time_now (vm); - u32 rnd; - rnd = (u32) (now * 1e6); - rnd = random_u32 (&rnd); - clib_memcpy (addr + 2, &rnd, sizeof (rnd)); - addr[0] = 2; - addr[1] = 0xfe; - } - else - rte_eth_macaddr_get (i, (struct ether_addr *) addr); - - if (xd->tx_q_used < tm->n_vlib_mains) - dpdk_device_lock_init (xd); - - xd->device_index = xd - dm->devices; - ASSERT (i == xd->device_index); - xd->per_interface_next_index = ~0; - - /* assign interface to input thread */ - dpdk_device_and_queue_t *dq; - int q; - - if (devconf->workers) - { - int i; - q = 0; - /* *INDENT-OFF* */ - clib_bitmap_foreach (i, devconf->workers, ({ - int cpu = dm->input_cpu_first_index + i; - unsigned lcore = vlib_worker_threads[cpu].lcore_id; - vec_validate(xd->cpu_socket_id_by_queue, q); - xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore); - vec_add2(dm->devices_by_cpu[cpu], dq, 1); - dq->device = xd->device_index; - dq->queue_id = q++; - })); - /* *INDENT-ON* */ - } - else - for (q = 0; q < xd->rx_q_used; q++) - { - int cpu = dm->input_cpu_first_index + next_cpu; - unsigned lcore = vlib_worker_threads[cpu].lcore_id; - - /* - * numa node for worker thread handling this queue - * needed for taking buffers from the right mempool - */ - vec_validate (xd->cpu_socket_id_by_queue, q); - xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id (lcore); - - /* - * construct vector of (device,queue) pairs for each worker thread - */ - vec_add2 (dm->devices_by_cpu[cpu], dq, 1); - dq->device = xd->device_index; - dq->queue_id = q; - - next_cpu++; - if (next_cpu == dm->input_cpu_count) - next_cpu = 0; - } - - - if (devconf->hqos_enabled) - { - xd->flags |= DPDK_DEVICE_FLAG_HQOS; - - if (devconf->hqos.hqos_thread_valid) - { - int cpu = dm->hqos_cpu_first_index + devconf->hqos.hqos_thread; - - if (devconf->hqos.hqos_thread >= dm->hqos_cpu_count) - return clib_error_return (0, "invalid HQoS thread index"); - - vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1); - dq->device = xd->device_index; - dq->queue_id = 0; - } - else - { - int cpu = dm->hqos_cpu_first_index + next_hqos_cpu; - - if (dm->hqos_cpu_count == 0) - return clib_error_return (0, "no HQoS threads available"); - - vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1); - dq->device = xd->device_index; - dq->queue_id = 0; - - next_hqos_cpu++; - if (next_hqos_cpu == dm->hqos_cpu_count) - next_hqos_cpu = 0; - - devconf->hqos.hqos_thread_valid = 1; - devconf->hqos.hqos_thread = cpu; - } - } - - vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains, - CLIB_CACHE_LINE_BYTES); - for (j = 0; j < tm->n_vlib_mains; j++) - { - vec_validate_ha (xd->tx_vectors[j], xd->nb_tx_desc, - sizeof (tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES); - vec_reset_length (xd->tx_vectors[j]); - } - - vec_validate_aligned (xd->rx_vectors, xd->rx_q_used, - CLIB_CACHE_LINE_BYTES); - for (j = 0; j < xd->rx_q_used; j++) - { - vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE - 1, - CLIB_CACHE_LINE_BYTES); - vec_reset_length (xd->rx_vectors[j]); - } - - vec_validate_aligned (xd->d_trace_buffers, tm->n_vlib_mains, - CLIB_CACHE_LINE_BYTES); - - rv = dpdk_port_setup (dm, xd); - - if (rv) - return rv; - - if (devconf->hqos_enabled) - { - rv = dpdk_port_setup_hqos (xd, &devconf->hqos); - if (rv) - return rv; - } - - /* count the number of descriptors used for this device */ - nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used; - - error = ethernet_register_interface - (dm->vnet_main, dpdk_device_class.index, xd->device_index, - /* ethernet address */ addr, - &xd->vlib_hw_if_index, dpdk_flag_change); - if (error) - return error; - - sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index); - xd->vlib_sw_if_index = sw->sw_if_index; - hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index); - - /* - * DAW-FIXME: The Cisco VIC firmware does not provide an api for a - * driver to dynamically change the mtu. If/when the - * VIC firmware gets fixed, then this should be removed. - */ - if (xd->pmd == VNET_DPDK_PMD_ENIC) - { - /* - * Initialize mtu to what has been set by CIMC in the firmware cfg. - */ - hi->max_packet_bytes = dev_info.max_rx_pktlen; - if (devconf->vlan_strip_offload != DPDK_DEVICE_VLAN_STRIP_OFF) - vlan_strip = 1; /* remove vlan tag from VIC port by default */ - else - clib_warning ("VLAN strip disabled for interface\n"); - } - else if (devconf->vlan_strip_offload == DPDK_DEVICE_VLAN_STRIP_ON) - vlan_strip = 1; - - if (vlan_strip) - { - int vlan_off; - vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index); - vlan_off |= ETH_VLAN_STRIP_OFFLOAD; - xd->port_conf.rxmode.hw_vlan_strip = vlan_off; - if (rte_eth_dev_set_vlan_offload (xd->device_index, vlan_off) == 0) - clib_warning ("VLAN strip enabled for interface\n"); - else - clib_warning ("VLAN strip cannot be supported by interface\n"); - } - - hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = - xd->port_conf.rxmode.max_rx_pkt_len - sizeof (ethernet_header_t); - - rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); - } - - if (nb_desc > dm->conf->num_mbufs) - clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n", - dm->conf->num_mbufs, nb_desc); - - return 0; -} - -static void -dpdk_bind_devices_to_uio (dpdk_config_main_t * conf) -{ - vlib_pci_main_t *pm = &pci_main; - clib_error_t *error; - vlib_pci_device_t *d; - u8 *pci_addr = 0; - int num_whitelisted = vec_len (conf->dev_confs); - - /* *INDENT-OFF* */ - pool_foreach (d, pm->pci_devs, ({ - dpdk_device_config_t * devconf = 0; - vec_reset_length (pci_addr); - pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); - - if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && d->device_class != PCI_CLASS_PROCESSOR_CO) - continue; - - if (num_whitelisted) - { - uword * p = hash_get (conf->device_config_index_by_pci_addr, d->bus_address.as_u32); - - if (!p) - continue; - - devconf = pool_elt_at_index (conf->dev_confs, p[0]); - } - - /* virtio */ - if (d->vendor_id == 0x1af4 && d->device_id == 0x1000) - ; - /* vmxnet3 */ - else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0) - ; - /* all Intel devices */ - else if (d->vendor_id == 0x8086) - ; - /* Cisco VIC */ - else if (d->vendor_id == 0x1137 && d->device_id == 0x0043) - ; - /* Chelsio T4/T5 */ - else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000) - ; - else - { - clib_warning ("Unsupported Ethernet PCI device 0x%04x:0x%04x found " - "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id, - pci_addr); - continue; - } - - error = vlib_pci_bind_to_uio (d, (char *) conf->uio_driver_name); - - if (error) - { - if (devconf == 0) - { - pool_get (conf->dev_confs, devconf); - hash_set (conf->device_config_index_by_pci_addr, d->bus_address.as_u32, - devconf - conf->dev_confs); - devconf->pci_addr.as_u32 = d->bus_address.as_u32; - } - devconf->is_blacklisted = 1; - clib_error_report (error); - } - })); - /* *INDENT-ON* */ - vec_free (pci_addr); -} - -static clib_error_t * -dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr, - unformat_input_t * input, u8 is_default) -{ - clib_error_t *error = 0; - uword *p; - dpdk_device_config_t *devconf; - unformat_input_t sub_input; - - if (is_default) - { - devconf = &conf->default_devconf; - } - else - { - p = hash_get (conf->device_config_index_by_pci_addr, pci_addr.as_u32); - - if (!p) - { - pool_get (conf->dev_confs, devconf); - hash_set (conf->device_config_index_by_pci_addr, pci_addr.as_u32, - devconf - conf->dev_confs); - } - else - return clib_error_return (0, - "duplicate configuration for PCI address %U", - format_vlib_pci_addr, &pci_addr); - } - - devconf->pci_addr.as_u32 = pci_addr.as_u32; - devconf->hqos_enabled = 0; - dpdk_device_config_hqos_default (&devconf->hqos); - - if (!input) - return 0; - - unformat_skip_white_space (input); - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "num-rx-queues %u", &devconf->num_rx_queues)) - ; - else if (unformat (input, "num-tx-queues %u", &devconf->num_tx_queues)) - ; - else if (unformat (input, "num-rx-desc %u", &devconf->num_rx_desc)) - ; - else if (unformat (input, "num-tx-desc %u", &devconf->num_tx_desc)) - ; - else if (unformat (input, "workers %U", unformat_bitmap_list, - &devconf->workers)) - ; - else - if (unformat - (input, "rss %U", unformat_vlib_cli_sub_input, &sub_input)) - { - error = unformat_rss_fn (&sub_input, &devconf->rss_fn); - if (error) - break; - } - else if (unformat (input, "vlan-strip-offload off")) - devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF; - else if (unformat (input, "vlan-strip-offload on")) - devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON; - else - if (unformat - (input, "hqos %U", unformat_vlib_cli_sub_input, &sub_input)) - { - devconf->hqos_enabled = 1; - error = unformat_hqos (&sub_input, &devconf->hqos); - if (error) - break; - } - else if (unformat (input, "hqos")) - { - devconf->hqos_enabled = 1; - } - else - { - error = clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - break; - } - } - - if (error) - return error; - - if (devconf->workers && devconf->num_rx_queues == 0) - devconf->num_rx_queues = clib_bitmap_count_set_bits (devconf->workers); - else if (devconf->workers && - clib_bitmap_count_set_bits (devconf->workers) != - devconf->num_rx_queues) - error = - clib_error_return (0, - "%U: number of worker threadds must be " - "equal to number of rx queues", format_vlib_pci_addr, - &pci_addr); - - return error; -} - -static clib_error_t * -dpdk_config (vlib_main_t * vm, unformat_input_t * input) -{ - clib_error_t *error = 0; - dpdk_main_t *dm = &dpdk_main; - dpdk_config_main_t *conf = &dpdk_config_main; - vlib_thread_main_t *tm = vlib_get_thread_main (); - dpdk_device_config_t *devconf; - vlib_pci_addr_t pci_addr; - unformat_input_t sub_input; - u8 *s, *tmp = 0; - u8 *rte_cmd = 0, *ethname = 0; - u32 log_level; - int ret, i; - int num_whitelisted = 0; - u8 no_pci = 0; - u8 no_huge = 0; - u8 huge_dir = 0; - u8 file_prefix = 0; - u8 *socket_mem = 0; - - conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword)); - - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - /* Prime the pump */ - if (unformat (input, "no-hugetlb")) - { - vec_add1 (conf->eal_init_args, (u8 *) "no-huge"); - no_huge = 1; - } - - else if (unformat (input, "enable-tcp-udp-checksum")) - conf->enable_tcp_udp_checksum = 1; - - else if (unformat (input, "decimal-interface-names")) - conf->interface_name_format_decimal = 1; - - else if (unformat (input, "no-multi-seg")) - conf->no_multi_seg = 1; - - else if (unformat (input, "enable-cryptodev")) - conf->cryptodev = 1; - - else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input, - &sub_input)) - { - error = - dpdk_device_config (conf, (vlib_pci_addr_t) (u32) ~ 1, &sub_input, - 1); - - if (error) - return error; - } - else - if (unformat - (input, "dev %U %U", unformat_vlib_pci_addr, &pci_addr, - unformat_vlib_cli_sub_input, &sub_input)) - { - error = dpdk_device_config (conf, pci_addr, &sub_input, 0); - - if (error) - return error; - - num_whitelisted++; - } - else if (unformat (input, "dev %U", unformat_vlib_pci_addr, &pci_addr)) - { - error = dpdk_device_config (conf, pci_addr, 0, 0); - - if (error) - return error; - - num_whitelisted++; - } - else if (unformat (input, "num-mbufs %d", &conf->num_mbufs)) - ; - else if (unformat (input, "kni %d", &conf->num_kni)) - ; - else if (unformat (input, "uio-driver %s", &conf->uio_driver_name)) - ; - else if (unformat (input, "socket-mem %s", &socket_mem)) - ; - else if (unformat (input, "no-pci")) - { - no_pci = 1; - tmp = format (0, "--no-pci%c", 0); - vec_add1 (conf->eal_init_args, tmp); - } - else if (unformat (input, "poll-sleep %d", &dm->poll_sleep)) - ; - -#define _(a) \ - else if (unformat(input, #a)) \ - { \ - tmp = format (0, "--%s%c", #a, 0); \ - vec_add1 (conf->eal_init_args, tmp); \ - } - foreach_eal_double_hyphen_predicate_arg -#undef _ -#define _(a) \ - else if (unformat(input, #a " %s", &s)) \ - { \ - if (!strncmp(#a, "huge-dir", 8)) \ - huge_dir = 1; \ - else if (!strncmp(#a, "file-prefix", 11)) \ - file_prefix = 1; \ - tmp = format (0, "--%s%c", #a, 0); \ - vec_add1 (conf->eal_init_args, tmp); \ - vec_add1 (s, 0); \ - if (!strncmp(#a, "vdev", 4)) \ - if (strstr((char*)s, "af_packet")) \ - clib_warning ("af_packet obsoleted. Use CLI 'create host-interface'."); \ - vec_add1 (conf->eal_init_args, s); \ - } - foreach_eal_double_hyphen_arg -#undef _ -#define _(a,b) \ - else if (unformat(input, #a " %s", &s)) \ - { \ - tmp = format (0, "-%s%c", #b, 0); \ - vec_add1 (conf->eal_init_args, tmp); \ - vec_add1 (s, 0); \ - vec_add1 (conf->eal_init_args, s); \ - } - foreach_eal_single_hyphen_arg -#undef _ -#define _(a,b) \ - else if (unformat(input, #a " %s", &s)) \ - { \ - tmp = format (0, "-%s%c", #b, 0); \ - vec_add1 (conf->eal_init_args, tmp); \ - vec_add1 (s, 0); \ - vec_add1 (conf->eal_init_args, s); \ - conf->a##_set_manually = 1; \ - } - foreach_eal_single_hyphen_mandatory_arg -#undef _ - else if (unformat (input, "default")) - ; - - else if (unformat_skip_white_space (input)) - ; - else - { - error = clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - goto done; - } - } - - if (!conf->uio_driver_name) - conf->uio_driver_name = format (0, "uio_pci_generic%c", 0); - - /* - * Use 1G huge pages if available. - */ - if (!no_huge && !huge_dir) - { - u32 x, *mem_by_socket = 0; - uword c = 0; - u8 use_1g = 1; - u8 use_2m = 1; - u8 less_than_1g = 1; - int rv; - - umount (DEFAULT_HUGE_DIR); - - /* Process "socket-mem" parameter value */ - if (vec_len (socket_mem)) - { - unformat_input_t in; - unformat_init_vector (&in, socket_mem); - while (unformat_check_input (&in) != UNFORMAT_END_OF_INPUT) - { - if (unformat (&in, "%u,", &x)) - ; - else if (unformat (&in, "%u", &x)) - ; - else if (unformat (&in, ",")) - x = 0; - else - break; - - vec_add1 (mem_by_socket, x); - - if (x > 1023) - less_than_1g = 0; - } - /* Note: unformat_free vec_frees(in.buffer), aka socket_mem... */ - unformat_free (&in); - socket_mem = 0; - } - else - { - /* *INDENT-OFF* */ - clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( - { - vec_validate(mem_by_socket, c); - mem_by_socket[c] = 256; /* default per-socket mem */ - } - )); - /* *INDENT-ON* */ - } - - /* check if available enough 1GB pages for each socket */ - /* *INDENT-OFF* */ - clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( - { - int pages_avail, page_size, mem; - - vec_validate(mem_by_socket, c); - mem = mem_by_socket[c]; - - page_size = 1024; - pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); - - if (pages_avail < 0 || page_size * pages_avail < mem) - use_1g = 0; - - page_size = 2; - pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); - - if (pages_avail < 0 || page_size * pages_avail < mem) - use_2m = 0; - })); - /* *INDENT-ON* */ - - if (mem_by_socket == 0) - { - error = clib_error_return (0, "mem_by_socket NULL"); - goto done; - } - _vec_len (mem_by_socket) = c + 1; - - /* regenerate socket_mem string */ - vec_foreach_index (x, mem_by_socket) - socket_mem = format (socket_mem, "%s%u", - socket_mem ? "," : "", mem_by_socket[x]); - socket_mem = format (socket_mem, "%c", 0); - - vec_free (mem_by_socket); - - rv = mkdir (VPP_RUN_DIR, 0755); - if (rv && errno != EEXIST) - { - error = clib_error_return (0, "mkdir '%s' failed errno %d", - VPP_RUN_DIR, errno); - goto done; - } - - rv = mkdir (DEFAULT_HUGE_DIR, 0755); - if (rv && errno != EEXIST) - { - error = clib_error_return (0, "mkdir '%s' failed errno %d", - DEFAULT_HUGE_DIR, errno); - goto done; - } - - if (use_1g && !(less_than_1g && use_2m)) - { - rv = - mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G"); - } - else if (use_2m) - { - rv = mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL); - } - else - { - return clib_error_return (0, "not enough free huge pages"); - } - - if (rv) - { - error = clib_error_return (0, "mount failed %d", errno); - goto done; - } - - tmp = format (0, "--huge-dir%c", 0); - vec_add1 (conf->eal_init_args, tmp); - tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0); - vec_add1 (conf->eal_init_args, tmp); - if (!file_prefix) - { - tmp = format (0, "--file-prefix%c", 0); - vec_add1 (conf->eal_init_args, tmp); - tmp = format (0, "vpp%c", 0); - vec_add1 (conf->eal_init_args, tmp); - } - } - - vec_free (rte_cmd); - vec_free (ethname); - - if (error) - return error; - - /* I'll bet that -c and -n must be the first and second args... */ - if (!conf->coremask_set_manually) - { - vlib_thread_registration_t *tr; - uword *coremask = 0; - int i; - - /* main thread core */ - coremask = clib_bitmap_set (coremask, tm->main_lcore, 1); - - for (i = 0; i < vec_len (tm->registrations); i++) - { - tr = tm->registrations[i]; - coremask = clib_bitmap_or (coremask, tr->coremask); - } - - vec_insert (conf->eal_init_args, 2, 1); - conf->eal_init_args[1] = (u8 *) "-c"; - tmp = format (0, "%U%c", format_bitmap_hex, coremask, 0); - conf->eal_init_args[2] = tmp; - clib_bitmap_free (coremask); - } - - if (!conf->nchannels_set_manually) - { - vec_insert (conf->eal_init_args, 2, 3); - conf->eal_init_args[3] = (u8 *) "-n"; - tmp = format (0, "%d", conf->nchannels); - conf->eal_init_args[4] = tmp; - } - - if (no_pci == 0 && geteuid () == 0) - dpdk_bind_devices_to_uio (conf); - -#define _(x) \ - if (devconf->x == 0 && conf->default_devconf.x > 0) \ - devconf->x = conf->default_devconf.x ; - - /* *INDENT-OFF* */ - pool_foreach (devconf, conf->dev_confs, ({ - - /* default per-device config items */ - foreach_dpdk_device_config_item - - /* add DPDK EAL whitelist/blacklist entry */ - if (num_whitelisted > 0 && devconf->is_blacklisted == 0) - { - tmp = format (0, "-w%c", 0); - vec_add1 (conf->eal_init_args, tmp); - tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0); - vec_add1 (conf->eal_init_args, tmp); - } - else if (num_whitelisted == 0 && devconf->is_blacklisted != 0) - { - tmp = format (0, "-b%c", 0); - vec_add1 (conf->eal_init_args, tmp); - tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0); - vec_add1 (conf->eal_init_args, tmp); - } - })); - /* *INDENT-ON* */ - -#undef _ - - /* set master-lcore */ - tmp = format (0, "--master-lcore%c", 0); - vec_add1 (conf->eal_init_args, tmp); - tmp = format (0, "%u%c", tm->main_lcore, 0); - vec_add1 (conf->eal_init_args, tmp); - - /* set socket-mem */ - tmp = format (0, "--socket-mem%c", 0); - vec_add1 (conf->eal_init_args, tmp); - tmp = format (0, "%s%c", socket_mem, 0); - vec_add1 (conf->eal_init_args, tmp); - - /* NULL terminate the "argv" vector, in case of stupidity */ - vec_add1 (conf->eal_init_args, 0); - _vec_len (conf->eal_init_args) -= 1; - - /* Set up DPDK eal and packet mbuf pool early. */ - - log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE; - - rte_set_log_level (log_level); - - vm = vlib_get_main (); - - /* make copy of args as rte_eal_init tends to mess up with arg array */ - for (i = 1; i < vec_len (conf->eal_init_args); i++) - conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ", - conf->eal_init_args[i]); - - ret = - rte_eal_init (vec_len (conf->eal_init_args), - (char **) conf->eal_init_args); - - /* lazy umount hugepages */ - umount2 (DEFAULT_HUGE_DIR, MNT_DETACH); - - if (ret < 0) - return clib_error_return (0, "rte_eal_init returned %d", ret); - - /* Dump the physical memory layout prior to creating the mbuf_pool */ - fprintf (stdout, "DPDK physical memory layout:\n"); - rte_dump_physmem_layout (stdout); - - /* main thread 1st */ - error = vlib_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ()); - if (error) - return error; - - for (i = 0; i < RTE_MAX_LCORE; i++) - { - error = vlib_buffer_pool_create (vm, conf->num_mbufs, - rte_lcore_to_socket_id (i)); - if (error) - return error; - } - -done: - return error; -} - -VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk"); - -void -dpdk_update_link_state (dpdk_device_t * xd, f64 now) -{ - vnet_main_t *vnm = vnet_get_main (); - struct rte_eth_link prev_link = xd->link; - u32 hw_flags = 0; - u8 hw_flags_chg = 0; - - /* only update link state for PMD interfaces */ - if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) - return; - - xd->time_last_link_update = now ? now : xd->time_last_link_update; - memset (&xd->link, 0, sizeof (xd->link)); - rte_eth_link_get_nowait (xd->device_index, &xd->link); - - if (LINK_STATE_ELOGS) - { - vlib_main_t *vm = vlib_get_main (); - ELOG_TYPE_DECLARE (e) = - { - .format = - "update-link-state: sw_if_index %d, admin_up %d," - "old link_state %d new link_state %d",.format_args = "i4i1i1i1",}; - - struct - { - u32 sw_if_index; - u8 admin_up; - u8 old_link_state; - u8 new_link_state; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->sw_if_index = xd->vlib_sw_if_index; - ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0; - ed->old_link_state = (u8) - vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index); - ed->new_link_state = (u8) xd->link.link_status; - } - - if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) && - ((xd->link.link_status != 0) ^ - vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index))) - { - hw_flags_chg = 1; - hw_flags |= (xd->link.link_status ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); - } - - if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex)) - { - hw_flags_chg = 1; - switch (xd->link.link_duplex) - { - case ETH_LINK_HALF_DUPLEX: - hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX; - break; - case ETH_LINK_FULL_DUPLEX: - hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX; - break; - default: - break; - } - } - if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed)) - { - hw_flags_chg = 1; - switch (xd->link.link_speed) - { - case ETH_SPEED_NUM_10M: - hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M; - break; - case ETH_SPEED_NUM_100M: - hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M; - break; - case ETH_SPEED_NUM_1G: - hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G; - break; - case ETH_SPEED_NUM_10G: - hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G; - break; - case ETH_SPEED_NUM_40G: - hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G; - break; - case 0: - break; - default: - clib_warning ("unknown link speed %d", xd->link.link_speed); - break; - } - } - if (hw_flags_chg) - { - if (LINK_STATE_ELOGS) - { - vlib_main_t *vm = vlib_get_main (); - - ELOG_TYPE_DECLARE (e) = - { - .format = - "update-link-state: sw_if_index %d, new flags %d",.format_args - = "i4i4",}; - - struct - { - u32 sw_if_index; - u32 flags; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->sw_if_index = xd->vlib_sw_if_index; - ed->flags = hw_flags; - } - vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, hw_flags); - } -} - -static uword -dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) -{ - clib_error_t *error; - vnet_main_t *vnm = vnet_get_main (); - dpdk_main_t *dm = &dpdk_main; - ethernet_main_t *em = ðernet_main; - dpdk_device_t *xd; - vlib_thread_main_t *tm = vlib_get_thread_main (); - int i; - - error = dpdk_lib_init (dm); - - /* - * Turn on the input node if we found some devices to drive - * and we're not running worker threads or i/o threads - */ - - if (error == 0 && vec_len (dm->devices) > 0) - { - if (tm->n_vlib_mains == 1) - vlib_node_set_state (vm, dpdk_input_node.index, - VLIB_NODE_STATE_POLLING); - else - for (i = 0; i < tm->n_vlib_mains; i++) - if (vec_len (dm->devices_by_cpu[i]) > 0) - vlib_node_set_state (vlib_mains[i], dpdk_input_node.index, - VLIB_NODE_STATE_POLLING); - } - - if (error) - clib_error_report (error); - - tm->worker_thread_release = 1; - - f64 now = vlib_time_now (vm); - vec_foreach (xd, dm->devices) - { - dpdk_update_link_state (xd, now); - } - - { - /* - * Extra set up for bond interfaces: - * 1. Setup MACs for bond interfaces and their slave links which was set - * in dpdk_port_setup() but needs to be done again here to take effect. - * 2. Set up info for bond interface related CLI support. - */ - int nports = rte_eth_dev_count (); - if (nports > 0) - { - for (i = 0; i < nports; i++) - { - struct rte_eth_dev_info dev_info; - rte_eth_dev_info_get (i, &dev_info); - if (!dev_info.driver_name) - dev_info.driver_name = dev_info.pci_dev->driver->driver.name; - - ASSERT (dev_info.driver_name); - if (strncmp (dev_info.driver_name, "rte_bond_pmd", 12) == 0) - { - u8 addr[6]; - u8 slink[16]; - int nlink = rte_eth_bond_slaves_get (i, slink, 16); - if (nlink > 0) - { - vnet_hw_interface_t *bhi; - ethernet_interface_t *bei; - int rv; - - /* Get MAC of 1st slave link */ - rte_eth_macaddr_get (slink[0], - (struct ether_addr *) addr); - /* Set MAC of bounded interface to that of 1st slave link */ - rv = - rte_eth_bond_mac_address_set (i, - (struct ether_addr *) - addr); - if (rv < 0) - clib_warning ("Failed to set MAC address"); - - /* Populate MAC of bonded interface in VPP hw tables */ - bhi = - vnet_get_hw_interface (vnm, - dm->devices[i].vlib_hw_if_index); - bei = - pool_elt_at_index (em->interfaces, bhi->hw_instance); - clib_memcpy (bhi->hw_address, addr, 6); - clib_memcpy (bei->address, addr, 6); - /* Init l3 packet size allowed on bonded interface */ - bhi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES; - bhi->max_l3_packet_bytes[VLIB_RX] = - bhi->max_l3_packet_bytes[VLIB_TX] = - ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t); - while (nlink >= 1) - { /* for all slave links */ - int slave = slink[--nlink]; - dpdk_device_t *sdev = &dm->devices[slave]; - vnet_hw_interface_t *shi; - vnet_sw_interface_t *ssi; - /* Add MAC to all slave links except the first one */ - if (nlink) - rte_eth_dev_mac_addr_add (slave, - (struct ether_addr *) - addr, 0); - /* Set slaves bitmap for bonded interface */ - bhi->bond_info = - clib_bitmap_set (bhi->bond_info, - sdev->vlib_hw_if_index, 1); - /* Set slave link flags on slave interface */ - shi = - vnet_get_hw_interface (vnm, sdev->vlib_hw_if_index); - ssi = - vnet_get_sw_interface (vnm, sdev->vlib_sw_if_index); - shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE; - ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE; - - /* Set l3 packet size allowed as the lowest of slave */ - if (bhi->max_l3_packet_bytes[VLIB_RX] > - shi->max_l3_packet_bytes[VLIB_RX]) - bhi->max_l3_packet_bytes[VLIB_RX] = - bhi->max_l3_packet_bytes[VLIB_TX] = - shi->max_l3_packet_bytes[VLIB_RX]; - - /* Set max packet size allowed as the lowest of slave */ - if (bhi->max_packet_bytes > shi->max_packet_bytes) - bhi->max_packet_bytes = shi->max_packet_bytes; - } - } - } - } - } - } - - while (1) - { - /* - * check each time through the loop in case intervals are changed - */ - f64 min_wait = dm->link_state_poll_interval < dm->stat_poll_interval ? - dm->link_state_poll_interval : dm->stat_poll_interval; - - vlib_process_wait_for_event_or_clock (vm, min_wait); - - if (dm->admin_up_down_in_progress) - /* skip the poll if an admin up down is in progress (on any interface) */ - continue; - - vec_foreach (xd, dm->devices) - { - f64 now = vlib_time_now (vm); - if ((now - xd->time_last_stats_update) >= dm->stat_poll_interval) - dpdk_update_counters (xd, now); - if ((now - xd->time_last_link_update) >= dm->link_state_poll_interval) - dpdk_update_link_state (xd, now); - - } - } - - return 0; -} - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (dpdk_process_node,static) = { - .function = dpdk_process, - .type = VLIB_NODE_TYPE_PROCESS, - .name = "dpdk-process", - .process_log2_n_stack_bytes = 17, -}; -/* *INDENT-ON* */ - -int -dpdk_set_stat_poll_interval (f64 interval) -{ - if (interval < DPDK_MIN_STATS_POLL_INTERVAL) - return (VNET_API_ERROR_INVALID_VALUE); - - dpdk_main.stat_poll_interval = interval; - - return 0; -} - -int -dpdk_set_link_state_poll_interval (f64 interval) -{ - if (interval < DPDK_MIN_LINK_POLL_INTERVAL) - return (VNET_API_ERROR_INVALID_VALUE); - - dpdk_main.link_state_poll_interval = interval; - - return 0; -} - -clib_error_t * -dpdk_init (vlib_main_t * vm) -{ - dpdk_main_t *dm = &dpdk_main; - vlib_node_t *ei; - clib_error_t *error = 0; - vlib_thread_main_t *tm = vlib_get_thread_main (); - - /* verify that structs are cacheline aligned */ - STATIC_ASSERT (offsetof (dpdk_device_t, cacheline0) == 0, - "Cache line marker must be 1st element in dpdk_device_t"); - STATIC_ASSERT (offsetof (dpdk_device_t, cacheline1) == - CLIB_CACHE_LINE_BYTES, - "Data in cache line 0 is bigger than cache line size"); - STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0, - "Cache line marker must be 1st element in frame_queue_trace_t"); - - u8 *name; - name = format (0, "dpdk_%08x%c", api_version, 0); - - /* Ask for a correctly-sized block of API message decode slots */ - dm->msg_id_base = vl_msg_api_get_msg_ids - ((char *) name, VL_MSG_FIRST_AVAILABLE); - vec_free (name); - - dm->vlib_main = vm; - dm->vnet_main = vnet_get_main (); - dm->conf = &dpdk_config_main; - - error = dpdk_plugin_api_hookup (vm); - - /* Add our API messages to the global name_crc hash table */ - setup_message_id_table (dm, &api_main); - -// TODO -// plugin_custom_dump_configure (dm); - - ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input"); - if (ei == 0) - return clib_error_return (0, "ethernet-input node AWOL"); - - dm->ethernet_input_node_index = ei->index; - - dm->conf->nchannels = 4; - dm->conf->num_mbufs = dm->conf->num_mbufs ? dm->conf->num_mbufs : NB_MBUF; - vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet"); - - dm->dpdk_device_by_kni_port_id = hash_create (0, sizeof (uword)); - dm->vu_sw_if_index_by_listener_fd = hash_create (0, sizeof (uword)); - dm->vu_sw_if_index_by_sock_fd = hash_create (0, sizeof (uword)); - - /* $$$ use n_thread_stacks since it's known-good at this point */ - vec_validate (dm->recycle, tm->n_thread_stacks - 1); - - /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */ - dm->buffer_flags_template = - (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_EXT_HDR_VALID - | IP_BUFFER_L4_CHECKSUM_COMPUTED | IP_BUFFER_L4_CHECKSUM_CORRECT); - - dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL; - dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL; - - /* init CLI */ - if ((error = vlib_call_init_function (vm, dpdk_cli_init))) - return error; - - return error; -} - -VLIB_INIT_FUNCTION (dpdk_init); - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ -- cgit 1.2.3-korg From 35af9e50cdbfc73dab963557f4ffbd56b21e2abc Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 6 Mar 2017 12:02:50 +0100 Subject: features: take device-input buffer advance value directly Change-Id: Ifac7d9134d03d79164ce6f06ae9413279bbaadb3 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/node.c | 6 ++-- src/vnet/devices/af_packet/node.c | 3 +- src/vnet/devices/netmap/node.c | 2 +- src/vnet/devices/virtio/vhost-user.c | 2 +- src/vnet/feature/feature.h | 62 ++++++++++++++++++++---------------- src/vnet/unix/tapcli.c | 3 +- src/vnet/unix/tuntap.c | 2 +- 7 files changed, 42 insertions(+), 38 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 04c41655..ccbfd2f2 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -439,9 +439,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, /* Do we have any driver RX features configured on the interface? */ vnet_feature_start_device_input_x4 (xd->vlib_sw_if_index, &next0, &next1, &next2, &next3, - b0, b1, b2, b3, - l3_offset0, l3_offset1, - l3_offset2, l3_offset3); + b0, b1, b2, b3); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, @@ -502,7 +500,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, /* Do we have any driver RX features configured on the interface? */ vnet_feature_start_device_input_x1 (xd->vlib_sw_if_index, &next0, - b0, l3_offset0); + b0); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index 69fc11c9..ab7fd800 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -216,8 +216,7 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, } /* redirect if feature path enabled */ - vnet_feature_start_device_input_x1 (apif->sw_if_index, &next0, b0, - 0); + vnet_feature_start_device_input_x1 (apif->sw_if_index, &next0, b0); /* enque and take next packet */ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, diff --git a/src/vnet/devices/netmap/node.c b/src/vnet/devices/netmap/node.c index 835209a3..68ea7832 100644 --- a/src/vnet/devices/netmap/node.c +++ b/src/vnet/devices/netmap/node.c @@ -218,7 +218,7 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, /* redirect if feature path enabled */ vnet_feature_start_device_input_x1 (nif->sw_if_index, &next0, - first_b0, 0); + first_b0); /* enque and take next packet */ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index f490f0c1..c16e9822 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -1747,7 +1747,7 @@ vhost_user_if_input (vlib_main_t * vm, /* redirect if feature path enabled */ vnet_feature_start_device_input_x1 (vui->sw_if_index, &next0, - b_head, 0); + b_head); u32 bi = to_next[-1]; //Cannot use to_next[-1] in the macro vlib_validate_buffer_enqueue_x1 (vm, node, next_index, diff --git a/src/vnet/feature/feature.h b/src/vnet/feature/feature.h index b27aaf17..77b1499d 100644 --- a/src/vnet/feature/feature.h +++ b/src/vnet/feature/feature.h @@ -18,6 +18,7 @@ #include #include +#include /** feature registration object */ typedef struct _vnet_feature_arc_registration @@ -227,7 +228,7 @@ vnet_feature_next (u32 sw_if_index, u32 * next0, vlib_buffer_t * b0) static_always_inline void vnet_feature_start_device_input_x1 (u32 sw_if_index, u32 * next0, - vlib_buffer_t * b0, u16 buffer_advanced0) + vlib_buffer_t * b0) { vnet_feature_main_t *fm = &feature_main; vnet_feature_config_main_t *cm; @@ -242,9 +243,12 @@ vnet_feature_start_device_input_x1 (u32 sw_if_index, u32 * next0, * Save next0 so that the last feature in the chain * can skip ethernet-input if indicated... */ + u16 adv; + vnet_buffer (b0)->device_input_feat.saved_next_index = *next0; - vnet_buffer (b0)->device_input_feat.buffer_advance = buffer_advanced0; - vlib_buffer_advance (b0, -buffer_advanced0); + adv = device_input_next_node_advance[*next0]; + vnet_buffer (b0)->device_input_feat.buffer_advance = adv; + vlib_buffer_advance (b0, -adv); b0->feature_arc_index = feature_arc_index; b0->current_config_index = @@ -258,10 +262,7 @@ static_always_inline void vnet_feature_start_device_input_x2 (u32 sw_if_index, u32 * next0, u32 * next1, - vlib_buffer_t * b0, - vlib_buffer_t * b1, - u16 buffer_advanced0, - u16 buffer_advanced1) + vlib_buffer_t * b0, vlib_buffer_t * b1) { vnet_feature_main_t *fm = &feature_main; vnet_feature_config_main_t *cm; @@ -276,12 +277,17 @@ vnet_feature_start_device_input_x2 (u32 sw_if_index, * Save next0 so that the last feature in the chain * can skip ethernet-input if indicated... */ + u16 adv; + vnet_buffer (b0)->device_input_feat.saved_next_index = *next0; + adv = device_input_next_node_advance[*next0]; + vnet_buffer (b0)->device_input_feat.buffer_advance = adv; + vlib_buffer_advance (b0, -adv); + vnet_buffer (b1)->device_input_feat.saved_next_index = *next1; - vnet_buffer (b0)->device_input_feat.buffer_advance = buffer_advanced0; - vnet_buffer (b1)->device_input_feat.buffer_advance = buffer_advanced1; - vlib_buffer_advance (b0, -buffer_advanced0); - vlib_buffer_advance (b1, -buffer_advanced1); + adv = device_input_next_node_advance[*next1]; + vnet_buffer (b1)->device_input_feat.buffer_advance = adv; + vlib_buffer_advance (b1, -adv); b0->feature_arc_index = feature_arc_index; b1->feature_arc_index = feature_arc_index; @@ -303,12 +309,7 @@ vnet_feature_start_device_input_x4 (u32 sw_if_index, u32 * next3, vlib_buffer_t * b0, vlib_buffer_t * b1, - vlib_buffer_t * b2, - vlib_buffer_t * b3, - u16 buffer_advanced0, - u16 buffer_advanced1, - u16 buffer_advanced2, - u16 buffer_advanced3) + vlib_buffer_t * b2, vlib_buffer_t * b3) { vnet_feature_main_t *fm = &feature_main; vnet_feature_config_main_t *cm; @@ -323,20 +324,27 @@ vnet_feature_start_device_input_x4 (u32 sw_if_index, * Save next0 so that the last feature in the chain * can skip ethernet-input if indicated... */ + u16 adv; + vnet_buffer (b0)->device_input_feat.saved_next_index = *next0; + adv = device_input_next_node_advance[*next0]; + vnet_buffer (b0)->device_input_feat.buffer_advance = adv; + vlib_buffer_advance (b0, -adv); + vnet_buffer (b1)->device_input_feat.saved_next_index = *next1; - vnet_buffer (b2)->device_input_feat.saved_next_index = *next2; - vnet_buffer (b3)->device_input_feat.saved_next_index = *next3; + adv = device_input_next_node_advance[*next1]; + vnet_buffer (b1)->device_input_feat.buffer_advance = adv; + vlib_buffer_advance (b1, -adv); - vnet_buffer (b0)->device_input_feat.buffer_advance = buffer_advanced0; - vnet_buffer (b1)->device_input_feat.buffer_advance = buffer_advanced1; - vnet_buffer (b2)->device_input_feat.buffer_advance = buffer_advanced2; - vnet_buffer (b3)->device_input_feat.buffer_advance = buffer_advanced3; + vnet_buffer (b2)->device_input_feat.saved_next_index = *next2; + adv = device_input_next_node_advance[*next2]; + vnet_buffer (b2)->device_input_feat.buffer_advance = adv; + vlib_buffer_advance (b2, -adv); - vlib_buffer_advance (b0, -buffer_advanced0); - vlib_buffer_advance (b1, -buffer_advanced1); - vlib_buffer_advance (b2, -buffer_advanced2); - vlib_buffer_advance (b3, -buffer_advanced3); + vnet_buffer (b3)->device_input_feat.saved_next_index = *next3; + adv = device_input_next_node_advance[*next3]; + vnet_buffer (b3)->device_input_feat.buffer_advance = adv; + vlib_buffer_advance (b3, -adv); b0->feature_arc_index = feature_arc_index; b1->feature_arc_index = feature_arc_index; diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c index 25c930c6..496f3885 100644 --- a/src/vnet/unix/tapcli.c +++ b/src/vnet/unix/tapcli.c @@ -355,8 +355,7 @@ static uword tapcli_rx_iface(vlib_main_t * vm, to_next++; n_left_to_next--; - vnet_feature_start_device_input_x1 (ti->sw_if_index, &next_index, - b_first, 0); + vnet_feature_start_device_input_x1 (ti->sw_if_index, &next_index, b_first); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c index 4a5dd676..2cfcc92f 100644 --- a/src/vnet/unix/tuntap.c +++ b/src/vnet/unix/tuntap.c @@ -351,7 +351,7 @@ tuntap_rx (vlib_main_t * vm, next_index = VNET_DEVICE_INPUT_NEXT_DROP; } - vnet_feature_start_device_input_x1 (tm->sw_if_index, &next_index, b, 0); + vnet_feature_start_device_input_x1 (tm->sw_if_index, &next_index, b); vlib_set_next_frame_buffer (vm, node, next_index, bi); -- cgit 1.2.3-korg From 24537ca721371408e2e2b6e2663b7c43a43ecac3 Mon Sep 17 00:00:00 2001 From: Radu Nicolau Date: Wed, 8 Mar 2017 12:53:19 +0000 Subject: dpdk: Fixed QAT devices binding, bind the VFs only Change-Id: I9ff64b6c583e1c513f91f90798298b55d0793f06 Signed-off-by: Radu Nicolau --- src/plugins/dpdk/device/init.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index e009ef3e..0130c0e9 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1125,8 +1125,12 @@ dpdk_bind_devices_to_uio (dpdk_config_main_t * conf) /* vmxnet3 */ else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0) ; - /* all Intel devices */ - else if (d->vendor_id == 0x8086) + /* all Intel network devices */ + else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_NETWORK_ETHERNET) + ; + /* all Intel QAT devices VFs */ + else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_PROCESSOR_CO && + (d->device_id == 0x0443 || d->device_id == 0x37c9 || d->device_id == 0x19e3)) ; /* Cisco VIC */ else if (d->vendor_id == 0x1137 && d->device_id == 0x0043) @@ -1136,7 +1140,7 @@ dpdk_bind_devices_to_uio (dpdk_config_main_t * conf) ; else { - clib_warning ("Unsupported Ethernet PCI device 0x%04x:0x%04x found " + clib_warning ("Unsupported PCI device 0x%04x:0x%04x found " "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id, pci_addr); continue; -- cgit 1.2.3-korg From c6ec8f39240a166a7e04a7834a28107b6353b46a Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Wed, 8 Mar 2017 10:41:47 -0500 Subject: Change dpdk input node fixed-sleep units to usec Change-Id: I94e8737cad9222d24602db4ad03bf43c12c62b79 Signed-off-by: Dave Barach --- src/plugins/dpdk/device/dpdk.h | 4 ++-- src/plugins/dpdk/device/init.c | 2 +- src/plugins/dpdk/device/node.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 2a1a6205..092c7dca 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -380,8 +380,8 @@ typedef struct f64 link_state_poll_interval; f64 stat_poll_interval; - /* Sleep for this many MS after each device poll */ - u32 poll_sleep; + /* Sleep for this many usec after each device poll */ + u32 poll_sleep_usec; /* convenience */ vlib_main_t *vlib_main; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 0130c0e9..85ecde25 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1354,7 +1354,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) tmp = format (0, "--no-pci%c", 0); vec_add1 (conf->eal_init_args, tmp); } - else if (unformat (input, "poll-sleep %d", &dm->poll_sleep)) + else if (unformat (input, "poll-sleep %d", &dm->poll_sleep_usec)) ; #define _(a) \ diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index ccbfd2f2..2120069e 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -533,12 +533,12 @@ static inline void poll_rate_limit (dpdk_main_t * dm) { /* Limit the poll rate by sleeping for N msec between polls */ - if (PREDICT_FALSE (dm->poll_sleep != 0)) + if (PREDICT_FALSE (dm->poll_sleep_usec != 0)) { struct timespec ts, tsrem; ts.tv_sec = 0; - ts.tv_nsec = 1000 * 1000 * dm->poll_sleep; /* 1ms */ + ts.tv_nsec = 1000 * dm->poll_sleep_usec; while (nanosleep (&ts, &tsrem) < 0) { -- cgit 1.2.3-korg From 25f635852aee76255f7210c43d43668a80fdccce Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 6 Mar 2017 21:51:00 +0100 Subject: dpdk: dpdk-input optimizations and fixes - fix issue caused by assumption that ethertype starts at 0 - intoduce buffer templates to speed-up vlib_buffer_t metadata initialization - avoid check for multiseg buffers inside loop if multiseg is disabled - interleave prefetches to reduce load on L1 cache Change-Id: I3b76e6d3e1e15ed28f01625edb7fbe9f38112e03 Signed-off-by: Damjan Marion --- src/plugins/dpdk.am | 4 + src/plugins/dpdk/device/dpdk.h | 3 + src/plugins/dpdk/device/init.c | 15 ++++ src/plugins/dpdk/device/node.c | 200 +++++++++++++++++++++-------------------- 4 files changed, 126 insertions(+), 96 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk.am b/src/plugins/dpdk.am index 01383de6..f1a37ae2 100644 --- a/src/plugins/dpdk.am +++ b/src/plugins/dpdk.am @@ -16,6 +16,10 @@ vppplugins_LTLIBRARIES += dpdk_plugin.la dpdk_plugin_la_LDFLAGS = $(AM_LDFLAGS) -Wl,--whole-archive,-l:libdpdk.a,--no-whole-archive,-lm,-ldl +# due to internal compiler error in GCC when compiling dpdk/device/node.c +# debug sysmbols level reduced to 1. See GCC PR #79953 for details +CFLAGS += -g1 + dpdk_plugin_la_SOURCES = \ dpdk/main.c \ dpdk/buffer.c \ diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 092c7dca..84f86ae2 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -338,6 +338,9 @@ typedef struct /* per-thread recycle lists */ u32 **recycle; + /* per-thread buffer templates */ + vlib_buffer_t *buffer_templates; + /* buffer flags template, configurable to enable/disable tcp / udp cksum */ u32 buffer_flags_template; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 85ecde25..110d7457 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -572,6 +572,21 @@ dpdk_lib_init (dpdk_main_t * dm) dm->buffer_flags_template &= ~(IP_BUFFER_L4_CHECKSUM_CORRECT | IP_BUFFER_L4_CHECKSUM_COMPUTED); + /* vlib_buffer_t template */ + vec_validate_aligned (dm->buffer_templates, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + for (i = 0; i < tm->n_vlib_mains; i++) + { + vlib_buffer_free_list_t *fl; + vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, i); + fl = vlib_buffer_get_free_list (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + vlib_buffer_init_for_free_list (bt, fl); + bt->flags = dm->buffer_flags_template; + bt->current_data = -RTE_PKTMBUF_HEADROOM; + vnet_buffer (bt)->sw_if_index[VLIB_TX] = (u32) ~ 0; + } + for (i = 0; i < nports; i++) { u8 addr[6]; diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 2120069e..e8d502ca 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -37,21 +37,21 @@ static char *dpdk_error_strings[] = { always_inline int vlib_buffer_is_ip4 (vlib_buffer_t * b) { - ethernet_header_t *h = (ethernet_header_t *) b->data; + ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b); return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP4)); } always_inline int vlib_buffer_is_ip6 (vlib_buffer_t * b) { - ethernet_header_t *h = (ethernet_header_t *) b->data; + ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b); return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6)); } always_inline int vlib_buffer_is_mpls (vlib_buffer_t * b) { - ethernet_header_t *h = (ethernet_header_t *) b->data; + ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b); return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST)); } @@ -217,10 +217,35 @@ dpdk_prefetch_buffer (struct rte_mbuf *mb) } static_always_inline void -dpdk_prefetch_buffer_data (struct rte_mbuf *mb) +dpdk_prefetch_ethertype (struct rte_mbuf *mb) { - vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); - CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (mb->buf_addr + mb->data_off + + STRUCT_OFFSET_OF (ethernet_header_t, type), + CLIB_CACHE_LINE_BYTES, LOAD); +} + + +/* + This function should fill 1st cacheline of vlib_buffer_t metadata with data + from buffer template. Instead of filling field by field, we construct + template and then use 128/256 bit vector instruction to copy data. + This code first loads whole cacheline into 4 128-bit registers (xmm) + or two 256 bit registers (ymm) and then stores data into all 4 buffers + efectively saving on register load operations. +*/ + +static_always_inline void +dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, + void *s) +{ + int i; + for (i = 0; i < 2; i++) + { + *(u8x32 *) (((u8 *) d0) + i * 32) = + *(u8x32 *) (((u8 *) d1) + i * 32) = + *(u8x32 *) (((u8 *) d2) + i * 32) = + *(u8x32 *) (((u8 *) d3) + i * 32) = *(u8x32 *) (((u8 *) s) + i * 32); + } } /* @@ -229,7 +254,8 @@ dpdk_prefetch_buffer_data (struct rte_mbuf *mb) */ static_always_inline u32 dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, - vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id) + vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id, + int maybe_multiseg) { u32 n_buffers; u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; @@ -239,7 +265,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, uword n_rx_bytes = 0; u32 n_trace, trace_cnt __attribute__ ((unused)); vlib_buffer_free_list_t *fl; - u32 buffer_flags_template; + vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, cpu_index); if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) return 0; @@ -251,8 +277,6 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, return 0; } - buffer_flags_template = dm->buffer_flags_template; - vec_reset_length (xd->d_trace_buffers[cpu_index]); trace_cnt = n_trace = vlib_get_trace_count (vm, node); @@ -272,33 +296,44 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + /* Update buffer template */ + vnet_buffer (bt)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + bt->error = node->errors[DPDK_ERROR_NONE]; + mb_index = 0; while (n_buffers > 0) { vlib_buffer_t *b0, *b1, *b2, *b3; - u32 bi0, next0, l3_offset0; - u32 bi1, next1, l3_offset1; - u32 bi2, next2, l3_offset2; - u32 bi3, next3, l3_offset3; + u32 bi0, next0; + u32 bi1, next1; + u32 bi2, next2; + u32 bi3, next3; u8 error0, error1, error2, error3; u64 or_ol_flags; vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (n_buffers > 8 && n_left_to_next > 4) + while (n_buffers >= 12 && n_left_to_next >= 4) { - struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index]; - struct rte_mbuf *mb1 = xd->rx_vectors[queue_id][mb_index + 1]; - struct rte_mbuf *mb2 = xd->rx_vectors[queue_id][mb_index + 2]; - struct rte_mbuf *mb3 = xd->rx_vectors[queue_id][mb_index + 3]; + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; - dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 4]); - dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 5]); - dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 6]); - dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 7]); + /* prefetches are interleaved with the rest of the code to reduce + pressure on L1 cache */ + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 8]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 4]); - if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) + mb0 = xd->rx_vectors[queue_id][mb_index]; + mb1 = xd->rx_vectors[queue_id][mb_index + 1]; + mb2 = xd->rx_vectors[queue_id][mb_index + 2]; + mb3 = xd->rx_vectors[queue_id][mb_index + 3]; + + ASSERT (mb0); + ASSERT (mb1); + ASSERT (mb2); + ASSERT (mb3); + + if (maybe_multiseg) { if (PREDICT_FALSE (mb0->nb_segs > 1)) dpdk_prefetch_buffer (mb0->next); @@ -310,22 +345,29 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, dpdk_prefetch_buffer (mb3->next); } - ASSERT (mb0); - ASSERT (mb1); - ASSERT (mb2); - ASSERT (mb3); - - or_ol_flags = (mb0->ol_flags | mb1->ol_flags | - mb2->ol_flags | mb3->ol_flags); b0 = vlib_buffer_from_rte_mbuf (mb0); b1 = vlib_buffer_from_rte_mbuf (mb1); b2 = vlib_buffer_from_rte_mbuf (mb2); b3 = vlib_buffer_from_rte_mbuf (mb3); - vlib_buffer_init_for_free_list (b0, fl); - vlib_buffer_init_for_free_list (b1, fl); - vlib_buffer_init_for_free_list (b2, fl); - vlib_buffer_init_for_free_list (b3, fl); + dpdk_buffer_init_from_template (b0, b1, b2, b3, bt); + + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 9]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 5]); + + /* current_data must be set to -RTE_PKTMBUF_HEADROOM in template */ + b0->current_data += mb0->data_off; + b1->current_data += mb1->data_off; + b2->current_data += mb2->data_off; + b3->current_data += mb3->data_off; + + b0->current_length = mb0->data_len; + b1->current_length = mb1->data_len; + b2->current_length = mb2->data_len; + b3->current_length = mb3->data_len; + + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 10]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 7]); bi0 = vlib_get_buffer_index (vm, b0); bi1 = vlib_get_buffer_index (vm, b1); @@ -345,21 +387,17 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, } else { - /* prefetch packet data for faster access to the ethertype */ - dpdk_prefetch_buffer_data (xd->rx_vectors[queue_id] - [mb_index + 4]); - dpdk_prefetch_buffer_data (xd->rx_vectors[queue_id] - [mb_index + 5]); - dpdk_prefetch_buffer_data (xd->rx_vectors[queue_id] - [mb_index + 6]); - dpdk_prefetch_buffer_data (xd->rx_vectors[queue_id] - [mb_index + 7]); next0 = dpdk_rx_next_from_etype (mb0, b0); next1 = dpdk_rx_next_from_etype (mb1, b1); next2 = dpdk_rx_next_from_etype (mb2, b2); next3 = dpdk_rx_next_from_etype (mb3, b3); } + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 11]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 6]); + + or_ol_flags = (mb0->ol_flags | mb1->ol_flags | + mb2->ol_flags | mb3->ol_flags); if (PREDICT_FALSE (or_ol_flags & PKT_RX_IP_CKSUM_BAD)) { dpdk_rx_error_from_mb (mb0, &next0, &error0); @@ -371,46 +409,11 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, b2->error = node->errors[error2]; b3->error = node->errors[error3]; } - else - { - b0->error = b1->error = node->errors[DPDK_ERROR_NONE]; - b2->error = b3->error = node->errors[DPDK_ERROR_NONE]; - } - l3_offset0 = device_input_next_node_advance[next0]; - l3_offset1 = device_input_next_node_advance[next1]; - l3_offset2 = device_input_next_node_advance[next2]; - l3_offset3 = device_input_next_node_advance[next3]; - - b0->current_data = l3_offset0 + mb0->data_off; - b1->current_data = l3_offset1 + mb1->data_off; - b2->current_data = l3_offset2 + mb2->data_off; - b3->current_data = l3_offset3 + mb3->data_off; - - b0->current_data -= RTE_PKTMBUF_HEADROOM; - b1->current_data -= RTE_PKTMBUF_HEADROOM; - b2->current_data -= RTE_PKTMBUF_HEADROOM; - b3->current_data -= RTE_PKTMBUF_HEADROOM; - - b0->current_length = mb0->data_len - l3_offset0; - b1->current_length = mb1->data_len - l3_offset1; - b2->current_length = mb2->data_len - l3_offset2; - b3->current_length = mb3->data_len - l3_offset3; - - b0->flags = buffer_flags_template; - b1->flags = buffer_flags_template; - b2->flags = buffer_flags_template; - b3->flags = buffer_flags_template; - - vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b2)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b3)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; - vnet_buffer (b2)->sw_if_index[VLIB_TX] = (u32) ~ 0; - vnet_buffer (b3)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vlib_buffer_advance (b0, device_input_next_node_advance[next0]); + vlib_buffer_advance (b1, device_input_next_node_advance[next1]); + vlib_buffer_advance (b2, device_input_next_node_advance[next2]); + vlib_buffer_advance (b3, device_input_next_node_advance[next3]); n_rx_bytes += mb0->pkt_len; n_rx_bytes += mb1->pkt_len; @@ -418,7 +421,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, n_rx_bytes += mb3->pkt_len; /* Process subsequent segments of multi-segment packets */ - if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) + if (maybe_multiseg) { dpdk_process_subseq_segs (vm, b0, mb0, fl); dpdk_process_subseq_segs (vm, b1, mb1, fl); @@ -452,6 +455,13 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, { struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index]; + if (PREDICT_TRUE (n_buffers > 3)) + { + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 2]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id] + [mb_index + 1]); + } + ASSERT (mb0); b0 = vlib_buffer_from_rte_mbuf (mb0); @@ -460,7 +470,11 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, if (PREDICT_FALSE (mb0->nb_segs > 1)) dpdk_prefetch_buffer (mb0->next); - vlib_buffer_init_for_free_list (b0, fl); + clib_memcpy (b0, bt, CLIB_CACHE_LINE_BYTES); + + ASSERT (b0->current_data == -RTE_PKTMBUF_HEADROOM); + b0->current_data += mb0->data_off; + b0->current_length = mb0->data_len; bi0 = vlib_get_buffer_index (vm, b0); @@ -474,18 +488,9 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, next0 = dpdk_rx_next_from_etype (mb0, b0); dpdk_rx_error_from_mb (mb0, &next0, &error0); - b0->error = node->errors[error0]; - - l3_offset0 = device_input_next_node_advance[next0]; - b0->current_data = l3_offset0; - b0->current_data += mb0->data_off - RTE_PKTMBUF_HEADROOM; - b0->current_length = mb0->data_len - l3_offset0; + vlib_buffer_advance (b0, device_input_next_node_advance[next0]); - b0->flags = buffer_flags_template; - - vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; n_rx_bytes += mb0->pkt_len; /* Process subsequent segments of multi-segment packets */ @@ -604,7 +609,10 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) vec_foreach (dq, dm->devices_by_cpu[cpu_index]) { xd = vec_elt_at_index(dm->devices, dq->device); - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id); + if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) + n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 1); + else + n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 0); } /* *INDENT-ON* */ -- cgit 1.2.3-korg From dfde53ae553f9d4c54882fb55a832cb0614834ea Mon Sep 17 00:00:00 2001 From: Billy McFall Date: Fri, 10 Mar 2017 14:49:15 -0500 Subject: VPP-279: Document changes for vnet/vnet/devices Add doxygen documentation for DPDK crypto CLI command. The move of DPDK to a plugin invalidated the local dir.dox. So moved dpdk/dir.dox to dpdk/device/dir.dox to fix. Change-Id: I229e2921c4b0cdd380021adb520cd2089a376afa Signed-off-by: Billy McFall --- src/plugins/dpdk/device/dir.dox | 27 +++++++++++ src/plugins/dpdk/dir.dox | 27 ----------- src/plugins/dpdk/ipsec/cli.c | 63 ++++++++++++++++++++++++- src/plugins/dpdk/ipsec/dir.dox | 13 ++++- src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md | 21 ++++++++- 5 files changed, 120 insertions(+), 31 deletions(-) create mode 100644 src/plugins/dpdk/device/dir.dox delete mode 100644 src/plugins/dpdk/dir.dox (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dir.dox b/src/plugins/dpdk/device/dir.dox new file mode 100644 index 00000000..43e36753 --- /dev/null +++ b/src/plugins/dpdk/device/dir.dox @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Doxygen directory documentation */ + +/** +@dir +@brief DPDK Abstraction Layer. + +This directory contains the source code for the DPDK abstraction layer. + +*/ +/*? %%clicmd:group_label DPDK and pcap tx %% ?*/ +/*? %%syscfg:group_label DPDK and pcap tx %% ?*/ diff --git a/src/plugins/dpdk/dir.dox b/src/plugins/dpdk/dir.dox deleted file mode 100644 index 43e36753..00000000 --- a/src/plugins/dpdk/dir.dox +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Doxygen directory documentation */ - -/** -@dir -@brief DPDK Abstraction Layer. - -This directory contains the source code for the DPDK abstraction layer. - -*/ -/*? %%clicmd:group_label DPDK and pcap tx %% ?*/ -/*? %%syscfg:group_label DPDK and pcap tx %% ?*/ diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c index 40cee39b..cd0a6037 100644 --- a/src/plugins/dpdk/ipsec/cli.c +++ b/src/plugins/dpdk/ipsec/cli.c @@ -136,11 +136,72 @@ done: return error; } +/*? + * This command is used to display the DPDK Crypto device data. See + * @ref dpdk_crypto_ipsec_doc for more details on initializing the + * DPDK Crypto device. + * + * @cliexpar + * Example of displaying the DPDK Crypto device data when disabled: + * @cliexstart{show crypto device mapping} + * DPDK Cryptodev support is disabled + * @cliexend + * Example of displaying the DPDK Crypto device data when enabled: + * @cliexstart{show crypto device mapping} + * worker crypto device id(type) + * 1 1(SW) + * 2 1(SW) + * @cliexend + * Example of displaying the DPDK Crypto device data when enabled with verbose: + * @cliexstart{show crypto device mapping verbose} + * worker cipher auth dir dev qp + * 1 AES_CTR AES-XCBC-MAC in 1 0 + * 1 AES_CTR HMAC-SHA384 in 1 0 + * 1 AES_CTR HMAC-SHA384 out 1 1 + * 1 AES_CBC HMAC-SHA512 in 1 0 + * 1 AES_CBC HMAC-SHA256 in 1 0 + * 1 AES_CBC AES-XCBC-MAC out 1 1 + * 1 AES_CTR AES-XCBC-MAC out 1 1 + * 1 AES_CBC HMAC-SHA256 out 1 1 + * 1 AES_CTR HMAC-SHA512 out 1 1 + * 1 AES_CTR HMAC-SHA256 in 1 0 + * 1 AES_CTR HMAC-SHA1 in 1 0 + * 1 AES_CBC HMAC-SHA512 out 1 1 + * 1 AES_CBC HMAC-SHA384 out 1 1 + * 1 AES_CTR HMAC-SHA1 out 1 1 + * 1 AES_CTR HMAC-SHA256 out 1 1 + * 1 AES_CBC HMAC-SHA1 in 1 0 + * 1 AES_CBC AES-XCBC-MAC in 1 0 + * 1 AES_CTR HMAC-SHA512 in 1 0 + * 1 AES_CBC HMAC-SHA1 out 1 1 + * 1 AES_CBC HMAC-SHA384 in 1 0 + * 2 AES_CTR AES-XCBC-MAC in 1 2 + * 2 AES_CTR HMAC-SHA384 in 1 2 + * 2 AES_CTR HMAC-SHA384 out 1 3 + * 2 AES_CBC HMAC-SHA512 in 1 2 + * 2 AES_CBC HMAC-SHA256 in 1 2 + * 2 AES_CBC AES-XCBC-MAC out 1 3 + * 2 AES_CTR AES-XCBC-MAC out 1 3 + * 2 AES_CBC HMAC-SHA256 out 1 3 + * 2 AES_CTR HMAC-SHA512 out 1 3 + * 2 AES_CTR HMAC-SHA256 in 1 2 + * 2 AES_CTR HMAC-SHA1 in 1 2 + * 2 AES_CBC HMAC-SHA512 out 1 3 + * 2 AES_CBC HMAC-SHA384 out 1 3 + * 2 AES_CTR HMAC-SHA1 out 1 3 + * 2 AES_CTR HMAC-SHA256 out 1 3 + * 2 AES_CBC HMAC-SHA1 in 1 2 + * 2 AES_CBC AES-XCBC-MAC in 1 2 + * 2 AES_CTR HMAC-SHA512 in 1 2 + * 2 AES_CBC HMAC-SHA1 out 1 3 + * 2 AES_CBC HMAC-SHA384 in 1 2 + * @cliexend +?*/ /* *INDENT-OFF* */ VLIB_CLI_COMMAND (lcore_cryptodev_map, static) = { .path = "show crypto device mapping", .short_help = - "show cryptodev device mapping ", + "show cryptodev device mapping [verbose]", .function = lcore_cryptodev_map_fn, }; /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/ipsec/dir.dox b/src/plugins/dpdk/ipsec/dir.dox index ffebfc4d..05504541 100644 --- a/src/plugins/dpdk/ipsec/dir.dox +++ b/src/plugins/dpdk/ipsec/dir.dox @@ -1,5 +1,6 @@ /* * Copyright (c) 2016 Intel and/or its affiliates. + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: @@ -12,7 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/* Doxygen directory documentation */ + /** - @dir vnet/vnet/devices/dpdk/ipsec - @brief IPSec ESP encrypt/decrypt using DPDK Cryptodev API +@dir src/plugins/dpdk/ipsec +@brief IPSec ESP encrypt/decrypt using DPDK Cryptodev API. + +This directory contains the source code for the DPDK Crypto abstraction layer. + */ +/*? %%clicmd:group_label DPDK Crypto %% ?*/ +/*? %%syscfg:group_label DPDK Crypto %% ?*/ diff --git a/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md b/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md index fed2fe0e..b3d3cc48 100644 --- a/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md +++ b/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md @@ -40,7 +40,7 @@ A couple of ways to achive this: * uncomment/add it in the platforms config (ie. build-data/platforms/vpp.mk) * set the option when building vpp (ie. make vpp_uses_dpdk_cryptodev_sw=yes build-release) -When enabling SW Cryptodev support, it means that you need to pre-build the required crypto libraries needed by those SW Cryptodev PMDs. +When enabling SW Cryptodev support, it means that you need to pre-build the required crypto libraries needed by those SW Cryptodev PMDs. This requires nasm, see nasm section below. ### Crypto Resources allocation @@ -84,3 +84,22 @@ For further details refer to [DPDK Crypto Device Driver documentation](http://dp The following CLI command displays the Cryptodev/Worker mapping: show crypto device mapping [verbose] + + +### nasm + +Building the DPDK Crypto Libraries requires the open source project nasm (The Netwide +Assembler) to be installed. Recommended version of nasm is 2.12.02. Minimum supported +version of nasm is 2.11.06. Use the following command to determine the current nasm version: + + nasm -v + +CentOS 7.3 and earlier and Fedora 21 and earlier use unsupported versions +of nasm. Use the following set of commands to build a supported version: + + wget http://www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2 + tar -xjvf nasm-2.12.02.tar.bz2 + cd nasm-2.12.02/ + ./configure + make + sudo make install -- cgit 1.2.3-korg From 803c51d7a17109eb0881a0b9da20a45f3a0391ab Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 14 Mar 2017 12:23:30 -0400 Subject: Fix packet trace output: dpdk_rx_trace must decode the actual ethertype As opposed to rubbish (typically) 14 octets past the ethertype. Also fix buffer error code setup in dpdk-input node single loop. Change-Id: Ide7c4097d3bb91e62749ed4e1d69a7b4b90225de Signed-off-by: Dave Barach Signed-off-by: John Lo --- src/plugins/dpdk/device/node.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index e8d502ca..e740fd18 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -73,6 +73,35 @@ dpdk_rx_next_from_etype (struct rte_mbuf * mb, vlib_buffer_t * b0) return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; } +always_inline u32 +dpdk_rx_next_from_packet_start (struct rte_mbuf * mb, vlib_buffer_t * b0) +{ + word start_delta; + int rv; + + start_delta = b0->current_data - + ((mb->buf_addr + mb->data_off) - (void *) b0->data); + + vlib_buffer_advance (b0, -start_delta); + + if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0))) + { + if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0)) + rv = VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; + else + rv = VNET_DEVICE_INPUT_NEXT_IP4_INPUT; + } + else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0))) + rv = VNET_DEVICE_INPUT_NEXT_IP6_INPUT; + else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0))) + rv = VNET_DEVICE_INPUT_NEXT_MPLS_INPUT; + else + rv = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + + vlib_buffer_advance (b0, start_delta); + return rv; +} + always_inline void dpdk_rx_error_from_mb (struct rte_mbuf *mb, u32 * next, u8 * error) { @@ -115,7 +144,7 @@ dpdk_rx_trace (dpdk_main_t * dm, if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) next0 = xd->per_interface_next_index; else - next0 = dpdk_rx_next_from_etype (mb, b0); + next0 = dpdk_rx_next_from_packet_start (mb, b0); dpdk_rx_error_from_mb (mb, &next0, &error0); @@ -488,6 +517,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, next0 = dpdk_rx_next_from_etype (mb0, b0); dpdk_rx_error_from_mb (mb0, &next0, &error0); + b0->error = node->errors[error0]; vlib_buffer_advance (b0, device_input_next_node_advance[next0]); -- cgit 1.2.3-korg From ed812458641e8805e98a5bb399248745bae5d6b7 Mon Sep 17 00:00:00 2001 From: "Alexander Popovsky (apopovsk)" Date: Fri, 17 Mar 2017 12:08:16 -0700 Subject: Fix bonded ethernet PMD name for DPDK 17.02 In DPDK 17.02 bonded PMD name is changed from rte_bond_pmd to net_bonding. Change-Id: I1a57a16b0ae68b5fa56a561a4f75981112228572 Signed-off-by: Alexander Popovsky (apopovsk) --- src/plugins/dpdk/device/dpdk.h | 21 +++++++++++++++++++++ src/plugins/dpdk/device/init.c | 10 +++------- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 84f86ae2..6328d115 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -66,6 +66,26 @@ extern vnet_device_class_t dpdk_device_class; extern vlib_node_registration_t dpdk_input_node; extern vlib_node_registration_t handoff_dispatch_node; +#if RTE_VERSION >= RTE_VERSION_NUM(17, 2, 0, 0) +#define foreach_dpdk_pmd \ + _ ("net_thunderx", THUNDERX) \ + _ ("net_e1000_em", E1000EM) \ + _ ("net_e1000_igb", IGB) \ + _ ("net_e1000_igb_vf", IGBVF) \ + _ ("net_ixgbe", IXGBE) \ + _ ("net_ixgbe_vf", IXGBEVF) \ + _ ("net_i40e", I40E) \ + _ ("net_i40e_vf", I40EVF) \ + _ ("net_virtio", VIRTIO) \ + _ ("net_enic", ENIC) \ + _ ("net_vmxnet3", VMXNET3) \ + _ ("AF_PACKET PMD", AF_PACKET) \ + _ ("net_bonding", BOND) \ + _ ("net_fm10k", FM10K) \ + _ ("net_cxgbe", CXGBE) \ + _ ("net_mlx5", MLX5) \ + _ ("net_dpaa2", DPAA2) +#else #define foreach_dpdk_pmd \ _ ("net_thunderx", THUNDERX) \ _ ("net_e1000_em", E1000EM) \ @@ -84,6 +104,7 @@ extern vlib_node_registration_t handoff_dispatch_node; _ ("net_cxgbe", CXGBE) \ _ ("net_mlx5", MLX5) \ _ ("net_dpaa2", DPAA2) +#endif typedef enum { diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 110d7457..ca905326 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1872,13 +1872,9 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) { for (i = 0; i < nports; i++) { - struct rte_eth_dev_info dev_info; - rte_eth_dev_info_get (i, &dev_info); - if (!dev_info.driver_name) - dev_info.driver_name = dev_info.pci_dev->driver->driver.name; - - ASSERT (dev_info.driver_name); - if (strncmp (dev_info.driver_name, "rte_bond_pmd", 12) == 0) + xd = &dm->devices[i]; + ASSERT (i == xd->device_index); + if (xd->pmd == VNET_DPDK_PMD_BOND) { u8 addr[6]; u8 slink[16]; -- cgit 1.2.3-korg From 95475a3661b95150c8d1e60a3942b91c2b5d06bc Mon Sep 17 00:00:00 2001 From: John Lo Date: Fri, 17 Mar 2017 18:05:35 -0400 Subject: Fix bonded interface name to use instance number starting from 0 With DPDK 17.02, bonded interface device_index does not start from 0 and may vary depends on the existence of other interfaces. Implement instance number for bonded interface so the interface name can make use of instance number starting from 0. Change-Id: Ia8eb7bd530446faa02ee7c7d1f6abdc22ac60b62 Signed-off-by: John Lo --- src/plugins/dpdk/device/dpdk.h | 4 ++-- src/plugins/dpdk/device/format.c | 4 ++-- src/plugins/dpdk/device/init.c | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 6328d115..06d89adb 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -220,8 +220,8 @@ typedef struct dpdk_device_hqos_per_worker_thread_t *hqos_wt; dpdk_device_hqos_per_hqos_thread_t *hqos_ht; - /* af_packet */ - u8 af_packet_port_id; + /* af_packet or BondEthernet instance number */ + u8 port_id; struct rte_eth_link link; f64 time_last_link_update; diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index 25a8c5cb..a09a3f83 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -177,7 +177,7 @@ format_dpdk_device_name (u8 * s, va_list * args) break; case VNET_DPDK_PORT_TYPE_ETH_BOND: - return format (s, "BondEthernet%d", dm->devices[i].device_index); + return format (s, "BondEthernet%d", dm->devices[i].port_id); case VNET_DPDK_PORT_TYPE_ETH_SWITCH: device_name = "EthernetSwitch"; @@ -185,7 +185,7 @@ format_dpdk_device_name (u8 * s, va_list * args) case VNET_DPDK_PORT_TYPE_AF_PACKET: rte_eth_dev_info_get (i, &dev_info); - return format (s, "af_packet%d", dm->devices[i].af_packet_port_id); + return format (s, "af_packet%d", dm->devices[i].port_id); default: case VNET_DPDK_PORT_TYPE_UNKNOWN: diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index ca905326..145162eb 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -516,6 +516,7 @@ dpdk_lib_init (dpdk_main_t * dm) u32 next_cpu = 0, next_hqos_cpu = 0; u8 af_packet_port_id = 0; + u8 bond_ether_port_id = 0; last_pci_addr.as_u32 = ~0; dm->input_cpu_first_index = 0; @@ -862,12 +863,13 @@ dpdk_lib_init (dpdk_main_t * dm) case VNET_DPDK_PMD_AF_PACKET: xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET; - xd->af_packet_port_id = af_packet_port_id++; + xd->port_id = af_packet_port_id++; break; case VNET_DPDK_PMD_BOND: xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; xd->port_type = VNET_DPDK_PORT_TYPE_ETH_BOND; + xd->port_id = bond_ether_port_id++; break; default: -- cgit 1.2.3-korg From 24d01367c869f968b9e21c0cae4df8c5941fdf16 Mon Sep 17 00:00:00 2001 From: John Lo Date: Wed, 22 Mar 2017 13:27:27 -0400 Subject: Update MAC address kept for Bonded Interface and its Slaves Change-Id: I9f7f9f840c3c1aad5e8c9a4fa1ba7a58a85cfd9e Signed-off-by: John Lo --- src/plugins/dpdk/device/init.c | 55 ++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 23 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 145162eb..9dc3fcce 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1888,24 +1888,24 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) int rv; /* Get MAC of 1st slave link */ - rte_eth_macaddr_get (slink[0], - (struct ether_addr *) addr); + rte_eth_macaddr_get + (slink[0], (struct ether_addr *) addr); + /* Set MAC of bounded interface to that of 1st slave link */ - rv = - rte_eth_bond_mac_address_set (i, - (struct ether_addr *) - addr); - if (rv < 0) - clib_warning ("Failed to set MAC address"); + clib_warning ("Set MAC for bond dev# %d", i); + rv = rte_eth_bond_mac_address_set + (i, (struct ether_addr *) addr); + if (rv) + clib_warning ("Set MAC addr failure rv=%d", rv); /* Populate MAC of bonded interface in VPP hw tables */ - bhi = - vnet_get_hw_interface (vnm, - dm->devices[i].vlib_hw_if_index); - bei = - pool_elt_at_index (em->interfaces, bhi->hw_instance); + bhi = vnet_get_hw_interface + (vnm, dm->devices[i].vlib_hw_if_index); + bei = pool_elt_at_index + (em->interfaces, bhi->hw_instance); clib_memcpy (bhi->hw_address, addr, 6); clib_memcpy (bei->address, addr, 6); + /* Init l3 packet size allowed on bonded interface */ bhi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES; bhi->max_l3_packet_bytes[VLIB_RX] = @@ -1917,22 +1917,31 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) dpdk_device_t *sdev = &dm->devices[slave]; vnet_hw_interface_t *shi; vnet_sw_interface_t *ssi; + ethernet_interface_t *sei; /* Add MAC to all slave links except the first one */ if (nlink) - rte_eth_dev_mac_addr_add (slave, - (struct ether_addr *) - addr, 0); + { + clib_warning ("Add MAC for slave dev# %d", slave); + rv = rte_eth_dev_mac_addr_add + (slave, (struct ether_addr *) addr, 0); + if (rv) + clib_warning ("Add MAC addr failure rv=%d", rv); + } /* Set slaves bitmap for bonded interface */ - bhi->bond_info = - clib_bitmap_set (bhi->bond_info, - sdev->vlib_hw_if_index, 1); + bhi->bond_info = clib_bitmap_set + (bhi->bond_info, sdev->vlib_hw_if_index, 1); /* Set slave link flags on slave interface */ - shi = - vnet_get_hw_interface (vnm, sdev->vlib_hw_if_index); - ssi = - vnet_get_sw_interface (vnm, sdev->vlib_sw_if_index); + shi = vnet_get_hw_interface + (vnm, sdev->vlib_hw_if_index); + ssi = vnet_get_sw_interface + (vnm, sdev->vlib_sw_if_index); + sei = pool_elt_at_index + (em->interfaces, shi->hw_instance); + shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE; ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE; + clib_memcpy (shi->hw_address, addr, 6); + clib_memcpy (sei->address, addr, 6); /* Set l3 packet size allowed as the lowest of slave */ if (bhi->max_l3_packet_bytes[VLIB_RX] > -- cgit 1.2.3-korg From 0f60ff8af3dd72bb1fa8f13886a80d110d78c7b0 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 30 Mar 2017 17:58:42 +0200 Subject: dpdk: add support for Mellanox ConnectX-5 devices Change-Id: I3ed2834a326eac50a7cb4faa592f42fd06325d5a Signed-off-by: Damjan Marion --- dpdk/Makefile | 2 +- ...t-of-buffer-counter-to-extended-statistic.patch | 135 ++++ ...t-mlx5-remove-unused-interface-name-query.patch | 22 + ...mlx5-fix-extended-statistics-wrong-number.patch | 87 +++ ...tended-statistics-counters-identification.patch | 13 + ...5-fix-startup-when-flow-cannot-be-applied.patch | 57 ++ .../0006-net-mlx5-add-hardware-TSO-support.patch | 385 ++++++++++ ...d-hardware-checksum-offload-for-tunnel-pa.patch | 194 +++++ ...d-enhanced-multi-packet-send-for-ConnectX.patch | 809 +++++++++++++++++++++ src/plugins/dpdk/device/init.c | 5 +- 10 files changed, 1707 insertions(+), 2 deletions(-) create mode 100644 dpdk/dpdk-17.02_patches/0001-dpdk-dev-net-mlx5-add-out-of-buffer-counter-to-extended-statistic.patch create mode 100644 dpdk/dpdk-17.02_patches/0002-dpdk-dev-1-2-net-mlx5-remove-unused-interface-name-query.patch create mode 100644 dpdk/dpdk-17.02_patches/0003-dpdk-dev-2-2-net-mlx5-fix-extended-statistics-wrong-number.patch create mode 100644 dpdk/dpdk-17.02_patches/0004-dpdk-dev-net-mlx5-fix-extended-statistics-counters-identification.patch create mode 100644 dpdk/dpdk-17.02_patches/0005-net-mlx5-fix-startup-when-flow-cannot-be-applied.patch create mode 100644 dpdk/dpdk-17.02_patches/0006-net-mlx5-add-hardware-TSO-support.patch create mode 100644 dpdk/dpdk-17.02_patches/0007-add-hardware-checksum-offload-for-tunnel-pa.patch create mode 100644 dpdk/dpdk-17.02_patches/0008-net-mlx5-add-enhanced-multi-packet-send-for-ConnectX.patch (limited to 'src/plugins/dpdk/device') diff --git a/dpdk/Makefile b/dpdk/Makefile index fc93f9c6..c46ef0f1 100644 --- a/dpdk/Makefile +++ b/dpdk/Makefile @@ -25,7 +25,7 @@ DPDK_MLX5_PMD ?= n B := $(DPDK_BUILD_DIR) I := $(DPDK_INSTALL_DIR) DPDK_VERSION ?= 17.02 -PKG_SUFFIX ?= vpp2 +PKG_SUFFIX ?= vpp3 DPDK_BASE_URL ?= http://fast.dpdk.org/rel DPDK_TARBALL := dpdk-$(DPDK_VERSION).tar.xz DPDK_TAR_URL := $(DPDK_BASE_URL)/$(DPDK_TARBALL) diff --git a/dpdk/dpdk-17.02_patches/0001-dpdk-dev-net-mlx5-add-out-of-buffer-counter-to-extended-statistic.patch b/dpdk/dpdk-17.02_patches/0001-dpdk-dev-net-mlx5-add-out-of-buffer-counter-to-extended-statistic.patch new file mode 100644 index 00000000..3ebf5e8a --- /dev/null +++ b/dpdk/dpdk-17.02_patches/0001-dpdk-dev-net-mlx5-add-out-of-buffer-counter-to-extended-statistic.patch @@ -0,0 +1,135 @@ +diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h +index 879da5e..2b4345a 100644 +--- a/drivers/net/mlx5/mlx5.h ++++ b/drivers/net/mlx5/mlx5.h +@@ -197,6 +197,8 @@ struct mlx5_secondary_data { + int mlx5_is_secondary(void); + int priv_get_ifname(const struct priv *, char (*)[IF_NAMESIZE]); + int priv_ifreq(const struct priv *, int req, struct ifreq *); ++int priv_is_ib_cntr(const char *); ++int priv_get_cntr_sysfs(struct priv *, const char *, uint64_t *); + int priv_get_num_vfs(struct priv *, uint16_t *); + int priv_get_mtu(struct priv *, uint16_t *); + int priv_set_flags(struct priv *, unsigned int, unsigned int); +diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c +index 2145965..6b64f44 100644 +--- a/drivers/net/mlx5/mlx5_ethdev.c ++++ b/drivers/net/mlx5/mlx5_ethdev.c +@@ -234,6 +234,23 @@ struct priv * + } + + /** ++ * Check if the counter is located on ib counters file. ++ * ++ * @param[in] cntr ++ * Counter name. ++ * ++ * @return ++ * 1 if counter is located on ib counters file , 0 otherwise. ++ */ ++int ++priv_is_ib_cntr(const char *cntr) ++{ ++ if (!strcmp(cntr, "out_of_buffer")) ++ return 1; ++ return 0; ++} ++ ++/** + * Read from sysfs entry. + * + * @param[in] priv +@@ -260,10 +277,15 @@ struct priv * + if (priv_get_ifname(priv, &ifname)) + return -1; + +- MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, +- ifname, entry); +- +- file = fopen(path, "rb"); ++ if (priv_is_ib_cntr(entry)) { ++ MKSTR(path, "%s/ports/1/hw_counters/%s", ++ priv->ctx->device->ibdev_path, entry); ++ file = fopen(path, "rb"); ++ } else { ++ MKSTR(path, "%s/device/net/%s/%s", ++ priv->ctx->device->ibdev_path, ifname, entry); ++ file = fopen(path, "rb"); ++ } + if (file == NULL) + return -1; + ret = fread(buf, 1, size, file); +@@ -469,6 +491,30 @@ struct priv * + } + + /** ++ * Read device counter from sysfs. ++ * ++ * @param priv ++ * Pointer to private structure. ++ * @param name ++ * Counter name. ++ * @param[out] cntr ++ * Counter output buffer. ++ * ++ * @return ++ * 0 on success, -1 on failure and errno is set. ++ */ ++int ++priv_get_cntr_sysfs(struct priv *priv, const char *name, uint64_t *cntr) ++{ ++ unsigned long ulong_ctr; ++ ++ if (priv_get_sysfs_ulong(priv, name, &ulong_ctr) == -1) ++ return -1; ++ *cntr = ulong_ctr; ++ return 0; ++} ++ ++/** + * Set device MTU. + * + * @param priv +diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c +index 20c957e..a48ebea 100644 +--- a/drivers/net/mlx5/mlx5_stats.c ++++ b/drivers/net/mlx5/mlx5_stats.c +@@ -125,6 +125,10 @@ struct mlx5_counter_ctrl { + .dpdk_name = "tx_errors_phy", + .ctr_name = "tx_errors_phy", + }, ++ { ++ .dpdk_name = "rx_out_of_buffer", ++ .ctr_name = "out_of_buffer", ++ }, + }; + + static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init); +@@ -159,9 +163,15 @@ struct mlx5_counter_ctrl { + WARN("unable to read statistic values from device"); + return -1; + } +- for (i = 0; i != xstats_n; ++i) +- stats[i] = (uint64_t) +- et_stats->data[xstats_ctrl->dev_table_idx[i]]; ++ for (i = 0; i != xstats_n; ++i) { ++ if (priv_is_ib_cntr(mlx5_counters_init[i].ctr_name)) ++ priv_get_cntr_sysfs(priv, ++ mlx5_counters_init[i].ctr_name, ++ &stats[i]); ++ else ++ stats[i] = (uint64_t) ++ et_stats->data[xstats_ctrl->dev_table_idx[i]]; ++ } + return 0; + } + +@@ -233,6 +243,8 @@ struct mlx5_counter_ctrl { + } + } + for (j = 0; j != xstats_n; ++j) { ++ if (priv_is_ib_cntr(mlx5_counters_init[i].ctr_name)) ++ continue; + if (xstats_ctrl->dev_table_idx[j] >= dev_stats_n) { + WARN("counter \"%s\" is not recognized", + mlx5_counters_init[j].dpdk_name); diff --git a/dpdk/dpdk-17.02_patches/0002-dpdk-dev-1-2-net-mlx5-remove-unused-interface-name-query.patch b/dpdk/dpdk-17.02_patches/0002-dpdk-dev-1-2-net-mlx5-remove-unused-interface-name-query.patch new file mode 100644 index 00000000..aa03639b --- /dev/null +++ b/dpdk/dpdk-17.02_patches/0002-dpdk-dev-1-2-net-mlx5-remove-unused-interface-name-query.patch @@ -0,0 +1,22 @@ +diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c +index 20c957e..0c80e4f 100644 +--- a/drivers/net/mlx5/mlx5_stats.c ++++ b/drivers/net/mlx5/mlx5_stats.c +@@ -177,17 +177,12 @@ struct mlx5_counter_ctrl { + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + unsigned int i; + unsigned int j; +- char ifname[IF_NAMESIZE]; + struct ifreq ifr; + struct ethtool_drvinfo drvinfo; + struct ethtool_gstrings *strings = NULL; + unsigned int dev_stats_n; + unsigned int str_sz; + +- if (priv_get_ifname(priv, &ifname)) { +- WARN("unable to get interface name"); +- return; +- } + /* How many statistics are available. */ + drvinfo.cmd = ETHTOOL_GDRVINFO; + ifr.ifr_data = (caddr_t)&drvinfo; diff --git a/dpdk/dpdk-17.02_patches/0003-dpdk-dev-2-2-net-mlx5-fix-extended-statistics-wrong-number.patch b/dpdk/dpdk-17.02_patches/0003-dpdk-dev-2-2-net-mlx5-fix-extended-statistics-wrong-number.patch new file mode 100644 index 00000000..05c2e8df --- /dev/null +++ b/dpdk/dpdk-17.02_patches/0003-dpdk-dev-2-2-net-mlx5-fix-extended-statistics-wrong-number.patch @@ -0,0 +1,87 @@ +diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c +index 0c80e4f..60ffbaa 100644 +--- a/drivers/net/mlx5/mlx5_stats.c ++++ b/drivers/net/mlx5/mlx5_stats.c +@@ -166,6 +166,29 @@ struct mlx5_counter_ctrl { + } + + /** ++ * Query the number of statistics provided by ETHTOOL. ++ * ++ * @param priv ++ * Pointer to private structure. ++ * ++ * @return ++ * Number of statistics on success, -1 on error. ++ */ ++static int ++priv_ethtool_get_stats_n(struct priv *priv) { ++ struct ethtool_drvinfo drvinfo; ++ struct ifreq ifr; ++ ++ drvinfo.cmd = ETHTOOL_GDRVINFO; ++ ifr.ifr_data = (caddr_t)&drvinfo; ++ if (priv_ifreq(priv, SIOCETHTOOL, &ifr) != 0) { ++ WARN("unable to query number of statistics"); ++ return -1; ++ } ++ return drvinfo.n_stats; ++} ++ ++/** + * Init the structures to read device counters. + * + * @param priv +@@ -178,19 +201,11 @@ struct mlx5_counter_ctrl { + unsigned int i; + unsigned int j; + struct ifreq ifr; +- struct ethtool_drvinfo drvinfo; + struct ethtool_gstrings *strings = NULL; + unsigned int dev_stats_n; + unsigned int str_sz; + +- /* How many statistics are available. */ +- drvinfo.cmd = ETHTOOL_GDRVINFO; +- ifr.ifr_data = (caddr_t)&drvinfo; +- if (priv_ifreq(priv, SIOCETHTOOL, &ifr) != 0) { +- WARN("unable to get driver info"); +- return; +- } +- dev_stats_n = drvinfo.n_stats; ++ dev_stats_n = priv_ethtool_get_stats_n(priv); + if (dev_stats_n < 1) { + WARN("no extended statistics available"); + return; +@@ -410,7 +425,15 @@ struct mlx5_counter_ctrl { + int ret = xstats_n; + + if (n >= xstats_n && stats) { ++ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; ++ int stats_n; ++ + priv_lock(priv); ++ stats_n = priv_ethtool_get_stats_n(priv); ++ if (stats_n < 0) ++ return -1; ++ if (xstats_ctrl->stats_n != stats_n) ++ priv_xstats_init(priv); + ret = priv_xstats_get(priv, stats); + priv_unlock(priv); + } +@@ -427,8 +450,15 @@ struct mlx5_counter_ctrl { + mlx5_xstats_reset(struct rte_eth_dev *dev) + { + struct priv *priv = mlx5_get_priv(dev); ++ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; ++ int stats_n; + + priv_lock(priv); ++ stats_n = priv_ethtool_get_stats_n(priv); ++ if (stats_n < 0) ++ return; ++ if (xstats_ctrl->stats_n != stats_n) ++ priv_xstats_init(priv); + priv_xstats_reset(priv); + priv_unlock(priv); + } diff --git a/dpdk/dpdk-17.02_patches/0004-dpdk-dev-net-mlx5-fix-extended-statistics-counters-identification.patch b/dpdk/dpdk-17.02_patches/0004-dpdk-dev-net-mlx5-fix-extended-statistics-counters-identification.patch new file mode 100644 index 00000000..8c066ad3 --- /dev/null +++ b/dpdk/dpdk-17.02_patches/0004-dpdk-dev-net-mlx5-fix-extended-statistics-counters-identification.patch @@ -0,0 +1,13 @@ +diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c +index 1953293..703f48c 100644 +--- a/drivers/net/mlx5/mlx5_stats.c ++++ b/drivers/net/mlx5/mlx5_stats.c +@@ -253,7 +253,7 @@ struct mlx5_counter_ctrl { + } + } + for (j = 0; j != xstats_n; ++j) { +- if (priv_is_ib_cntr(mlx5_counters_init[i].ctr_name)) ++ if (priv_is_ib_cntr(mlx5_counters_init[j].ctr_name)) + continue; + if (xstats_ctrl->dev_table_idx[j] >= dev_stats_n) { + WARN("counter \"%s\" is not recognized", diff --git a/dpdk/dpdk-17.02_patches/0005-net-mlx5-fix-startup-when-flow-cannot-be-applied.patch b/dpdk/dpdk-17.02_patches/0005-net-mlx5-fix-startup-when-flow-cannot-be-applied.patch new file mode 100644 index 00000000..af928bb2 --- /dev/null +++ b/dpdk/dpdk-17.02_patches/0005-net-mlx5-fix-startup-when-flow-cannot-be-applied.patch @@ -0,0 +1,57 @@ +From 0866d640e42d6c54b2b3f15ebde9930e756ba4d5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?N=C3=A9lio=20Laranjeiro?= +Date: Wed, 22 Feb 2017 10:57:52 +0100 +Subject: [PATCH] net/mlx5: fix startup when flow cannot be applied + +When flows cannot be re-applied due to configuration modifications, the +start function should rollback the configuration done. + +Fixes: 2097d0d1e2cc ("net/mlx5: support basic flow items and actions") +Cc: stable@dpdk.org + +Signed-off-by: Nelio Laranjeiro +--- + drivers/net/mlx5/mlx5_trigger.c | 21 ++++++++++++++++----- + 1 file changed, 16 insertions(+), 5 deletions(-) + +diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c +index 30addd2..0acbf28 100644 +--- a/drivers/net/mlx5/mlx5_trigger.c ++++ b/drivers/net/mlx5/mlx5_trigger.c +@@ -82,17 +82,28 @@ mlx5_dev_start(struct rte_eth_dev *dev) + ERROR("%p: an error occurred while configuring hash RX queues:" + " %s", + (void *)priv, strerror(err)); +- /* Rollback. */ +- priv_special_flow_disable_all(priv); +- priv_mac_addrs_disable(priv); +- priv_destroy_hash_rxqs(priv); ++ goto error; + } + if (dev->data->dev_conf.fdir_conf.mode != RTE_FDIR_MODE_NONE) + priv_fdir_enable(priv); +- priv_dev_interrupt_handler_install(priv, dev); + err = priv_flow_start(priv); ++ if (err) { ++ priv->started = 0; ++ ERROR("%p: an error occurred while configuring flows:" ++ " %s", ++ (void *)priv, strerror(err)); ++ goto error; ++ } ++ priv_dev_interrupt_handler_install(priv, dev); + priv_xstats_init(priv); + priv_unlock(priv); ++ return 0; ++error: ++ /* Rollback. */ ++ priv_special_flow_disable_all(priv); ++ priv_mac_addrs_disable(priv); ++ priv_destroy_hash_rxqs(priv); ++ priv_flow_stop(priv); + return -err; + } + +-- +2.7.4 + diff --git a/dpdk/dpdk-17.02_patches/0006-net-mlx5-add-hardware-TSO-support.patch b/dpdk/dpdk-17.02_patches/0006-net-mlx5-add-hardware-TSO-support.patch new file mode 100644 index 00000000..929a6132 --- /dev/null +++ b/dpdk/dpdk-17.02_patches/0006-net-mlx5-add-hardware-TSO-support.patch @@ -0,0 +1,385 @@ +From e25bad4a287924d26627ffe307f8a12824b87054 Mon Sep 17 00:00:00 2001 +From: Shahaf Shuler +Date: Thu, 2 Mar 2017 11:01:31 +0200 +Subject: [PATCH] net/mlx5: add hardware TSO support + +Implement support for hardware TSO. + +Signed-off-by: Shahaf Shuler +Acked-by: Nelio Laranjeiro +--- + doc/guides/nics/features/mlx5.ini | 1 + + doc/guides/nics/mlx5.rst | 12 ++++ + drivers/net/mlx5/mlx5.c | 18 ++++++ + drivers/net/mlx5/mlx5.h | 2 + + drivers/net/mlx5/mlx5_defs.h | 3 + + drivers/net/mlx5/mlx5_ethdev.c | 2 + + drivers/net/mlx5/mlx5_rxtx.c | 123 +++++++++++++++++++++++++++++++++----- + drivers/net/mlx5/mlx5_rxtx.h | 2 + + drivers/net/mlx5/mlx5_txq.c | 13 ++++ + 9 files changed, 160 insertions(+), 16 deletions(-) + +diff --git a/doc/guides/nics/features/mlx5.ini b/doc/guides/nics/features/mlx5.ini +index f20d214..8df25ce 100644 +--- a/doc/guides/nics/features/mlx5.ini ++++ b/doc/guides/nics/features/mlx5.ini +@@ -11,6 +11,7 @@ Queue start/stop = Y + MTU update = Y + Jumbo frame = Y + Scattered Rx = Y ++TSO = Y + Promiscuous mode = Y + Allmulticast mode = Y + Unicast MAC filter = Y +diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst +index 5f6e594..9b0ba29 100644 +--- a/doc/guides/nics/mlx5.rst ++++ b/doc/guides/nics/mlx5.rst +@@ -90,6 +90,7 @@ Features + - Secondary process TX is supported. + - KVM and VMware ESX SR-IOV modes are supported. + - RSS hash result is supported. ++- Hardware TSO. + + Limitations + ----------- +@@ -186,9 +187,20 @@ Run-time configuration + save PCI bandwidth and improve performance at the cost of a slightly + higher CPU usage. + ++ This option cannot be used in conjunction with ``tso`` below. When ``tso`` ++ is set, ``txq_mpw_en`` is disabled. ++ + It is currently only supported on the ConnectX-4 Lx and ConnectX-5 + families of adapters. Enabled by default. + ++- ``tso`` parameter [int] ++ ++ A nonzero value enables hardware TSO. ++ When hardware TSO is enabled, packets marked with TCP segmentation ++ offload will be divided into segments by the hardware. ++ ++ Disabled by default. ++ + Prerequisites + ------------- + +diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c +index d4bd469..03ed3b3 100644 +--- a/drivers/net/mlx5/mlx5.c ++++ b/drivers/net/mlx5/mlx5.c +@@ -84,6 +84,9 @@ + /* Device parameter to enable multi-packet send WQEs. */ + #define MLX5_TXQ_MPW_EN "txq_mpw_en" + ++/* Device parameter to enable hardware TSO offload. */ ++#define MLX5_TSO "tso" ++ + /** + * Retrieve integer value from environment variable. + * +@@ -290,6 +293,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque) + priv->txqs_inline = tmp; + } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { + priv->mps &= !!tmp; /* Enable MPW only if HW supports */ ++ } else if (strcmp(MLX5_TSO, key) == 0) { ++ priv->tso = !!tmp; + } else { + WARN("%s: unknown parameter", key); + return -EINVAL; +@@ -316,6 +321,7 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs) + MLX5_TXQ_INLINE, + MLX5_TXQS_MIN_INLINE, + MLX5_TXQ_MPW_EN, ++ MLX5_TSO, + NULL, + }; + struct rte_kvargs *kvlist; +@@ -479,6 +485,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) + IBV_EXP_DEVICE_ATTR_RX_HASH | + IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS | + IBV_EXP_DEVICE_ATTR_RX_PAD_END_ALIGN | ++ IBV_EXP_DEVICE_ATTR_TSO_CAPS | + 0; + + DEBUG("using port %u (%08" PRIx32 ")", port, test); +@@ -580,11 +587,22 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) + + priv_get_num_vfs(priv, &num_vfs); + priv->sriov = (num_vfs || sriov); ++ priv->tso = ((priv->tso) && ++ (exp_device_attr.tso_caps.max_tso > 0) && ++ (exp_device_attr.tso_caps.supported_qpts & ++ (1 << IBV_QPT_RAW_ETH))); ++ if (priv->tso) ++ priv->max_tso_payload_sz = ++ exp_device_attr.tso_caps.max_tso; + if (priv->mps && !mps) { + ERROR("multi-packet send not supported on this device" + " (" MLX5_TXQ_MPW_EN ")"); + err = ENOTSUP; + goto port_error; ++ } else if (priv->mps && priv->tso) { ++ WARN("multi-packet send not supported in conjunction " ++ "with TSO. MPS disabled"); ++ priv->mps = 0; + } + /* Allocate and register default RSS hash keys. */ + priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n, +diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h +index 4c4b9d4..93f129b 100644 +--- a/drivers/net/mlx5/mlx5.h ++++ b/drivers/net/mlx5/mlx5.h +@@ -126,6 +126,8 @@ struct priv { + unsigned int mps:1; /* Whether multi-packet send is supported. */ + unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */ + unsigned int pending_alarm:1; /* An alarm is pending. */ ++ unsigned int tso:1; /* Whether TSO is supported. */ ++ unsigned int max_tso_payload_sz; /* Maximum TCP payload for TSO. */ + unsigned int txq_inline; /* Maximum packet size for inlining. */ + unsigned int txqs_inline; /* Queue number threshold for inlining. */ + /* RX/TX queues. */ +diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h +index e91d245..eecb908 100644 +--- a/drivers/net/mlx5/mlx5_defs.h ++++ b/drivers/net/mlx5/mlx5_defs.h +@@ -79,4 +79,7 @@ + /* Maximum number of extended statistics counters. */ + #define MLX5_MAX_XSTATS 32 + ++/* Maximum Packet headers size (L2+L3+L4) for TSO. */ ++#define MLX5_MAX_TSO_HEADER 128 ++ + #endif /* RTE_PMD_MLX5_DEFS_H_ */ +diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c +index 5677f03..5542193 100644 +--- a/drivers/net/mlx5/mlx5_ethdev.c ++++ b/drivers/net/mlx5/mlx5_ethdev.c +@@ -693,6 +693,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) + (DEV_TX_OFFLOAD_IPV4_CKSUM | + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM); ++ if (priv->tso) ++ info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO; + if (priv_get_ifname(priv, &ifname) == 0) + info->if_index = if_nametoindex(ifname); + /* FIXME: RETA update/query API expects the callee to know the size of +diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c +index 4d5455b..98889f6 100644 +--- a/drivers/net/mlx5/mlx5_rxtx.c ++++ b/drivers/net/mlx5/mlx5_rxtx.c +@@ -365,6 +365,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) + const unsigned int elts_n = 1 << txq->elts_n; + unsigned int i = 0; + unsigned int j = 0; ++ unsigned int k = 0; + unsigned int max; + uint16_t max_wqe; + unsigned int comp; +@@ -392,8 +393,10 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) + uintptr_t addr; + uint64_t naddr; + uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2; ++ uint16_t tso_header_sz = 0; + uint16_t ehdr; + uint8_t cs_flags = 0; ++ uint64_t tso = 0; + #ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t total_length = 0; + #endif +@@ -465,14 +468,74 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) + length -= pkt_inline_sz; + addr += pkt_inline_sz; + } ++ if (txq->tso_en) { ++ tso = buf->ol_flags & PKT_TX_TCP_SEG; ++ if (tso) { ++ uintptr_t end = (uintptr_t) ++ (((uintptr_t)txq->wqes) + ++ (1 << txq->wqe_n) * ++ MLX5_WQE_SIZE); ++ unsigned int copy_b; ++ uint8_t vlan_sz = (buf->ol_flags & ++ PKT_TX_VLAN_PKT) ? 4 : 0; ++ ++ tso_header_sz = buf->l2_len + vlan_sz + ++ buf->l3_len + buf->l4_len; ++ ++ if (unlikely(tso_header_sz > ++ MLX5_MAX_TSO_HEADER)) ++ break; ++ copy_b = tso_header_sz - pkt_inline_sz; ++ /* First seg must contain all headers. */ ++ assert(copy_b <= length); ++ raw += MLX5_WQE_DWORD_SIZE; ++ if (copy_b && ++ ((end - (uintptr_t)raw) > copy_b)) { ++ uint16_t n = (MLX5_WQE_DS(copy_b) - ++ 1 + 3) / 4; ++ ++ if (unlikely(max_wqe < n)) ++ break; ++ max_wqe -= n; ++ rte_memcpy((void *)raw, ++ (void *)addr, copy_b); ++ addr += copy_b; ++ length -= copy_b; ++ pkt_inline_sz += copy_b; ++ /* ++ * Another DWORD will be added ++ * in the inline part. ++ */ ++ raw += MLX5_WQE_DS(copy_b) * ++ MLX5_WQE_DWORD_SIZE - ++ MLX5_WQE_DWORD_SIZE; ++ } else { ++ /* NOP WQE. */ ++ wqe->ctrl = (rte_v128u32_t){ ++ htonl(txq->wqe_ci << 8), ++ htonl(txq->qp_num_8s | 1), ++ 0, ++ 0, ++ }; ++ ds = 1; ++ total_length = 0; ++ pkts--; ++ pkts_n++; ++ elts_head = (elts_head - 1) & ++ (elts_n - 1); ++ k++; ++ goto next_wqe; ++ } ++ } ++ } + /* Inline if enough room. */ +- if (txq->max_inline) { ++ if (txq->inline_en || tso) { + uintptr_t end = (uintptr_t) + (((uintptr_t)txq->wqes) + + (1 << txq->wqe_n) * MLX5_WQE_SIZE); + unsigned int max_inline = txq->max_inline * + RTE_CACHE_LINE_SIZE - +- MLX5_WQE_DWORD_SIZE; ++ (pkt_inline_sz - 2); + uintptr_t addr_end = (addr + max_inline) & + ~(RTE_CACHE_LINE_SIZE - 1); + unsigned int copy_b = (addr_end > addr) ? +@@ -491,6 +554,18 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) + if (unlikely(max_wqe < n)) + break; + max_wqe -= n; ++ if (tso) { ++ uint32_t inl = ++ htonl(copy_b | MLX5_INLINE_SEG); ++ ++ pkt_inline_sz = ++ MLX5_WQE_DS(tso_header_sz) * ++ MLX5_WQE_DWORD_SIZE; ++ rte_memcpy((void *)raw, ++ (void *)&inl, sizeof(inl)); ++ raw += sizeof(inl); ++ pkt_inline_sz += sizeof(inl); ++ } + rte_memcpy((void *)raw, (void *)addr, copy_b); + addr += copy_b; + length -= copy_b; +@@ -591,18 +666,34 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) + next_pkt: + ++i; + /* Initialize known and common part of the WQE structure. */ +- wqe->ctrl = (rte_v128u32_t){ +- htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), +- htonl(txq->qp_num_8s | ds), +- 0, +- 0, +- }; +- wqe->eseg = (rte_v128u32_t){ +- 0, +- cs_flags, +- 0, +- (ehdr << 16) | htons(pkt_inline_sz), +- }; ++ if (tso) { ++ wqe->ctrl = (rte_v128u32_t){ ++ htonl((txq->wqe_ci << 8) | MLX5_OPCODE_TSO), ++ htonl(txq->qp_num_8s | ds), ++ 0, ++ 0, ++ }; ++ wqe->eseg = (rte_v128u32_t){ ++ 0, ++ cs_flags | (htons(buf->tso_segsz) << 16), ++ 0, ++ (ehdr << 16) | htons(tso_header_sz), ++ }; ++ } else { ++ wqe->ctrl = (rte_v128u32_t){ ++ htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), ++ htonl(txq->qp_num_8s | ds), ++ 0, ++ 0, ++ }; ++ wqe->eseg = (rte_v128u32_t){ ++ 0, ++ cs_flags, ++ 0, ++ (ehdr << 16) | htons(pkt_inline_sz), ++ }; ++ } ++next_wqe: + txq->wqe_ci += (ds + 3) / 4; + #ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent bytes counter. */ +@@ -610,10 +701,10 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) + #endif + } while (pkts_n); + /* Take a shortcut if nothing must be sent. */ +- if (unlikely(i == 0)) ++ if (unlikely((i + k) == 0)) + return 0; + /* Check whether completion threshold has been reached. */ +- comp = txq->elts_comp + i + j; ++ comp = txq->elts_comp + i + j + k; + if (comp >= MLX5_TX_COMP_THRESH) { + volatile struct mlx5_wqe_ctrl *w = + (volatile struct mlx5_wqe_ctrl *)wqe; +diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h +index 41a34d7..6b328cf 100644 +--- a/drivers/net/mlx5/mlx5_rxtx.h ++++ b/drivers/net/mlx5/mlx5_rxtx.h +@@ -254,6 +254,8 @@ struct txq { + uint16_t cqe_n:4; /* Number of CQ elements (in log2). */ + uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */ + uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ ++ uint16_t inline_en:1; /* When set inline is enabled. */ ++ uint16_t tso_en:1; /* When set hardware TSO is enabled. */ + uint32_t qp_num_8s; /* QP number shifted by 8. */ + volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ + volatile void *wqes; /* Work queue (use volatile to write into). */ +diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c +index 949035b..995b763 100644 +--- a/drivers/net/mlx5/mlx5_txq.c ++++ b/drivers/net/mlx5/mlx5_txq.c +@@ -342,6 +342,19 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, + RTE_CACHE_LINE_SIZE); + attr.init.cap.max_inline_data = + tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE; ++ tmpl.txq.inline_en = 1; ++ } ++ if (priv->tso) { ++ uint16_t max_tso_inline = ((MLX5_MAX_TSO_HEADER + ++ (RTE_CACHE_LINE_SIZE - 1)) / ++ RTE_CACHE_LINE_SIZE); ++ ++ attr.init.max_tso_header = ++ max_tso_inline * RTE_CACHE_LINE_SIZE; ++ attr.init.comp_mask |= IBV_EXP_QP_INIT_ATTR_MAX_TSO_HEADER; ++ tmpl.txq.max_inline = RTE_MAX(tmpl.txq.max_inline, ++ max_tso_inline); ++ tmpl.txq.tso_en = 1; + } + tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init); + if (tmpl.qp == NULL) { +-- +2.7.4 + diff --git a/dpdk/dpdk-17.02_patches/0007-add-hardware-checksum-offload-for-tunnel-pa.patch b/dpdk/dpdk-17.02_patches/0007-add-hardware-checksum-offload-for-tunnel-pa.patch new file mode 100644 index 00000000..bbcce486 --- /dev/null +++ b/dpdk/dpdk-17.02_patches/0007-add-hardware-checksum-offload-for-tunnel-pa.patch @@ -0,0 +1,194 @@ +From f0dda2ab16635894b1e3836d0b960b9270a3b491 Mon Sep 17 00:00:00 2001 +From: Shahaf Shuler +Date: Thu, 2 Mar 2017 11:05:44 +0200 +Subject: [PATCH] net/mlx5: add hardware checksum offload for tunnel packets + +Prior to this commit Tx checksum offload was supported only for the +inner headers. +This commit adds support for the hardware to compute the checksum for the +outer headers as well. + +The support is for tunneling protocols GRE and VXLAN. + +Signed-off-by: Shahaf Shuler +Acked-by: Nelio Laranjeiro +--- + doc/guides/nics/features/mlx5.ini | 2 ++ + doc/guides/nics/mlx5.rst | 3 ++- + drivers/net/mlx5/mlx5.c | 7 +++++++ + drivers/net/mlx5/mlx5.h | 2 ++ + drivers/net/mlx5/mlx5_ethdev.c | 2 ++ + drivers/net/mlx5/mlx5_prm.h | 6 ++++++ + drivers/net/mlx5/mlx5_rxtx.c | 14 +++++++++++++- + drivers/net/mlx5/mlx5_rxtx.h | 2 ++ + drivers/net/mlx5/mlx5_txq.c | 2 ++ + 9 files changed, 38 insertions(+), 2 deletions(-) + +diff --git a/doc/guides/nics/features/mlx5.ini b/doc/guides/nics/features/mlx5.ini +index 8df25ce..1814f82 100644 +--- a/doc/guides/nics/features/mlx5.ini ++++ b/doc/guides/nics/features/mlx5.ini +@@ -27,6 +27,8 @@ CRC offload = Y + VLAN offload = Y + L3 checksum offload = Y + L4 checksum offload = Y ++Inner L3 checksum = Y ++Inner L4 checksum = Y + Packet type parsing = Y + Basic stats = Y + Stats per queue = Y +diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst +index 9b0ba29..41f3a47 100644 +--- a/doc/guides/nics/mlx5.rst ++++ b/doc/guides/nics/mlx5.rst +@@ -91,13 +91,14 @@ Features + - KVM and VMware ESX SR-IOV modes are supported. + - RSS hash result is supported. + - Hardware TSO. ++- Hardware checksum TX offload for VXLAN and GRE. + + Limitations + ----------- + + - Inner RSS for VXLAN frames is not supported yet. + - Port statistics through software counters only. +-- Hardware checksum offloads for VXLAN inner header are not supported yet. ++- Hardware checksum RX offloads for VXLAN inner header are not supported yet. + - Secondary process RX is not supported. + + Configuration +diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c +index 03ed3b3..6f42948 100644 +--- a/drivers/net/mlx5/mlx5.c ++++ b/drivers/net/mlx5/mlx5.c +@@ -375,6 +375,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) + struct ibv_device_attr device_attr; + unsigned int sriov; + unsigned int mps; ++ unsigned int tunnel_en; + int idx; + int i; + +@@ -429,12 +430,17 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) + * as all ConnectX-5 devices. + */ + switch (pci_dev->id.device_id) { ++ case PCI_DEVICE_ID_MELLANOX_CONNECTX4: ++ tunnel_en = 1; ++ mps = 0; ++ break; + case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: + mps = 1; ++ tunnel_en = 1; + break; + default: + mps = 0; +@@ -539,6 +545,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) + priv->mtu = ETHER_MTU; + priv->mps = mps; /* Enable MPW by default if supported. */ + priv->cqe_comp = 1; /* Enable compression by default. */ ++ priv->tunnel_en = tunnel_en; + err = mlx5_args(priv, pci_dev->device.devargs); + if (err) { + ERROR("failed to process device arguments: %s", +diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h +index 93f129b..870e01f 100644 +--- a/drivers/net/mlx5/mlx5.h ++++ b/drivers/net/mlx5/mlx5.h +@@ -127,6 +127,8 @@ struct priv { + unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */ + unsigned int pending_alarm:1; /* An alarm is pending. */ + unsigned int tso:1; /* Whether TSO is supported. */ ++ unsigned int tunnel_en:1; ++ /* Whether Tx offloads for tunneled packets are supported. */ + unsigned int max_tso_payload_sz; /* Maximum TCP payload for TSO. */ + unsigned int txq_inline; /* Maximum packet size for inlining. */ + unsigned int txqs_inline; /* Queue number threshold for inlining. */ +diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c +index 5542193..8be9e77 100644 +--- a/drivers/net/mlx5/mlx5_ethdev.c ++++ b/drivers/net/mlx5/mlx5_ethdev.c +@@ -695,6 +695,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) + DEV_TX_OFFLOAD_TCP_CKSUM); + if (priv->tso) + info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO; ++ if (priv->tunnel_en) ++ info->tx_offload_capa |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + if (priv_get_ifname(priv, &ifname) == 0) + info->if_index = if_nametoindex(ifname); + /* FIXME: RETA update/query API expects the callee to know the size of +diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h +index 3318668..0a77f5b 100644 +--- a/drivers/net/mlx5/mlx5_prm.h ++++ b/drivers/net/mlx5/mlx5_prm.h +@@ -120,6 +120,12 @@ + /* Tunnel packet bit in the CQE. */ + #define MLX5_CQE_RX_TUNNEL_PACKET (1u << 0) + ++/* Inner L3 checksum offload (Tunneled packets only). */ ++#define MLX5_ETH_WQE_L3_INNER_CSUM (1u << 4) ++ ++/* Inner L4 checksum offload (Tunneled packets only). */ ++#define MLX5_ETH_WQE_L4_INNER_CSUM (1u << 5) ++ + /* INVALID is used by packets matching no flow rules. */ + #define MLX5_FLOW_MARK_INVALID 0 + +diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c +index 98889f6..c2eb891 100644 +--- a/drivers/net/mlx5/mlx5_rxtx.c ++++ b/drivers/net/mlx5/mlx5_rxtx.c +@@ -443,7 +443,19 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) + /* Should we enable HW CKSUM offload */ + if (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { +- cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; ++ const uint64_t is_tunneled = buf->ol_flags & ++ (PKT_TX_TUNNEL_GRE | ++ PKT_TX_TUNNEL_VXLAN); ++ ++ if (is_tunneled && txq->tunnel_en) { ++ cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | ++ MLX5_ETH_WQE_L4_INNER_CSUM; ++ if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) ++ cs_flags |= MLX5_ETH_WQE_L3_CSUM; ++ } else { ++ cs_flags = MLX5_ETH_WQE_L3_CSUM | ++ MLX5_ETH_WQE_L4_CSUM; ++ } + } + raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; + /* Replace the Ethernet type by the VLAN if necessary. */ +diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h +index 6b328cf..9669564 100644 +--- a/drivers/net/mlx5/mlx5_rxtx.h ++++ b/drivers/net/mlx5/mlx5_rxtx.h +@@ -256,6 +256,8 @@ struct txq { + uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ + uint16_t inline_en:1; /* When set inline is enabled. */ + uint16_t tso_en:1; /* When set hardware TSO is enabled. */ ++ uint16_t tunnel_en:1; ++ /* When set TX offload for tunneled packets are supported. */ + uint32_t qp_num_8s; /* QP number shifted by 8. */ + volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ + volatile void *wqes; /* Work queue (use volatile to write into). */ +diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c +index 995b763..9d0c00f 100644 +--- a/drivers/net/mlx5/mlx5_txq.c ++++ b/drivers/net/mlx5/mlx5_txq.c +@@ -356,6 +356,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, + max_tso_inline); + tmpl.txq.tso_en = 1; + } ++ if (priv->tunnel_en) ++ tmpl.txq.tunnel_en = 1; + tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init); + if (tmpl.qp == NULL) { + ret = (errno ? errno : EINVAL); +-- +2.7.4 + diff --git a/dpdk/dpdk-17.02_patches/0008-net-mlx5-add-enhanced-multi-packet-send-for-ConnectX.patch b/dpdk/dpdk-17.02_patches/0008-net-mlx5-add-enhanced-multi-packet-send-for-ConnectX.patch new file mode 100644 index 00000000..6ff076c7 --- /dev/null +++ b/dpdk/dpdk-17.02_patches/0008-net-mlx5-add-enhanced-multi-packet-send-for-ConnectX.patch @@ -0,0 +1,809 @@ +From 7ca5c8de65acabe4cb60960adcfa9247efdd2a5c Mon Sep 17 00:00:00 2001 +From: Yongseok Koh +Date: Wed, 15 Mar 2017 16:55:44 -0700 +Subject: [PATCH] net/mlx5: add enhanced multi-packet send for ConnectX-5 + +ConnectX-5 supports enhanced version of multi-packet send (MPS). An MPS Tx +descriptor can carry multiple packets either by including pointers of +packets or by inlining packets. Inlining packet data can be helpful to +better utilize PCIe bandwidth. In addition, Enhanced MPS supports hybrid +mode - mixing inlined packets and pointers in a descriptor. This feature is +enabled by default if supported by HW. + +Signed-off-by: Yongseok Koh +--- + doc/guides/nics/mlx5.rst | 31 +++- + drivers/net/mlx5/mlx5.c | 37 +++- + drivers/net/mlx5/mlx5.h | 4 +- + drivers/net/mlx5/mlx5_defs.h | 7 + + drivers/net/mlx5/mlx5_ethdev.c | 6 +- + drivers/net/mlx5/mlx5_prm.h | 20 ++ + drivers/net/mlx5/mlx5_rxtx.c | 410 +++++++++++++++++++++++++++++++++++++++++ + drivers/net/mlx5/mlx5_rxtx.h | 7 +- + drivers/net/mlx5/mlx5_txq.c | 29 ++- + 9 files changed, 534 insertions(+), 17 deletions(-) + +diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst +index 41f3a47..0783aeb 100644 +--- a/doc/guides/nics/mlx5.rst ++++ b/doc/guides/nics/mlx5.rst +@@ -183,10 +183,17 @@ Run-time configuration + + - ``txq_mpw_en`` parameter [int] + +- A nonzero value enables multi-packet send. This feature allows the TX +- burst function to pack up to five packets in two descriptors in order to +- save PCI bandwidth and improve performance at the cost of a slightly +- higher CPU usage. ++ A nonzero value enables multi-packet send (MPS) for ConnectX-4 Lx and ++ enhanced multi-packet send (Enhanced MPS) for ConnectX-5. MPS allows the ++ TX burst function to pack up multiple packets in a single descriptor ++ session in order to save PCI bandwidth and improve performance at the ++ cost of a slightly higher CPU usage. When ``txq_inline`` is set along ++ with ``txq_mpw_en``, TX burst function tries to copy entire packet data ++ on to TX descriptor instead of including pointer of packet only if there ++ is enough room remained in the descriptor. ``txq_inline`` sets ++ per-descriptor space for either pointers or inlined packets. In addition, ++ Enhanced MPS supports hybrid mode - mixing inlined packets and pointers ++ in the same descriptor. + + This option cannot be used in conjunction with ``tso`` below. When ``tso`` + is set, ``txq_mpw_en`` is disabled. +@@ -194,6 +201,22 @@ Run-time configuration + It is currently only supported on the ConnectX-4 Lx and ConnectX-5 + families of adapters. Enabled by default. + ++- ``txq_mpw_hdr_dseg_en`` parameter [int] ++ ++ A nonzero value enables including two pointers in the first block of TX ++ descriptor. This can be used to lessen CPU load for memory copy. ++ ++ Effective only when Enhanced MPS is supported. Disabled by default. ++ ++- ``txq_max_inline_len`` parameter [int] ++ ++ Maximum size of packet to be inlined. This limits the size of packet to ++ be inlined. If the size of a packet is larger than configured value, the ++ packet isn't inlined even though there's enough space remained in the ++ descriptor. Instead, the packet is included with pointer. ++ ++ Effective only when Enhanced MPS is supported. The default value is 256. ++ + - ``tso`` parameter [int] + + A nonzero value enables hardware TSO. +diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c +index ebc7984..bc6a34f 100644 +--- a/drivers/net/mlx5/mlx5.c ++++ b/drivers/net/mlx5/mlx5.c +@@ -84,6 +84,12 @@ + /* Device parameter to enable multi-packet send WQEs. */ + #define MLX5_TXQ_MPW_EN "txq_mpw_en" + ++/* Device parameter to include 2 dsegs in the title WQEBB. */ ++#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" ++ ++/* Device parameter to limit the size of inlining packet. */ ++#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" ++ + /* Device parameter to enable hardware TSO offload. */ + #define MLX5_TSO "tso" + +@@ -294,7 +300,11 @@ mlx5_args_check(const char *key, const char *val, void *opaque) + } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { + priv->txqs_inline = tmp; + } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { +- priv->mps &= !!tmp; /* Enable MPW only if HW supports */ ++ priv->mps = !!tmp ? priv->mps : MLX5_MPW_DISABLED; ++ } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { ++ priv->mpw_hdr_dseg = !!tmp; ++ } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { ++ priv->inline_max_packet_sz = tmp; + } else if (strcmp(MLX5_TSO, key) == 0) { + priv->tso = !!tmp; + } else { +@@ -323,6 +333,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs) + MLX5_TXQ_INLINE, + MLX5_TXQS_MIN_INLINE, + MLX5_TXQ_MPW_EN, ++ MLX5_TXQ_MPW_HDR_DSEG_EN, ++ MLX5_TXQ_MAX_INLINE_LEN, + MLX5_TSO, + NULL, + }; +@@ -434,24 +446,27 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) + switch (pci_dev->id.device_id) { + case PCI_DEVICE_ID_MELLANOX_CONNECTX4: + tunnel_en = 1; +- mps = 0; ++ mps = MLX5_MPW_DISABLED; + break; + case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: ++ mps = MLX5_MPW; ++ break; + case PCI_DEVICE_ID_MELLANOX_CONNECTX5: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: +- mps = 1; + tunnel_en = 1; ++ mps = MLX5_MPW_ENHANCED; + break; + default: +- mps = 0; ++ mps = MLX5_MPW_DISABLED; + } + INFO("PCI information matches, using device \"%s\"" +- " (SR-IOV: %s, MPS: %s)", ++ " (SR-IOV: %s, %sMPS: %s)", + list[i]->name, + sriov ? "true" : "false", +- mps ? "true" : "false"); ++ mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", ++ mps != MLX5_MPW_DISABLED ? "true" : "false"); + attr_ctx = ibv_open_device(list[i]); + err = errno; + break; +@@ -546,6 +561,13 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) + priv->pd = pd; + priv->mtu = ETHER_MTU; + priv->mps = mps; /* Enable MPW by default if supported. */ ++ /* Set default values for Enhanced MPW, a.k.a MPWv2. */ ++ if (mps == MLX5_MPW_ENHANCED) { ++ priv->mpw_hdr_dseg = 0; ++ priv->txqs_inline = MLX5_EMPW_MIN_TXQS; ++ priv->inline_max_packet_sz = MLX5_EMPW_MAX_INLINE_LEN; ++ priv->txq_inline = MLX5_WQE_SIZE_MAX - MLX5_WQE_SIZE; ++ } + priv->cqe_comp = 1; /* Enable compression by default. */ + priv->tunnel_en = tunnel_en; + err = mlx5_args(priv, pci_dev->device.devargs); +@@ -613,6 +635,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) + "with TSO. MPS disabled"); + priv->mps = 0; + } ++ INFO("%sMPS is %s", ++ priv->mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", ++ priv->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); + /* Allocate and register default RSS hash keys. */ + priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n, + sizeof((*priv->rss_conf)[0]), 0); +diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h +index 870e01f..d26d465 100644 +--- a/drivers/net/mlx5/mlx5.h ++++ b/drivers/net/mlx5/mlx5.h +@@ -123,7 +123,8 @@ struct priv { + unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */ + unsigned int hw_padding:1; /* End alignment padding is supported. */ + unsigned int sriov:1; /* This is a VF or PF with VF devices. */ +- unsigned int mps:1; /* Whether multi-packet send is supported. */ ++ unsigned int mps:2; /* Multi-packet send mode (0: disabled). */ ++ unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ + unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */ + unsigned int pending_alarm:1; /* An alarm is pending. */ + unsigned int tso:1; /* Whether TSO is supported. */ +@@ -132,6 +133,7 @@ struct priv { + unsigned int max_tso_payload_sz; /* Maximum TCP payload for TSO. */ + unsigned int txq_inline; /* Maximum packet size for inlining. */ + unsigned int txqs_inline; /* Queue number threshold for inlining. */ ++ unsigned int inline_max_packet_sz; /* Max packet size for inlining. */ + /* RX/TX queues. */ + unsigned int rxqs_n; /* RX queues array size. */ + unsigned int txqs_n; /* TX queues array size. */ +diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h +index eecb908..201bb33 100644 +--- a/drivers/net/mlx5/mlx5_defs.h ++++ b/drivers/net/mlx5/mlx5_defs.h +@@ -55,6 +55,13 @@ + #define MLX5_TX_COMP_THRESH 32 + + /* ++ * Request TX completion every time the total number of WQEBBs used for inlining ++ * packets exceeds the size of WQ divided by this divisor. Better to be power of ++ * two for performance. ++ */ ++#define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3) ++ ++/* + * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP + * from which buffers are to be transmitted will have to be mapped by this + * driver to their own Memory Region (MR). This is a slow operation. +diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c +index 5deb6e8..dd5fe5c 100644 +--- a/drivers/net/mlx5/mlx5_ethdev.c ++++ b/drivers/net/mlx5/mlx5_ethdev.c +@@ -1590,7 +1590,11 @@ priv_select_tx_function(struct priv *priv) + { + priv->dev->tx_pkt_burst = mlx5_tx_burst; + /* Select appropriate TX function. */ +- if (priv->mps && priv->txq_inline) { ++ if (priv->mps == MLX5_MPW_ENHANCED) { ++ priv->dev->tx_pkt_burst = ++ mlx5_tx_burst_empw; ++ DEBUG("selected Enhanced MPW TX function"); ++ } else if (priv->mps && priv->txq_inline) { + priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline; + DEBUG("selected MPW inline TX function"); + } else if (priv->mps) { +diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h +index 0a77f5b..155bdba 100644 +--- a/drivers/net/mlx5/mlx5_prm.h ++++ b/drivers/net/mlx5/mlx5_prm.h +@@ -73,6 +73,9 @@ + /* WQE size */ + #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE) + ++/* Max size of a WQE session. */ ++#define MLX5_WQE_SIZE_MAX 960U ++ + /* Compute the number of DS. */ + #define MLX5_WQE_DS(n) \ + (((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE) +@@ -80,10 +83,19 @@ + /* Room for inline data in multi-packet WQE. */ + #define MLX5_MWQE64_INL_DATA 28 + ++/* Default minimum number of Tx queues for inlining packets. */ ++#define MLX5_EMPW_MIN_TXQS 8 ++ ++/* Default max packet length to be inlined. */ ++#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE) ++ + #ifndef HAVE_VERBS_MLX5_OPCODE_TSO + #define MLX5_OPCODE_TSO MLX5_OPCODE_LSO_MPW /* Compat with OFED 3.3. */ + #endif + ++#define MLX5_OPC_MOD_ENHANCED_MPSW 0 ++#define MLX5_OPCODE_ENHANCED_MPSW 0x29 ++ + /* CQE value to inform that VLAN is stripped. */ + #define MLX5_CQE_VLAN_STRIPPED (1u << 0) + +@@ -176,10 +188,18 @@ struct mlx5_wqe64 { + uint8_t raw[32]; + } __rte_aligned(MLX5_WQE_SIZE); + ++/* MPW mode. */ ++enum mlx5_mpw_mode { ++ MLX5_MPW_DISABLED, ++ MLX5_MPW, ++ MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2. */ ++}; ++ + /* MPW session status. */ + enum mlx5_mpw_state { + MLX5_MPW_STATE_OPENED, + MLX5_MPW_INL_STATE_OPENED, ++ MLX5_MPW_ENHANCED_STATE_OPENED, + MLX5_MPW_STATE_CLOSED, + }; + +diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c +index 9fc433e..a1dd84a 100644 +--- a/drivers/net/mlx5/mlx5_rxtx.c ++++ b/drivers/net/mlx5/mlx5_rxtx.c +@@ -195,6 +195,62 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci) + } + + /** ++ * Return the size of tailroom of WQ. ++ * ++ * @param txq ++ * Pointer to TX queue structure. ++ * @param addr ++ * Pointer to tail of WQ. ++ * ++ * @return ++ * Size of tailroom. ++ */ ++static inline size_t ++tx_mlx5_wq_tailroom(struct txq *txq, void *addr) ++{ ++ size_t tailroom; ++ tailroom = (uintptr_t)(txq->wqes) + ++ (1 << txq->wqe_n) * MLX5_WQE_SIZE - ++ (uintptr_t)addr; ++ return tailroom; ++} ++ ++/** ++ * Copy data to tailroom of circular queue. ++ * ++ * @param dst ++ * Pointer to destination. ++ * @param src ++ * Pointer to source. ++ * @param n ++ * Number of bytes to copy. ++ * @param base ++ * Pointer to head of queue. ++ * @param tailroom ++ * Size of tailroom from dst. ++ * ++ * @return ++ * Pointer after copied data. ++ */ ++static inline void * ++mlx5_copy_to_wq(void *dst, const void *src, size_t n, ++ void *base, size_t tailroom) ++{ ++ void *ret; ++ ++ if (n > tailroom) { ++ rte_memcpy(dst, src, tailroom); ++ rte_memcpy(base, (void *)((uintptr_t)src + tailroom), ++ n - tailroom); ++ ret = (uint8_t *)base + n - tailroom; ++ } else { ++ rte_memcpy(dst, src, n); ++ ret = (n == tailroom) ? base : (uint8_t *)dst + n; ++ } ++ return ret; ++} ++ ++/** + * Manage TX completions. + * + * When sending a burst, mlx5_tx_burst() posts several WRs. +@@ -1269,6 +1325,360 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, + } + + /** ++ * Open an Enhanced MPW session. ++ * ++ * @param txq ++ * Pointer to TX queue structure. ++ * @param mpw ++ * Pointer to MPW session structure. ++ * @param length ++ * Packet length. ++ */ ++static inline void ++mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding) ++{ ++ uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); ++ ++ mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED; ++ mpw->pkts_n = 0; ++ mpw->total_len = sizeof(struct mlx5_wqe); ++ mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); ++ mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) | ++ (txq->wqe_ci << 8) | ++ MLX5_OPCODE_ENHANCED_MPSW); ++ mpw->wqe->ctrl[2] = 0; ++ mpw->wqe->ctrl[3] = 0; ++ memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE); ++ if (unlikely(padding)) { ++ uintptr_t addr = (uintptr_t)(mpw->wqe + 1); ++ ++ /* Pad the first 2 DWORDs with zero-length inline header. */ ++ *(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG); ++ *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) = ++ htonl(MLX5_INLINE_SEG); ++ mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE; ++ /* Start from the next WQEBB. */ ++ mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1)); ++ } else { ++ mpw->data.raw = (volatile void *)(mpw->wqe + 1); ++ } ++} ++ ++/** ++ * Close an Enhanced MPW session. ++ * ++ * @param txq ++ * Pointer to TX queue structure. ++ * @param mpw ++ * Pointer to MPW session structure. ++ * ++ * @return ++ * Number of consumed WQEs. ++ */ ++static inline uint16_t ++mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw) ++{ ++ uint16_t ret; ++ ++ /* Store size in multiple of 16 bytes. Control and Ethernet segments ++ * count as 2. ++ */ ++ mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len)); ++ mpw->state = MLX5_MPW_STATE_CLOSED; ++ ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE; ++ txq->wqe_ci += ret; ++ return ret; ++} ++ ++/** ++ * DPDK callback for TX with Enhanced MPW support. ++ * ++ * @param dpdk_txq ++ * Generic pointer to TX queue structure. ++ * @param[in] pkts ++ * Packets to transmit. ++ * @param pkts_n ++ * Number of packets in array. ++ * ++ * @return ++ * Number of packets successfully transmitted (<= pkts_n). ++ */ ++uint16_t ++mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) ++{ ++ struct txq *txq = (struct txq *)dpdk_txq; ++ uint16_t elts_head = txq->elts_head; ++ const unsigned int elts_n = 1 << txq->elts_n; ++ unsigned int i = 0; ++ unsigned int j = 0; ++ unsigned int max_elts; ++ uint16_t max_wqe; ++ unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; ++ unsigned int mpw_room = 0; ++ unsigned int inl_pad = 0; ++ uint32_t inl_hdr; ++ struct mlx5_mpw mpw = { ++ .state = MLX5_MPW_STATE_CLOSED, ++ }; ++ ++ if (unlikely(!pkts_n)) ++ return 0; ++ /* Start processing. */ ++ txq_complete(txq); ++ max_elts = (elts_n - (elts_head - txq->elts_tail)); ++ if (max_elts > elts_n) ++ max_elts -= elts_n; ++ /* A CQE slot must always be available. */ ++ assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); ++ max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); ++ if (unlikely(!max_wqe)) ++ return 0; ++ do { ++ struct rte_mbuf *buf = *(pkts++); ++ unsigned int elts_head_next; ++ uintptr_t addr; ++ uint64_t naddr; ++ unsigned int n; ++ unsigned int do_inline = 0; /* Whether inline is possible. */ ++ uint32_t length; ++ unsigned int segs_n = buf->nb_segs; ++ uint32_t cs_flags = 0; ++ ++ /* ++ * Make sure there is enough room to store this packet and ++ * that one ring entry remains unused. ++ */ ++ assert(segs_n); ++ if (max_elts - j < segs_n + 1) ++ break; ++ /* Do not bother with large packets MPW cannot handle. */ ++ if (segs_n > MLX5_MPW_DSEG_MAX) ++ break; ++ /* Should we enable HW CKSUM offload. */ ++ if (buf->ol_flags & ++ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) ++ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; ++ /* Retrieve packet information. */ ++ length = PKT_LEN(buf); ++ /* Start new session if: ++ * - multi-segment packet ++ * - no space left even for a dseg ++ * - next packet can be inlined with a new WQE ++ * - cs_flag differs ++ * It can't be MLX5_MPW_STATE_OPENED as always have a single ++ * segmented packet. ++ */ ++ if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) { ++ if ((segs_n != 1) || ++ (inl_pad + sizeof(struct mlx5_wqe_data_seg) > ++ mpw_room) || ++ (length <= txq->inline_max_packet_sz && ++ inl_pad + sizeof(inl_hdr) + length > ++ mpw_room) || ++ (mpw.wqe->eseg.cs_flags != cs_flags)) ++ max_wqe -= mlx5_empw_close(txq, &mpw); ++ } ++ if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) { ++ if (unlikely(segs_n != 1)) { ++ /* Fall back to legacy MPW. ++ * A MPW session consumes 2 WQEs at most to ++ * include MLX5_MPW_DSEG_MAX pointers. ++ */ ++ if (unlikely(max_wqe < 2)) ++ break; ++ mlx5_mpw_new(txq, &mpw, length); ++ } else { ++ /* In Enhanced MPW, inline as much as the budget ++ * is allowed. The remaining space is to be ++ * filled with dsegs. If the title WQEBB isn't ++ * padded, it will have 2 dsegs there. ++ */ ++ mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX, ++ (max_inline ? max_inline : ++ pkts_n * MLX5_WQE_DWORD_SIZE) + ++ MLX5_WQE_SIZE); ++ if (unlikely(max_wqe * MLX5_WQE_SIZE < ++ mpw_room)) ++ break; ++ /* Don't pad the title WQEBB to not waste WQ. */ ++ mlx5_empw_new(txq, &mpw, 0); ++ mpw_room -= mpw.total_len; ++ inl_pad = 0; ++ do_inline = ++ length <= txq->inline_max_packet_sz && ++ sizeof(inl_hdr) + length <= mpw_room && ++ !txq->mpw_hdr_dseg; ++ } ++ mpw.wqe->eseg.cs_flags = cs_flags; ++ } else { ++ /* Evaluate whether the next packet can be inlined. ++ * Inlininig is possible when: ++ * - length is less than configured value ++ * - length fits for remaining space ++ * - not required to fill the title WQEBB with dsegs ++ */ ++ do_inline = ++ length <= txq->inline_max_packet_sz && ++ inl_pad + sizeof(inl_hdr) + length <= ++ mpw_room && ++ (!txq->mpw_hdr_dseg || ++ mpw.total_len >= MLX5_WQE_SIZE); ++ } ++ /* Multi-segment packets must be alone in their MPW. */ ++ assert((segs_n == 1) || (mpw.pkts_n == 0)); ++ if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) { ++#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) ++ length = 0; ++#endif ++ do { ++ volatile struct mlx5_wqe_data_seg *dseg; ++ ++ elts_head_next = ++ (elts_head + 1) & (elts_n - 1); ++ assert(buf); ++ (*txq->elts)[elts_head] = buf; ++ dseg = mpw.data.dseg[mpw.pkts_n]; ++ addr = rte_pktmbuf_mtod(buf, uintptr_t); ++ *dseg = (struct mlx5_wqe_data_seg){ ++ .byte_count = htonl(DATA_LEN(buf)), ++ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), ++ .addr = htonll(addr), ++ }; ++ elts_head = elts_head_next; ++#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) ++ length += DATA_LEN(buf); ++#endif ++ buf = buf->next; ++ ++j; ++ ++mpw.pkts_n; ++ } while (--segs_n); ++ /* A multi-segmented packet takes one MPW session. ++ * TODO: Pack more multi-segmented packets if possible. ++ */ ++ mlx5_mpw_close(txq, &mpw); ++ if (mpw.pkts_n < 3) ++ max_wqe--; ++ else ++ max_wqe -= 2; ++ } else if (do_inline) { ++ /* Inline packet into WQE. */ ++ unsigned int max; ++ ++ assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); ++ assert(length == DATA_LEN(buf)); ++ inl_hdr = htonl(length | MLX5_INLINE_SEG); ++ addr = rte_pktmbuf_mtod(buf, uintptr_t); ++ mpw.data.raw = (volatile void *) ++ ((uintptr_t)mpw.data.raw + inl_pad); ++ max = tx_mlx5_wq_tailroom(txq, ++ (void *)(uintptr_t)mpw.data.raw); ++ /* Copy inline header. */ ++ mpw.data.raw = (volatile void *) ++ mlx5_copy_to_wq( ++ (void *)(uintptr_t)mpw.data.raw, ++ &inl_hdr, ++ sizeof(inl_hdr), ++ (void *)(uintptr_t)txq->wqes, ++ max); ++ max = tx_mlx5_wq_tailroom(txq, ++ (void *)(uintptr_t)mpw.data.raw); ++ /* Copy packet data. */ ++ mpw.data.raw = (volatile void *) ++ mlx5_copy_to_wq( ++ (void *)(uintptr_t)mpw.data.raw, ++ (void *)addr, ++ length, ++ (void *)(uintptr_t)txq->wqes, ++ max); ++ ++mpw.pkts_n; ++ mpw.total_len += (inl_pad + sizeof(inl_hdr) + length); ++ /* No need to get completion as the entire packet is ++ * copied to WQ. Free the buf right away. ++ */ ++ elts_head_next = elts_head; ++ rte_pktmbuf_free_seg(buf); ++ mpw_room -= (inl_pad + sizeof(inl_hdr) + length); ++ /* Add pad in the next packet if any. */ ++ inl_pad = (((uintptr_t)mpw.data.raw + ++ (MLX5_WQE_DWORD_SIZE - 1)) & ++ ~(MLX5_WQE_DWORD_SIZE - 1)) - ++ (uintptr_t)mpw.data.raw; ++ } else { ++ /* No inline. Load a dseg of packet pointer. */ ++ volatile rte_v128u32_t *dseg; ++ ++ assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED); ++ assert((inl_pad + sizeof(*dseg)) <= mpw_room); ++ assert(length == DATA_LEN(buf)); ++ if (!tx_mlx5_wq_tailroom(txq, ++ (void *)((uintptr_t)mpw.data.raw ++ + inl_pad))) ++ dseg = (volatile void *)txq->wqes; ++ else ++ dseg = (volatile void *) ++ ((uintptr_t)mpw.data.raw + ++ inl_pad); ++ elts_head_next = (elts_head + 1) & (elts_n - 1); ++ (*txq->elts)[elts_head] = buf; ++ addr = rte_pktmbuf_mtod(buf, uintptr_t); ++ for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) ++ rte_prefetch2((void *)(addr + ++ n * RTE_CACHE_LINE_SIZE)); ++ naddr = htonll(addr); ++ *dseg = (rte_v128u32_t) { ++ htonl(length), ++ txq_mp2mr(txq, txq_mb2mp(buf)), ++ naddr, ++ naddr >> 32, ++ }; ++ mpw.data.raw = (volatile void *)(dseg + 1); ++ mpw.total_len += (inl_pad + sizeof(*dseg)); ++ ++j; ++ ++mpw.pkts_n; ++ mpw_room -= (inl_pad + sizeof(*dseg)); ++ inl_pad = 0; ++ } ++ elts_head = elts_head_next; ++#ifdef MLX5_PMD_SOFT_COUNTERS ++ /* Increment sent bytes counter. */ ++ txq->stats.obytes += length; ++#endif ++ ++i; ++ } while (i < pkts_n); ++ /* Take a shortcut if nothing must be sent. */ ++ if (unlikely(i == 0)) ++ return 0; ++ /* Check whether completion threshold has been reached. */ ++ if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH || ++ (uint16_t)(txq->wqe_ci - txq->mpw_comp) >= ++ (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) { ++ volatile struct mlx5_wqe *wqe = mpw.wqe; ++ ++ /* Request completion on last WQE. */ ++ wqe->ctrl[2] = htonl(8); ++ /* Save elts_head in unused "immediate" field of WQE. */ ++ wqe->ctrl[3] = elts_head; ++ txq->elts_comp = 0; ++ txq->mpw_comp = txq->wqe_ci; ++ txq->cq_pi++; ++ } else { ++ txq->elts_comp += j; ++ } ++#ifdef MLX5_PMD_SOFT_COUNTERS ++ /* Increment sent packets counter. */ ++ txq->stats.opackets += i; ++#endif ++ if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) ++ mlx5_empw_close(txq, &mpw); ++ else if (mpw.state == MLX5_MPW_STATE_OPENED) ++ mlx5_mpw_close(txq, &mpw); ++ /* Ring QP doorbell. */ ++ mlx5_tx_dbrec(txq, mpw.wqe); ++ txq->elts_head = elts_head; ++ return i; ++} ++ ++/** + * Translate RX completion flags to packet type. + * + * @param[in] cqe +diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h +index 0db810c..4a4bd84 100644 +--- a/drivers/net/mlx5/mlx5_rxtx.h ++++ b/drivers/net/mlx5/mlx5_rxtx.h +@@ -248,17 +248,21 @@ struct txq { + uint16_t elts_head; /* Current index in (*elts)[]. */ + uint16_t elts_tail; /* First element awaiting completion. */ + uint16_t elts_comp; /* Counter since last completion request. */ ++ uint16_t mpw_comp; /* WQ index since last completion request. */ + uint16_t cq_ci; /* Consumer index for completion queue. */ ++ uint16_t cq_pi; /* Producer index for completion queue. */ + uint16_t wqe_ci; /* Consumer index for work queue. */ + uint16_t wqe_pi; /* Producer index for work queue. */ + uint16_t elts_n:4; /* (*elts)[] length (in log2). */ + uint16_t cqe_n:4; /* Number of CQ elements (in log2). */ + uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */ +- uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ + uint16_t inline_en:1; /* When set inline is enabled. */ + uint16_t tso_en:1; /* When set hardware TSO is enabled. */ + uint16_t tunnel_en:1; + /* When set TX offload for tunneled packets are supported. */ ++ uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ ++ uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ ++ uint16_t inline_max_packet_sz; /* Max packet size for inlining. */ + uint32_t qp_num_8s; /* QP number shifted by 8. */ + volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ + volatile void *wqes; /* Work queue (use volatile to write into). */ +@@ -329,6 +333,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t); + uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t); + uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t); + uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t); ++uint16_t mlx5_tx_burst_empw(void *, struct rte_mbuf **, uint16_t); + uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t); + uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t); + uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); +diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c +index 9d0c00f..bbfce75 100644 +--- a/drivers/net/mlx5/mlx5_txq.c ++++ b/drivers/net/mlx5/mlx5_txq.c +@@ -266,6 +266,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, + struct ibv_exp_cq_attr cq_attr; + } attr; + enum ibv_exp_query_intf_status status; ++ unsigned int cqe_n; + int ret = 0; + + if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) { +@@ -276,6 +277,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, + (void)conf; /* Thresholds configuration (ignored). */ + assert(desc > MLX5_TX_COMP_THRESH); + tmpl.txq.elts_n = log2above(desc); ++ if (priv->mps == MLX5_MPW_ENHANCED) ++ tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg; + /* MRs will be registered in mp2mr[] later. */ + attr.rd = (struct ibv_exp_res_domain_init_attr){ + .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | +@@ -294,9 +297,12 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, + .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, + .res_domain = tmpl.rd, + }; ++ cqe_n = ((desc / MLX5_TX_COMP_THRESH) - 1) ? ++ ((desc / MLX5_TX_COMP_THRESH) - 1) : 1; ++ if (priv->mps == MLX5_MPW_ENHANCED) ++ cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV; + tmpl.cq = ibv_exp_create_cq(priv->ctx, +- (((desc / MLX5_TX_COMP_THRESH) - 1) ? +- ((desc / MLX5_TX_COMP_THRESH) - 1) : 1), ++ cqe_n, + NULL, NULL, 0, &attr.cq); + if (tmpl.cq == NULL) { + ret = ENOMEM; +@@ -340,9 +346,24 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, + tmpl.txq.max_inline = + ((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) / + RTE_CACHE_LINE_SIZE); +- attr.init.cap.max_inline_data = +- tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE; + tmpl.txq.inline_en = 1; ++ /* TSO and MPS can't be enabled concurrently. */ ++ assert(!priv->tso || !priv->mps); ++ if (priv->mps == MLX5_MPW_ENHANCED) { ++ tmpl.txq.inline_max_packet_sz = ++ priv->inline_max_packet_sz; ++ /* To minimize the size of data set, avoid requesting ++ * too large WQ. ++ */ ++ attr.init.cap.max_inline_data = ++ ((RTE_MIN(priv->txq_inline, ++ priv->inline_max_packet_sz) + ++ (RTE_CACHE_LINE_SIZE - 1)) / ++ RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE; ++ } else { ++ attr.init.cap.max_inline_data = ++ tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE; ++ } + } + if (priv->tso) { + uint16_t max_tso_inline = ((MLX5_MAX_TSO_HEADER + +-- +2.7.4 + diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 9dc3fcce..538db6cb 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -790,7 +790,10 @@ dpdk_lib_init (dpdk_main_t * dm) case VNET_DPDK_PMD_MLX5: { - char *pn_100g[] = { "MCX415A-CCAT", "MCX416A-CCAT", 0 }; + char *pn_100g[] = { "MCX415A-CCAT", "MCX416A-CCAT", + "MCX556A-ECAT", "MCX556A-EDAT", "MCX555A-ECAT", + "MCX515A-CCAT", "MCX516A-CCAT", "MCX516A-CDAT", 0 + }; char *pn_40g[] = { "MCX413A-BCAT", "MCX414A-BCAT", "MCX415A-BCAT", "MCX416A-BCAT", "MCX4131A-BCAT", 0 }; -- cgit 1.2.3-korg From 586afd762bfa149f5ca167bd5fd5a0cd59ce94fe Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 5 Apr 2017 19:18:20 +0200 Subject: Use thread local storage for thread index This patch deprecates stack-based thread identification, Also removes requirement that thread stacks are adjacent. Finally, possibly annoying for some folks, it renames all occurences of cpu_index and cpu_number with thread index. Using word "cpu" is misleading here as thread can be migrated ti different CPU, and also it is not related to linux cpu index. Change-Id: I68cdaf661e701d2336fc953dcb9978d10a70f7c1 Signed-off-by: Damjan Marion --- src/examples/srv6-sample-localsid/node.c | 4 +- src/plugins/dpdk/buffer.c | 2 +- src/plugins/dpdk/device/device.c | 8 +- src/plugins/dpdk/device/dpdk_priv.h | 8 +- src/plugins/dpdk/device/init.c | 2 +- src/plugins/dpdk/device/node.c | 32 +++--- src/plugins/dpdk/hqos/hqos.c | 16 +-- src/plugins/dpdk/ipsec/cli.c | 8 +- src/plugins/dpdk/ipsec/crypto_node.c | 4 +- src/plugins/dpdk/ipsec/esp.h | 4 +- src/plugins/dpdk/ipsec/esp_decrypt.c | 4 +- src/plugins/dpdk/ipsec/esp_encrypt.c | 5 +- src/plugins/dpdk/ipsec/ipsec.c | 2 +- src/plugins/dpdk/ipsec/ipsec.h | 4 +- src/plugins/dpdk/main.c | 2 +- src/plugins/flowperpkt/l2_node.c | 2 +- src/plugins/flowperpkt/node.c | 2 +- src/plugins/ioam/export-common/ioam_export.h | 6 +- .../ioam/ip6/ioam_cache_tunnel_select_node.c | 16 +-- src/plugins/ixge/ixge.c | 2 +- src/plugins/lb/lb.c | 8 +- src/plugins/lb/node.c | 22 ++-- src/plugins/lb/refcount.c | 8 +- src/plugins/lb/refcount.h | 4 +- src/plugins/memif/node.c | 35 +++--- src/plugins/snat/in2out.c | 110 +++++++++--------- src/plugins/snat/out2in.c | 102 ++++++++--------- src/plugins/snat/snat.h | 10 +- src/vlib/buffer.c | 6 +- src/vlib/buffer_funcs.h | 4 +- src/vlib/cli.c | 6 +- src/vlib/counter.h | 16 +-- src/vlib/error.c | 2 +- src/vlib/global_funcs.h | 2 +- src/vlib/main.c | 14 +-- src/vlib/main.h | 2 +- src/vlib/node.c | 2 +- src/vlib/node.h | 6 +- src/vlib/node_funcs.h | 8 +- src/vlib/threads.c | 69 ++++------- src/vlib/threads.h | 21 ++-- src/vlib/unix/cj.c | 7 +- src/vlib/unix/cj.h | 2 +- src/vlib/unix/main.c | 43 +++---- src/vnet/adj/adj_l2.c | 4 +- src/vnet/adj/adj_midchain.c | 8 +- src/vnet/adj/adj_nsh.c | 4 +- src/vnet/classify/vnet_classify.c | 16 +-- src/vnet/cop/ip4_whitelist.c | 8 +- src/vnet/cop/ip6_whitelist.c | 8 +- src/vnet/devices/af_packet/node.c | 20 ++-- src/vnet/devices/devices.c | 61 +++++----- src/vnet/devices/devices.h | 18 +-- src/vnet/devices/netmap/node.c | 24 ++-- src/vnet/devices/ssvm/node.c | 6 +- src/vnet/devices/virtio/vhost-user.c | 127 +++++++++++---------- src/vnet/dpo/lookup_dpo.c | 20 ++-- src/vnet/dpo/replicate_dpo.c | 12 +- src/vnet/ethernet/arp.c | 2 +- src/vnet/ethernet/interface.c | 7 +- src/vnet/ethernet/node.c | 14 +-- src/vnet/gre/node.c | 8 +- src/vnet/interface.h | 2 +- src/vnet/interface_output.c | 53 ++++----- src/vnet/ip/ip4_forward.c | 34 +++--- src/vnet/ip/ip4_input.c | 8 +- src/vnet/ip/ip6_forward.c | 24 ++-- src/vnet/ip/ip6_input.c | 8 +- src/vnet/ip/ip6_neighbor.c | 4 +- src/vnet/ipsec/esp.h | 8 +- src/vnet/ipsec/esp_decrypt.c | 13 ++- src/vnet/ipsec/esp_encrypt.c | 13 ++- src/vnet/ipsec/ikev2.c | 64 ++++++----- src/vnet/ipsec/ipsec.h | 12 +- src/vnet/ipsec/ipsec_if.c | 2 +- src/vnet/l2/l2_bvi.h | 2 +- src/vnet/l2/l2_input.c | 14 +-- src/vnet/l2/l2_output.c | 6 +- src/vnet/l2tp/decap.c | 2 +- src/vnet/l2tp/encap.c | 2 +- src/vnet/l2tp/l2tp.c | 6 +- src/vnet/lisp-gpe/decap.c | 16 +-- src/vnet/lldp/lldp_input.c | 2 +- src/vnet/map/ip4_map.c | 14 +-- src/vnet/map/ip4_map_t.c | 12 +- src/vnet/map/ip6_map.c | 19 +-- src/vnet/map/ip6_map_t.c | 12 +- src/vnet/mpls/mpls_input.c | 8 +- src/vnet/mpls/mpls_lookup.c | 20 ++-- src/vnet/mpls/mpls_output.c | 10 +- src/vnet/pg/input.c | 4 +- src/vnet/replication.c | 20 ++-- src/vnet/replication.h | 2 +- src/vnet/session/node.c | 2 +- src/vnet/sr/sr_localsid.c | 44 +++---- src/vnet/tcp/builtin_client.c | 2 +- src/vnet/tcp/tcp.c | 8 +- src/vnet/tcp/tcp_debug.h | 2 +- src/vnet/tcp/tcp_input.c | 10 +- src/vnet/tcp/tcp_output.c | 20 ++-- src/vnet/udp/udp_input.c | 2 +- src/vnet/unix/tapcli.c | 2 +- src/vnet/unix/tuntap.c | 4 +- src/vnet/vxlan-gpe/decap.c | 10 +- src/vnet/vxlan-gpe/encap.c | 12 +- src/vnet/vxlan/decap.c | 10 +- src/vnet/vxlan/encap.c | 12 +- src/vpp/stats/stats.c | 14 +-- src/vpp/stats/stats.h | 2 +- 109 files changed, 790 insertions(+), 791 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/examples/srv6-sample-localsid/node.c b/src/examples/srv6-sample-localsid/node.c index 7bae9cd7..e83e2352 100644 --- a/src/examples/srv6-sample-localsid/node.c +++ b/src/examples/srv6-sample-localsid/node.c @@ -114,7 +114,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -168,7 +168,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram /* This increments the SRv6 per LocalSID counters.*/ vlib_increment_combined_counter (((next0 == SRV6_SAMPLE_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : &(sm->sr_ls_valid_counters)), - cpu_index, + thread_index, ls0 - sm->localsids, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 2765c292..c80b3fa8 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -132,7 +132,7 @@ dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) u32 merge_index; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 50b26689..91661246 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -243,7 +243,7 @@ static_always_inline ASSERT (ring->tx_tail == 0); n_retry = 16; - queue_id = vm->cpu_index; + queue_id = vm->thread_index; do { @@ -266,7 +266,7 @@ static_always_inline { /* no wrap, transmit in one burst */ dpdk_device_hqos_per_worker_thread_t *hqos = - &xd->hqos_wt[vm->cpu_index]; + &xd->hqos_wt[vm->thread_index]; ASSERT (hqos->swq != NULL); @@ -332,7 +332,7 @@ dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp) { dpdk_main_t *dm = &dpdk_main; - u32 my_cpu = vm->cpu_index; + u32 my_cpu = vm->thread_index; struct rte_mbuf *mb_new; if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0) @@ -376,7 +376,7 @@ dpdk_interface_tx (vlib_main_t * vm, tx_ring_hdr_t *ring; u32 n_on_ring; - my_cpu = vm->cpu_index; + my_cpu = vm->thread_index; queue_id = my_cpu; diff --git a/src/plugins/dpdk/device/dpdk_priv.h b/src/plugins/dpdk/device/dpdk_priv.h index dd40ff48..52b4ca4b 100644 --- a/src/plugins/dpdk/device/dpdk_priv.h +++ b/src/plugins/dpdk/device/dpdk_priv.h @@ -79,7 +79,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) { vlib_simple_counter_main_t *cm; vnet_main_t *vnm = vnet_get_main (); - u32 my_cpu = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u64 rxerrors, last_rxerrors; /* only update counters for PMD interfaces */ @@ -96,7 +96,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_NO_BUF); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, xd->stats.rx_nombuf - xd->last_stats.rx_nombuf); } @@ -107,7 +107,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_MISS); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, xd->stats.imissed - xd->last_stats.imissed); } @@ -119,7 +119,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_ERROR); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, rxerrors - last_rxerrors); } diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 538db6cb..7eaf8da7 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -324,7 +324,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) int rv; int j; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) { diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index e740fd18..b10e0fad 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -283,7 +283,7 @@ dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, */ static_always_inline u32 dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, - vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id, + vlib_node_runtime_t * node, u32 thread_index, u16 queue_id, int maybe_multiseg) { u32 n_buffers; @@ -294,7 +294,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, uword n_rx_bytes = 0; u32 n_trace, trace_cnt __attribute__ ((unused)); vlib_buffer_free_list_t *fl; - vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, cpu_index); + vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, thread_index); if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) return 0; @@ -306,7 +306,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, return 0; } - vec_reset_length (xd->d_trace_buffers[cpu_index]); + vec_reset_length (xd->d_trace_buffers[thread_index]); trace_cnt = n_trace = vlib_get_trace_count (vm, node); if (n_trace > 0) @@ -318,7 +318,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, { struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++]; vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); - vec_add1 (xd->d_trace_buffers[cpu_index], + vec_add1 (xd->d_trace_buffers[thread_index], vlib_get_buffer_index (vm, b)); } } @@ -546,20 +546,22 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, vlib_put_next_frame (vm, node, next_index, n_left_to_next); } - if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[cpu_index]) > 0)) + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[thread_index]) > 0)) { - dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers[cpu_index], - vec_len (xd->d_trace_buffers[cpu_index])); - vlib_set_trace_count (vm, node, n_trace - - vec_len (xd->d_trace_buffers[cpu_index])); + dpdk_rx_trace (dm, node, xd, queue_id, + xd->d_trace_buffers[thread_index], + vec_len (xd->d_trace_buffers[thread_index])); + vlib_set_trace_count (vm, node, + n_trace - + vec_len (xd->d_trace_buffers[thread_index])); } vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); + thread_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, mb_index); + vnet_device_increment_rx_packets (thread_index, mb_index); return mb_index; } @@ -630,19 +632,19 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) dpdk_device_t *xd; uword n_rx_packets = 0; dpdk_device_and_queue_t *dq; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* * Poll all devices on this cpu for input/interrupts. */ /* *INDENT-OFF* */ - vec_foreach (dq, dm->devices_by_cpu[cpu_index]) + vec_foreach (dq, dm->devices_by_cpu[thread_index]) { xd = vec_elt_at_index(dm->devices, dq->device); if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 1); + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1); else - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 0); + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 0); } /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index a288fca7..8b251beb 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -397,7 +397,7 @@ static_always_inline void dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0; @@ -405,12 +405,12 @@ dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) { vlib_worker_thread_barrier_check (); - u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]); + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (dev_pos >= n_devs) dev_pos = 0; dpdk_device_and_queue_t *dq = - vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos); + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; @@ -479,7 +479,7 @@ static_always_inline void dpdk_hqos_thread_internal (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0; @@ -487,7 +487,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) { vlib_worker_thread_barrier_check (); - u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]); + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (PREDICT_FALSE (n_devs == 0)) { dev_pos = 0; @@ -497,7 +497,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) dev_pos = 0; dpdk_device_and_queue_t *dq = - vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos); + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; @@ -586,7 +586,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w) vm = vlib_get_main (); - ASSERT (vm->cpu_index == os_get_cpu_number ()); + ASSERT (vm->thread_index == vlib_get_thread_index ()); clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); @@ -595,7 +595,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w) while (tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); - if (vec_len (dm->devices_by_hqos_cpu[vm->cpu_index]) == 0) + if (vec_len (dm->devices_by_hqos_cpu[vm->thread_index]) == 0) return clib_error ("current I/O TX thread does not have any devices assigned to it"); diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c index cd0a6037..3ae8c9b8 100644 --- a/src/plugins/dpdk/ipsec/cli.c +++ b/src/plugins/dpdk/ipsec/cli.c @@ -42,8 +42,8 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) for (i = 0; i < tm->n_vlib_mains; i++) { uword key, data; - u32 cpu_index = vlib_mains[i]->cpu_index; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + u32 thread_index = vlib_mains[i]->thread_index; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; u8 *s = 0; if (skip_master) @@ -57,7 +57,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) i32 last_cdev = -1; crypto_qp_data_t *qpd; - s = format (s, "%u\t", cpu_index); + s = format (s, "%u\t", thread_index); /* *INDENT-OFF* */ vec_foreach (qpd, cwm->qp_data) @@ -95,7 +95,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) cap.sym.auth.algo = p_key->auth_algo; check_algo_is_supported (&cap, auth_str); vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n", - vlib_mains[i]->cpu_index, cipher_str, auth_str, + vlib_mains[i]->thread_index, cipher_str, auth_str, p_key->is_outbound ? "out" : "in", cwm->qp_data[data].dev_id, cwm->qp_data[data].qp_id); diff --git a/src/plugins/dpdk/ipsec/crypto_node.c b/src/plugins/dpdk/ipsec/crypto_node.c index dc3452b2..a3c45902 100644 --- a/src/plugins/dpdk/ipsec/crypto_node.c +++ b/src/plugins/dpdk/ipsec/crypto_node.c @@ -171,9 +171,9 @@ static uword dpdk_crypto_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; crypto_qp_data_t *qpd; u32 n_deq = 0; diff --git a/src/plugins/dpdk/ipsec/esp.h b/src/plugins/dpdk/ipsec/esp.h index 320295b1..56f0c756 100644 --- a/src/plugins/dpdk/ipsec/esp.h +++ b/src/plugins/dpdk/ipsec/esp.h @@ -170,9 +170,9 @@ static_always_inline int create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, u8 is_outbound) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; struct rte_crypto_sym_xform cipher_xform = { 0 }; struct rte_crypto_sym_xform auth_xform = { 0 }; struct rte_crypto_sym_xform *xfs; diff --git a/src/plugins/dpdk/ipsec/esp_decrypt.c b/src/plugins/dpdk/ipsec/esp_decrypt.c index 286e03f8..bab76e3b 100644 --- a/src/plugins/dpdk/ipsec/esp_decrypt.c +++ b/src/plugins/dpdk/ipsec/esp_decrypt.c @@ -88,7 +88,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, { u32 n_left_from, *from, *to_next, next_index; ipsec_main_t *im = &ipsec_main; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); dpdk_crypto_main_t * dcm = &dpdk_crypto_main; dpdk_esp_main_t * em = &dpdk_esp_main; u32 i; @@ -104,7 +104,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, return n_left_from; } - crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, cpu_index); + crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, thread_index); u32 n_qps = vec_len(cwm->qp_data); struct rte_crypto_op ** cops_to_enq[n_qps]; u32 n_cop_qp[n_qps], * bi_to_enq[n_qps]; diff --git a/src/plugins/dpdk/ipsec/esp_encrypt.c b/src/plugins/dpdk/ipsec/esp_encrypt.c index 5b03de73..f996d7df 100644 --- a/src/plugins/dpdk/ipsec/esp_encrypt.c +++ b/src/plugins/dpdk/ipsec/esp_encrypt.c @@ -93,7 +93,7 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, { u32 n_left_from, *from, *to_next, next_index; ipsec_main_t *im = &ipsec_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; dpdk_esp_main_t *em = &dpdk_esp_main; u32 i; @@ -111,7 +111,8 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, return n_left_from; } - crypto_worker_main_t *cwm = vec_elt_at_index (dcm->workers_main, cpu_index); + crypto_worker_main_t *cwm = + vec_elt_at_index (dcm->workers_main, thread_index); u32 n_qps = vec_len (cwm->qp_data); struct rte_crypto_op **cops_to_enq[n_qps]; u32 n_cop_qp[n_qps], *bi_to_enq[n_qps]; diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index b0aaaaec..5d8f4fba 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -289,7 +289,7 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, if (!map) { clib_warning ("unable to create hash table for worker %u", - vlib_mains[i]->cpu_index); + vlib_mains[i]->thread_index); goto error; } cwm->algo_qp_map = map; diff --git a/src/plugins/dpdk/ipsec/ipsec.h b/src/plugins/dpdk/ipsec/ipsec.h index 28bffc80..f0f793c0 100644 --- a/src/plugins/dpdk/ipsec/ipsec.h +++ b/src/plugins/dpdk/ipsec/ipsec.h @@ -95,8 +95,8 @@ static_always_inline void crypto_alloc_cops () { dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - u32 cpu_index = os_get_cpu_number (); - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + u32 thread_index = vlib_get_thread_index (); + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; unsigned socket_id = rte_socket_id (); crypto_qp_data_t *qpd; diff --git a/src/plugins/dpdk/main.c b/src/plugins/dpdk/main.c index 7ee2a785..942b8b2d 100644 --- a/src/plugins/dpdk/main.c +++ b/src/plugins/dpdk/main.c @@ -39,7 +39,7 @@ rte_delay_us_override (unsigned us) * thread then do not intercept. (Must not be called from an * independent pthread). */ - if (os_get_cpu_number () == 0) + if (vlib_get_thread_index () == 0) { /* * We're in the vlib main thread or a vlib process. Make sure diff --git a/src/plugins/flowperpkt/l2_node.c b/src/plugins/flowperpkt/l2_node.c index 1c2f681e..fdaf81d1 100644 --- a/src/plugins/flowperpkt/l2_node.c +++ b/src/plugins/flowperpkt/l2_node.c @@ -102,7 +102,7 @@ add_to_flow_record_l2 (vlib_main_t * vm, u8 * src_mac, u8 * dst_mac, u16 ethertype, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->cpu_index; + u32 my_cpu_number = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; diff --git a/src/plugins/flowperpkt/node.c b/src/plugins/flowperpkt/node.c index f77f087d..0277682d 100644 --- a/src/plugins/flowperpkt/node.c +++ b/src/plugins/flowperpkt/node.c @@ -101,7 +101,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, u32 src_address, u32 dst_address, u8 tos, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->cpu_index; + u32 my_cpu_number = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; diff --git a/src/plugins/ioam/export-common/ioam_export.h b/src/plugins/ioam/export-common/ioam_export.h index 2bf3fd54..9de0d13b 100644 --- a/src/plugins/ioam/export-common/ioam_export.h +++ b/src/plugins/ioam/export-common/ioam_export.h @@ -477,8 +477,8 @@ do { \ from = vlib_frame_vector_args (F); \ n_left_from = (F)->n_vectors; \ next_index = (N)->cached_next_index; \ - while (__sync_lock_test_and_set ((EM)->lockp[(VM)->cpu_index], 1)); \ - my_buf = ioam_export_get_my_buffer (EM, (VM)->cpu_index); \ + while (__sync_lock_test_and_set ((EM)->lockp[(VM)->thread_index], 1)); \ + my_buf = ioam_export_get_my_buffer (EM, (VM)->thread_index); \ my_buf->touched_at = vlib_time_now (VM); \ while (n_left_from > 0) \ { \ @@ -620,7 +620,7 @@ do { \ } \ vlib_node_increment_counter (VM, export_node.index, \ EXPORT_ERROR_RECORDED, pkts_recorded); \ - *(EM)->lockp[(VM)->cpu_index] = 0; \ + *(EM)->lockp[(VM)->thread_index] = 0; \ } while(0) #endif /* __included_ioam_export_h__ */ diff --git a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c index a56dc040..0cf742c9 100644 --- a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c +++ b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c @@ -396,7 +396,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp0->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index0)) + vm->thread_index, &pool_index0)) { cache_ts_added++; } @@ -419,7 +419,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index0; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -455,7 +455,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp1->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index1)) + vm->thread_index, &pool_index1)) { cache_ts_added++; } @@ -479,7 +479,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh1 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index1; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -562,7 +562,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp0->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index0)) + vm->thread_index, &pool_index0)) { cache_ts_added++; } @@ -585,7 +585,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index0; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -701,7 +701,7 @@ expired_cache_ts_timer_callback (u32 * expired_timers) ioam_cache_main_t *cm = &ioam_cache_main; int i; u32 pool_index; - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 count = 0; for (i = 0; i < vec_len (expired_timers); i++) @@ -724,7 +724,7 @@ ioam_cache_ts_timer_tick_node_fn (vlib_main_t * vm, vlib_frame_t * f) { ioam_cache_main_t *cm = &ioam_cache_main; - u32 my_thread_index = os_get_cpu_number (); + u32 my_thread_index = vlib_get_thread_index (); struct timespec ts, tsrem; tw_timer_expire_timers_16t_2w_512sl (&cm->timer_wheels[my_thread_index], diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index f3c5cc09..08f5b692 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -1887,7 +1887,7 @@ done: vlib_increment_combined_counter (vnet_main. interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - 0 /* cpu_index */ , + 0 /* thread_index */ , xd->vlib_sw_if_index, n_packets, dq->rx.n_bytes); diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index add81236..addc2a42 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -63,11 +63,11 @@ u8 *format_lb_main (u8 * s, va_list * args) s = format(s, " #vips: %u\n", pool_elts(lbm->vips)); s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1); - u32 cpu_index; - for(cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++ ) { - lb_hash_t *h = lbm->per_cpu[cpu_index].sticky_ht; + u32 thread_index; + for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) { + lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht; if (h) { - s = format(s, "core %d\n", cpu_index); + s = format(s, "core %d\n", thread_index); s = format(s, " timeout: %ds\n", h->timeout); s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h)); } diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c index 8b763c53..3171148b 100644 --- a/src/plugins/lb/node.c +++ b/src/plugins/lb/node.c @@ -60,10 +60,10 @@ format_lb_trace (u8 * s, va_list * args) return s; } -lb_hash_t *lb_get_sticky_table(u32 cpu_index) +lb_hash_t *lb_get_sticky_table(u32 thread_index) { lb_main_t *lbm = &lb_main; - lb_hash_t *sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + lb_hash_t *sticky_ht = lbm->per_cpu[thread_index].sticky_ht; //Check if size changed if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) { @@ -71,8 +71,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index) lb_hash_bucket_t *b; u32 i; lb_hash_foreach_entry(sticky_ht, b, i) { - vlib_refcount_add(&lbm->as_refcount, cpu_index, b->value[i], -1); - vlib_refcount_add(&lbm->as_refcount, cpu_index, 0, 1); + vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1); + vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1); } lb_hash_free(sticky_ht); @@ -81,8 +81,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index) //Create if necessary if (PREDICT_FALSE(sticky_ht == NULL)) { - lbm->per_cpu[cpu_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); - sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + lbm->per_cpu[thread_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); + sticky_ht = lbm->per_cpu[thread_index].sticky_ht; clib_warning("Regenerated sticky table %p", sticky_ht); } @@ -153,10 +153,10 @@ lb_node_fn (vlib_main_t * vm, { lb_main_t *lbm = &lb_main; u32 n_left_from, *from, next_index, *to_next, n_left_to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 lb_time = lb_hash_time_now(vm); - lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index); + lb_hash_t *sticky_ht = lb_get_sticky_table(thread_index); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; @@ -240,9 +240,9 @@ lb_node_fn (vlib_main_t * vm, //Configuration may be changed, vectors resized, etc... //Dereference previously used - vlib_refcount_add(&lbm->as_refcount, cpu_index, + vlib_refcount_add(&lbm->as_refcount, thread_index, lb_hash_available_value(sticky_ht, hash0, available_index0), -1); - vlib_refcount_add(&lbm->as_refcount, cpu_index, + vlib_refcount_add(&lbm->as_refcount, thread_index, asindex0, 1); //Add sticky entry @@ -260,7 +260,7 @@ lb_node_fn (vlib_main_t * vm, } vlib_increment_simple_counter(&lbm->vip_counters[counter], - cpu_index, + thread_index, vnet_buffer (p0)->ip.adj_index[VLIB_TX], 1); diff --git a/src/plugins/lb/refcount.c b/src/plugins/lb/refcount.c index 22415c88..6f01ab5a 100644 --- a/src/plugins/lb/refcount.c +++ b/src/plugins/lb/refcount.c @@ -31,10 +31,10 @@ u64 vlib_refcount_get(vlib_refcount_t *r, u32 index) { u64 count = 0; vlib_thread_main_t *tm = vlib_get_thread_main (); - u32 cpu_index; - for (cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++) { - if (r->per_cpu[cpu_index].length > index) - count += r->per_cpu[cpu_index].counters[index]; + u32 thread_index; + for (thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++) { + if (r->per_cpu[thread_index].length > index) + count += r->per_cpu[thread_index].counters[index]; } return count; } diff --git a/src/plugins/lb/refcount.h b/src/plugins/lb/refcount.h index 8c26e7be..dcfcb3fe 100644 --- a/src/plugins/lb/refcount.h +++ b/src/plugins/lb/refcount.h @@ -45,9 +45,9 @@ typedef struct { void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size); static_always_inline -void vlib_refcount_add(vlib_refcount_t *r, u32 cpu_index, u32 counter_index, i32 v) +void vlib_refcount_add(vlib_refcount_t *r, u32 thread_index, u32 counter_index, i32 v) { - vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[cpu_index]; + vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[thread_index]; if (PREDICT_FALSE(counter_index >= per_cpu->length)) __vlib_refcount_resize(per_cpu, clib_max(counter_index + 16, per_cpu->length * 2)); diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c index 659d5dfb..cee1f3d1 100644 --- a/src/plugins/memif/node.c +++ b/src/plugins/memif/node.c @@ -94,7 +94,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_rx_bytes = 0; u32 *to_next = 0; u32 n_free_bufs; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 bi0, bi1; vlib_buffer_t *b0, *b1; u16 ring_size = 1 << mif->log2_ring_size; @@ -105,14 +105,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (mif->per_interface_next_index != ~0) next_index = mif->per_interface_next_index; - n_free_bufs = vec_len (nm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < ring_size)) { - vec_validate (nm->rx_buffers[cpu_index], ring_size + n_free_bufs - 1); + vec_validate (nm->rx_buffers[thread_index], + ring_size + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], ring_size); - _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; } head = ring->head; @@ -158,15 +159,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, CLIB_CACHE_LINE_BYTES, LOAD); } /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1; - bi0 = nm->rx_buffers[cpu_index][last_buf]; - bi1 = nm->rx_buffers[cpu_index][last_buf - 1]; - _vec_len (nm->rx_buffers[cpu_index]) -= 2; + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + bi0 = nm->rx_buffers[thread_index][last_buf]; + bi1 = nm->rx_buffers[thread_index][last_buf - 1]; + _vec_len (nm->rx_buffers[thread_index]) -= 2; if (last_buf > 4) { - memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 2]); - memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 3]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 2]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 3]); } /* enqueue buffer */ @@ -256,9 +257,9 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, while (num_slots && n_left_to_next) { /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1; - bi0 = nm->rx_buffers[cpu_index][last_buf]; - _vec_len (nm->rx_buffers[cpu_index]) = last_buf; + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + bi0 = nm->rx_buffers[thread_index][last_buf]; + _vec_len (nm->rx_buffers[thread_index]) = last_buf; /* enqueue buffer */ to_next[0] = bi0; @@ -315,7 +316,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, ring->tail = head; vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters - + VNET_INTERFACE_COUNTER_RX, cpu_index, + + VNET_INTERFACE_COUNTER_RX, thread_index, mif->hw_if_index, n_rx_packets, n_rx_bytes); @@ -327,7 +328,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); memif_main_t *nm = &memif_main; memif_if_t *mif; @@ -337,7 +338,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (mif->flags & MEMIF_IF_FLAG_ADMIN_UP && mif->flags & MEMIF_IF_FLAG_CONNECTED && (mif->if_index % nm->input_cpu_count) == - (cpu_index - nm->input_cpu_first_index)) + (thread_index - nm->input_cpu_first_index)) { if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) n_rx_packets += diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c index b4961365..e5ee965f 100644 --- a/src/plugins/snat/in2out.c +++ b/src/plugins/snat/in2out.c @@ -212,7 +212,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, snat_session_t ** sessionp, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index) + u32 thread_index) { snat_user_t *u; snat_user_key_t user_key; @@ -246,27 +246,27 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0)) { /* no, make a new one */ - pool_get (sm->per_thread_data[cpu_index].users, u); + pool_get (sm->per_thread_data[thread_index].users, u); memset (u, 0, sizeof (*u)); u->addr = ip0->src_address; u->fib_index = rx_fib_index0; - pool_get (sm->per_thread_data[cpu_index].list_pool, per_user_list_head_elt); + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); - kv0.value = u - sm->per_thread_data[cpu_index].users; + kv0.value = u - sm->per_thread_data[thread_index].users; /* add user */ clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */); } else { - u = pool_elt_at_index (sm->per_thread_data[cpu_index].users, + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, value0.value); } @@ -276,25 +276,25 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, /* Remove the oldest dynamic translation */ do { oldest_per_user_translation_list_index = - clib_dlist_remove_head (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove_head (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); ASSERT (oldest_per_user_translation_list_index != ~0); /* add it back to the end of the LRU list */ - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index, oldest_per_user_translation_list_index); /* Get the list element */ oldest_per_user_translation_list_elt = - pool_elt_at_index (sm->per_thread_data[cpu_index].list_pool, + pool_elt_at_index (sm->per_thread_data[thread_index].list_pool, oldest_per_user_translation_list_index); /* Get the session index from the list element */ session_index = oldest_per_user_translation_list_elt->value; /* Get the session */ - s = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, session_index); } while (snat_is_session_static (s)); @@ -346,7 +346,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, } /* Create a new session */ - pool_get (sm->per_thread_data[cpu_index].sessions, s); + pool_get (sm->per_thread_data[thread_index].sessions, s); memset (s, 0, sizeof (*s)); s->outside_address_index = address_index; @@ -362,22 +362,22 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, } /* Create list elts */ - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt); - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); per_user_translation_list_elt->value = - s - sm->per_thread_data[cpu_index].sessions; + s - sm->per_thread_data[thread_index].sessions; s->per_user_index = per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s->per_user_list_head_index, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); } s->in2out = *key0; @@ -388,12 +388,12 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, /* Add to translation hashes */ kv0.key = s->in2out.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */)) clib_warning ("in2out key add failed"); kv0.key = s->out2in.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */)) clib_warning ("out2in key add failed"); @@ -403,7 +403,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, worker_by_out_key.port = s->out2in.port; worker_by_out_key.fib_index = s->out2in.fib_index; kv0.key = worker_by_out_key.as_u64; - kv0.value = cpu_index; + kv0.value = thread_index; clib_bihash_add_del_8_8 (&sm->worker_by_out, &kv0, 1); /* log NAT event */ @@ -465,7 +465,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0, * * @param[in,out] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -473,7 +473,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0, * @param d optional parameter */ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -524,13 +524,13 @@ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, } next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto out; } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); out: @@ -548,7 +548,7 @@ out: * * @param[in] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -556,7 +556,7 @@ out: * @param d optional parameter */ u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -624,7 +624,7 @@ static inline u32 icmp_in2out (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index, + u32 thread_index, void *d) { snat_session_key_t key0, sm0; @@ -641,7 +641,7 @@ static inline u32 icmp_in2out (snat_main_t *sm, echo0 = (icmp_echo_header_t *)(icmp0+1); - next0_tmp = sm->icmp_match_in2out_cb(sm, node, cpu_index, b0, + next0_tmp = sm->icmp_match_in2out_cb(sm, node, thread_index, b0, &key0, &sm0, &dont_translate, d); if (next0_tmp != ~0) next0 = next0_tmp; @@ -847,11 +847,11 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm, vlib_node_runtime_t * node, u32 next0, f64 now, - u32 cpu_index, + u32 thread_index, snat_session_t ** p_s0) { next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, cpu_index, p_s0); + next0, thread_index, p_s0); snat_session_t * s0 = *p_s0; if (PREDICT_TRUE(next0 != SNAT_IN2OUT_NEXT_DROP && s0)) { @@ -862,9 +862,9 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm, /* Per-user LRU list maintenance for dynamic translations */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -884,7 +884,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, snat_runtime_t * rt = (snat_runtime_t *)node->runtime_data; f64 now = vlib_time_now (vm); u32 stats_node_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); stats_node_index = is_slow_path ? snat_in2out_slowpath_node.index : snat_in2out_node.index; @@ -977,7 +977,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next0 = icmp_in2out_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, - node, next0, now, cpu_index, &s0); + node, next0, now, thread_index, &s0); goto trace00; } } @@ -1006,7 +1006,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace00; next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto trace00; } @@ -1017,7 +1017,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->src_address.as_u32; @@ -1063,9 +1063,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1081,7 +1081,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; @@ -1117,7 +1117,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next1 = icmp_in2out_slow_path (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, cpu_index, &s1); + next1, now, thread_index, &s1); goto trace01; } } @@ -1146,7 +1146,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace01; next1 = slow_path (sm, b1, ip1, rx_fib_index1, &key1, - &s1, node, next1, cpu_index); + &s1, node, next1, thread_index); if (PREDICT_FALSE (next1 == SNAT_IN2OUT_NEXT_DROP)) goto trace01; } @@ -1157,7 +1157,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value1.value); old_addr1 = ip1->src_address.as_u32; @@ -1203,9 +1203,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s1)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s1->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s1->per_user_list_head_index, s1->per_user_index); } @@ -1220,7 +1220,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next1; t->session_index = ~0; if (s1) - t->session_index = s1 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next1 != SNAT_IN2OUT_NEXT_DROP; @@ -1292,7 +1292,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next0 = icmp_in2out_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace0; } } @@ -1321,7 +1321,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace0; next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto trace0; @@ -1333,7 +1333,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->src_address.as_u32; @@ -1379,9 +1379,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1397,7 +1397,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; @@ -2010,7 +2010,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm, u32 n_left_to_next_worker = 0, *to_next_worker = 0; u32 next_worker_index = 0; u32 current_worker_index = ~0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ASSERT (vec_len (sm->workers)); @@ -2048,7 +2048,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm, next_worker_index = sm->worker_in2out_cb(ip0, rx_fib_index0); - if (PREDICT_FALSE (next_worker_index != cpu_index)) + if (PREDICT_FALSE (next_worker_index != thread_index)) { do_handoff = 1; diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c index 656e42db..5d308d78 100644 --- a/src/plugins/snat/out2in.c +++ b/src/plugins/snat/out2in.c @@ -129,7 +129,7 @@ create_session_for_static_mapping (snat_main_t *sm, snat_session_key_t in2out, snat_session_key_t out2in, vlib_node_runtime_t * node, - u32 cpu_index) + u32 thread_index) { snat_user_t *u; snat_user_key_t user_key; @@ -146,36 +146,36 @@ create_session_for_static_mapping (snat_main_t *sm, if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0)) { /* no, make a new one */ - pool_get (sm->per_thread_data[cpu_index].users, u); + pool_get (sm->per_thread_data[thread_index].users, u); memset (u, 0, sizeof (*u)); u->addr = in2out.addr; u->fib_index = in2out.fib_index; - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); - kv0.value = u - sm->per_thread_data[cpu_index].users; + kv0.value = u - sm->per_thread_data[thread_index].users; /* add user */ clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */); /* add non-traslated packets worker lookup */ - kv0.value = cpu_index; + kv0.value = thread_index; clib_bihash_add_del_8_8 (&sm->worker_by_in, &kv0, 1); } else { - u = pool_elt_at_index (sm->per_thread_data[cpu_index].users, + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, value0.value); } - pool_get (sm->per_thread_data[cpu_index].sessions, s); + pool_get (sm->per_thread_data[thread_index].sessions, s); memset (s, 0, sizeof (*s)); s->outside_address_index = ~0; @@ -183,22 +183,22 @@ create_session_for_static_mapping (snat_main_t *sm, u->nstaticsessions++; /* Create list elts */ - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt); - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); per_user_translation_list_elt->value = - s - sm->per_thread_data[cpu_index].sessions; + s - sm->per_thread_data[thread_index].sessions; s->per_user_index = - per_user_translation_list_elt - sm->per_thread_data[cpu_index].list_pool; + per_user_translation_list_elt - sm->per_thread_data[thread_index].list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s->per_user_list_head_index, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); s->in2out = in2out; s->out2in = out2in; @@ -206,12 +206,12 @@ create_session_for_static_mapping (snat_main_t *sm, /* Add to translation hashes */ kv0.key = s->in2out.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */)) clib_warning ("in2out key add failed"); kv0.key = s->out2in.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */)) clib_warning ("out2in key add failed"); @@ -298,7 +298,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, * * @param[in,out] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -306,7 +306,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, * @param d optional parameter */ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -366,7 +366,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, - node, cpu_index); + node, thread_index); if (!s0) { @@ -375,7 +375,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); out: @@ -393,7 +393,7 @@ out: * * @param[in] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -401,7 +401,7 @@ out: * @param d optional parameter */ u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -460,7 +460,7 @@ static inline u32 icmp_out2in (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index, + u32 thread_index, void *d) { snat_session_key_t key0, sm0; @@ -477,7 +477,7 @@ static inline u32 icmp_out2in (snat_main_t *sm, echo0 = (icmp_echo_header_t *)(icmp0+1); - next0_tmp = sm->icmp_match_out2in_cb(sm, node, cpu_index, b0, + next0_tmp = sm->icmp_match_out2in_cb(sm, node, thread_index, b0, &key0, &sm0, &dont_translate, d); if (next0_tmp != ~0) next0 = next0_tmp; @@ -589,11 +589,11 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, f64 now, - u32 cpu_index, + u32 thread_index, snat_session_t ** p_s0) { next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, cpu_index, p_s0); + next0, thread_index, p_s0); snat_session_t * s0 = *p_s0; if (PREDICT_TRUE(next0 != SNAT_OUT2IN_NEXT_DROP && s0)) { @@ -604,9 +604,9 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -624,7 +624,7 @@ snat_out2in_node_fn (vlib_main_t * vm, u32 pkts_processed = 0; snat_main_t * sm = &snat_main; f64 now = vlib_time_now (vm); - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -712,7 +712,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next0 = icmp_out2in_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace0; } @@ -743,7 +743,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, - cpu_index); + thread_index); if (!s0) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -752,7 +752,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->dst_address.as_u32; @@ -796,9 +796,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -813,7 +813,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; @@ -847,7 +847,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next1 = icmp_out2in_slow_path (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, cpu_index, &s1); + next1, now, thread_index, &s1); goto trace1; } @@ -878,7 +878,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s1 = create_session_for_static_mapping(sm, b1, sm1, key1, node, - cpu_index); + thread_index); if (!s1) { b1->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -887,7 +887,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value1.value); old_addr1 = ip1->dst_address.as_u32; @@ -931,9 +931,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s1)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s1->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s1->per_user_list_head_index, s1->per_user_index); } @@ -948,7 +948,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next1; t->session_index = ~0; if (s1) - t->session_index = s1 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next1 != SNAT_OUT2IN_NEXT_DROP; @@ -1016,7 +1016,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next0 = icmp_out2in_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace00; } @@ -1048,7 +1048,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, - cpu_index); + thread_index); if (!s0) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -1057,7 +1057,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->dst_address.as_u32; @@ -1101,9 +1101,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1118,7 +1118,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; @@ -1599,7 +1599,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm, u32 n_left_to_next_worker = 0, *to_next_worker = 0; u32 next_worker_index = 0; u32 current_worker_index = ~0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ASSERT (vec_len (sm->workers)); @@ -1637,7 +1637,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm, next_worker_index = sm->worker_out2in_cb(ip0, rx_fib_index0); - if (PREDICT_FALSE (next_worker_index != cpu_index)) + if (PREDICT_FALSE (next_worker_index != thread_index)) { do_handoff = 1; diff --git a/src/plugins/snat/snat.h b/src/plugins/snat/snat.h index 017825c0..f4e1c5c0 100644 --- a/src/plugins/snat/snat.h +++ b/src/plugins/snat/snat.h @@ -221,7 +221,7 @@ struct snat_main_s; typedef u32 snat_icmp_match_function_t (struct snat_main_s *sm, vlib_node_runtime_t *node, - u32 cpu_index, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, @@ -402,22 +402,22 @@ typedef struct { } tcp_udp_header_t; u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a517a597..be3b41ef 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -299,7 +299,7 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, if (CLIB_DEBUG == 0) return; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); /* smp disaster check */ if (vec_len (vlib_mains) > 1) @@ -355,7 +355,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, vlib_buffer_free_list_t *f; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) { @@ -474,7 +474,7 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) u32 merge_index; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 394c336a..328660a3 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -209,7 +209,7 @@ always_inline vlib_buffer_known_state_t vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); uword *p = hash_get (bm->buffer_known_hash, buffer_index); return p ? p[0] : VLIB_BUFFER_UNKNOWN; @@ -221,7 +221,7 @@ vlib_buffer_set_known_state (vlib_main_t * vm, vlib_buffer_known_state_t state) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); hash_set (bm->buffer_known_hash, buffer_index, state); } diff --git a/src/vlib/cli.c b/src/vlib/cli.c index f853f655..3cc95076 100644 --- a/src/vlib/cli.c +++ b/src/vlib/cli.c @@ -709,7 +709,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap->flags |= MHEAP_FLAG_VALIDATE; // Turn off small object cache because it delays detection of errors @@ -722,7 +722,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap->flags &= ~MHEAP_FLAG_VALIDATE; mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; @@ -733,7 +733,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap_validate(heap); }); diff --git a/src/vlib/counter.h b/src/vlib/counter.h index 17a85217..60e2055d 100644 --- a/src/vlib/counter.h +++ b/src/vlib/counter.h @@ -70,17 +70,17 @@ u32 vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm); /** Increment a simple counter @param cm - (vlib_simple_counter_main_t *) simple counter main pointer - @param cpu_index - (u32) the current cpu index + @param thread_index - (u32) the current cpu index @param index - (u32) index of the counter to increment @param increment - (u64) quantitiy to add to the counter */ always_inline void vlib_increment_simple_counter (vlib_simple_counter_main_t * cm, - u32 cpu_index, u32 index, u64 increment) + u32 thread_index, u32 index, u64 increment) { counter_t *my_counters; - my_counters = cm->counters[cpu_index]; + my_counters = cm->counters[thread_index]; my_counters[index] += increment; } @@ -201,7 +201,7 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); /** Increment a combined counter @param cm - (vlib_combined_counter_main_t *) comined counter main pointer - @param cpu_index - (u32) the current cpu index + @param thread_index - (u32) the current cpu index @param index - (u32) index of the counter to increment @param packet_increment - (u64) number of packets to add to the counter @param byte_increment - (u64) number of bytes to add to the counter @@ -209,13 +209,13 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); always_inline void vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, - u32 cpu_index, + u32 thread_index, u32 index, u64 n_packets, u64 n_bytes) { vlib_counter_t *my_counters; /* Use this CPU's counter array */ - my_counters = cm->counters[cpu_index]; + my_counters = cm->counters[thread_index]; my_counters[index].packets += n_packets; my_counters[index].bytes += n_bytes; @@ -224,14 +224,14 @@ vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, /** Pre-fetch a per-thread combined counter for the given object index */ always_inline void vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm, - u32 cpu_index, u32 index) + u32 thread_index, u32 index) { vlib_counter_t *cpu_counters; /* * This CPU's index is assumed to already be in cache */ - cpu_counters = cm->counters[cpu_index]; + cpu_counters = cm->counters[thread_index]; CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE); } diff --git a/src/vlib/error.c b/src/vlib/error.c index a2c23176..e4ed4ee3 100644 --- a/src/vlib/error.c +++ b/src/vlib/error.c @@ -149,7 +149,7 @@ vlib_register_errors (vlib_main_t * vm, vlib_node_t *n = vlib_get_node (vm, node_index); uword l; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); /* Free up any previous error strings. */ if (n->n_errors > 0) diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h index f51ec381..9dd01fbf 100644 --- a/src/vlib/global_funcs.h +++ b/src/vlib/global_funcs.h @@ -23,7 +23,7 @@ always_inline vlib_main_t * vlib_get_main (void) { vlib_main_t *vm; - vm = vlib_mains[os_get_cpu_number ()]; + vm = vlib_mains[vlib_get_thread_index ()]; ASSERT (vm); return vm; } diff --git a/src/vlib/main.c b/src/vlib/main.c index b22203f0..422d3e26 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -136,18 +136,18 @@ vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, else { f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN); - f->cpu_index = vm->cpu_index; + f->thread_index = vm->thread_index; fi = vlib_frame_index_no_check (vm, f); } /* Poison frame when debugging. */ if (CLIB_DEBUG > 0) { - u32 save_cpu_index = f->cpu_index; + u32 save_thread_index = f->thread_index; memset (f, 0xfe, n); - f->cpu_index = save_cpu_index; + f->thread_index = save_thread_index; } /* Insert magic number. */ @@ -517,7 +517,7 @@ vlib_put_next_frame (vlib_main_t * vm, * a dangling frame reference. Each thread has its own copy of * the next_frames vector. */ - if (0 && r->cpu_index != next_runtime->cpu_index) + if (0 && r->thread_index != next_runtime->thread_index) { nf->frame_index = ~0; nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED); @@ -866,7 +866,7 @@ vlib_elog_main_loop_event (vlib_main_t * vm, : evm->node_call_elog_event_types, node_index), /* track */ - (vm->cpu_index ? &vlib_worker_threads[vm->cpu_index]. + (vm->thread_index ? &vlib_worker_threads[vm->thread_index]. elog_track : &em->default_track), /* data to log */ n_vectors); } @@ -963,7 +963,7 @@ dispatch_node (vlib_main_t * vm, vm->cpu_time_last_node_dispatch = last_time_stamp; - if (1 /* || vm->cpu_index == node->cpu_index */ ) + if (1 /* || vm->thread_index == node->thread_index */ ) { vlib_main_t *stat_vm; @@ -1029,7 +1029,7 @@ dispatch_node (vlib_main_t * vm, { u32 node_name, vector_length, is_polling; } *ed; - vlib_worker_thread_t *w = vlib_worker_threads + vm->cpu_index; + vlib_worker_thread_t *w = vlib_worker_threads + vm->thread_index; #endif if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT diff --git a/src/vlib/main.h b/src/vlib/main.h index 0197b4f3..329bf073 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -156,7 +156,7 @@ typedef struct vlib_main_t uword *init_functions_called; /* to compare with node runtime */ - u32 cpu_index; + u32 thread_index; void **mbuf_alloc_list; diff --git a/src/vlib/node.c b/src/vlib/node.c index dc0a4de5..bbd3a42e 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -99,7 +99,7 @@ vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index) vlib_pending_frame_t *pf; i32 i, j, n_insert; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); vlib_worker_thread_barrier_sync (vm); diff --git a/src/vlib/node.h b/src/vlib/node.h index fc7e7da2..1e2f4c38 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -344,8 +344,8 @@ typedef struct vlib_frame_t /* Number of vector elements currently in frame. */ u16 n_vectors; - /* Owner cpuid / heap id */ - u16 cpu_index; + /* Owner thread / heap id */ + u16 thread_index; /* Scalar and vector arguments to next node. */ u8 arguments[0]; @@ -459,7 +459,7 @@ typedef struct vlib_node_runtime_t zero before first run of this node. */ - u16 cpu_index; /**< CPU this node runs on */ + u16 thread_index; /**< thread this node runs on */ u8 runtime_data[0]; /**< Function dependent node-runtime data. This data is diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 1f7d94e1..54e36874 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -201,9 +201,9 @@ always_inline vlib_frame_t * vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) { vlib_frame_t *f; - u32 cpu_index = frame_index & VLIB_CPU_MASK; + u32 thread_index = frame_index & VLIB_CPU_MASK; u32 offset = frame_index & VLIB_OFFSET_MASK; - vm = vlib_mains[cpu_index]; + vm = vlib_mains[thread_index]; f = vm->heap_base + offset; return f; } @@ -215,10 +215,10 @@ vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) ASSERT (((uword) f & VLIB_CPU_MASK) == 0); - vm = vlib_mains[f->cpu_index]; + vm = vlib_mains[f->thread_index]; i = ((u8 *) f - (u8 *) vm->heap_base); - return i | f->cpu_index; + return i | f->thread_index; } always_inline vlib_frame_t * diff --git a/src/vlib/threads.c b/src/vlib/threads.c index ef3a24d3..4a111f8d 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -35,27 +35,12 @@ vl (void *p) vlib_worker_thread_t *vlib_worker_threads; vlib_thread_main_t vlib_thread_main; +__thread uword vlib_thread_index = 0; + uword os_get_cpu_number (void) { - void *sp; - uword n; - u32 len; - - len = vec_len (vlib_thread_stacks); - if (len == 0) - return 0; - - /* Get any old stack address. */ - sp = &sp; - - n = ((uword) sp - (uword) vlib_thread_stacks[0]) - >> VLIB_LOG2_THREAD_STACK_SIZE; - - /* "processes" have their own stacks, and they always run in thread 0 */ - n = n >= len ? 0 : n; - - return n; + return vlib_thread_index; } uword @@ -275,21 +260,6 @@ vlib_thread_init (vlib_main_t * vm) return 0; } -vlib_worker_thread_t * -vlib_alloc_thread (vlib_main_t * vm) -{ - vlib_worker_thread_t *w; - - if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks)) - { - clib_warning ("out of worker threads... Quitting..."); - exit (1); - } - vec_add2 (vlib_worker_threads, w, 1); - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; - return w; -} - vlib_frame_queue_t * vlib_frame_queue_alloc (int nelts) { @@ -427,7 +397,7 @@ vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, f64 b4 = vlib_time_now_ticks (vm, before); vlib_worker_thread_barrier_check (vm, b4); /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */ - // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm); + // vlib_frame_queue_dequeue (vm->thread_index, vm, nm); } elt = fq->elts + (new_tail & (fq->nelts - 1)); @@ -497,6 +467,8 @@ vlib_worker_thread_bootstrap_fn (void *arg) w->lwp = syscall (SYS_gettid); w->thread_id = pthread_self (); + vlib_thread_index = w - vlib_worker_threads; + rv = (void *) clib_calljmp ((uword (*)(uword)) w->thread_function, (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE); @@ -610,7 +582,9 @@ start_workers (vlib_main_t * vm) mheap_alloc (0 /* use VM */ , tr->mheap_size); else w->thread_mheap = main_heap; - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + + w->thread_stack = + vlib_thread_stack_init (w - vlib_worker_threads); w->thread_function = tr->function; w->thread_function_arg = w; w->instance_id = k; @@ -630,7 +604,7 @@ start_workers (vlib_main_t * vm) vm_clone = clib_mem_alloc (sizeof (*vm_clone)); clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone)); - vm_clone->cpu_index = worker_thread_index; + vm_clone->thread_index = worker_thread_index; vm_clone->heap_base = w->thread_mheap; vm_clone->mbuf_alloc_list = 0; vm_clone->init_functions_called = @@ -679,7 +653,7 @@ start_workers (vlib_main_t * vm) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy initial runtime_data from node */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -692,7 +666,7 @@ start_workers (vlib_main_t * vm) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy initial runtime_data from node */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -756,7 +730,8 @@ start_workers (vlib_main_t * vm) mheap_alloc (0 /* use VM */ , tr->mheap_size); else w->thread_mheap = main_heap; - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + w->thread_stack = + vlib_thread_stack_init (w - vlib_worker_threads); w->thread_function = tr->function; w->thread_function_arg = w; w->instance_id = j; @@ -827,7 +802,7 @@ vlib_worker_thread_node_runtime_update (void) uword n_calls, uword n_vectors, uword n_clocks); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (vec_len (vlib_mains) == 1) return; @@ -835,7 +810,7 @@ vlib_worker_thread_node_runtime_update (void) vm = vlib_mains[0]; nm = &vm->node_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); ASSERT (*vlib_worker_threads->wait_at_barrier == 1); /* @@ -955,7 +930,7 @@ vlib_worker_thread_node_runtime_update (void) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy runtime_data, will be overwritten later for existing rt */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -981,7 +956,7 @@ vlib_worker_thread_node_runtime_update (void) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy runtime_data, will be overwritten later for existing rt */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -1180,7 +1155,7 @@ vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which) if (vlib_mains == 0) return; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); vlib_worker_thread_barrier_sync (vm); switch (which) @@ -1212,7 +1187,7 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) vlib_worker_threads[0].barrier_sync_count++; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; @@ -1260,7 +1235,7 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) int vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm) { - u32 thread_id = vm->cpu_index; + u32 thread_id = vm->thread_index; vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id]; vlib_frame_queue_elt_t *elt; u32 *from, *to; @@ -1393,7 +1368,7 @@ vlib_worker_thread_fn (void *arg) vlib_main_t *vm = vlib_get_main (); clib_error_t *e; - ASSERT (vm->cpu_index == os_get_cpu_number ()); + ASSERT (vm->thread_index == vlib_get_thread_index ()); vlib_worker_thread_init (w); clib_time_init (&vm->clib_time); diff --git a/src/vlib/threads.h b/src/vlib/threads.h index eca4fc26..101d3d4a 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -153,8 +153,6 @@ typedef struct /* Called early, in thread 0's context */ clib_error_t *vlib_thread_init (vlib_main_t * vm); -vlib_worker_thread_t *vlib_alloc_thread (vlib_main_t * vm); - int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, u32 frame_queue_index, vlib_frame_t * frame, vlib_frame_queue_msg_type_t type); @@ -183,12 +181,19 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts); void vlib_worker_thread_barrier_sync (vlib_main_t * vm); void vlib_worker_thread_barrier_release (vlib_main_t * vm); +extern __thread uword vlib_thread_index; +static_always_inline uword +vlib_get_thread_index (void) +{ + return vlib_thread_index; +} + always_inline void vlib_smp_unsafe_warning (void) { if (CLIB_DEBUG > 0) { - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__); } } @@ -331,21 +336,21 @@ vlib_num_workers () } always_inline u32 -vlib_get_worker_cpu_index (u32 worker_index) +vlib_get_worker_thread_index (u32 worker_index) { return worker_index + 1; } always_inline u32 -vlib_get_worker_index (u32 cpu_index) +vlib_get_worker_index (u32 thread_index) { - return cpu_index - 1; + return thread_index - 1; } always_inline u32 vlib_get_current_worker_index () { - return os_get_cpu_number () - 1; + return vlib_get_thread_index () - 1; } static inline void @@ -467,6 +472,8 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, return elt; } +u8 *vlib_thread_stack_init (uword thread_index); + int vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb); diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c index 33ba163a..7c1e9475 100644 --- a/src/vlib/unix/cj.c +++ b/src/vlib/unix/cj.c @@ -48,7 +48,7 @@ cj_log (u32 type, void *data0, void *data1) r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]); r->time = vlib_time_now (cjm->vlib_main); - r->cpu = os_get_cpu_number (); + r->thread_index = vlib_get_thread_index (); r->type = type; r->data[0] = pointer_to_uword (data0); r->data[1] = pointer_to_uword (data1); @@ -133,7 +133,8 @@ static inline void cj_dump_one_record (cj_record_t * r) { fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n", - r->cpu, r->time, r->type, (long long unsigned int) r->data[0], + r->thread_index, r->time, r->type, + (long long unsigned int) r->data[0], (long long unsigned int) r->data[1]); } @@ -161,7 +162,7 @@ cj_dump_internal (u8 filter0_enable, u64 filter0, index = (cjm->tail + 1) & (cjm->num_records - 1); r = &(cjm->records[index]); - if (r->cpu != (u32) ~ 0) + if (r->thread_index != (u32) ~ 0) { /* Yes, dump from tail + 1 to the end */ for (i = index; i < cjm->num_records; i++) diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h index 67626afe..d0a1d46e 100644 --- a/src/vlib/unix/cj.h +++ b/src/vlib/unix/cj.h @@ -23,7 +23,7 @@ typedef struct { f64 time; - u32 cpu; + u32 thread_index; u32 type; u64 data[2]; } cj_record_t; diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 6b96cc0d..db5ddd64 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -510,13 +510,28 @@ thread0 (uword arg) return i; } +u8 * +vlib_thread_stack_init (uword thread_index) +{ + vec_validate (vlib_thread_stacks, thread_index); + vlib_thread_stacks[thread_index] = clib_mem_alloc_aligned + (VLIB_THREAD_STACK_SIZE, VLIB_THREAD_STACK_SIZE); + + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (vlib_thread_stacks[thread_index], + clib_mem_get_page_size (), PROT_READ) < 0) + clib_unix_warning ("thread stack"); + return vlib_thread_stacks[thread_index]; +} + int vlib_unix_main (int argc, char *argv[]) { vlib_main_t *vm = &vlib_global_main; /* one and only time for this! */ - vlib_thread_main_t *tm = &vlib_thread_main; unformat_input_t input; - u8 *thread_stacks; clib_error_t *e; int i; @@ -548,29 +563,9 @@ vlib_unix_main (int argc, char *argv[]) } unformat_free (&input); - /* - * allocate n x VLIB_THREAD_STACK_SIZE stacks, aligned to a - * VLIB_THREAD_STACK_SIZE boundary - * See also: os_get_cpu_number() in vlib/vlib/threads.c - */ - thread_stacks = clib_mem_alloc_aligned - ((uword) tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE, - VLIB_THREAD_STACK_SIZE); - - vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1); - for (i = 0; i < vec_len (vlib_thread_stacks); i++) - { - vlib_thread_stacks[i] = thread_stacks; - - /* - * Disallow writes to the bottom page of the stack, to - * catch stack overflows. - */ - if (mprotect (thread_stacks, clib_mem_get_page_size (), PROT_READ) < 0) - clib_unix_warning ("thread stack"); + vlib_thread_stack_init (0); - thread_stacks += VLIB_THREAD_STACK_SIZE; - } + vlib_thread_index = 0; i = clib_calljmp (thread0, (uword) vm, (void *) (vlib_thread_stacks[0] + diff --git a/src/vnet/adj/adj_l2.c b/src/vnet/adj/adj_l2.c index f68e54e0..20d70dd4 100644 --- a/src/vnet/adj/adj_l2.c +++ b/src/vnet/adj/adj_l2.c @@ -52,7 +52,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm, { u32 * from = vlib_frame_vector_args (frame); u32 n_left_from, n_left_to_next, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); ethernet_main_t * em = ðernet_main; n_left_from = frame->n_vectors; @@ -93,7 +93,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm, vnet_buffer(p0)->sw_if_index[VLIB_TX] = adj0->rewrite_header.sw_if_index; vlib_increment_combined_counter(&adjacency_counters, - cpu_index, + thread_index, adj_index0, /* packet increment */ 0, /* byte increment */ rw_len0); diff --git a/src/vnet/adj/adj_midchain.c b/src/vnet/adj/adj_midchain.c index e8087f08..5756de43 100644 --- a/src/vnet/adj/adj_midchain.c +++ b/src/vnet/adj/adj_midchain.c @@ -49,7 +49,7 @@ adj_midchain_tx_inline (vlib_main_t * vm, u32 next_index; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; /* Vector of buffer / pkt indices we're supposed to process */ from = vlib_frame_vector_args (frame); @@ -124,13 +124,13 @@ adj_midchain_tx_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj0->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj1->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b1)); @@ -181,7 +181,7 @@ adj_midchain_tx_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj0->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/vnet/adj/adj_nsh.c b/src/vnet/adj/adj_nsh.c index 9a0f9d8b..128570b0 100644 --- a/src/vnet/adj/adj_nsh.c +++ b/src/vnet/adj/adj_nsh.c @@ -53,7 +53,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm, { u32 * from = vlib_frame_vector_args (frame); u32 n_left_from, n_left_to_next, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); n_left_from = frame->n_vectors; next_index = node->cached_next_index; @@ -94,7 +94,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm, vnet_buffer(p0)->ip.save_rewrite_length = rw_len0; vlib_increment_combined_counter(&adjacency_counters, - cpu_index, + thread_index, adj_index0, /* packet increment */ 0, /* byte increment */ rw_len0); diff --git a/src/vnet/classify/vnet_classify.c b/src/vnet/classify/vnet_classify.c index 98842a48..70a189b0 100644 --- a/src/vnet/classify/vnet_classify.c +++ b/src/vnet/classify/vnet_classify.c @@ -251,12 +251,12 @@ static inline void make_working_copy vnet_classify_entry_##size##_t * working_copy##size = 0; foreach_size_in_u32x4; #undef _ - u32 cpu_number = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); - if (cpu_number >= vec_len (t->working_copies)) + if (thread_index >= vec_len (t->working_copies)) { oldheap = clib_mem_set_heap (t->mheap); - vec_validate (t->working_copies, cpu_number); + vec_validate (t->working_copies, thread_index); clib_mem_set_heap (oldheap); } @@ -265,7 +265,7 @@ static inline void make_working_copy * updates from multiple threads will not result in sporadic, spurious * lookup failures. */ - working_copy = t->working_copies[cpu_number]; + working_copy = t->working_copies[thread_index]; t->saved_bucket.as_u64 = b->as_u64; oldheap = clib_mem_set_heap (t->mheap); @@ -290,7 +290,7 @@ static inline void make_working_copy default: abort(); } - t->working_copies[cpu_number] = working_copy; + t->working_copies[thread_index] = working_copy; } _vec_len(working_copy) = (1<log2_pages)*t->entries_per_page; @@ -318,7 +318,7 @@ static inline void make_working_copy working_bucket.offset = vnet_classify_get_offset (t, working_copy); CLIB_MEMORY_BARRIER(); b->as_u64 = working_bucket.as_u64; - t->working_copies[cpu_number] = working_copy; + t->working_copies[thread_index] = working_copy; } static vnet_classify_entry_t * @@ -387,7 +387,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t, int i; u64 hash, new_hash; u32 new_log2_pages; - u32 cpu_number = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u8 * key_minus_skip; ASSERT ((add_v->flags & VNET_CLASSIFY_ENTRY_FREE) == 0); @@ -498,7 +498,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t, new_log2_pages = t->saved_bucket.log2_pages + 1; expand_again: - working_copy = t->working_copies[cpu_number]; + working_copy = t->working_copies[thread_index]; new_v = split_and_rehash (t, working_copy, new_log2_pages); if (new_v == 0) diff --git a/src/vnet/cop/ip4_whitelist.c b/src/vnet/cop/ip4_whitelist.c index 6ef3d7d7..1b5e336b 100644 --- a/src/vnet/cop/ip4_whitelist.c +++ b/src/vnet/cop/ip4_whitelist.c @@ -60,7 +60,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, cop_feature_type_t next_index; cop_main_t *cm = &cop_main; vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -177,12 +177,12 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, dpo1 = load_balance_get_bucket_i(lb1, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); vlib_increment_combined_counter - (vcm, cpu_index, lb_index1, 1, + (vcm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, b1) + sizeof(ethernet_header_t)); @@ -273,7 +273,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, dpo0 = load_balance_get_bucket_i(lb0, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); diff --git a/src/vnet/cop/ip6_whitelist.c b/src/vnet/cop/ip6_whitelist.c index c2e16ccf..f3fe62e3 100644 --- a/src/vnet/cop/ip6_whitelist.c +++ b/src/vnet/cop/ip6_whitelist.c @@ -61,7 +61,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, cop_main_t *cm = &cop_main; ip6_main_t * im6 = &ip6_main; vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -153,12 +153,12 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, dpo1 = load_balance_get_bucket_i(lb1, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); vlib_increment_combined_counter - (vcm, cpu_index, lb_index1, 1, + (vcm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, b1) + sizeof(ethernet_header_t)); @@ -233,7 +233,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, dpo0 = load_balance_get_bucket_i(lb0, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index ba337f3f..76980102 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -124,7 +124,7 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 frame_num = apif->rx_req->tp_frame_nr; u8 *block_start = apif->rx_ring + block * block_size; uword n_trace = vlib_get_trace_count (vm, node); - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); u32 min_bufs = apif->rx_req->tp_frame_size / n_buffer_bytes; @@ -132,15 +132,15 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (apif->per_interface_next_index != ~0) next_index = apif->per_interface_next_index; - n_free_bufs = vec_len (apm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (apm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) { - vec_validate (apm->rx_buffers[cpu_index], + vec_validate (apm->rx_buffers[thread_index], VLIB_FRAME_SIZE + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &apm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &apm->rx_buffers[thread_index][n_free_bufs], VLIB_FRAME_SIZE); - _vec_len (apm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (apm->rx_buffers[thread_index]) = n_free_bufs; } rx_frame = apif->next_rx_frame; @@ -163,11 +163,11 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { /* grab free buffer */ u32 last_empty_buffer = - vec_len (apm->rx_buffers[cpu_index]) - 1; + vec_len (apm->rx_buffers[thread_index]) - 1; prev_bi0 = bi0; - bi0 = apm->rx_buffers[cpu_index][last_empty_buffer]; + bi0 = apm->rx_buffers[thread_index][last_empty_buffer]; b0 = vlib_get_buffer (vm, bi0); - _vec_len (apm->rx_buffers[cpu_index]) = last_empty_buffer; + _vec_len (apm->rx_buffers[thread_index]) = last_empty_buffer; n_free_bufs--; /* copy data */ @@ -236,9 +236,9 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), apif->hw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), apif->hw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c index 41645220..5e5e812c 100644 --- a/src/vnet/devices/devices.c +++ b/src/vnet/devices/devices.c @@ -104,7 +104,7 @@ vnet_device_queue_sort (void *a1, void *a2) void vnet_device_input_assign_thread (u32 hw_if_index, - u16 queue_id, uword cpu_index) + u16 queue_id, uword thread_index) { vnet_main_t *vnm = vnet_get_main (); vnet_device_main_t *vdm = &vnet_device_main; @@ -115,19 +115,19 @@ vnet_device_input_assign_thread (u32 hw_if_index, ASSERT (hw->input_node_index > 0); - if (vdm->first_worker_cpu_index == 0) - cpu_index = 0; + if (vdm->first_worker_thread_index == 0) + thread_index = 0; - if (cpu_index != 0 && - (cpu_index < vdm->first_worker_cpu_index || - cpu_index > vdm->last_worker_cpu_index)) + if (thread_index != 0 && + (thread_index < vdm->first_worker_thread_index || + thread_index > vdm->last_worker_thread_index)) { - cpu_index = vdm->next_worker_cpu_index++; - if (vdm->next_worker_cpu_index > vdm->last_worker_cpu_index) - vdm->next_worker_cpu_index = vdm->first_worker_cpu_index; + thread_index = vdm->next_worker_thread_index++; + if (vdm->next_worker_thread_index > vdm->last_worker_thread_index) + vdm->next_worker_thread_index = vdm->first_worker_thread_index; } - vm = vlib_mains[cpu_index]; + vm = vlib_mains[thread_index]; rt = vlib_node_get_runtime_data (vm, hw->input_node_index); vec_add2 (rt->devices_and_queues, dq, 1); @@ -136,33 +136,33 @@ vnet_device_input_assign_thread (u32 hw_if_index, dq->queue_id = queue_id; vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); - vec_validate (hw->input_node_cpu_index_by_queue, queue_id); - hw->input_node_cpu_index_by_queue[queue_id] = cpu_index; + vec_validate (hw->input_node_thread_index_by_queue, queue_id); + hw->input_node_thread_index_by_queue[queue_id] = thread_index; } static int vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id, - uword cpu_index) + uword thread_index) { vnet_main_t *vnm = vnet_get_main (); vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); vnet_device_input_runtime_t *rt; vnet_device_and_queue_t *dq; - uword old_cpu_index; + uword old_thread_index; - if (hw->input_node_cpu_index_by_queue == 0) + if (hw->input_node_thread_index_by_queue == 0) return VNET_API_ERROR_INVALID_INTERFACE; - if (vec_len (hw->input_node_cpu_index_by_queue) < queue_id + 1) + if (vec_len (hw->input_node_thread_index_by_queue) < queue_id + 1) return VNET_API_ERROR_INVALID_INTERFACE; - old_cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; + old_thread_index = hw->input_node_thread_index_by_queue[queue_id]; - if (old_cpu_index == cpu_index) + if (old_thread_index == thread_index) return 0; rt = - vlib_node_get_runtime_data (vlib_mains[old_cpu_index], + vlib_node_get_runtime_data (vlib_mains[old_thread_index], hw->input_node_index); vec_foreach (dq, rt->devices_and_queues) @@ -240,7 +240,7 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, vnet_device_main_t *vdm = &vnet_device_main; u32 hw_if_index = (u32) ~ 0; u32 queue_id = (u32) 0; - u32 cpu_index = (u32) ~ 0; + u32 thread_index = (u32) ~ 0; int rv; if (!unformat_user (input, unformat_line_input, line_input)) @@ -253,10 +253,10 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (line_input, "queue %d", &queue_id)) ; - else if (unformat (line_input, "main", &cpu_index)) - cpu_index = 0; - else if (unformat (line_input, "worker %d", &cpu_index)) - cpu_index += vdm->first_worker_cpu_index; + else if (unformat (line_input, "main", &thread_index)) + thread_index = 0; + else if (unformat (line_input, "worker %d", &thread_index)) + thread_index += vdm->first_worker_thread_index; else { error = clib_error_return (0, "parse error: '%U'", @@ -271,16 +271,17 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, if (hw_if_index == (u32) ~ 0) return clib_error_return (0, "please specify valid interface name"); - if (cpu_index > vdm->last_worker_cpu_index) + if (thread_index > vdm->last_worker_thread_index) return clib_error_return (0, "please specify valid worker thread or main"); - rv = vnet_device_input_unassign_thread (hw_if_index, queue_id, cpu_index); + rv = + vnet_device_input_unassign_thread (hw_if_index, queue_id, thread_index); if (rv) return clib_error_return (0, "not found"); - vnet_device_input_assign_thread (hw_if_index, queue_id, cpu_index); + vnet_device_input_assign_thread (hw_if_index, queue_id, thread_index); return 0; } @@ -326,9 +327,9 @@ vnet_device_init (vlib_main_t * vm) tr = p ? (vlib_thread_registration_t *) p[0] : 0; if (tr && tr->count > 0) { - vdm->first_worker_cpu_index = tr->first_index; - vdm->next_worker_cpu_index = tr->first_index; - vdm->last_worker_cpu_index = tr->first_index + tr->count - 1; + vdm->first_worker_thread_index = tr->first_index; + vdm->next_worker_thread_index = tr->first_index; + vdm->last_worker_thread_index = tr->first_index + tr->count - 1; } return 0; } diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h index bbb29fe3..966f8302 100644 --- a/src/vnet/devices/devices.h +++ b/src/vnet/devices/devices.h @@ -50,9 +50,9 @@ typedef struct typedef struct { vnet_device_per_worker_data_t *workers; - uword first_worker_cpu_index; - uword last_worker_cpu_index; - uword next_worker_cpu_index; + uword first_worker_thread_index; + uword last_worker_thread_index; + uword next_worker_thread_index; } vnet_device_main_t; typedef struct @@ -80,7 +80,7 @@ vnet_set_device_input_node (u32 hw_if_index, u32 node_index) } void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id, - uword cpu_index); + uword thread_index); static inline u64 vnet_get_aggregate_rx_packets (void) @@ -95,12 +95,12 @@ vnet_get_aggregate_rx_packets (void) } static inline void -vnet_device_increment_rx_packets (u32 cpu_index, u64 count) +vnet_device_increment_rx_packets (u32 thread_index, u64 count) { vnet_device_main_t *vdm = &vnet_device_main; vnet_device_per_worker_data_t *pwd; - pwd = vec_elt_at_index (vdm->workers, cpu_index); + pwd = vec_elt_at_index (vdm->workers, thread_index); pwd->aggregate_rx_packets += count; } @@ -117,9 +117,9 @@ vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index, { vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); - ASSERT (queue_id < vec_len (hw->input_node_cpu_index_by_queue)); - u32 cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; - vlib_node_set_interrupt_pending (vlib_mains[cpu_index], + ASSERT (queue_id < vec_len (hw->input_node_thread_index_by_queue)); + u32 thread_index = hw->input_node_thread_index_by_queue[queue_id]; + vlib_node_set_interrupt_pending (vlib_mains[thread_index], hw->input_node_index); } diff --git a/src/vnet/devices/netmap/node.c b/src/vnet/devices/netmap/node.c index 68ea7832..e120eeae 100644 --- a/src/vnet/devices/netmap/node.c +++ b/src/vnet/devices/netmap/node.c @@ -98,22 +98,22 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_free_bufs; struct netmap_ring *ring; int cur_ring; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (nif->per_interface_next_index != ~0) next_index = nif->per_interface_next_index; - n_free_bufs = vec_len (nm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) { - vec_validate (nm->rx_buffers[cpu_index], + vec_validate (nm->rx_buffers[thread_index], VLIB_FRAME_SIZE + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], VLIB_FRAME_SIZE); - _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; } cur_ring = nif->first_rx_ring; @@ -163,11 +163,11 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *b0; /* grab free buffer */ u32 last_empty_buffer = - vec_len (nm->rx_buffers[cpu_index]) - 1; + vec_len (nm->rx_buffers[thread_index]) - 1; prev_bi0 = bi0; - bi0 = nm->rx_buffers[cpu_index][last_empty_buffer]; + bi0 = nm->rx_buffers[thread_index][last_empty_buffer]; b0 = vlib_get_buffer (vm, bi0); - _vec_len (nm->rx_buffers[cpu_index]) = last_empty_buffer; + _vec_len (nm->rx_buffers[thread_index]) = last_empty_buffer; n_free_bufs--; /* copy data */ @@ -247,9 +247,9 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), nif->hw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), nif->hw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -260,7 +260,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { int i; u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); netmap_main_t *nm = &netmap_main; netmap_if_t *nmi; @@ -269,7 +269,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, nmi = vec_elt_at_index (nm->interfaces, i); if (nmi->is_admin_up && (i % nm->input_cpu_count) == - (cpu_index - nm->input_cpu_first_index)) + (thread_index - nm->input_cpu_first_index)) n_rx_packets += netmap_device_input_fn (vm, node, frame, nmi); } diff --git a/src/vnet/devices/ssvm/node.c b/src/vnet/devices/ssvm/node.c index a6c9dfd7..539b4161 100644 --- a/src/vnet/devices/ssvm/node.c +++ b/src/vnet/devices/ssvm/node.c @@ -89,7 +89,7 @@ ssvm_eth_device_input (ssvm_eth_main_t * em, ethernet_header_t *eh0; u16 type0; u32 n_rx_bytes = 0, l3_offset0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 trace_cnt __attribute__ ((unused)) = vlib_get_trace_count (vm, node); volatile u32 *lock; u32 *elt_indices; @@ -284,10 +284,10 @@ out: vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters - + VNET_INTERFACE_COUNTER_RX, cpu_index, + + VNET_INTERFACE_COUNTER_RX, thread_index, intfc->vlib_hw_if_index, rx_queue_index, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, rx_queue_index); + vnet_device_increment_rx_packets (thread_index, rx_queue_index); return rx_queue_index; } diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index 00807dc0..5e720f65 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -331,7 +331,7 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) { //Let's try to assign one queue to each thread u32 qid = 0; - u32 cpu_index = 0; + u32 thread_index = 0; vui->use_tx_spinlock = 0; while (1) { @@ -341,20 +341,21 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) if (!rxvq->started || !rxvq->enabled) continue; - vui->per_cpu_tx_qid[cpu_index] = qid; - cpu_index++; - if (cpu_index == vlib_get_thread_main ()->n_vlib_mains) + vui->per_cpu_tx_qid[thread_index] = qid; + thread_index++; + if (thread_index == vlib_get_thread_main ()->n_vlib_mains) return; } //We need to loop, meaning the spinlock has to be used vui->use_tx_spinlock = 1; - if (cpu_index == 0) + if (thread_index == 0) { //Could not find a single valid one - for (cpu_index = 0; - cpu_index < vlib_get_thread_main ()->n_vlib_mains; cpu_index++) + for (thread_index = 0; + thread_index < vlib_get_thread_main ()->n_vlib_mains; + thread_index++) { - vui->per_cpu_tx_qid[cpu_index] = 0; + vui->per_cpu_tx_qid[thread_index] = 0; } return; } @@ -368,7 +369,7 @@ vhost_user_rx_thread_placement () vhost_user_intf_t *vui; vhost_cpu_t *vhc; u32 *workers = 0; - u32 cpu_index; + u32 thread_index; vlib_main_t *vm; //Let's list all workers cpu indexes @@ -400,9 +401,9 @@ vhost_user_rx_thread_placement () continue; i %= vec_len (vui_workers); - cpu_index = vui_workers[i]; + thread_index = vui_workers[i]; i++; - vhc = &vum->cpus[cpu_index]; + vhc = &vum->cpus[thread_index]; iaq.qid = qid; iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; @@ -429,14 +430,14 @@ vhost_user_rx_thread_placement () vhc->operation_mode = mode; } - for (cpu_index = vum->input_cpu_first_index; - cpu_index < vum->input_cpu_first_index + vum->input_cpu_count; - cpu_index++) + for (thread_index = vum->input_cpu_first_index; + thread_index < vum->input_cpu_first_index + vum->input_cpu_count; + thread_index++) { vlib_node_state_t state = VLIB_NODE_STATE_POLLING; - vhc = &vum->cpus[cpu_index]; - vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + vhc = &vum->cpus[thread_index]; + vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; switch (vhc->operation_mode) { case VHOST_USER_INTERRUPT_MODE: @@ -532,7 +533,7 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) { vhost_user_main_t *vum = &vhost_user_main; vhost_cpu_t *vhc; - u32 cpu_index; + u32 thread_index; vhost_iface_and_queue_t *vhiq; vlib_main_t *vm; u32 ifq2; @@ -553,8 +554,8 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) if ((vhiq->vhost_iface_index == (ifq >> 8)) && (VHOST_VRING_IDX_TX (vhiq->qid) == (ifq & 0xff))) { - cpu_index = vhc - vum->cpus; - vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + thread_index = vhc - vum->cpus; + vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; /* * Convert RX virtqueue number in the lower byte to vring * queue index for the input node process. Top bytes contain @@ -1592,7 +1593,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 n_trace = vlib_get_trace_count (vm, node); u16 qsz_mask; u32 map_hint = 0; - u16 cpu_index = os_get_cpu_number (); + u16 thread_index = vlib_get_thread_index (); u16 copy_len = 0; { @@ -1651,32 +1652,32 @@ vhost_user_if_input (vlib_main_t * vm, * in the loop and come back later. This is not an issue as for big packet, * processing cost really comes from the memory copy. */ - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len < n_left + 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len < n_left + 1)) { - u32 curr_len = vum->cpus[cpu_index].rx_buffers_len; - vum->cpus[cpu_index].rx_buffers_len += + u32 curr_len = vum->cpus[thread_index].rx_buffers_len; + vum->cpus[thread_index].rx_buffers_len += vlib_buffer_alloc_from_free_list (vm, - vum->cpus[cpu_index].rx_buffers + + vum->cpus[thread_index].rx_buffers + curr_len, VHOST_USER_RX_BUFFERS_N - curr_len, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len < + (vum->cpus[thread_index].rx_buffers_len < VHOST_USER_RX_BUFFER_STARVATION)) { /* In case of buffer starvation, discard some packets from the queue * and log the event. * We keep doing best effort for the remaining packets. */ - u32 flush = (n_left + 1 > vum->cpus[cpu_index].rx_buffers_len) ? - n_left + 1 - vum->cpus[cpu_index].rx_buffers_len : 1; + u32 flush = (n_left + 1 > vum->cpus[thread_index].rx_buffers_len) ? + n_left + 1 - vum->cpus[thread_index].rx_buffers_len : 1; flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush); n_left -= flush; vlib_increment_simple_counter (vnet_main. interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), + vlib_get_thread_index (), vui->sw_if_index, flush); vlib_error_count (vm, vhost_user_input_node.index, @@ -1696,7 +1697,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 desc_data_offset; vring_desc_t *desc_table = txvq->desc; - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len <= 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len <= 1)) { /* Not enough rx_buffers * Note: We yeld on 1 so we don't need to do an additional @@ -1707,17 +1708,18 @@ vhost_user_if_input (vlib_main_t * vm, } desc_current = txvq->avail->ring[txvq->last_avail_idx & qsz_mask]; - vum->cpus[cpu_index].rx_buffers_len--; - bi_current = (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index].rx_buffers_len]; + vum->cpus[thread_index].rx_buffers_len--; + bi_current = (vum->cpus[thread_index].rx_buffers) + [vum->cpus[thread_index].rx_buffers_len]; b_head = b_current = vlib_get_buffer (vm, bi_current); to_next[0] = bi_current; //We do that now so we can forget about bi_current to_next++; n_left_to_next--; vlib_prefetch_buffer_with_index (vm, - (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index]. + (vum-> + cpus[thread_index].rx_buffers) + [vum->cpus[thread_index]. rx_buffers_len - 1], LOAD); /* Just preset the used descriptor id and length for later */ @@ -1791,7 +1793,7 @@ vhost_user_if_input (vlib_main_t * vm, (b_current->current_length == VLIB_BUFFER_DATA_SIZE)) { if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len == 0)) + (vum->cpus[thread_index].rx_buffers_len == 0)) { /* Cancel speculation */ to_next--; @@ -1805,17 +1807,18 @@ vhost_user_if_input (vlib_main_t * vm, * but valid. */ vhost_user_input_rewind_buffers (vm, - &vum->cpus[cpu_index], + &vum->cpus + [thread_index], b_head); n_left = 0; goto stop; } /* Get next output */ - vum->cpus[cpu_index].rx_buffers_len--; + vum->cpus[thread_index].rx_buffers_len--; u32 bi_next = - (vum->cpus[cpu_index].rx_buffers)[vum->cpus - [cpu_index].rx_buffers_len]; + (vum->cpus[thread_index].rx_buffers)[vum->cpus + [thread_index].rx_buffers_len]; b_current->next_buffer = bi_next; b_current->flags |= VLIB_BUFFER_NEXT_PRESENT; bi_current = bi_next; @@ -1823,7 +1826,7 @@ vhost_user_if_input (vlib_main_t * vm, } /* Prepare a copy order executed later for the data */ - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; u32 desc_data_l = desc_table[desc_current].len - desc_data_offset; @@ -1880,7 +1883,7 @@ vhost_user_if_input (vlib_main_t * vm, if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD)) { if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning @@ -1905,7 +1908,7 @@ vhost_user_if_input (vlib_main_t * vm, /* Do the memory copies */ if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning ("Memory mapping error on interface hw_if_index=%d " @@ -1933,9 +1936,9 @@ vhost_user_if_input (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), vui->sw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), vui->sw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -1946,15 +1949,15 @@ vhost_user_input (vlib_main_t * vm, { vhost_user_main_t *vum = &vhost_user_main; uword n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); vhost_iface_and_queue_t *vhiq; vhost_user_intf_t *vui; vhost_cpu_t *vhc; - vhc = &vum->cpus[cpu_index]; + vhc = &vum->cpus[thread_index]; if (PREDICT_TRUE (vhc->operation_mode == VHOST_USER_POLLING_MODE)) { - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) { vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node); @@ -2096,7 +2099,7 @@ vhost_user_tx (vlib_main_t * vm, vhost_user_vring_t *rxvq; u16 qsz_mask; u8 error; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 map_hint = 0; u8 retry = 8; u16 copy_len; @@ -2116,7 +2119,7 @@ vhost_user_tx (vlib_main_t * vm, qid = VHOST_VRING_IDX_RX (*vec_elt_at_index - (vui->per_cpu_tx_qid, os_get_cpu_number ())); + (vui->per_cpu_tx_qid, vlib_get_thread_index ())); rxvq = &vui->vrings[qid]; if (PREDICT_FALSE (vui->use_tx_spinlock)) vhost_user_vring_lock (vui, qid); @@ -2143,10 +2146,10 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace = + vum->cpus[thread_index].current_trace = vlib_add_trace (vm, node, b0, - sizeof (*vum->cpus[cpu_index].current_trace)); - vhost_user_tx_trace (vum->cpus[cpu_index].current_trace, + sizeof (*vum->cpus[thread_index].current_trace)); + vhost_user_tx_trace (vum->cpus[thread_index].current_trace, vui, qid / 2, b0, rxvq); } @@ -2188,14 +2191,14 @@ retry: { // Get a header from the header array virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len]; + &vum->cpus[thread_index].tx_headers[tx_headers_len]; tx_headers_len++; hdr->hdr.flags = 0; hdr->hdr.gso_type = 0; hdr->num_buffers = 1; //This is local, no need to check // Prepare a copy order executed later for the header - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = vui->virtio_net_hdr_sz; cpy->dst = buffer_map_addr; @@ -2220,7 +2223,7 @@ retry: else if (vui->virtio_net_hdr_sz == 12) //MRG is available { virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + &vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; //Move from available to used buffer rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id = @@ -2282,7 +2285,7 @@ retry: } { - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = bytes_left; cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len; @@ -2325,8 +2328,8 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace->hdr = - vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + vum->cpus[thread_index].current_trace->hdr = + vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; } n_left--; //At the end for error counting when 'goto done' is invoked @@ -2336,7 +2339,7 @@ retry: done: //Do the memory copies if (PREDICT_FALSE - (vhost_user_tx_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_tx_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning ("Memory mapping error on interface hw_if_index=%d " @@ -2386,7 +2389,7 @@ done3: vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), vui->sw_if_index, n_left); + vlib_get_thread_index (), vui->sw_if_index, n_left); } vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); @@ -2773,11 +2776,11 @@ vhost_user_send_interrupt_process (vlib_main_t * vm, case ~0: vec_foreach (vhc, vum->cpus) { - u32 cpu_index = vhc - vum->cpus; + u32 thread_index = vhc - vum->cpus; f64 next_timeout; next_timeout = timeout; - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) { vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; vhost_user_vring_t *rxvq = diff --git a/src/vnet/dpo/lookup_dpo.c b/src/vnet/dpo/lookup_dpo.c index e94e871c..97ad0a44 100644 --- a/src/vnet/dpo/lookup_dpo.c +++ b/src/vnet/dpo/lookup_dpo.c @@ -266,7 +266,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, int table_from_interface) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; from = vlib_frame_vector_args (from_frame); @@ -407,10 +407,10 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -511,7 +511,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -606,7 +606,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, { vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -749,10 +749,10 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -853,7 +853,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -930,7 +930,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm, int table_from_interface) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; from = vlib_frame_vector_args (from_frame); @@ -994,7 +994,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index a9f334be..e25ceae9 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -627,7 +627,7 @@ replicate_inline (vlib_main_t * vm, vlib_combined_counter_main_t * cm = &replicate_main.repm_counters; replicate_main_t * rm = &replicate_main; u32 n_left_from, * from, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -657,12 +657,12 @@ replicate_inline (vlib_main_t * vm, rep0 = replicate_get(repi0); vlib_increment_combined_counter( - cm, cpu_index, repi0, 1, + cm, thread_index, repi0, 1, vlib_buffer_length_in_chain(vm, b0)); - vec_validate (rm->clones[cpu_index], rep0->rep_n_buckets - 1); + vec_validate (rm->clones[thread_index], rep0->rep_n_buckets - 1); - num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[cpu_index], rep0->rep_n_buckets, 128); + num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[thread_index], rep0->rep_n_buckets, 128); if (num_cloned != rep0->rep_n_buckets) { @@ -673,7 +673,7 @@ replicate_inline (vlib_main_t * vm, for (bucket = 0; bucket < num_cloned; bucket++) { - ci0 = rm->clones[cpu_index][bucket]; + ci0 = rm->clones[thread_index][bucket]; c0 = vlib_get_buffer(vm, ci0); to_next[0] = ci0; @@ -700,7 +700,7 @@ replicate_inline (vlib_main_t * vm, vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); } } - vec_reset_length (rm->clones[cpu_index]); + vec_reset_length (rm->clones[thread_index]); } vlib_put_next_frame (vm, node, next_index, n_left_to_next); diff --git a/src/vnet/ethernet/arp.c b/src/vnet/ethernet/arp.c index ee757505..c74a097e 100644 --- a/src/vnet/ethernet/arp.c +++ b/src/vnet/ethernet/arp.c @@ -1771,7 +1771,7 @@ set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t * a) { vnet_main_t *vm = vnet_get_main (); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (a->flags & ETHERNET_ARP_ARGS_REMOVE) vnet_arp_unset_ip4_over_ethernet_internal (vm, a); diff --git a/src/vnet/ethernet/interface.c b/src/vnet/ethernet/interface.c index 9894e3c8..335e3f9f 100644 --- a/src/vnet/ethernet/interface.c +++ b/src/vnet/ethernet/interface.c @@ -362,7 +362,7 @@ simulated_ethernet_interface_tx (vlib_main_t * vm, u32 next_index = VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT; u32 i, next_node_index, bvi_flag, sw_if_index; u32 n_pkts = 0, n_bytes = 0; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; vlib_node_main_t *nm = &vm->node_main; @@ -420,8 +420,9 @@ simulated_ethernet_interface_tx (vlib_main_t * vm, /* increment TX interface stat */ vlib_increment_combined_counter (im->combined_sw_if_counters + - VNET_INTERFACE_COUNTER_TX, cpu_index, - sw_if_index, n_pkts, n_bytes); + VNET_INTERFACE_COUNTER_TX, + thread_index, sw_if_index, n_pkts, + n_bytes); } return n_left_from; diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index b699e381..f7787ed2 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -291,7 +291,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_node_runtime_t *error_node; u32 n_left_from, next_index, *from, *to_next; u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 cached_sw_if_index = ~0; u32 cached_is_l2 = 0; /* shut up gcc */ vnet_hw_interface_t *hi = NULL; /* used for main interface only */ @@ -510,7 +510,7 @@ ethernet_input_inline (vlib_main_t * vm, interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, new_sw_if_index0, 1, len0); if (new_sw_if_index1 != old_sw_if_index1 @@ -519,7 +519,7 @@ ethernet_input_inline (vlib_main_t * vm, interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, new_sw_if_index1, 1, len1); @@ -530,7 +530,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = stats_n_bytes = 0; @@ -696,13 +696,13 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, new_sw_if_index0, 1, len0); + thread_index, new_sw_if_index0, 1, len0); if (stats_n_packets > 0) { vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = stats_n_bytes = 0; } @@ -734,7 +734,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/gre/node.c b/src/vnet/gre/node.c index 2683586e..acf15f24 100644 --- a/src/vnet/gre/node.c +++ b/src/vnet/gre/node.c @@ -75,7 +75,7 @@ gre_input (vlib_main_t * vm, u64 cached_tunnel_key6[4]; u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 len; vnet_interface_main_t *im = &gm->vnet_main->interface_main; @@ -257,7 +257,7 @@ gre_input (vlib_main_t * vm, len = vlib_buffer_length_in_chain (vm, b0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); @@ -324,7 +324,7 @@ drop0: len = vlib_buffer_length_in_chain (vm, b1); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); @@ -502,7 +502,7 @@ drop1: len = vlib_buffer_length_in_chain (vm, b0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); diff --git a/src/vnet/interface.h b/src/vnet/interface.h index a1ea2d61..08f08b10 100644 --- a/src/vnet/interface.h +++ b/src/vnet/interface.h @@ -468,7 +468,7 @@ typedef struct vnet_hw_interface_t u32 input_node_index; /* input node cpu index by queue */ - u32 *input_node_cpu_index_by_queue; + u32 *input_node_thread_index_by_queue; } vnet_hw_interface_t; diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c index 03f2cdca..663dc309 100644 --- a/src/vnet/interface_output.c +++ b/src/vnet/interface_output.c @@ -196,7 +196,7 @@ slow_path (vlib_main_t * vm, */ static_always_inline void incr_output_stats (vnet_main_t * vnm, - u32 cpu_index, + u32 thread_index, u32 length, u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes) @@ -216,7 +216,7 @@ incr_output_stats (vnet_main_t * vnm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, *last_sw_if_index, *n_packets, *n_bytes); } @@ -240,7 +240,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 last_sw_if_index; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; n_buffers = frame->n_vectors; @@ -266,7 +266,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_TX_ERROR); - vlib_increment_simple_counter (cm, cpu_index, + vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, n_buffers); return vlib_error_drop_buffers (vm, node, from, /* buffer stride */ 1, @@ -341,18 +341,18 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, from += 1; to_tx += n_buffers; n_left_to_tx -= n_buffers; - incr_output_stats (vnm, cpu_index, n_slow_bytes, + incr_output_stats (vnm, thread_index, n_slow_bytes, vnet_buffer (b)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); } } else { - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b0)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b1)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); @@ -396,7 +396,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, to_tx += n_buffers; n_left_to_tx -= n_buffers; } - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b0)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); @@ -408,7 +408,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, } /* Final update of interface stats. */ - incr_output_stats (vnm, cpu_index, 0, ~0, /* ~0 will flush stats */ + incr_output_stats (vnm, thread_index, 0, ~0, /* ~0 will flush stats */ &last_sw_if_index, &n_packets, &n_bytes); return n_buffers; @@ -428,7 +428,7 @@ vnet_interface_output_node (vlib_main_t * vm, u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 n_bytes_b0, n_bytes_b1, n_bytes_b2, n_bytes_b3; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; vnet_interface_main_t *im = &vnm->interface_main; u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX; u32 current_config_index = ~0; @@ -458,7 +458,7 @@ vnet_interface_output_node (vlib_main_t * vm, cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_TX_ERROR); - vlib_increment_simple_counter (cm, cpu_index, + vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, n_buffers); return vlib_error_drop_buffers (vm, node, from, @@ -558,7 +558,7 @@ vnet_interface_output_node (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif0, 1, + thread_index, tx_swif0, 1, n_bytes_b0); } @@ -567,7 +567,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif1, 1, + thread_index, tx_swif1, 1, n_bytes_b1); } @@ -576,7 +576,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif2, 1, + thread_index, tx_swif2, 1, n_bytes_b2); } if (PREDICT_FALSE (tx_swif3 != rt->sw_if_index)) @@ -584,7 +584,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif3, 1, + thread_index, tx_swif3, 1, n_bytes_b3); } } @@ -623,7 +623,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif0, 1, + thread_index, tx_swif0, 1, n_bytes_b0); } } @@ -634,7 +634,7 @@ vnet_interface_output_node (vlib_main_t * vm, /* Update main interface stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, rt->sw_if_index, n_packets, n_bytes); return n_buffers; } @@ -893,7 +893,7 @@ process_drop_punt (vlib_main_t * vm, u32 current_sw_if_index, n_errors_current_sw_if_index; u64 current_counter; vlib_simple_counter_main_t *cm; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; static vlib_error_t memory[VNET_ERROR_N_DISPOSITION]; static char memory_init[VNET_ERROR_N_DISPOSITION]; @@ -965,19 +965,19 @@ process_drop_punt (vlib_main_t * vm, current_counter -= 2; n_errors_current_sw_if_index -= 2; - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); /* Increment super-interface drop/punt counters for sub-interfaces. */ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0); vlib_increment_simple_counter - (cm, cpu_index, sw_if0->sup_sw_if_index, + (cm, thread_index, sw_if0->sup_sw_if_index, sw_if0->sup_sw_if_index != sw_if_index0); sw_if1 = vnet_get_sw_interface (vnm, sw_if_index1); vlib_increment_simple_counter - (cm, cpu_index, sw_if1->sup_sw_if_index, + (cm, thread_index, sw_if1->sup_sw_if_index, sw_if1->sup_sw_if_index != sw_if_index1); em->counters[current_counter_index] = current_counter; @@ -1013,11 +1013,12 @@ process_drop_punt (vlib_main_t * vm, sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; /* Increment drop/punt counters. */ - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); /* Increment super-interface drop/punt counters for sub-interfaces. */ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0); - vlib_increment_simple_counter (cm, cpu_index, sw_if0->sup_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, + sw_if0->sup_sw_if_index, sw_if0->sup_sw_if_index != sw_if_index0); if (PREDICT_FALSE (e0 != current_error)) @@ -1041,12 +1042,12 @@ process_drop_punt (vlib_main_t * vm, { vnet_sw_interface_t *si; - vlib_increment_simple_counter (cm, cpu_index, current_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, current_sw_if_index, n_errors_current_sw_if_index); si = vnet_get_sw_interface (vnm, current_sw_if_index); if (si->sup_sw_if_index != current_sw_if_index) - vlib_increment_simple_counter (cm, cpu_index, si->sup_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, si->sup_sw_if_index, n_errors_current_sw_if_index); } diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index ee1703e7..fdfe7f63 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -75,7 +75,7 @@ ip4_lookup_inline (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -292,19 +292,19 @@ ip4_lookup_inline (vlib_main_t * vm, vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lb_index0, 1, + (cm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, p0) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index1, 1, + (cm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, p1) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index2, 1, + (cm, thread_index, lb_index2, 1, vlib_buffer_length_in_chain (vm, p2) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index3, 1, + (cm, thread_index, lb_index3, 1, vlib_buffer_length_in_chain (vm, p3) + sizeof (ethernet_header_t)); @@ -392,7 +392,7 @@ ip4_lookup_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); from += 1; to_next += 1; @@ -479,7 +479,7 @@ ip4_load_balance (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -584,9 +584,9 @@ ip4_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); vlib_validate_buffer_enqueue_x2 (vm, node, next, to_next, n_left_to_next, @@ -639,7 +639,7 @@ ip4_load_balance (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, @@ -2330,7 +2330,7 @@ ip4_rewrite_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -2379,9 +2379,9 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) { vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + thread_index, adj_index0); vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index1); + thread_index, adj_index1); } ip0 = vlib_buffer_get_current (p0); @@ -2527,13 +2527,13 @@ ip4_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); } @@ -2618,7 +2618,7 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + thread_index, adj_index0); /* Guess we are only writing on simple Ethernet header. */ vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); @@ -2637,7 +2637,7 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); /* Check MTU of outgoing interface. */ diff --git a/src/vnet/ip/ip4_input.c b/src/vnet/ip/ip4_input.c index ba200a9f..3b08f4b0 100644 --- a/src/vnet/ip/ip4_input.c +++ b/src/vnet/ip/ip4_input.c @@ -85,7 +85,7 @@ ip4_input_inline (vlib_main_t * vm, vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip4_input_node.index); vlib_simple_counter_main_t *cm; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -178,8 +178,8 @@ ip4_input_inline (vlib_main_t * vm, vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); /* Punt packets with options or wrong version. */ if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45)) @@ -299,7 +299,7 @@ ip4_input_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); /* Punt packets with options or wrong version. */ if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45)) diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index c120f12c..c2fc4f87 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -74,7 +74,7 @@ ip6_lookup_inline (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -185,9 +185,9 @@ ip6_lookup_inline (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); from += 2; to_next += 2; @@ -291,7 +291,7 @@ ip6_lookup_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); from += 1; to_next += 1; @@ -703,7 +703,7 @@ ip6_load_balance (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ip6_main_t *im = &ip6_main; from = vlib_frame_vector_args (frame); @@ -824,9 +824,9 @@ ip6_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); vlib_validate_buffer_enqueue_x2 (vm, node, next, to_next, n_left_to_next, @@ -886,7 +886,7 @@ ip6_load_balance (vlib_main_t * vm, } vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, @@ -1897,7 +1897,7 @@ ip6_rewrite_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -2019,11 +2019,11 @@ ip6_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index1, 1, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); } @@ -2156,7 +2156,7 @@ ip6_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); } diff --git a/src/vnet/ip/ip6_input.c b/src/vnet/ip/ip6_input.c index 20306088..ffdc4727 100644 --- a/src/vnet/ip/ip6_input.c +++ b/src/vnet/ip/ip6_input.c @@ -82,7 +82,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_input_node.index); vlib_simple_counter_main_t *cm; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -171,8 +171,8 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); error0 = error1 = IP6_ERROR_NONE; @@ -270,7 +270,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); error0 = IP6_ERROR_NONE; /* Version != 6? Drop it. */ diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 5d1fb6f8..2af546df 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -581,7 +581,7 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm, u32 next_index; pending_resolution_t *pr, *mc; - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) { set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, 1 /* set new neighbor */ , is_static, @@ -722,7 +722,7 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm, uword *p; int rv = 0; - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) { set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, 0 /* unset */ , 0, 0); diff --git a/src/vnet/ipsec/esp.h b/src/vnet/ipsec/esp.h index 50cac806..799003b9 100644 --- a/src/vnet/ipsec/esp.h +++ b/src/vnet/ipsec/esp.h @@ -282,8 +282,8 @@ hmac_calc (ipsec_integ_alg_t alg, u8 * data, int data_len, u8 * signature, u8 use_esn, u32 seq_hi) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - HMAC_CTX *ctx = &(em->per_thread_data[cpu_index].hmac_ctx); + u32 thread_index = vlib_get_thread_index (); + HMAC_CTX *ctx = &(em->per_thread_data[thread_index].hmac_ctx); const EVP_MD *md = NULL; unsigned int len; @@ -292,10 +292,10 @@ hmac_calc (ipsec_integ_alg_t alg, if (PREDICT_FALSE (em->esp_integ_algs[alg].md == 0)) return 0; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_integ_alg)) + if (PREDICT_FALSE (alg != em->per_thread_data[thread_index].last_integ_alg)) { md = em->esp_integ_algs[alg].md; - em->per_thread_data[cpu_index].last_integ_alg = alg; + em->per_thread_data[thread_index].last_integ_alg = alg; } HMAC_Init (ctx, key, key_len, md); diff --git a/src/vnet/ipsec/esp_decrypt.c b/src/vnet/ipsec/esp_decrypt.c index 7289b260..925d2b45 100644 --- a/src/vnet/ipsec/esp_decrypt.c +++ b/src/vnet/ipsec/esp_decrypt.c @@ -85,8 +85,8 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg, u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].decrypt_ctx); + u32 thread_index = vlib_get_thread_index (); + EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].decrypt_ctx); const EVP_CIPHER *cipher = NULL; int out_len; @@ -95,10 +95,11 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg, if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == 0)) return; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_decrypt_alg)) + if (PREDICT_FALSE + (alg != em->per_thread_data[thread_index].last_decrypt_alg)) { cipher = em->esp_crypto_algs[alg].type; - em->per_thread_data[cpu_index].last_decrypt_alg = alg; + em->per_thread_data[thread_index].last_decrypt_alg = alg; } EVP_DecryptInit_ex (ctx, cipher, NULL, key, iv); @@ -117,11 +118,11 @@ esp_decrypt_node_fn (vlib_main_t * vm, u32 *recycle = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ipsec_alloc_empty_buffers (vm, im); - u32 *empty_buffers = im->empty_buffers[cpu_index]; + u32 *empty_buffers = im->empty_buffers[thread_index]; if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from)) { diff --git a/src/vnet/ipsec/esp_encrypt.c b/src/vnet/ipsec/esp_encrypt.c index 44ae2297..b2bc4e0b 100644 --- a/src/vnet/ipsec/esp_encrypt.c +++ b/src/vnet/ipsec/esp_encrypt.c @@ -88,8 +88,8 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg, u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].encrypt_ctx); + u32 thread_index = vlib_get_thread_index (); + EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].encrypt_ctx); const EVP_CIPHER *cipher = NULL; int out_len; @@ -98,10 +98,11 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg, if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == IPSEC_CRYPTO_ALG_NONE)) return; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_encrypt_alg)) + if (PREDICT_FALSE + (alg != em->per_thread_data[thread_index].last_encrypt_alg)) { cipher = em->esp_crypto_algs[alg].type; - em->per_thread_data[cpu_index].last_encrypt_alg = alg; + em->per_thread_data[thread_index].last_encrypt_alg = alg; } EVP_EncryptInit_ex (ctx, cipher, NULL, key, iv); @@ -119,11 +120,11 @@ esp_encrypt_node_fn (vlib_main_t * vm, n_left_from = from_frame->n_vectors; ipsec_main_t *im = &ipsec_main; u32 *recycle = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ipsec_alloc_empty_buffers (vm, im); - u32 *empty_buffers = im->empty_buffers[cpu_index]; + u32 *empty_buffers = im->empty_buffers[thread_index]; if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from)) { diff --git a/src/vnet/ipsec/ikev2.c b/src/vnet/ipsec/ikev2.c index 2c1074d8..3f9978a7 100644 --- a/src/vnet/ipsec/ikev2.c +++ b/src/vnet/ipsec/ikev2.c @@ -303,16 +303,16 @@ static void ikev2_delete_sa (ikev2_sa_t * sa) { ikev2_main_t *km = &ikev2_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); uword *p; ikev2_sa_free_all_vec (sa); - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi); + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi); if (p) { - hash_unset (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi); - pool_put (km->per_thread_data[cpu_index].sas, sa); + hash_unset (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi); + pool_put (km->per_thread_data[thread_index].sas, sa); } } @@ -776,29 +776,31 @@ ikev2_initial_contact_cleanup (ikev2_sa_t * sa) ikev2_sa_t *tmp; u32 i, *delete = 0; ikev2_child_sa_t *c; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); if (!sa->initial_contact) return; /* find old IKE SAs with the same authenticated identity */ /* *INDENT-OFF* */ - pool_foreach (tmp, km->per_thread_data[cpu_index].sas, ({ + pool_foreach (tmp, km->per_thread_data[thread_index].sas, ({ if (tmp->i_id.type != sa->i_id.type || vec_len(tmp->i_id.data) != vec_len(sa->i_id.data) || memcmp(sa->i_id.data, tmp->i_id.data, vec_len(sa->i_id.data))) continue; if (sa->rspi != tmp->rspi) - vec_add1(delete, tmp - km->per_thread_data[cpu_index].sas); + vec_add1(delete, tmp - km->per_thread_data[thread_index].sas); })); /* *INDENT-ON* */ for (i = 0; i < vec_len (delete); i++) { - tmp = pool_elt_at_index (km->per_thread_data[cpu_index].sas, delete[i]); - vec_foreach (c, tmp->childs) - ikev2_delete_tunnel_interface (km->vnet_main, tmp, c); + tmp = + pool_elt_at_index (km->per_thread_data[thread_index].sas, delete[i]); + vec_foreach (c, + tmp->childs) ikev2_delete_tunnel_interface (km->vnet_main, + tmp, c); ikev2_delete_sa (tmp); } @@ -1922,10 +1924,10 @@ ikev2_retransmit_sa_init (ike_header_t * ike, { ikev2_main_t *km = &ikev2_main; ikev2_sa_t *sa; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* *INDENT-OFF* */ - pool_foreach (sa, km->per_thread_data[cpu_index].sas, ({ + pool_foreach (sa, km->per_thread_data[thread_index].sas, ({ if (sa->ispi == clib_net_to_host_u64(ike->ispi) && sa->iaddr.as_u32 == iaddr.as_u32 && sa->raddr.as_u32 == raddr.as_u32) @@ -2036,7 +2038,7 @@ ikev2_node_fn (vlib_main_t * vm, u32 n_left_from, *from, *to_next; ikev2_next_t next_index; ikev2_main_t *km = &ikev2_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -2134,11 +2136,14 @@ ikev2_node_fn (vlib_main_t * vm, if (sa0->state == IKEV2_STATE_SA_INIT) { /* add SA to the pool */ - pool_get (km->per_thread_data[cpu_index].sas, sa0); + pool_get (km->per_thread_data[thread_index].sas, + sa0); clib_memcpy (sa0, &sa, sizeof (*sa0)); - hash_set (km->per_thread_data[cpu_index].sa_by_rspi, + hash_set (km-> + per_thread_data[thread_index].sa_by_rspi, sa0->rspi, - sa0 - km->per_thread_data[cpu_index].sas); + sa0 - + km->per_thread_data[thread_index].sas); } else { @@ -2169,11 +2174,11 @@ ikev2_node_fn (vlib_main_t * vm, if (sa0->state == IKEV2_STATE_SA_INIT) { /* add SA to the pool */ - pool_get (km->per_thread_data[cpu_index].sas, sa0); + pool_get (km->per_thread_data[thread_index].sas, sa0); clib_memcpy (sa0, &sa, sizeof (*sa0)); - hash_set (km->per_thread_data[cpu_index].sa_by_rspi, + hash_set (km->per_thread_data[thread_index].sa_by_rspi, sa0->rspi, - sa0 - km->per_thread_data[cpu_index].sas); + sa0 - km->per_thread_data[thread_index].sas); } else { @@ -2184,12 +2189,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_IKE_AUTH) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) @@ -2240,12 +2246,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_INFORMATIONAL) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) @@ -2305,12 +2312,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_CREATE_CHILD_SA) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) diff --git a/src/vnet/ipsec/ipsec.h b/src/vnet/ipsec/ipsec.h index 58f0f145..c884e360 100644 --- a/src/vnet/ipsec/ipsec.h +++ b/src/vnet/ipsec/ipsec.h @@ -324,21 +324,21 @@ int ipsec_set_interface_key (vnet_main_t * vnm, u32 hw_if_index, always_inline void ipsec_alloc_empty_buffers (vlib_main_t * vm, ipsec_main_t * im) { - u32 cpu_index = os_get_cpu_number (); - uword l = vec_len (im->empty_buffers[cpu_index]); + u32 thread_index = vlib_get_thread_index (); + uword l = vec_len (im->empty_buffers[thread_index]); uword n_alloc = 0; if (PREDICT_FALSE (l < VLIB_FRAME_SIZE)) { - if (!im->empty_buffers[cpu_index]) + if (!im->empty_buffers[thread_index]) { - vec_alloc (im->empty_buffers[cpu_index], 2 * VLIB_FRAME_SIZE); + vec_alloc (im->empty_buffers[thread_index], 2 * VLIB_FRAME_SIZE); } - n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[cpu_index] + l, + n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[thread_index] + l, 2 * VLIB_FRAME_SIZE - l); - _vec_len (im->empty_buffers[cpu_index]) = l + n_alloc; + _vec_len (im->empty_buffers[thread_index]) = l + n_alloc; } } diff --git a/src/vnet/ipsec/ipsec_if.c b/src/vnet/ipsec/ipsec_if.c index dc882004..ed124894 100644 --- a/src/vnet/ipsec/ipsec_if.c +++ b/src/vnet/ipsec/ipsec_if.c @@ -99,7 +99,7 @@ static int ipsec_add_del_tunnel_if_rpc_callback (ipsec_add_del_tunnel_args_t * a) { vnet_main_t *vnm = vnet_get_main (); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); return ipsec_add_del_tunnel_if_internal (vnm, a); } diff --git a/src/vnet/l2/l2_bvi.h b/src/vnet/l2/l2_bvi.h index dd1130a6..e21a1616 100644 --- a/src/vnet/l2/l2_bvi.h +++ b/src/vnet/l2/l2_bvi.h @@ -97,7 +97,7 @@ l2_to_bvi (vlib_main_t * vlib_main, vlib_increment_combined_counter (vnet_main->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - vlib_main->cpu_index, + vlib_main->thread_index, vnet_buffer (b0)->sw_if_index[VLIB_RX], 1, vlib_buffer_length_in_chain (vlib_main, b0)); return TO_BVI_ERR_OK; diff --git a/src/vnet/l2/l2_input.c b/src/vnet/l2/l2_input.c index 041ff38d..e5d6878a 100644 --- a/src/vnet/l2/l2_input.c +++ b/src/vnet/l2/l2_input.c @@ -117,7 +117,7 @@ typedef enum static_always_inline void classify_and_dispatch (vlib_main_t * vm, vlib_node_runtime_t * node, - u32 cpu_index, + u32 thread_index, l2input_main_t * msm, vlib_buffer_t * b0, u32 * next0) { /* @@ -237,7 +237,7 @@ l2input_node_inline (vlib_main_t * vm, u32 n_left_from, *from, *to_next; l2input_next_t next_index; l2input_main_t *msm = &l2input_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; /* number of packets to process */ @@ -350,10 +350,10 @@ l2input_node_inline (vlib_main_t * vm, vlib_node_increment_counter (vm, l2input_node.index, L2INPUT_ERROR_L2INPUT, 4); - classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0); - classify_and_dispatch (vm, node, cpu_index, msm, b1, &next1); - classify_and_dispatch (vm, node, cpu_index, msm, b2, &next2); - classify_and_dispatch (vm, node, cpu_index, msm, b3, &next3); + classify_and_dispatch (vm, node, thread_index, msm, b0, &next0); + classify_and_dispatch (vm, node, thread_index, msm, b1, &next1); + classify_and_dispatch (vm, node, thread_index, msm, b2, &next2); + classify_and_dispatch (vm, node, thread_index, msm, b3, &next3); /* verify speculative enqueues, maybe switch current next frame */ /* if next0==next1==next_index then nothing special needs to be done */ @@ -393,7 +393,7 @@ l2input_node_inline (vlib_main_t * vm, vlib_node_increment_counter (vm, l2input_node.index, L2INPUT_ERROR_L2INPUT, 1); - classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0); + classify_and_dispatch (vm, node, thread_index, msm, b0, &next0); /* verify speculative enqueue, maybe switch current next frame */ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, diff --git a/src/vnet/l2/l2_output.c b/src/vnet/l2/l2_output.c index 00f22571..e17b2a16 100644 --- a/src/vnet/l2/l2_output.c +++ b/src/vnet/l2/l2_output.c @@ -643,11 +643,11 @@ l2output_create_output_node_mapping (vlib_main_t * vlib_main, vnet_main_t * vnet hw0 = vnet_get_sup_hw_interface (vnet_main, sw_if_index); - uword cpu_number; + uword thread_index; - cpu_number = os_get_cpu_number (); + thread_index = vlib_get_thread_index (); - if (cpu_number) + if (thread_index) { u32 oldflags; diff --git a/src/vnet/l2tp/decap.c b/src/vnet/l2tp/decap.c index e8986935..46104129 100644 --- a/src/vnet/l2tp/decap.c +++ b/src/vnet/l2tp/decap.c @@ -149,7 +149,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi) /* per-mapping byte stats include the ethernet header */ vlib_increment_combined_counter (&lm->counter_main, - os_get_cpu_number (), + vlib_get_thread_index (), counter_index, 1 /* packet_increment */ , vlib_buffer_length_in_chain (vm, b) + sizeof (ethernet_header_t)); diff --git a/src/vnet/l2tp/encap.c b/src/vnet/l2tp/encap.c index ed7a9580..dcdfde4b 100644 --- a/src/vnet/l2tp/encap.c +++ b/src/vnet/l2tp/encap.c @@ -124,7 +124,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi) /* per-mapping byte stats include the ethernet header */ vlib_increment_combined_counter (&lm->counter_main, - os_get_cpu_number (), + vlib_get_thread_index (), counter_index, 1 /* packet_increment */ , vlib_buffer_length_in_chain (vm, b)); diff --git a/src/vnet/l2tp/l2tp.c b/src/vnet/l2tp/l2tp.c index cb94d7e7..3dedc447 100644 --- a/src/vnet/l2tp/l2tp.c +++ b/src/vnet/l2tp/l2tp.c @@ -157,7 +157,7 @@ test_counters_command_fn (vlib_main_t * vm, u32 session_index; u32 counter_index; u32 nincr = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* *INDENT-OFF* */ pool_foreach (session, lm->sessions, @@ -167,11 +167,11 @@ test_counters_command_fn (vlib_main_t * vm, session_index_to_counter_index (session_index, SESSION_COUNTER_USER_TO_NETWORK); vlib_increment_combined_counter (&lm->counter_main, - cpu_index, + thread_index, counter_index, 1/*pkt*/, 1111 /*bytes*/); vlib_increment_combined_counter (&lm->counter_main, - cpu_index, + thread_index, counter_index+1, 1/*pkt*/, 2222 /*bytes*/); nincr++; diff --git a/src/vnet/lisp-gpe/decap.c b/src/vnet/lisp-gpe/decap.c index d887a95f..68769710 100644 --- a/src/vnet/lisp-gpe/decap.c +++ b/src/vnet/lisp-gpe/decap.c @@ -103,7 +103,7 @@ next_index_to_iface (lisp_gpe_main_t * lgm, u32 next_index) } static_always_inline void -incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length, +incr_decap_stats (vnet_main_t * vnm, u32 thread_index, u32 length, u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes) { @@ -122,7 +122,7 @@ incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, *last_sw_if_index, + thread_index, *last_sw_if_index, *n_packets, *n_bytes); } *last_sw_if_index = sw_if_index; @@ -150,11 +150,11 @@ static uword lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_v4) { - u32 n_left_from, next_index, *from, *to_next, cpu_index; + u32 n_left_from, next_index, *from, *to_next, thread_index; u32 n_bytes = 0, n_packets = 0, last_sw_if_index = ~0, drops = 0; lisp_gpe_main_t *lgm = vnet_lisp_gpe_get_main (); - cpu_index = os_get_cpu_number (); + thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -267,7 +267,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si0) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b0), si0[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0]; @@ -282,7 +282,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si1) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b1), si1[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b1)->sw_if_index[VLIB_RX] = si1[0]; @@ -397,7 +397,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si0) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b0), si0[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0]; @@ -430,7 +430,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* flush iface stats */ - incr_decap_stats (lgm->vnet_main, cpu_index, 0, ~0, &last_sw_if_index, + incr_decap_stats (lgm->vnet_main, thread_index, 0, ~0, &last_sw_if_index, &n_packets, &n_bytes); vlib_node_increment_counter (vm, lisp_gpe_ip4_input_node.index, LISP_GPE_ERROR_NO_TUNNEL, drops); diff --git a/src/vnet/lldp/lldp_input.c b/src/vnet/lldp/lldp_input.c index 762743d0..e88f6fdb 100644 --- a/src/vnet/lldp/lldp_input.c +++ b/src/vnet/lldp/lldp_input.c @@ -35,7 +35,7 @@ typedef struct static void lldp_rpc_update_peer_cb (const lldp_intf_update_t * a) { - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); lldp_intf_t *n = lldp_get_intf (&lldp_main, a->hw_if_index); if (!n) diff --git a/src/vnet/map/ip4_map.c b/src/vnet/map/ip4_map.c index 1a20d704..e39b6f14 100644 --- a/src/vnet/map/ip4_map.c +++ b/src/vnet/map/ip4_map.c @@ -248,7 +248,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) next_index = node->cached_next_index; map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -377,7 +377,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip6h0->payload_length) + @@ -409,7 +409,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip41) ? IP4_MAP_NEXT_IP6_REWRITE : next1; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index1, 1, clib_net_to_host_u16 (ip6h1->payload_length) + @@ -520,7 +520,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip6h0->payload_length) + @@ -564,7 +564,7 @@ ip4_map_reass (vlib_main_t * vm, next_index = node->cached_next_index; map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 *fragments_to_drop = NULL; u32 *fragments_to_loopback = NULL; @@ -694,8 +694,8 @@ ip4_map_reass (vlib_main_t * vm, { if (error0 == MAP_ERROR_NONE) vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, map_domain_index0, - 1, + thread_index, + map_domain_index0, 1, clib_net_to_host_u16 (ip60->payload_length) + 40); next0 = diff --git a/src/vnet/map/ip4_map_t.c b/src/vnet/map/ip4_map_t.c index b63d76bf..5f2bcbf9 100644 --- a/src/vnet/map/ip4_map_t.c +++ b/src/vnet/map/ip4_map_t.c @@ -477,7 +477,7 @@ ip4_map_t_icmp (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -520,7 +520,7 @@ ip4_map_t_icmp (vlib_main_t * vm, if (PREDICT_TRUE (error0 == MAP_ERROR_NONE)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, len0); } @@ -1051,7 +1051,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -1158,7 +1158,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip40-> @@ -1169,7 +1169,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error1 == MAP_ERROR_NONE && next1 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p1)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip41-> @@ -1252,7 +1252,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip40-> diff --git a/src/vnet/map/ip6_map.c b/src/vnet/map/ip6_map.c index f7eb768f..63ada962 100644 --- a/src/vnet/map/ip6_map.c +++ b/src/vnet/map/ip6_map.c @@ -172,7 +172,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_get_runtime (vm, ip6_map_node.index); map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -319,7 +319,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next0; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); @@ -352,7 +352,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next1; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index1, 1, clib_net_to_host_u16 (ip41->length)); @@ -505,7 +505,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next0; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); @@ -820,7 +820,7 @@ ip6_map_ip4_reass (vlib_main_t * vm, vlib_node_get_runtime (vm, ip6_map_ip4_reass_node.index); map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 *fragments_to_drop = NULL; u32 *fragments_to_loopback = NULL; @@ -958,8 +958,8 @@ ip6_map_ip4_reass (vlib_main_t * vm, { if (error0 == MAP_ERROR_NONE) vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, map_domain_index0, - 1, + thread_index, + map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); next0 = @@ -1015,7 +1015,7 @@ ip6_map_icmp_relay (vlib_main_t * vm, vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_icmp_relay_node.index); map_main_t *mm = &map_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u16 *fragment_ids, *fid; from = vlib_frame_vector_args (frame); @@ -1143,7 +1143,8 @@ ip6_map_icmp_relay (vlib_main_t * vm, ip_csum_t sum = ip_incremental_checksum (0, new_icmp40, nlen - 20); new_icmp40->checksum = ~ip_csum_fold (sum); - vlib_increment_simple_counter (&mm->icmp_relayed, cpu_index, 0, 1); + vlib_increment_simple_counter (&mm->icmp_relayed, thread_index, 0, + 1); error: if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/map/ip6_map_t.c b/src/vnet/map/ip6_map_t.c index eb3996c2..99151678 100644 --- a/src/vnet/map/ip6_map_t.c +++ b/src/vnet/map/ip6_map_t.c @@ -448,7 +448,7 @@ ip6_map_t_icmp (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -493,7 +493,7 @@ ip6_map_t_icmp (vlib_main_t * vm, if (PREDICT_TRUE (error0 == MAP_ERROR_NONE)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, len0); @@ -1051,7 +1051,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_t_node.index); vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -1218,7 +1218,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, clib_net_to_host_u16 @@ -1229,7 +1229,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error1 == MAP_ERROR_NONE && next1 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p1)-> map_t.map_domain_index, 1, clib_net_to_host_u16 @@ -1403,7 +1403,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, clib_net_to_host_u16 diff --git a/src/vnet/mpls/mpls_input.c b/src/vnet/mpls/mpls_input.c index 893c4511..1b9bdd05 100644 --- a/src/vnet/mpls/mpls_input.c +++ b/src/vnet/mpls/mpls_input.c @@ -76,7 +76,7 @@ mpls_input_inline (vlib_main_t * vm, u32 n_left_from, next_index, * from, * to_next; mpls_input_runtime_t * rt; mpls_main_t * mm; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_simple_counter_main_t * cm; vnet_main_t * vnm = vnet_get_main(); @@ -151,7 +151,7 @@ mpls_input_inline (vlib_main_t * vm, next0 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); } if (PREDICT_FALSE(h1[3] == 0)) @@ -164,7 +164,7 @@ mpls_input_inline (vlib_main_t * vm, next1 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index1, &next1, b1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); } if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -215,7 +215,7 @@ mpls_input_inline (vlib_main_t * vm, { next0 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); } if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c index 475bb204..ace6a70f 100644 --- a/src/vnet/mpls/mpls_lookup.c +++ b/src/vnet/mpls/mpls_lookup.c @@ -67,7 +67,7 @@ mpls_lookup (vlib_main_t * vm, vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; u32 n_left_from, next_index, * from, * to_next; mpls_main_t * mm = &mpls_main; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -220,16 +220,16 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter - (cm, cpu_index, lbi2, 1, + (cm, thread_index, lbi2, 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter - (cm, cpu_index, lbi3, 1, + (cm, thread_index, lbi3, 1, vlib_buffer_length_in_chain (vm, b3)); /* @@ -351,7 +351,7 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); /* @@ -440,7 +440,7 @@ mpls_load_balance (vlib_main_t * vm, { vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 next; from = vlib_frame_vector_args (frame); @@ -536,10 +536,10 @@ mpls_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) @@ -597,7 +597,7 @@ mpls_load_balance (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, diff --git a/src/vnet/mpls/mpls_output.c b/src/vnet/mpls/mpls_output.c index 08018fd1..d90dec21 100644 --- a/src/vnet/mpls/mpls_output.c +++ b/src/vnet/mpls/mpls_output.c @@ -64,12 +64,12 @@ mpls_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_midchain) { - u32 n_left_from, next_index, * from, * to_next, cpu_index; + u32 n_left_from, next_index, * from, * to_next, thread_index; vlib_node_runtime_t * error_node; u32 n_left_to_next; mpls_main_t *mm; - cpu_index = os_get_cpu_number(); + thread_index = vlib_get_thread_index(); error_node = vlib_node_get_runtime (vm, mpls_output_node.index); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -137,13 +137,13 @@ mpls_output_inline (vlib_main_t * vm, /* Bump the adj counters for packet and bytes */ vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); @@ -245,7 +245,7 @@ mpls_output_inline (vlib_main_t * vm, vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); diff --git a/src/vnet/pg/input.c b/src/vnet/pg/input.c index 2649798b..597ae060 100644 --- a/src/vnet/pg/input.c +++ b/src/vnet/pg/input.c @@ -893,7 +893,7 @@ pg_generate_set_lengths (pg_main_t * pg, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), + vlib_get_thread_index (), si->sw_if_index, n_buffers, length_sum); } @@ -1266,7 +1266,7 @@ pg_stream_fill_helper (pg_main_t * pg, l += vlib_buffer_index_length_in_chain (vm, buffers[i]); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), + vlib_get_thread_index (), si->sw_if_index, n_alloc, l); s->current_replay_packet_index += n_alloc; s->current_replay_packet_index %= diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 86d922b5..233a8c2f 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -31,16 +31,16 @@ replication_prep (vlib_main_t * vm, { replication_main_t *rm = &replication_main; replication_context_t *ctx; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; ip4_header_t *ip; u32 ctx_id; /* Allocate a context, reserve context 0 */ - if (PREDICT_FALSE (rm->contexts[cpu_number] == 0)) - pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES); + if (PREDICT_FALSE (rm->contexts[thread_index] == 0)) + pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES); - pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES); - ctx_id = ctx - rm->contexts[cpu_number]; + pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES); + ctx_id = ctx - rm->contexts[thread_index]; /* Save state from vlib buffer */ ctx->saved_free_list_index = b0->free_list_index; @@ -94,11 +94,11 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) { replication_main_t *rm = &replication_main; replication_context_t *ctx; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; ip4_header_t *ip; /* Get access to the replication context */ - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); + ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count); /* Restore vnet buffer state */ clib_memcpy (vnet_buffer (b0), ctx->vnet_buffer, @@ -133,7 +133,7 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) b0->flags &= ~VLIB_BUFFER_RECYCLE; /* Free context back to its pool */ - pool_put (rm->contexts[cpu_number], ctx); + pool_put (rm->contexts[thread_index], ctx); } return ctx; @@ -160,7 +160,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) replication_main_t *rm = &replication_main; replication_context_t *ctx; u32 feature_node_index = 0; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; /* * All buffers in the list are destined to the same recycle node. @@ -172,7 +172,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) { bi0 = fl->buffers[0]; b0 = vlib_get_buffer (vm, bi0); - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); + ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count); feature_node_index = ctx->recycle_node_index; } diff --git a/src/vnet/replication.h b/src/vnet/replication.h index 5dc554c9..ce4b3ff1 100644 --- a/src/vnet/replication.h +++ b/src/vnet/replication.h @@ -100,7 +100,7 @@ replication_get_ctx (vlib_buffer_t * b0) replication_main_t *rm = &replication_main; return replication_is_recycled (b0) ? - pool_elt_at_index (rm->contexts[os_get_cpu_number ()], + pool_elt_at_index (rm->contexts[vlib_get_thread_index ()], b0->recycle_count) : 0; } diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index b86e87d9..dd211c51 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -311,7 +311,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, unix_shared_memory_queue_t *q; application_t *app; int n_tx_packets = 0; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; int i, rv; f64 now = vlib_time_now (vm); diff --git a/src/vnet/sr/sr_localsid.c b/src/vnet/sr/sr_localsid.c index 2e3d56de..6d72a506 100755 --- a/src/vnet/sr/sr_localsid.c +++ b/src/vnet/sr/sr_localsid.c @@ -887,7 +887,7 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -974,26 +974,26 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (((next1 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b1)); + &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter (((next2 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b2)); + &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter (((next3 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b3)); + &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b3)); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, bi0, bi1, bi2, bi3, @@ -1062,8 +1062,8 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); @@ -1103,7 +1103,7 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -1205,26 +1205,26 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (((next1 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b1)); + &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter (((next2 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b2)); + &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter (((next3 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b3)); + &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b3)); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, bi0, bi1, bi2, bi3, @@ -1295,8 +1295,8 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index e3705060..c1567aa0 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -174,7 +174,7 @@ tclient_thread_fn (void *arg) pthread_sigmask (SIG_SETMASK, &s, 0); } - clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0]; + clib_per_cpu_mheaps[vlib_get_thread_index ()] = clib_per_cpu_mheaps[0]; while (1) { diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index b2a371e2..b6c34828 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -646,10 +646,10 @@ const static transport_proto_vft_t tcp6_proto = { void tcp_timer_keep_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID; tcp_connection_close (tc); @@ -675,10 +675,10 @@ tcp_timer_establish_handler (u32 conn_index) void tcp_timer_waitclose_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; /* Session didn't come back with a close(). Send FIN either way diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 0090e15e..eaca672c 100644 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -343,7 +343,7 @@ typedef enum _tcp_dbg_evt } \ else \ { \ - u32 _thread_index = os_get_cpu_number (); \ + u32 _thread_index = vlib_get_thread_index (); \ _tc = tcp_connection_get (_tc_index, _thread_index); \ } \ ELOG_TYPE_DECLARE (_e) = \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a8224dc2..7e9fa47b 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1142,7 +1142,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); @@ -1332,7 +1332,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; from = vlib_frame_vector_args (from_frame); @@ -1634,7 +1634,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1989,7 +1989,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; @@ -2243,7 +2243,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index ea157bd7..e18bfad7 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -387,8 +387,8 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ @@ -396,7 +396,7 @@ do { \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ tm->vlib_main, my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[cpu_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -408,8 +408,8 @@ do { \ #define tcp_return_buffer(tm) \ do { \ u32 *my_tx_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ _vec_len (my_tx_buffers) +=1; \ } while (0) @@ -942,7 +942,7 @@ tcp_send_ack (tcp_connection_t * tc) void tcp_timer_delack_handler (u32 index) { - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; tc = tcp_connection_get (index, thread_index); @@ -1022,7 +1022,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; u32 bi, snd_space, n_bytes; @@ -1152,7 +1152,7 @@ tcp_timer_persist_handler (u32 index) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; u32 bi, n_bytes; @@ -1313,7 +1313,7 @@ tcp46_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1524,7 +1524,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 4b22109b..810278e6 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -70,7 +70,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, udp4_uri_input_next_t next_index; udp_uri_main_t *um = vnet_get_udp_main (); session_manager_main_t *smm = vnet_get_session_manager_main (); - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; u8 my_enqueue_epoch; u32 *session_indices_to_enqueue; static u32 serial_number; diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c index fb1a8bac..0fc62f6c 100644 --- a/src/vnet/unix/tapcli.c +++ b/src/vnet/unix/tapcli.c @@ -366,7 +366,7 @@ static uword tapcli_rx_iface(vlib_main_t * vm, vlib_increment_combined_counter ( vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number(), ti->sw_if_index, + vlib_get_thread_index(), ti->sw_if_index, 1, n_bytes_in_packet); if (PREDICT_FALSE(n_trace > 0)) { diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c index 2cfcc92f..ac674653 100644 --- a/src/vnet/unix/tuntap.c +++ b/src/vnet/unix/tuntap.c @@ -189,7 +189,7 @@ tuntap_tx (vlib_main_t * vm, /* Update tuntap interface output stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - vm->cpu_index, + vm->thread_index, tm->sw_if_index, n_packets, n_bytes); @@ -297,7 +297,7 @@ tuntap_rx (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number(), + vlib_get_thread_index(), tm->sw_if_index, 1, n_bytes_in_packet); diff --git a/src/vnet/vxlan-gpe/decap.c b/src/vnet/vxlan-gpe/decap.c index 22ab4b62..d4fe4231 100644 --- a/src/vnet/vxlan-gpe/decap.c +++ b/src/vnet/vxlan-gpe/decap.c @@ -115,7 +115,7 @@ vxlan_gpe_input (vlib_main_t * vm, vxlan4_gpe_tunnel_key_t last_key4; vxlan6_gpe_tunnel_key_t last_key6; u32 pkts_decapsulated = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; if (is_ip4) @@ -342,7 +342,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -427,7 +427,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len1; stats_sw_if_index = sw_if_index1; @@ -588,7 +588,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -615,7 +615,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) { vlib_increment_combined_counter ( - im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, cpu_index, + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan-gpe/encap.c b/src/vnet/vxlan-gpe/encap.c index 3a486e56..67ed94b4 100644 --- a/src/vnet/vxlan-gpe/encap.c +++ b/src/vnet/vxlan-gpe/encap.c @@ -151,7 +151,7 @@ vxlan_gpe_encap (vlib_main_t * vm, vnet_main_t * vnm = ngm->vnet_main; vnet_interface_main_t * im = &vnm->interface_main; u32 pkts_encapsulated = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; from = vlib_frame_vector_args (from_frame); @@ -253,7 +253,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_sw_if_index = sw_if_index0; stats_n_packets = 2; stats_n_bytes = len0 + len1; @@ -262,10 +262,10 @@ vxlan_gpe_encap (vlib_main_t * vm, { vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index0, 1, len0); + thread_index, sw_if_index0, 1, len0); vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index1, 1, len1); + thread_index, sw_if_index1, 1, len1); } } @@ -335,7 +335,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -359,7 +359,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) { vlib_increment_combined_counter ( - im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, cpu_index, + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c index 514b2c99..2acb1f6f 100644 --- a/src/vnet/vxlan/decap.c +++ b/src/vnet/vxlan/decap.c @@ -81,7 +81,7 @@ vxlan_input (vlib_main_t * vm, vxlan4_tunnel_key_t last_key4; vxlan6_tunnel_key_t last_key6; u32 pkts_decapsulated = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; if (is_ip4) @@ -314,7 +314,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -468,7 +468,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len1; @@ -674,7 +674,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -711,7 +711,7 @@ vxlan_input (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan/encap.c b/src/vnet/vxlan/encap.c index 5b63064a..4cfbbc23 100644 --- a/src/vnet/vxlan/encap.c +++ b/src/vnet/vxlan/encap.c @@ -77,7 +77,7 @@ vxlan_encap_inline (vlib_main_t * vm, vnet_interface_main_t * im = &vnm->interface_main; u32 pkts_encapsulated = 0; u16 old_l0 = 0, old_l1 = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; u32 sw_if_index0 = 0, sw_if_index1 = 0; u32 next0 = 0, next1 = 0; @@ -301,7 +301,7 @@ vxlan_encap_inline (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_sw_if_index = sw_if_index0; stats_n_packets = 2; @@ -311,10 +311,10 @@ vxlan_encap_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index0, 1, len0); + thread_index, sw_if_index0, 1, len0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index1, 1, len1); + thread_index, sw_if_index1, 1, len1); } } @@ -464,7 +464,7 @@ vxlan_encap_inline (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -496,7 +496,7 @@ vxlan_encap_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c index 042d02e2..4309cd51 100644 --- a/src/vpp/stats/stats.c +++ b/src/vpp/stats/stats.c @@ -66,14 +66,14 @@ _(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters) void dslock (stats_main_t * sm, int release_hint, int tag) { - u32 thread_id; + u32 thread_index; data_structure_lock_t *l = sm->data_structure_lock; if (PREDICT_FALSE (l == 0)) return; - thread_id = os_get_cpu_number (); - if (l->lock && l->thread_id == thread_id) + thread_index = vlib_get_thread_index (); + if (l->lock && l->thread_index == thread_index) { l->count++; return; @@ -85,7 +85,7 @@ dslock (stats_main_t * sm, int release_hint, int tag) while (__sync_lock_test_and_set (&l->lock, 1)) /* zzzz */ ; l->tag = tag; - l->thread_id = thread_id; + l->thread_index = thread_index; l->count = 1; } @@ -99,14 +99,14 @@ stats_dslock_with_hint (int hint, int tag) void dsunlock (stats_main_t * sm) { - u32 thread_id; + u32 thread_index; data_structure_lock_t *l = sm->data_structure_lock; if (PREDICT_FALSE (l == 0)) return; - thread_id = os_get_cpu_number (); - ASSERT (l->lock && l->thread_id == thread_id); + thread_index = vlib_get_thread_index (); + ASSERT (l->lock && l->thread_index == thread_index); l->count--; if (l->count == 0) { diff --git a/src/vpp/stats/stats.h b/src/vpp/stats/stats.h index 118115be..024dc78e 100644 --- a/src/vpp/stats/stats.h +++ b/src/vpp/stats/stats.h @@ -30,7 +30,7 @@ typedef struct { volatile u32 lock; volatile u32 release_hint; - u32 thread_id; + u32 thread_index; u32 count; int tag; } data_structure_lock_t; -- cgit 1.2.3-korg From 0f26c5a0138ac86d7ebd197c31a09d8d624c35fe Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Wed, 1 Mar 2017 15:12:11 -0800 Subject: MPLS Mcast 1 - interface-DPO Used in the Data-plane to change a packet's input interface 2 - MPLS multicast FIB entry Same as a unicast entry but it links to a replicate not a load-balance DPO 3 - Multicast MPLS tunnel Update MPLS tunnels to use a FIB path-list to describe the endpoint[s]. Use the path-list to generate the forwarding chain (DPOs) to link to . 4 - Resolve a path via a local label (of an mLDP LSP) For IP multicast entries to use an LSP in the replication list, we need to decribe the 'resolve-via-label' where the label is that of a multicast LSP. 5 - MPLS disposition path sets RPF-ID For a interface-less LSP (i.e. mLDP not RSVP-TE) at the tail of the LSP we still need to perform an RPF check. An MPLS disposition DPO performs the MPLS pop validation checks and sets the RPF-ID in the packet. 6 - RPF check with per-entry RPF-ID An RPF-ID is used instead of a real interface SW if index in the case the IP traffic arrives from an LSP that does not have an associated interface. Change-Id: Ib92e177be919147bafeb599729abf3d1abc2f4b3 Signed-off-by: Neale Ranns --- src/plugins/dpdk/device/node.c | 2 +- src/vat/api_format.c | 133 ++--- src/vnet.am | 2 + src/vnet/adj/adj.c | 13 +- src/vnet/adj/adj.h | 6 + src/vnet/adj/adj_internal.h | 14 +- src/vnet/adj/adj_mcast.c | 134 ++++- src/vnet/adj/adj_mcast.h | 27 + src/vnet/adj/adj_midchain.c | 62 ++- src/vnet/adj/adj_nbr.c | 2 - src/vnet/buffer.h | 3 + src/vnet/devices/ssvm/node.c | 2 +- src/vnet/dhcp/dhcp6_proxy_node.c | 1 + src/vnet/dpo/dpo.c | 10 + src/vnet/dpo/dpo.h | 8 +- src/vnet/dpo/interface_dpo.c | 416 ++++++++++++++++ src/vnet/dpo/interface_dpo.h | 67 +++ src/vnet/dpo/lookup_dpo.c | 211 +++++++- src/vnet/dpo/lookup_dpo.h | 20 + src/vnet/dpo/mpls_disposition.c | 364 ++++++++++++++ src/vnet/dpo/mpls_disposition.h | 85 ++++ src/vnet/dpo/mpls_label_dpo.c | 6 +- src/vnet/dpo/replicate_dpo.c | 48 +- src/vnet/dpo/replicate_dpo.h | 2 + src/vnet/ethernet/arp.c | 1 + src/vnet/ethernet/interface.c | 2 +- src/vnet/ethernet/node.c | 4 +- src/vnet/ethernet/types.def | 4 +- src/vnet/fib/fib_api.h | 4 + src/vnet/fib/fib_entry.c | 47 +- src/vnet/fib/fib_entry.h | 13 +- src/vnet/fib/fib_entry_src.c | 154 +++--- src/vnet/fib/fib_internal.h | 1 + src/vnet/fib/fib_path.c | 222 +++++++-- src/vnet/fib/fib_path.h | 17 +- src/vnet/fib/fib_path_ext.c | 4 +- src/vnet/fib/fib_path_ext.h | 3 +- src/vnet/fib/fib_path_list.c | 270 ++++++---- src/vnet/fib/fib_path_list.h | 22 +- src/vnet/fib/fib_table.c | 47 +- src/vnet/fib/fib_test.c | 345 +++++++++++-- src/vnet/fib/fib_test.h | 111 +++++ src/vnet/fib/fib_types.c | 15 +- src/vnet/fib/fib_types.h | 60 ++- src/vnet/fib/mpls_fib.c | 15 +- src/vnet/handoff.h | 10 +- src/vnet/interface.c | 2 +- src/vnet/ip/ip.api | 3 + src/vnet/ip/ip4_forward.c | 20 + src/vnet/ip/ip6_forward.c | 23 + src/vnet/ip/ip6_neighbor.c | 1 + src/vnet/ip/ip_api.c | 98 ++-- src/vnet/ip/lookup.c | 3 +- src/vnet/lisp-gpe/lisp_gpe_fwd_entry.c | 1 + src/vnet/mfib/ip4_mfib.c | 1 + src/vnet/mfib/ip6_mfib.c | 1 + src/vnet/mfib/mfib_entry.c | 395 +++++++++------ src/vnet/mfib/mfib_entry.h | 20 +- src/vnet/mfib/mfib_forward.c | 29 +- src/vnet/mfib/mfib_table.c | 8 +- src/vnet/mfib/mfib_table.h | 1 + src/vnet/mfib/mfib_test.c | 127 ++++- src/vnet/mpls/mpls.api | 87 ++-- src/vnet/mpls/mpls.c | 17 +- src/vnet/mpls/mpls_api.c | 97 ++-- src/vnet/mpls/mpls_input.c | 2 +- src/vnet/mpls/mpls_lookup.c | 236 ++++++--- src/vnet/mpls/mpls_tunnel.c | 883 ++++++++++++++++++++++----------- src/vnet/mpls/mpls_tunnel.h | 57 ++- src/vnet/mpls/mpls_types.h | 20 + src/vnet/srp/interface.c | 2 +- test/test_ip_mcast.py | 1 + test/test_mpls.py | 277 ++++++++++- test/vpp_ip_route.py | 38 +- test/vpp_mpls_tunnel_interface.py | 46 ++ test/vpp_papi_provider.py | 16 +- 76 files changed, 4393 insertions(+), 1128 deletions(-) create mode 100644 src/vnet/dpo/interface_dpo.c create mode 100644 src/vnet/dpo/interface_dpo.h create mode 100644 src/vnet/dpo/mpls_disposition.c create mode 100644 src/vnet/dpo/mpls_disposition.h create mode 100644 src/vnet/fib/fib_test.h create mode 100644 test/vpp_mpls_tunnel_interface.py (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index b10e0fad..0549ba5d 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -52,7 +52,7 @@ always_inline int vlib_buffer_is_mpls (vlib_buffer_t * b) { ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b); - return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST)); + return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS)); } always_inline u32 diff --git a/src/vat/api_format.c b/src/vat/api_format.c index 61b8e1d8..107aa012 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -16369,32 +16369,82 @@ api_netmap_delete (vat_main_t * vam) return ret; } -static void vl_api_mpls_tunnel_details_t_handler - (vl_api_mpls_tunnel_details_t * mp) +static void +vl_api_mpls_fib_path_print (vat_main_t * vam, vl_api_fib_path2_t * fp) +{ + if (fp->afi == IP46_TYPE_IP6) + print (vam->ofp, + " weight %d, sw_if_index %d, is_local %d, is_drop %d, " + "is_unreach %d, is_prohitbit %d, afi %d, next_hop %U", + ntohl (fp->weight), ntohl (fp->sw_if_index), fp->is_local, + fp->is_drop, fp->is_unreach, fp->is_prohibit, fp->afi, + format_ip6_address, fp->next_hop); + else if (fp->afi == IP46_TYPE_IP4) + print (vam->ofp, + " weight %d, sw_if_index %d, is_local %d, is_drop %d, " + "is_unreach %d, is_prohitbit %d, afi %d, next_hop %U", + ntohl (fp->weight), ntohl (fp->sw_if_index), fp->is_local, + fp->is_drop, fp->is_unreach, fp->is_prohibit, fp->afi, + format_ip4_address, fp->next_hop); +} + +static void +vl_api_mpls_fib_path_json_print (vat_json_node_t * node, + vl_api_fib_path2_t * fp) +{ + struct in_addr ip4; + struct in6_addr ip6; + + vat_json_object_add_uint (node, "weight", ntohl (fp->weight)); + vat_json_object_add_uint (node, "sw_if_index", ntohl (fp->sw_if_index)); + vat_json_object_add_uint (node, "is_local", fp->is_local); + vat_json_object_add_uint (node, "is_drop", fp->is_drop); + vat_json_object_add_uint (node, "is_unreach", fp->is_unreach); + vat_json_object_add_uint (node, "is_prohibit", fp->is_prohibit); + vat_json_object_add_uint (node, "next_hop_afi", fp->afi); + if (fp->afi == IP46_TYPE_IP4) + { + clib_memcpy (&ip4, &fp->next_hop, sizeof (ip4)); + vat_json_object_add_ip4 (node, "next_hop", ip4); + } + else if (fp->afi == IP46_TYPE_IP6) + { + clib_memcpy (&ip6, &fp->next_hop, sizeof (ip6)); + vat_json_object_add_ip6 (node, "next_hop", ip6); + } +} + +static void +vl_api_mpls_tunnel_details_t_handler (vl_api_mpls_tunnel_details_t * mp) { vat_main_t *vam = &vat_main; - i32 len = mp->mt_next_hop_n_labels; + int count = ntohl (mp->mt_count); + vl_api_fib_path2_t *fp; i32 i; - print (vam->ofp, "[%d]: via %U %d labels ", - mp->tunnel_index, - format_ip4_address, mp->mt_next_hop, - ntohl (mp->mt_next_hop_sw_if_index)); - for (i = 0; i < len; i++) + print (vam->ofp, "[%d]: sw_if_index %d via:", + ntohl (mp->mt_tunnel_index), ntohl (mp->mt_sw_if_index)); + fp = mp->mt_paths; + for (i = 0; i < count; i++) { - print (vam->ofp, "%u ", ntohl (mp->mt_next_hop_out_labels[i])); + vl_api_mpls_fib_path_print (vam, fp); + fp++; } + print (vam->ofp, ""); } -static void vl_api_mpls_tunnel_details_t_handler_json - (vl_api_mpls_tunnel_details_t * mp) +#define vl_api_mpls_tunnel_details_t_endian vl_noop_handler +#define vl_api_mpls_tunnel_details_t_print vl_noop_handler + +static void +vl_api_mpls_tunnel_details_t_handler_json (vl_api_mpls_tunnel_details_t * mp) { vat_main_t *vam = &vat_main; vat_json_node_t *node = NULL; - struct in_addr ip4; + int count = ntohl (mp->mt_count); + vl_api_fib_path2_t *fp; i32 i; - i32 len = mp->mt_next_hop_n_labels; if (VAT_JSON_ARRAY != vam->json_tree.type) { @@ -16404,17 +16454,17 @@ static void vl_api_mpls_tunnel_details_t_handler_json node = vat_json_array_add (&vam->json_tree); vat_json_init_object (node); - vat_json_object_add_uint (node, "tunnel_index", ntohl (mp->tunnel_index)); - clib_memcpy (&ip4, &(mp->mt_next_hop), sizeof (ip4)); - vat_json_object_add_ip4 (node, "next_hop", ip4); - vat_json_object_add_uint (node, "next_hop_sw_if_index", - ntohl (mp->mt_next_hop_sw_if_index)); - vat_json_object_add_uint (node, "l2_only", ntohl (mp->mt_l2_only)); - vat_json_object_add_uint (node, "label_count", len); - for (i = 0; i < len; i++) + vat_json_object_add_uint (node, "tunnel_index", + ntohl (mp->mt_tunnel_index)); + vat_json_object_add_uint (node, "sw_if_index", ntohl (mp->mt_sw_if_index)); + + vat_json_object_add_uint (node, "l2_only", mp->mt_l2_only); + + fp = mp->mt_paths; + for (i = 0; i < count; i++) { - vat_json_object_add_uint (node, "label", - ntohl (mp->mt_next_hop_out_labels[i])); + vl_api_mpls_fib_path_json_print (node, fp); + fp++; } } @@ -16453,6 +16503,7 @@ api_mpls_tunnel_dump (vat_main_t * vam) #define vl_api_mpls_fib_details_t_endian vl_noop_handler #define vl_api_mpls_fib_details_t_print vl_noop_handler + static void vl_api_mpls_fib_details_t_handler (vl_api_mpls_fib_details_t * mp) { @@ -16467,20 +16518,7 @@ vl_api_mpls_fib_details_t_handler (vl_api_mpls_fib_details_t * mp) fp = mp->path; for (i = 0; i < count; i++) { - if (fp->afi == IP46_TYPE_IP6) - print (vam->ofp, - " weight %d, sw_if_index %d, is_local %d, is_drop %d, " - "is_unreach %d, is_prohitbit %d, afi %d, next_hop %U", - ntohl (fp->weight), ntohl (fp->sw_if_index), fp->is_local, - fp->is_drop, fp->is_unreach, fp->is_prohibit, fp->afi, - format_ip6_address, fp->next_hop); - else if (fp->afi == IP46_TYPE_IP4) - print (vam->ofp, - " weight %d, sw_if_index %d, is_local %d, is_drop %d, " - "is_unreach %d, is_prohitbit %d, afi %d, next_hop %U", - ntohl (fp->weight), ntohl (fp->sw_if_index), fp->is_local, - fp->is_drop, fp->is_unreach, fp->is_prohibit, fp->afi, - format_ip4_address, fp->next_hop); + vl_api_mpls_fib_path_print (vam, fp); fp++; } } @@ -16491,8 +16529,6 @@ static void vl_api_mpls_fib_details_t_handler_json vat_main_t *vam = &vat_main; int count = ntohl (mp->count); vat_json_node_t *node = NULL; - struct in_addr ip4; - struct in6_addr ip6; vl_api_fib_path2_t *fp; int i; @@ -16511,23 +16547,8 @@ static void vl_api_mpls_fib_details_t_handler_json fp = mp->path; for (i = 0; i < count; i++) { - vat_json_object_add_uint (node, "weight", ntohl (fp->weight)); - vat_json_object_add_uint (node, "sw_if_index", ntohl (fp->sw_if_index)); - vat_json_object_add_uint (node, "is_local", fp->is_local); - vat_json_object_add_uint (node, "is_drop", fp->is_drop); - vat_json_object_add_uint (node, "is_unreach", fp->is_unreach); - vat_json_object_add_uint (node, "is_prohibit", fp->is_prohibit); - vat_json_object_add_uint (node, "next_hop_afi", fp->afi); - if (fp->afi == IP46_TYPE_IP4) - { - clib_memcpy (&ip4, &fp->next_hop, sizeof (ip4)); - vat_json_object_add_ip4 (node, "next_hop", ip4); - } - else if (fp->afi == IP46_TYPE_IP6) - { - clib_memcpy (&ip6, &fp->next_hop, sizeof (ip6)); - vat_json_object_add_ip6 (node, "next_hop", ip6); - } + vl_api_mpls_fib_path_json_print (node, fp); + fp++; } } diff --git a/src/vnet.am b/src/vnet.am index 643ae92e..bed4902b 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -990,6 +990,8 @@ libvnet_la_SOURCES += \ vnet/dpo/lookup_dpo.c \ vnet/dpo/classify_dpo.c \ vnet/dpo/replicate_dpo.c \ + vnet/dpo/interface_dpo.c \ + vnet/dpo/mpls_disposition.c \ vnet/dpo/mpls_label_dpo.c nobase_include_HEADERS += \ diff --git a/src/vnet/adj/adj.c b/src/vnet/adj/adj.c index 90182006..36dfe500 100644 --- a/src/vnet/adj/adj.c +++ b/src/vnet/adj/adj.c @@ -67,6 +67,10 @@ adj_alloc (fib_protocol_t proto) adj->lookup_next_index = 0; adj->ia_delegates = NULL; + /* lest it become a midchain in the future */ + memset(&adj->sub_type.midchain.next_dpo, 0, + sizeof(adj->sub_type.midchain.next_dpo)); + ip4_main.lookup_main.adjacency_heap = adj_pool; ip6_main.lookup_main.adjacency_heap = adj_pool; @@ -118,6 +122,9 @@ format_ip_adjacency (u8 * s, va_list * args) case IP_LOOKUP_NEXT_MCAST: s = format (s, "%U", format_adj_mcast, adj_index, 0); break; + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: + s = format (s, "%U", format_adj_mcast_midchain, adj_index, 0); + break; default: break; } @@ -180,6 +187,7 @@ adj_last_lock_gone (ip_adjacency_t *adj) adj->rewrite_header.sw_if_index); break; case IP_LOOKUP_NEXT_MCAST: + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: adj_mcast_remove(adj->ia_nh_proto, adj->rewrite_header.sw_if_index); break; @@ -338,6 +346,7 @@ adj_walk (u32 sw_if_index, FOR_EACH_FIB_IP_PROTOCOL(proto) { adj_nbr_walk(sw_if_index, proto, cb, ctx); + adj_mcast_walk(sw_if_index, proto, cb, ctx); } } @@ -544,9 +553,9 @@ adj_show (vlib_main_t * vm, * [@0] * [@1] glean: loop0 * [@2] ipv4 via 1.0.0.2 loop0: IP4: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc - * [@3] mpls via 1.0.0.2 loop0: MPLS_UNICAST: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc + * [@3] mpls via 1.0.0.2 loop0: MPLS: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc * [@4] ipv4 via 1.0.0.3 loop0: IP4: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc - * [@5] mpls via 1.0.0.3 loop0: MPLS_UNICAST: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc + * [@5] mpls via 1.0.0.3 loop0: MPLS: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc * @cliexend ?*/ VLIB_CLI_COMMAND (adj_show_command, static) = { diff --git a/src/vnet/adj/adj.h b/src/vnet/adj/adj.h index 32997c91..ed5eb1f1 100644 --- a/src/vnet/adj/adj.h +++ b/src/vnet/adj/adj.h @@ -81,6 +81,10 @@ typedef enum /** Multicast Adjacency. */ IP_LOOKUP_NEXT_MCAST, + /** Multicast Midchain Adjacency. An Adjacency for sending macst packets + * on a tunnel/virtual interface */ + IP_LOOKUP_NEXT_MCAST_MIDCHAIN, + IP_LOOKUP_N_NEXT, } __attribute__ ((packed)) ip_lookup_next_t; @@ -107,6 +111,7 @@ typedef enum [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite", \ [IP_LOOKUP_NEXT_MCAST] = "ip4-rewrite-mcast", \ [IP_LOOKUP_NEXT_MIDCHAIN] = "ip4-midchain", \ + [IP_LOOKUP_NEXT_MCAST_MIDCHAIN] = "ip4-mcast-midchain", \ [IP_LOOKUP_NEXT_ICMP_ERROR] = "ip4-icmp-error", \ } @@ -119,6 +124,7 @@ typedef enum [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite", \ [IP_LOOKUP_NEXT_MCAST] = "ip6-rewrite-mcast", \ [IP_LOOKUP_NEXT_MIDCHAIN] = "ip6-midchain", \ + [IP_LOOKUP_NEXT_MCAST_MIDCHAIN] = "ip6-mcast-midchain", \ [IP_LOOKUP_NEXT_ICMP_ERROR] = "ip6-icmp-error", \ [IP6_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop", \ [IP6_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop", \ diff --git a/src/vnet/adj/adj_internal.h b/src/vnet/adj/adj_internal.h index 30668625..2c123c54 100644 --- a/src/vnet/adj/adj_internal.h +++ b/src/vnet/adj/adj_internal.h @@ -17,6 +17,7 @@ #define __ADJ_INTERNAL_H__ #include +#include #include #include #include @@ -87,11 +88,14 @@ adj_get_index (ip_adjacency_t *adj) return (adj - adj_pool); } -extern void adj_nbr_update_rewrite_internal (ip_adjacency_t *adj, - ip_lookup_next_t adj_next_index, - u32 complete_next_index, - u32 next_index, - u8 *rewrite); +extern void adj_nbr_update_rewrite_internal(ip_adjacency_t *adj, + ip_lookup_next_t adj_next_index, + u32 complete_next_index, + u32 next_index, + u8 *rewrite); +extern void adj_midchain_setup(adj_index_t adj_index, + adj_midchain_fixup_t fixup, + adj_flags_t flags); extern ip_adjacency_t * adj_alloc(fib_protocol_t proto); diff --git a/src/vnet/adj/adj_mcast.c b/src/vnet/adj/adj_mcast.c index 4f678e43..755abfd4 100644 --- a/src/vnet/adj/adj_mcast.c +++ b/src/vnet/adj/adj_mcast.c @@ -13,7 +13,7 @@ * limitations under the License. */ -#include +#include #include #include #include @@ -129,6 +129,59 @@ adj_mcast_update_rewrite (adj_index_t adj_index, adj->rewrite_header.dst_mcast_mask = clib_host_to_net_u32(mask); } +/** + * adj_mcast_midchain_update_rewrite + * + * Update the adjacency's rewrite string. A NULL string implies the + * rewirte is reset (i.e. when ARP/ND etnry is gone). + * NB: the adj being updated may be handling traffic in the DP. + */ +void +adj_mcast_midchain_update_rewrite (adj_index_t adj_index, + adj_midchain_fixup_t fixup, + adj_flags_t flags, + u8 *rewrite, + u8 offset, + u32 mask) +{ + ip_adjacency_t *adj; + + ASSERT(ADJ_INDEX_INVALID != adj_index); + + adj = adj_get(adj_index); + + /* + * one time only update. since we don't support chainging the tunnel + * src,dst, this is all we need. + */ + ASSERT(adj->lookup_next_index == IP_LOOKUP_NEXT_MCAST); + /* + * tunnels can always provide a rewrite. + */ + ASSERT(NULL != rewrite); + + adj_midchain_setup(adj_index, fixup, flags); + + /* + * update the adj's rewrite string and build the arc + * from the rewrite node to the interface's TX node + */ + adj_nbr_update_rewrite_internal(adj, IP_LOOKUP_NEXT_MCAST_MIDCHAIN, + adj_get_mcast_node(adj->ia_nh_proto), + vnet_tx_node_index_for_sw_interface( + vnet_get_main(), + adj->rewrite_header.sw_if_index), + rewrite); + + /* + * set the fields corresponding to the mcast IP address rewrite + * The mask must be stored in network byte order, since the packet's + * IP address will also be in network order. + */ + adj->rewrite_header.dst_mcast_offset = offset; + adj->rewrite_header.dst_mcast_mask = clib_host_to_net_u32(mask); +} + void adj_mcast_remove (fib_protocol_t proto, u32 sw_if_index) @@ -260,6 +313,24 @@ adj_mcast_interface_delete (vnet_main_t * vnm, VNET_SW_INTERFACE_ADD_DEL_FUNCTION(adj_mcast_interface_delete); +/** + * @brief Walk the multicast Adjacencies on a given interface + */ +void +adj_mcast_walk (u32 sw_if_index, + fib_protocol_t proto, + adj_walk_cb_t cb, + void *ctx) +{ + if (vec_len(adj_mcasts[proto]) > sw_if_index) + { + if (ADJ_INDEX_INVALID != adj_mcasts[proto][sw_if_index]) + { + cb(adj_mcasts[proto][sw_if_index], ctx); + } + } +} + u8* format_adj_mcast (u8* s, va_list *ap) { @@ -269,6 +340,8 @@ format_adj_mcast (u8* s, va_list *ap) s = format(s, "%U-mcast: ", format_fib_protocol, adj->ia_nh_proto); + if (adj->rewrite_header.flags & VNET_REWRITE_HAS_FEATURES) + s = format(s, "[features] "); s = format (s, "%U", format_vnet_rewrite, &adj->rewrite_header, sizeof (adj->rewrite_data), 0); @@ -276,6 +349,28 @@ format_adj_mcast (u8* s, va_list *ap) return (s); } +u8* +format_adj_mcast_midchain (u8* s, va_list *ap) +{ + index_t index = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + vnet_main_t * vnm = vnet_get_main(); + ip_adjacency_t * adj = adj_get(index); + + s = format(s, "%U-mcast-midchain: ", + format_fib_protocol, adj->ia_nh_proto); + s = format (s, "%U", + format_vnet_rewrite, + vnm->vlib_main, &adj->rewrite_header, + sizeof (adj->rewrite_data), 0); + s = format (s, "\n%Ustacked-on:\n%U%U", + format_white_space, indent, + format_white_space, indent+2, + format_dpo_id, &adj->sub_type.midchain.next_dpo, indent+2); + + return (s); +} + static void adj_dpo_lock (dpo_id_t *dpo) @@ -293,6 +388,11 @@ const static dpo_vft_t adj_mcast_dpo_vft = { .dv_unlock = adj_dpo_unlock, .dv_format = format_adj_mcast, }; +const static dpo_vft_t adj_mcast_midchain_dpo_vft = { + .dv_lock = adj_dpo_lock, + .dv_unlock = adj_dpo_unlock, + .dv_format = format_adj_mcast_midchain, +}; /** * @brief The per-protocol VLIB graph nodes that are assigned to a mcast @@ -319,6 +419,31 @@ const static char* const * const adj_mcast_nodes[DPO_PROTO_NUM] = [DPO_PROTO_MPLS] = NULL, }; +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a mcast + * object. + * + * this means that these graph nodes are ones from which a mcast is the + * parent object in the DPO-graph. + */ +const static char* const adj_mcast_midchain_ip4_nodes[] = +{ + "ip4-mcast-midchain", + NULL, +}; +const static char* const adj_mcast_midchain_ip6_nodes[] = +{ + "ip6-mcast-midchain", + NULL, +}; + +const static char* const * const adj_mcast_midchain_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = adj_mcast_midchain_ip4_nodes, + [DPO_PROTO_IP6] = adj_mcast_midchain_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + /** * @brief Return the size of the adj DB. * This is only for testing purposes so an efficient implementation is not needed @@ -349,5 +474,10 @@ adj_mcast_db_size (void) void adj_mcast_module_init (void) { - dpo_register(DPO_ADJACENCY_MCAST, &adj_mcast_dpo_vft, adj_mcast_nodes); + dpo_register(DPO_ADJACENCY_MCAST, + &adj_mcast_dpo_vft, + adj_mcast_nodes); + dpo_register(DPO_ADJACENCY_MCAST_MIDCHAIN, + &adj_mcast_midchain_dpo_vft, + adj_mcast_midchain_nodes); } diff --git a/src/vnet/adj/adj_mcast.h b/src/vnet/adj/adj_mcast.h index 40d44313..bfb0d6f6 100644 --- a/src/vnet/adj/adj_mcast.h +++ b/src/vnet/adj/adj_mcast.h @@ -26,6 +26,7 @@ #define __ADJ_MCAST_H__ #include +#include /** * @brief @@ -68,10 +69,36 @@ extern void adj_mcast_update_rewrite(adj_index_t adj_index, u8 offset, u32 mask); +/** + * @brief + * Update the rewrite string for an existing adjacecny and + * Convert the adjacency into a midchain + * + * @param + * The index of the adj to update + * + * @param + * The new rewrite + */ +extern void adj_mcast_midchain_update_rewrite(adj_index_t adj_index, + adj_midchain_fixup_t fixup, + adj_flags_t flags, + u8 *rewrite, + u8 offset, + u32 mask); +/** + * @brief Walk the multicast Adjacencies on a given interface + */ +extern void adj_mcast_walk (u32 sw_if_index, + fib_protocol_t adj_nh_proto, + adj_walk_cb_t cb, + void *ctx); + /** * @brief Format/display a mcast adjacency. */ extern u8* format_adj_mcast(u8* s, va_list *ap); +extern u8* format_adj_mcast_midchain(u8* s, va_list *ap); /** * @brief Get the sze of the mcast adj DB. Test purposes only. diff --git a/src/vnet/adj/adj_midchain.c b/src/vnet/adj/adj_midchain.c index 5756de43..a93a1c3e 100644 --- a/src/vnet/adj/adj_midchain.c +++ b/src/vnet/adj/adj_midchain.c @@ -346,7 +346,7 @@ adj_get_midchain_node (vnet_link_t link) static u8 adj_midchain_get_feature_arc_index_for_link_type (const ip_adjacency_t *adj) { - u8 arc = (u8) ~0; + u8 arc = (u8) ~0; switch (adj->ia_link) { case VNET_LINK_IP4: @@ -393,17 +393,14 @@ adj_nbr_midchain_get_tx_node (ip_adjacency_t *adj) } /** - * adj_nbr_midchain_update_rewrite + * adj_midchain_setup * - * Update the adjacency's rewrite string. A NULL string implies the - * rewrite is reset (i.e. when ARP/ND etnry is gone). - * NB: the adj being updated may be handling traffic in the DP. + * Setup the adj as a mid-chain */ void -adj_nbr_midchain_update_rewrite (adj_index_t adj_index, - adj_midchain_fixup_t fixup, - adj_flags_t flags, - u8 *rewrite) +adj_midchain_setup (adj_index_t adj_index, + adj_midchain_fixup_t fixup, + adj_flags_t flags) { u32 feature_index, tx_node; ip_adjacency_t *adj; @@ -413,16 +410,6 @@ adj_nbr_midchain_update_rewrite (adj_index_t adj_index, adj = adj_get(adj_index); - /* - * one time only update. since we don't support chainging the tunnel - * src,dst, this is all we need. - */ - ASSERT(adj->lookup_next_index == IP_LOOKUP_NEXT_ARP); - /* - * tunnels can always provide a rewrite. - */ - ASSERT(NULL != rewrite); - adj->sub_type.midchain.fixup_func = fixup; adj->ia_flags |= flags; @@ -447,6 +434,38 @@ adj_nbr_midchain_update_rewrite (adj_index_t adj_index, dpo_stack_from_node(tx_node, &adj->sub_type.midchain.next_dpo, drop_dpo_get(vnet_link_to_dpo_proto(adj->ia_link))); +} + +/** + * adj_nbr_midchain_update_rewrite + * + * Update the adjacency's rewrite string. A NULL string implies the + * rewrite is reset (i.e. when ARP/ND etnry is gone). + * NB: the adj being updated may be handling traffic in the DP. + */ +void +adj_nbr_midchain_update_rewrite (adj_index_t adj_index, + adj_midchain_fixup_t fixup, + adj_flags_t flags, + u8 *rewrite) +{ + ip_adjacency_t *adj; + + ASSERT(ADJ_INDEX_INVALID != adj_index); + + adj = adj_get(adj_index); + + /* + * one time only update. since we don't support chainging the tunnel + * src,dst, this is all we need. + */ + ASSERT(adj->lookup_next_index == IP_LOOKUP_NEXT_ARP); + /* + * tunnels can always provide a rewrite. + */ + ASSERT(NULL != rewrite); + + adj_midchain_setup(adj_index, fixup, flags); /* * update the rewirte with the workers paused. @@ -454,7 +473,7 @@ adj_nbr_midchain_update_rewrite (adj_index_t adj_index, adj_nbr_update_rewrite_internal(adj, IP_LOOKUP_NEXT_MIDCHAIN, adj_get_midchain_node(adj->ia_link), - tx_node, + adj_nbr_midchain_get_tx_node(adj), rewrite); } @@ -496,7 +515,8 @@ adj_nbr_midchain_stack (adj_index_t adj_index, adj = adj_get(adj_index); - ASSERT(IP_LOOKUP_NEXT_MIDCHAIN == adj->lookup_next_index); + ASSERT((IP_LOOKUP_NEXT_MIDCHAIN == adj->lookup_next_index) || + (IP_LOOKUP_NEXT_MCAST_MIDCHAIN == adj->lookup_next_index)); dpo_stack_from_node(adj_nbr_midchain_get_tx_node(adj), &adj->sub_type.midchain.next_dpo, diff --git a/src/vnet/adj/adj_nbr.c b/src/vnet/adj/adj_nbr.c index ddacb030..3d450d1f 100644 --- a/src/vnet/adj/adj_nbr.c +++ b/src/vnet/adj/adj_nbr.c @@ -195,8 +195,6 @@ adj_nbr_alloc (fib_protocol_t nh_proto, adj->ia_link = link_type; adj->ia_nh_proto = nh_proto; adj->rewrite_header.sw_if_index = sw_if_index; - memset(&adj->sub_type.midchain.next_dpo, 0, - sizeof(adj->sub_type.midchain.next_dpo)); adj_nbr_evaluate_feature (adj_get_index(adj)); return (adj); diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index ea3ce093..ed869d1f 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -130,6 +130,9 @@ typedef struct /* Rewrite length */ u32 save_rewrite_length; + + /* MFIB RPF ID */ + u32 rpf_id; }; /* ICMP */ diff --git a/src/vnet/devices/ssvm/node.c b/src/vnet/devices/ssvm/node.c index 539b4161..b7a8db05 100644 --- a/src/vnet/devices/ssvm/node.c +++ b/src/vnet/devices/ssvm/node.c @@ -210,7 +210,7 @@ ssvm_eth_device_input (ssvm_eth_main_t * em, next0 = SSVM_ETH_INPUT_NEXT_IP4_INPUT; else if (type0 == ETHERNET_TYPE_IP6) next0 = SSVM_ETH_INPUT_NEXT_IP6_INPUT; - else if (type0 == ETHERNET_TYPE_MPLS_UNICAST) + else if (type0 == ETHERNET_TYPE_MPLS) next0 = SSVM_ETH_INPUT_NEXT_MPLS_INPUT; l3_offset0 = ((next0 == SSVM_ETH_INPUT_NEXT_IP4_INPUT || diff --git a/src/vnet/dhcp/dhcp6_proxy_node.c b/src/vnet/dhcp/dhcp6_proxy_node.c index 524cb095..de73154d 100644 --- a/src/vnet/dhcp/dhcp6_proxy_node.c +++ b/src/vnet/dhcp/dhcp6_proxy_node.c @@ -883,6 +883,7 @@ dhcp6_proxy_set_server (ip46_address_t *addr, mfib_table_entry_update(rx_fib_index, &all_dhcp_servers, MFIB_SOURCE_DHCP, + MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_ACCEPT_ALL_ITF); mfib_table_lock(rx_fib_index, FIB_PROTOCOL_IP6); } diff --git a/src/vnet/dpo/dpo.c b/src/vnet/dpo/dpo.c index d8e075a7..dfc2bd92 100644 --- a/src/vnet/dpo/dpo.c +++ b/src/vnet/dpo/dpo.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include /** * Array of char* names for the DPO types and protos @@ -182,6 +184,12 @@ dpo_set (dpo_id_t *dpo, case IP_LOOKUP_NEXT_MIDCHAIN: dpo->dpoi_type = DPO_ADJACENCY_MIDCHAIN; break; + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: + dpo->dpoi_type = DPO_ADJACENCY_MCAST_MIDCHAIN; + break; + case IP_LOOKUP_NEXT_MCAST: + dpo->dpoi_type = DPO_ADJACENCY_MCAST; + break; default: break; } @@ -453,6 +461,8 @@ dpo_module_init (vlib_main_t * vm) lookup_dpo_module_init(); ip_null_dpo_module_init(); replicate_module_init(); + interface_dpo_module_init(); + mpls_disp_dpo_module_init(); return (NULL); } diff --git a/src/vnet/dpo/dpo.h b/src/vnet/dpo/dpo.h index 48b92d3d..5aa4e2d2 100644 --- a/src/vnet/dpo/dpo.h +++ b/src/vnet/dpo/dpo.h @@ -108,12 +108,15 @@ typedef enum dpo_type_t_ { DPO_ADJACENCY_MIDCHAIN, DPO_ADJACENCY_GLEAN, DPO_ADJACENCY_MCAST, + DPO_ADJACENCY_MCAST_MIDCHAIN, DPO_RECEIVE, DPO_LOOKUP, DPO_LISP_CP, DPO_CLASSIFY, DPO_MPLS_LABEL, + DPO_MPLS_DISPOSITION, DPO_MFIB_ENTRY, + DPO_INTERFACE, DPO_LAST, } __attribute__((packed)) dpo_type_t; @@ -129,6 +132,7 @@ typedef enum dpo_type_t_ { [DPO_ADJACENCY_MIDCHAIN] = "dpo-adjacency-midcahin", \ [DPO_ADJACENCY_GLEAN] = "dpo-glean", \ [DPO_ADJACENCY_MCAST] = "dpo-adj-mcast", \ + [DPO_ADJACENCY_MCAST_MIDCHAIN] = "dpo-adj-mcast-midchain", \ [DPO_RECEIVE] = "dpo-receive", \ [DPO_LOOKUP] = "dpo-lookup", \ [DPO_LOAD_BALANCE] = "dpo-load-balance", \ @@ -136,7 +140,9 @@ typedef enum dpo_type_t_ { [DPO_LISP_CP] = "dpo-lisp-cp", \ [DPO_CLASSIFY] = "dpo-classify", \ [DPO_MPLS_LABEL] = "dpo-mpls-label", \ - [DPO_MFIB_ENTRY] = "dpo-mfib_entry" \ + [DPO_MPLS_DISPOSITION] = "dpo-mpls-diposition", \ + [DPO_MFIB_ENTRY] = "dpo-mfib_entry", \ + [DPO_INTERFACE] = "dpo-interface" \ } /** diff --git a/src/vnet/dpo/interface_dpo.c b/src/vnet/dpo/interface_dpo.c new file mode 100644 index 00000000..50ca756f --- /dev/null +++ b/src/vnet/dpo/interface_dpo.c @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +/* + * The 'DB' of interface DPOs. + * There is only one per-interface per-protocol, so this is a per-interface + * vector + */ +static index_t *interface_dpo_db[DPO_PROTO_NUM]; + +static interface_dpo_t * +interface_dpo_alloc (void) +{ + interface_dpo_t *ido; + + pool_get(interface_dpo_pool, ido); + + return (ido); +} + +static inline interface_dpo_t * +interface_dpo_get_from_dpo (const dpo_id_t *dpo) +{ + ASSERT(DPO_INTERFACE == dpo->dpoi_type); + + return (interface_dpo_get(dpo->dpoi_index)); +} + +static inline index_t +interface_dpo_get_index (interface_dpo_t *ido) +{ + return (ido - interface_dpo_pool); +} + +static void +interface_dpo_lock (dpo_id_t *dpo) +{ + interface_dpo_t *ido; + + ido = interface_dpo_get_from_dpo(dpo); + ido->ido_locks++; +} + +static void +interface_dpo_unlock (dpo_id_t *dpo) +{ + interface_dpo_t *ido; + + ido = interface_dpo_get_from_dpo(dpo); + ido->ido_locks--; + + if (0 == ido->ido_locks) + { + interface_dpo_db[ido->ido_proto][ido->ido_sw_if_index] = + INDEX_INVALID; + pool_put(interface_dpo_pool, ido); + } +} + +/* + * interface_dpo_add_or_lock + * + * Add/create and lock a new or lock an existing for the interface DPO + * on the interface and protocol given + */ +void +interface_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + dpo_id_t *dpo) +{ + interface_dpo_t *ido; + + vec_validate_init_empty(interface_dpo_db[proto], + sw_if_index, + INDEX_INVALID); + + if (INDEX_INVALID == interface_dpo_db[proto][sw_if_index]) + { + ido = interface_dpo_alloc(); + + ido->ido_sw_if_index = sw_if_index; + ido->ido_proto = proto; + + interface_dpo_db[proto][sw_if_index] = + interface_dpo_get_index(ido); + } + else + { + ido = interface_dpo_get(interface_dpo_db[proto][sw_if_index]); + } + + dpo_set(dpo, DPO_INTERFACE, proto, interface_dpo_get_index(ido)); +} + + +static clib_error_t * +interface_dpo_interface_state_change (vnet_main_t * vnm, + u32 sw_if_index, + u32 flags) +{ + /* + */ + return (NULL); +} + +VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION( + interface_dpo_interface_state_change); + +/** + * @brief Registered callback for HW interface state changes + */ +static clib_error_t * +interface_dpo_hw_interface_state_change (vnet_main_t * vnm, + u32 hw_if_index, + u32 flags) +{ + return (NULL); +} + +VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION( + interface_dpo_hw_interface_state_change); + +static clib_error_t * +interface_dpo_interface_delete (vnet_main_t * vnm, + u32 sw_if_index, + u32 is_add) +{ + return (NULL); +} + +VNET_SW_INTERFACE_ADD_DEL_FUNCTION( + interface_dpo_interface_delete); + +u8* +format_interface_dpo (u8* s, va_list *ap) +{ + index_t index = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + vnet_main_t * vnm = vnet_get_main(); + interface_dpo_t *ido = interface_dpo_get(index); + + return (format(s, "%U-dpo: %U", + format_vnet_sw_interface_name, + vnm, + vnet_get_sw_interface(vnm, ido->ido_sw_if_index), + format_dpo_proto, ido->ido_proto)); +} + +static void +interface_dpo_mem_show (void) +{ + fib_show_memory_usage("Interface", + pool_elts(interface_dpo_pool), + pool_len(interface_dpo_pool), + sizeof(interface_dpo_t)); +} + + +const static dpo_vft_t interface_dpo_vft = { + .dv_lock = interface_dpo_lock, + .dv_unlock = interface_dpo_unlock, + .dv_format = format_interface_dpo, + .dv_mem_show = interface_dpo_mem_show, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a glean + * object. + * + * this means that these graph nodes are ones from which a glean is the + * parent object in the DPO-graph. + */ +const static char* const interface_dpo_ip4_nodes[] = +{ + "interface-dpo-ip4", + NULL, +}; +const static char* const interface_dpo_ip6_nodes[] = +{ + "interface-dpo-ip4", + NULL, +}; + +const static char* const * const interface_dpo_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = interface_dpo_ip4_nodes, + [DPO_PROTO_IP6] = interface_dpo_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +void +interface_dpo_module_init (void) +{ + dpo_register(DPO_INTERFACE, + &interface_dpo_vft, + interface_dpo_nodes); +} + +/** + * @brief Interface DPO trace data + */ +typedef struct interface_dpo_trace_t_ +{ + u32 sw_if_index; +} interface_dpo_trace_t; + +typedef enum interface_dpo_next_t_ +{ + INTERFACE_DPO_DROP = 0, + INTERFACE_DPO_INPUT = 1, +} interface_dpo_next_t; + +always_inline uword +interface_dpo_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, next_index, * from, * to_next; + u32 cpu_index = os_get_cpu_number(); + vnet_interface_main_t *im; + + im = &vnet_get_main ()->interface_main; + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next > 2) + { + const interface_dpo_t *ido0, *ido1; + u32 bi0, idoi0, bi1, idoi1; + vlib_buffer_t *b0, *b1; + + bi0 = from[0]; + to_next[0] = bi0; + bi1 = from[1]; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + idoi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + idoi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + ido0 = interface_dpo_get(idoi0); + ido1 = interface_dpo_get(idoi1); + + vnet_buffer(b0)->sw_if_index[VLIB_RX] = ido0->ido_sw_if_index; + vnet_buffer(b1)->sw_if_index[VLIB_RX] = ido1->ido_sw_if_index; + + vlib_increment_combined_counter (im->combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + cpu_index, + ido0->ido_sw_if_index, + 1, + vlib_buffer_length_in_chain (vm, b0)); + vlib_increment_combined_counter (im->combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + cpu_index, + ido1->ido_sw_if_index, + 1, + vlib_buffer_length_in_chain (vm, b1)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + interface_dpo_trace_t *tr0; + + tr0 = vlib_add_trace (vm, node, b0, sizeof (*tr0)); + tr0->sw_if_index = ido0->ido_sw_if_index; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + interface_dpo_trace_t *tr1; + + tr1 = vlib_add_trace (vm, node, b1, sizeof (*tr1)); + tr1->sw_if_index = ido1->ido_sw_if_index; + } + + vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, + n_left_to_next, bi0, bi1, + INTERFACE_DPO_INPUT, + INTERFACE_DPO_INPUT); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + const interface_dpo_t * ido0; + vlib_buffer_t * b0; + u32 bi0, idoi0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + idoi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + ido0 = interface_dpo_get(idoi0); + + /* Swap the RX interface of the packet to the one the + * interface DPR represents */ + vnet_buffer(b0)->sw_if_index[VLIB_RX] = ido0->ido_sw_if_index; + + /* Bump the interface's RX coutners */ + vlib_increment_combined_counter (im->combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + cpu_index, + ido0->ido_sw_if_index, + 1, + vlib_buffer_length_in_chain (vm, b0)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + interface_dpo_trace_t *tr; + + tr = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->sw_if_index = ido0->ido_sw_if_index; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, + INTERFACE_DPO_INPUT); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_interface_dpo_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + interface_dpo_trace_t * t = va_arg (*args, interface_dpo_trace_t *); + uword indent = format_get_indent (s); + s = format (s, "%U sw_if_index:%d", + format_white_space, indent, + t->sw_if_index); + return s; +} + +static uword +interface_dpo_ip4 (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (interface_dpo_inline(vm, node, from_frame)); +} + +static uword +interface_dpo_ip6 (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (interface_dpo_inline(vm, node, from_frame)); +} + +VLIB_REGISTER_NODE (interface_dpo_ip4_node) = { + .function = interface_dpo_ip4, + .name = "interface-dpo-ip4", + .vector_size = sizeof (u32), + .format_trace = format_interface_dpo_trace, + + .n_next_nodes = 2, + .next_nodes = { + [INTERFACE_DPO_DROP] = "ip4-drop", + [INTERFACE_DPO_INPUT] = "ip4-input", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (interface_dpo_ip4_node, + interface_dpo_ip4) + +VLIB_REGISTER_NODE (interface_dpo_ip6_node) = { + .function = interface_dpo_ip6, + .name = "interface-dpo-ip6", + .vector_size = sizeof (u32), + .format_trace = format_interface_dpo_trace, + + .n_next_nodes = 2, + .next_nodes = { + [INTERFACE_DPO_DROP] = "ip6-drop", + [INTERFACE_DPO_INPUT] = "ip6-input", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (interface_dpo_ip6_node, + interface_dpo_ip6) + diff --git a/src/vnet/dpo/interface_dpo.h b/src/vnet/dpo/interface_dpo.h new file mode 100644 index 00000000..1538dfbb --- /dev/null +++ b/src/vnet/dpo/interface_dpo.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing interfaceing the packet, i.e. it's for-us + */ + +#ifndef __INTERFACE_DPO_H__ +#define __INTERFACE_DPO_H__ + +#include + +typedef struct interface_dpo_t_ +{ + /** + * The Software interface index that the packets will be given + * as the ingress/rx interface + */ + u32 ido_sw_if_index; + + /** + * next VLIB node. A '-input' node. + */ + u32 ido_next_node; + + /** + * DPO protocol that the packets will have as they 'ingress' + * on this interface + */ + dpo_proto_t ido_proto; + + /** + * number of locks. + */ + u16 ido_locks; +} interface_dpo_t; + +extern void interface_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + dpo_id_t *dpo); + +extern void interface_dpo_module_init(void); + +/** + * @brief pool of all interface DPOs + */ +interface_dpo_t *interface_dpo_pool; + +static inline interface_dpo_t * +interface_dpo_get (index_t index) +{ + return (pool_elt_at_index(interface_dpo_pool, index)); +} + +#endif diff --git a/src/vnet/dpo/lookup_dpo.c b/src/vnet/dpo/lookup_dpo.c index 97ad0a44..e5b00a79 100644 --- a/src/vnet/dpo/lookup_dpo.c +++ b/src/vnet/dpo/lookup_dpo.c @@ -21,8 +21,12 @@ #include #include #include +#include +#include +#include static const char *const lookup_input_names[] = LOOKUP_INPUTS; +static const char *const lookup_cast_names[] = LOOKUP_CASTS; /** * @brief Enumeration of the lookup subtypes @@ -31,6 +35,7 @@ typedef enum lookup_sub_type_t_ { LOOKUP_SUB_TYPE_SRC, LOOKUP_SUB_TYPE_DST, + LOOKUP_SUB_TYPE_DST_MCAST, LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE, } lookup_sub_type_t; #define LOOKUP_SUB_TYPE_NUM (LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE+1) @@ -67,6 +72,7 @@ lookup_dpo_get_index (lookup_dpo_t *lkd) static void lookup_dpo_add_or_lock_i (fib_node_index_t fib_index, dpo_proto_t proto, + lookup_cast_t cast, lookup_input_t input, lookup_table_t table_config, dpo_id_t *dpo) @@ -79,6 +85,7 @@ lookup_dpo_add_or_lock_i (fib_node_index_t fib_index, lkd->lkd_proto = proto; lkd->lkd_input = input; lkd->lkd_table = table_config; + lkd->lkd_cast = cast; /* * use the input type to select the lookup sub-type @@ -100,6 +107,10 @@ lookup_dpo_add_or_lock_i (fib_node_index_t fib_index, type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST]; break; } + if (LOOKUP_MULTICAST == cast) + { + type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_MCAST]; + } } if (0 == type) @@ -115,20 +126,29 @@ lookup_dpo_add_or_lock_i (fib_node_index_t fib_index, void lookup_dpo_add_or_lock_w_fib_index (fib_node_index_t fib_index, dpo_proto_t proto, + lookup_cast_t cast, lookup_input_t input, lookup_table_t table_config, dpo_id_t *dpo) { if (LOOKUP_TABLE_FROM_CONFIG == table_config) { - fib_table_lock(fib_index, dpo_proto_to_fib(proto)); + if (LOOKUP_UNICAST == cast) + { + fib_table_lock(fib_index, dpo_proto_to_fib(proto)); + } + else + { + mfib_table_lock(fib_index, dpo_proto_to_fib(proto)); + } } - lookup_dpo_add_or_lock_i(fib_index, proto, input, table_config, dpo); + lookup_dpo_add_or_lock_i(fib_index, proto, cast, input, table_config, dpo); } void lookup_dpo_add_or_lock_w_table_id (u32 table_id, dpo_proto_t proto, + lookup_cast_t cast, lookup_input_t input, lookup_table_t table_config, dpo_id_t *dpo) @@ -137,13 +157,22 @@ lookup_dpo_add_or_lock_w_table_id (u32 table_id, if (LOOKUP_TABLE_FROM_CONFIG == table_config) { - fib_index = - fib_table_find_or_create_and_lock(dpo_proto_to_fib(proto), - table_id); + if (LOOKUP_UNICAST == cast) + { + fib_index = + fib_table_find_or_create_and_lock(dpo_proto_to_fib(proto), + table_id); + } + else + { + fib_index = + mfib_table_find_or_create_and_lock(dpo_proto_to_fib(proto), + table_id); + } } ASSERT(FIB_NODE_INDEX_INVALID != fib_index); - lookup_dpo_add_or_lock_i(fib_index, proto, input, table_config, dpo); + lookup_dpo_add_or_lock_i(fib_index, proto, cast, input, table_config, dpo); } u8* @@ -156,16 +185,29 @@ format_lookup_dpo (u8 *s, va_list *args) if (LOOKUP_TABLE_FROM_INPUT_INTERFACE == lkd->lkd_table) { - s = format(s, "%s lookup in interface's %U table", + s = format(s, "%s,%s lookup in interface's %U table", lookup_input_names[lkd->lkd_input], + lookup_cast_names[lkd->lkd_cast], format_dpo_proto, lkd->lkd_proto); } else { - s = format(s, "%s lookup in %U", - lookup_input_names[lkd->lkd_input], - format_fib_table_name, lkd->lkd_fib_index, - dpo_proto_to_fib(lkd->lkd_proto)); + if (LOOKUP_UNICAST == lkd->lkd_cast) + { + s = format(s, "%s,%s lookup in %U", + lookup_input_names[lkd->lkd_input], + lookup_cast_names[lkd->lkd_cast], + format_fib_table_name, lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto)); + } + else + { + s = format(s, "%s,%s lookup in %U", + lookup_input_names[lkd->lkd_input], + lookup_cast_names[lkd->lkd_cast], + format_mfib_table_name, lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto)); + } } return (s); } @@ -193,8 +235,16 @@ lookup_dpo_unlock (dpo_id_t *dpo) { if (LOOKUP_TABLE_FROM_CONFIG == lkd->lkd_table) { - fib_table_unlock(lkd->lkd_fib_index, - dpo_proto_to_fib(lkd->lkd_proto)); + if (LOOKUP_UNICAST == lkd->lkd_cast) + { + fib_table_unlock(lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto)); + } + else + { + mfib_table_unlock(lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto)); + } } pool_put(lookup_dpo_pool, lkd); } @@ -1069,6 +1119,123 @@ VLIB_REGISTER_NODE (lookup_mpls_dst_itf_node) = { }; VLIB_NODE_FUNCTION_MULTIARCH (lookup_mpls_dst_itf_node, lookup_mpls_dst_itf) +typedef enum lookup_ip_dst_mcast_next_t_ { + LOOKUP_IP_DST_MCAST_NEXT_RPF, + LOOKUP_IP_DST_MCAST_N_NEXT, +} mfib_forward_lookup_next_t; + +always_inline uword +lookup_dpo_ip_dst_mcast_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int is_v4) +{ + u32 n_left_from, next_index, * from, * to_next; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = LOOKUP_IP_DST_MCAST_NEXT_RPF; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + /* while (n_left_from >= 4 && n_left_to_next >= 2) */ + /* } */ + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, lkdi0, fib_index0, next0; + const lookup_dpo_t * lkd0; + fib_node_index_t mfei0; + vlib_buffer_t * b0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* dst lookup was done by mpls lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + fib_index0 = lkd0->lkd_fib_index; + next0 = LOOKUP_IP_DST_MCAST_NEXT_RPF; + + if (is_v4) + { + ip4_header_t * ip0; + + ip0 = vlib_buffer_get_current (b0); + mfei0 = ip4_mfib_table_lookup(ip4_mfib_get(fib_index0), + &ip0->src_address, + &ip0->dst_address, + 64); + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = mfei0; + tr->addr.ip4 = ip0->dst_address; + } + } + else + { + ip6_header_t * ip0; + + ip0 = vlib_buffer_get_current (b0); + mfei0 = ip6_mfib_table_lookup2(ip6_mfib_get(fib_index0), + &ip0->src_address, + &ip0->dst_address); + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = mfei0; + tr->addr.ip6 = ip0->dst_address; + } + } + + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = mfei0; + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +always_inline uword +lookup_ip4_dst_mcast (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip_dst_mcast_inline(vm, node, from_frame, 1)); +} + +VLIB_REGISTER_NODE (lookup_ip4_dst_mcast_node) = { + .function = lookup_ip4_dst_mcast, + .name = "lookup-ip4-dst-mcast", + .vector_size = sizeof (u32), + + .format_trace = format_lookup_trace, + .n_next_nodes = LOOKUP_IP_DST_MCAST_N_NEXT, + .next_nodes = { + [LOOKUP_IP_DST_MCAST_NEXT_RPF] = "ip4-mfib-forward-rpf", + }, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_dst_mcast_node, + lookup_ip4_dst_mcast) + static void lookup_dpo_mem_show (void) { @@ -1129,6 +1296,22 @@ const static char* const * const lookup_dst_nodes[DPO_PROTO_NUM] = [DPO_PROTO_MPLS] = lookup_dst_mpls_nodes, }; +const static char* const lookup_dst_mcast_ip4_nodes[] = +{ + "lookup-ip4-dst-mcast", + NULL, +}; +const static char* const lookup_dst_mcast_ip6_nodes[] = +{ + "lookup-ip6-dst-mcast", + NULL, +}; +const static char* const * const lookup_dst_mcast_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = lookup_dst_mcast_ip4_nodes, + [DPO_PROTO_IP6] = lookup_dst_mcast_ip6_nodes, +}; + const static char* const lookup_dst_from_interface_ip4_nodes[] = { "lookup-ip4-dst-itf", @@ -1168,6 +1351,8 @@ lookup_dpo_module_init (void) dpo_register_new_type(&lkd_vft, lookup_src_nodes); lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST] = dpo_register_new_type(&lkd_vft, lookup_dst_nodes); + lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_MCAST] = + dpo_register_new_type(&lkd_vft, lookup_dst_mcast_nodes); lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE] = dpo_register_new_type(&lkd_vft, lookup_dst_from_interface_nodes); } diff --git a/src/vnet/dpo/lookup_dpo.h b/src/vnet/dpo/lookup_dpo.h index ff283388..7dfd0385 100644 --- a/src/vnet/dpo/lookup_dpo.h +++ b/src/vnet/dpo/lookup_dpo.h @@ -46,6 +46,19 @@ typedef enum lookup_table_t_ { [LOOKUP_INPUT_DST_ADDR] = "table-configured", \ } +/** + * Switch to use the packet's source or destination address for lookup + */ +typedef enum lookup_cast_t_ { + LOOKUP_UNICAST, + LOOKUP_MULTICAST, +} __attribute__ ((packed)) lookup_cast_t; + +#define LOOKUP_CASTS { \ + [LOOKUP_UNICAST] = "unicast", \ + [LOOKUP_MULTICAST] = "multicast", \ +} + /** * A representation of an MPLS label for imposition in the data-path */ @@ -73,6 +86,11 @@ typedef struct lookup_dpo_t */ lookup_table_t lkd_table; + /** + * Unicast of rmulticast FIB lookup + */ + lookup_cast_t lkd_cast; + /** * Number of locks */ @@ -81,11 +99,13 @@ typedef struct lookup_dpo_t extern void lookup_dpo_add_or_lock_w_fib_index(fib_node_index_t fib_index, dpo_proto_t proto, + lookup_cast_t cast, lookup_input_t input, lookup_table_t table, dpo_id_t *dpo); extern void lookup_dpo_add_or_lock_w_table_id(u32 table_id, dpo_proto_t proto, + lookup_cast_t cast, lookup_input_t input, lookup_table_t table, dpo_id_t *dpo); diff --git a/src/vnet/dpo/mpls_disposition.c b/src/vnet/dpo/mpls_disposition.c new file mode 100644 index 00000000..5dc33fcf --- /dev/null +++ b/src/vnet/dpo/mpls_disposition.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +/* + * pool of all MPLS Label DPOs + */ +mpls_disp_dpo_t *mpls_disp_dpo_pool; + +static mpls_disp_dpo_t * +mpls_disp_dpo_alloc (void) +{ + mpls_disp_dpo_t *mdd; + + pool_get_aligned(mpls_disp_dpo_pool, mdd, CLIB_CACHE_LINE_BYTES); + memset(mdd, 0, sizeof(*mdd)); + + dpo_reset(&mdd->mdd_dpo); + + return (mdd); +} + +static index_t +mpls_disp_dpo_get_index (mpls_disp_dpo_t *mdd) +{ + return (mdd - mpls_disp_dpo_pool); +} + +index_t +mpls_disp_dpo_create (dpo_proto_t payload_proto, + fib_rpf_id_t rpf_id, + const dpo_id_t *dpo) +{ + mpls_disp_dpo_t *mdd; + + mdd = mpls_disp_dpo_alloc(); + + mdd->mdd_payload_proto = payload_proto; + mdd->mdd_rpf_id = rpf_id; + + dpo_stack(DPO_MPLS_DISPOSITION, + mdd->mdd_payload_proto, + &mdd->mdd_dpo, + dpo); + + return (mpls_disp_dpo_get_index(mdd)); +} + +u8* +format_mpls_disp_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + u32 indent = va_arg (*args, u32); + mpls_disp_dpo_t *mdd; + + mdd = mpls_disp_dpo_get(index); + + s = format(s, "mpls-disposition:[%d]:[%U]", + index, + format_dpo_proto, mdd->mdd_payload_proto); + + s = format(s, "\n%U", format_white_space, indent); + s = format(s, "%U", format_dpo_id, &mdd->mdd_dpo, indent+2); + + return (s); +} + +static void +mpls_disp_dpo_lock (dpo_id_t *dpo) +{ + mpls_disp_dpo_t *mdd; + + mdd = mpls_disp_dpo_get(dpo->dpoi_index); + + mdd->mdd_locks++; +} + +static void +mpls_disp_dpo_unlock (dpo_id_t *dpo) +{ + mpls_disp_dpo_t *mdd; + + mdd = mpls_disp_dpo_get(dpo->dpoi_index); + + mdd->mdd_locks--; + + if (0 == mdd->mdd_locks) + { + dpo_reset(&mdd->mdd_dpo); + pool_put(mpls_disp_dpo_pool, mdd); + } +} + +/** + * @brief A struct to hold tracing information for the MPLS label disposition + * node. + */ +typedef struct mpls_label_disposition_trace_t_ +{ + index_t mdd; +} mpls_label_disposition_trace_t; + +always_inline uword +mpls_label_disposition_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + u8 payload_is_ip4, + u8 payload_is_ip6) +{ + u32 n_left_from, next_index, * from, * to_next; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + mpls_disp_dpo_t *mdd0, *mdd1; + u32 bi0, mddi0, bi1, mddi1; + vlib_buffer_t * b0, *b1; + u32 next0, next1; + + bi0 = to_next[0] = from[0]; + bi1 = to_next[1] = from[1]; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, STORE); + vlib_prefetch_buffer_header (p3, STORE); + + CLIB_PREFETCH (p2->data, sizeof (ip6_header_t), STORE); + CLIB_PREFETCH (p3->data, sizeof (ip6_header_t), STORE); + } + + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* dst lookup was done by ip4 lookup */ + mddi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + mddi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + mdd0 = mpls_disp_dpo_get(mddi0); + mdd1 = mpls_disp_dpo_get(mddi1); + + if (payload_is_ip4) + { + /* + * decrement the TTL on ingress to the LSP + */ + } + else if (payload_is_ip6) + { + /* + * decrement the TTL on ingress to the LSP + */ + } + + next0 = mdd0->mdd_dpo.dpoi_next_node; + next1 = mdd1->mdd_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mdd0->mdd_dpo.dpoi_index; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = mdd1->mdd_dpo.dpoi_index; + vnet_buffer(b0)->ip.rpf_id = mdd0->mdd_rpf_id; + vnet_buffer(b1)->ip.rpf_id = mdd1->mdd_rpf_id; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_disposition_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + + tr->mdd = mddi0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_disposition_trace_t *tr = + vlib_add_trace (vm, node, b1, sizeof (*tr)); + tr->mdd = mddi1; + } + + vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, + n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + mpls_disp_dpo_t *mdd0; + vlib_buffer_t * b0; + u32 bi0, mddi0; + u32 next0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* dst lookup was done by ip4 lookup */ + mddi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + mdd0 = mpls_disp_dpo_get(mddi0); + + if (payload_is_ip4) + { + /* + * decrement the TTL on ingress to the LSP + */ + } + else if (payload_is_ip6) + { + /* + * decrement the TTL on ingress to the LSP + */ + } + else + { + } + + next0 = mdd0->mdd_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mdd0->mdd_dpo.dpoi_index; + vnet_buffer(b0)->ip.rpf_id = mdd0->mdd_rpf_id; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_disposition_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->mdd = mddi0; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_mpls_label_disposition_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + CLIB_UNUSED (mpls_label_disposition_trace_t * t); + + t = va_arg (*args, mpls_label_disposition_trace_t *); + + s = format(s, "disp:%d", t->mdd); + return (s); +} + +static uword +ip4_mpls_label_disposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_disposition_inline(vm, node, frame, 1, 0)); +} + +VLIB_REGISTER_NODE (ip4_mpls_label_disposition_node) = { + .function = ip4_mpls_label_disposition, + .name = "ip4-mpls-label-disposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_disposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip4-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip4_mpls_label_disposition_node, + ip4_mpls_label_disposition) + +static uword +ip6_mpls_label_disposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_disposition_inline(vm, node, frame, 0, 1)); +} + +VLIB_REGISTER_NODE (ip6_mpls_label_disposition_node) = { + .function = ip6_mpls_label_disposition, + .name = "ip6-mpls-label-disposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_disposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip6-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip6_mpls_label_disposition_node, + ip6_mpls_label_disposition) + +static void +mpls_disp_dpo_mem_show (void) +{ + fib_show_memory_usage("MPLS label", + pool_elts(mpls_disp_dpo_pool), + pool_len(mpls_disp_dpo_pool), + sizeof(mpls_disp_dpo_t)); +} + +const static dpo_vft_t mdd_vft = { + .dv_lock = mpls_disp_dpo_lock, + .dv_unlock = mpls_disp_dpo_unlock, + .dv_format = format_mpls_disp_dpo, + .dv_mem_show = mpls_disp_dpo_mem_show, +}; + +const static char* const mpls_label_disp_ip4_nodes[] = +{ + "ip4-mpls-label-disposition", + NULL, +}; +const static char* const mpls_label_disp_ip6_nodes[] = +{ + "ip6-mpls-label-disposition", + NULL, +}; +const static char* const * const mpls_label_disp_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = mpls_label_disp_ip4_nodes, + [DPO_PROTO_IP6] = mpls_label_disp_ip6_nodes, +}; + + +void +mpls_disp_dpo_module_init (void) +{ + dpo_register(DPO_MPLS_DISPOSITION, &mdd_vft, mpls_label_disp_nodes); +} diff --git a/src/vnet/dpo/mpls_disposition.h b/src/vnet/dpo/mpls_disposition.h new file mode 100644 index 00000000..9c015083 --- /dev/null +++ b/src/vnet/dpo/mpls_disposition.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __MPLS_DISP_DPO_H__ +#define __MPLS_DISP_DPO_H__ + +#include +#include +#include +#include + +/** + * A representation of an MPLS label for imposition in the data-path + */ +typedef struct mpls_disp_dpo_t +{ + /** + * Next DPO in the graph + */ + dpo_id_t mdd_dpo; + + /** + * The protocol of the payload/packets that are being encapped + */ + dpo_proto_t mdd_payload_proto; + + /** + * RPF-ID (if this is an mcast disposition) + */ + fib_rpf_id_t mdd_rpf_id; + + /** + * Number of locks/users of the label + */ + u16 mdd_locks; +} mpls_disp_dpo_t; + +/** + * @brief Assert that the MPLS label object is less than a cache line in size. + * Should this get any bigger then we will need to reconsider how many labels + * can be pushed in one object. + */ +_Static_assert((sizeof(mpls_disp_dpo_t) <= CLIB_CACHE_LINE_BYTES), + "MPLS Disposition DPO is larger than one cache line."); + +/** + * @brief Create an MPLS label object + * + * @param payload_proto The ptocool of the payload packets that will + * be imposed with this label header. + * @param dpo The parent of the created MPLS label object + */ +extern index_t mpls_disp_dpo_create(dpo_proto_t payload_proto, + fib_rpf_id_t rpf_id, + const dpo_id_t *dpo); + +extern u8* format_mpls_disp_dpo(u8 *s, va_list *args); + + +/* + * Encapsulation violation for fast data-path access + */ +extern mpls_disp_dpo_t *mpls_disp_dpo_pool; + +static inline mpls_disp_dpo_t * +mpls_disp_dpo_get (index_t index) +{ + return (pool_elt_at_index(mpls_disp_dpo_pool, index)); +} + +extern void mpls_disp_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/mpls_label_dpo.c b/src/vnet/dpo/mpls_label_dpo.c index be9b2850..4d84b900 100644 --- a/src/vnet/dpo/mpls_label_dpo.c +++ b/src/vnet/dpo/mpls_label_dpo.c @@ -562,7 +562,7 @@ VLIB_REGISTER_NODE (mpls_label_imposition_node) = { .format_trace = format_mpls_label_imposition_trace, .n_next_nodes = 1, .next_nodes = { - [0] = "error-drop", + [0] = "mpls-drop", } }; VLIB_NODE_FUNCTION_MULTIARCH (mpls_label_imposition_node, @@ -584,7 +584,7 @@ VLIB_REGISTER_NODE (ip4_mpls_label_imposition_node) = { .format_trace = format_mpls_label_imposition_trace, .n_next_nodes = 1, .next_nodes = { - [0] = "error-drop", + [0] = "ip4-drop", } }; VLIB_NODE_FUNCTION_MULTIARCH (ip4_mpls_label_imposition_node, @@ -606,7 +606,7 @@ VLIB_REGISTER_NODE (ip6_mpls_label_imposition_node) = { .format_trace = format_mpls_label_imposition_trace, .n_next_nodes = 1, .next_nodes = { - [0] = "error-drop", + [0] = "ip6-drop", } }; VLIB_NODE_FUNCTION_MULTIARCH (ip6_mpls_label_imposition_node, diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index e25ceae9..9fdb9a05 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -17,6 +17,7 @@ #include #include #include +#include #undef REP_DEBUG @@ -106,6 +107,7 @@ replicate_format (index_t repi, dpo_id_t *buckets; u32 i; + repi &= ~MPLS_IS_REPLICATE; rep = replicate_get(repi); vlib_get_combined_counter(&(replicate_main.repm_counters), repi, &to); buckets = replicate_get_buckets(rep); @@ -187,6 +189,7 @@ replicate_set_bucket (index_t repi, replicate_t *rep; dpo_id_t *buckets; + repi &= ~MPLS_IS_REPLICATE; rep = replicate_get(repi); buckets = replicate_get_buckets(rep); @@ -199,11 +202,13 @@ int replicate_is_drop (const dpo_id_t *dpo) { replicate_t *rep; + index_t repi; if (DPO_REPLICATE != dpo->dpoi_type) return (0); - rep = replicate_get(dpo->dpoi_index); + repi = dpo->dpoi_index & ~MPLS_IS_REPLICATE; + rep = replicate_get(repi); if (1 == rep->rep_n_buckets) { @@ -218,6 +223,7 @@ replicate_get_bucket (index_t repi, { replicate_t *rep; + repi &= ~MPLS_IS_REPLICATE; rep = replicate_get(repi); return (replicate_get_bucket_i(rep, bucket)); @@ -288,9 +294,11 @@ replicate_multipath_update (const dpo_id_t *dpo, dpo_id_t *tmp_dpo; u32 ii, n_buckets; replicate_t *rep; + index_t repi; ASSERT(DPO_REPLICATE == dpo->dpoi_type); - rep = replicate_get(dpo->dpoi_index); + repi = dpo->dpoi_index & ~MPLS_IS_REPLICATE; + rep = replicate_get(repi); nhs = replicate_multipath_next_hop_fixup(next_hops, rep->rep_proto); n_buckets = vec_len(nhs); @@ -718,7 +726,7 @@ format_replicate_trace (u8 * s, va_list * args) s = format (s, "replicate: %d via %U", t->rep_index, - format_dpo_id, &t->dpo); + format_dpo_id, &t->dpo, 0); return s; } @@ -731,7 +739,7 @@ ip4_replicate (vlib_main_t * vm, } /** - * @brief + * @brief IP4 replication node */ VLIB_REGISTER_NODE (ip4_replicate_node) = { .function = ip4_replicate, @@ -744,7 +752,7 @@ VLIB_REGISTER_NODE (ip4_replicate_node) = { .format_trace = format_replicate_trace, .n_next_nodes = 1, .next_nodes = { - [0] = "error-drop", + [0] = "ip4-drop", }, }; @@ -757,7 +765,7 @@ ip6_replicate (vlib_main_t * vm, } /** - * @brief + * @brief IPv6 replication node */ VLIB_REGISTER_NODE (ip6_replicate_node) = { .function = ip6_replicate, @@ -770,7 +778,33 @@ VLIB_REGISTER_NODE (ip6_replicate_node) = { .format_trace = format_replicate_trace, .n_next_nodes = 1, .next_nodes = { - [0] = "error-drop", + [0] = "ip6-drop", + }, +}; + +static uword +mpls_replicate (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (replicate_inline (vm, node, frame)); +} + +/** + * @brief MPLS replication node + */ +VLIB_REGISTER_NODE (mpls_replicate_node) = { + .function = mpls_replicate, + .name = "mpls-replicate", + .vector_size = sizeof (u32), + + .n_errors = ARRAY_LEN(replicate_dpo_error_strings), + .error_strings = replicate_dpo_error_strings, + + .format_trace = format_replicate_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "mpls-drop", }, }; diff --git a/src/vnet/dpo/replicate_dpo.h b/src/vnet/dpo/replicate_dpo.h index 77273015..7383184a 100644 --- a/src/vnet/dpo/replicate_dpo.h +++ b/src/vnet/dpo/replicate_dpo.h @@ -25,6 +25,7 @@ #include #include #include +#include /** * replicate main @@ -119,6 +120,7 @@ extern replicate_t *replicate_pool; static inline replicate_t* replicate_get (index_t repi) { + repi &= ~MPLS_IS_REPLICATE; return (pool_elt_at_index(replicate_pool, repi)); } diff --git a/src/vnet/ethernet/arp.c b/src/vnet/ethernet/arp.c index c74a097e..dd509193 100644 --- a/src/vnet/ethernet/arp.c +++ b/src/vnet/ethernet/arp.c @@ -507,6 +507,7 @@ arp_update_adjacency (vnet_main_t * vnm, u32 sw_if_index, u32 ai) case IP_LOOKUP_NEXT_PUNT: case IP_LOOKUP_NEXT_LOCAL: case IP_LOOKUP_NEXT_REWRITE: + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: case IP_LOOKUP_NEXT_MIDCHAIN: case IP_LOOKUP_NEXT_ICMP_ERROR: case IP_LOOKUP_N_NEXT: diff --git a/src/vnet/ethernet/interface.c b/src/vnet/ethernet/interface.c index 335e3f9f..9ac30bc6 100644 --- a/src/vnet/ethernet/interface.c +++ b/src/vnet/ethernet/interface.c @@ -115,7 +115,7 @@ ethernet_build_rewrite (vnet_main_t * vnm, #define _(a,b) case VNET_LINK_##a: type = ETHERNET_TYPE_##b; break _(IP4, IP4); _(IP6, IP6); - _(MPLS, MPLS_UNICAST); + _(MPLS, MPLS); _(ARP, ARP); #undef _ default: diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index f7787ed2..5305012f 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -249,7 +249,7 @@ determine_next_node (ethernet_main_t * em, { *next0 = em->l3_next.input_next_ip6; } - else if (type0 == ETHERNET_TYPE_MPLS_UNICAST) + else if (type0 == ETHERNET_TYPE_MPLS) { *next0 = em->l3_next.input_next_mpls; @@ -1252,7 +1252,7 @@ next_by_ethertype_register (next_by_ethertype_t * l3_next, { l3_next->input_next_ip6 = next_index; } - else if (ethertype == ETHERNET_TYPE_MPLS_UNICAST) + else if (ethertype == ETHERNET_TYPE_MPLS) { l3_next->input_next_mpls = next_index; } diff --git a/src/vnet/ethernet/types.def b/src/vnet/ethernet/types.def index 643f3152..7dab8ee1 100644 --- a/src/vnet/ethernet/types.def +++ b/src/vnet/ethernet/types.def @@ -85,8 +85,8 @@ ethernet_type (0x876D, SECURE_DATA) ethernet_type (0x8808, MAC_CONTROL) ethernet_type (0x8809, SLOW_PROTOCOLS) ethernet_type (0x880B, PPP) -ethernet_type (0x8847, MPLS_UNICAST) -ethernet_type (0x8848, MPLS_MULTICAST) +ethernet_type (0x8847, MPLS) +ethernet_type (0x8848, MPLS_UPSTREAM_ASSIGNED) ethernet_type (0x8863, PPPOE_DISCOVERY) ethernet_type (0x8864, PPPOE_SESSION) ethernet_type (0x886D, INTEL_ANS) diff --git a/src/vnet/fib/fib_api.h b/src/vnet/fib/fib_api.h index f8275317..10d0cb58 100644 --- a/src/vnet/fib/fib_api.h +++ b/src/vnet/fib/fib_api.h @@ -24,6 +24,7 @@ add_del_route_check (fib_protocol_t table_proto, fib_protocol_t next_hop_table_proto, u32 next_hop_table_id, u8 create_missing_tables, + u8 is_rpf_id, u32 * fib_index, u32 * next_hop_fib_index); int @@ -33,10 +34,13 @@ add_del_route_t_handler (u8 is_multipath, u8 is_unreach, u8 is_prohibit, u8 is_local, + u8 is_multicast, u8 is_classify, u32 classify_table_index, u8 is_resolve_host, u8 is_resolve_attached, + u8 is_interface_rx, + u8 is_rpf_id, u32 fib_index, const fib_prefix_t * prefix, u8 next_hop_proto_is_ip4, diff --git a/src/vnet/fib/fib_entry.c b/src/vnet/fib/fib_entry.c index dac1fce9..6f811aa1 100644 --- a/src/vnet/fib/fib_entry.c +++ b/src/vnet/fib/fib_entry.c @@ -75,13 +75,7 @@ fib_entry_get_default_chain_type (const fib_entry_t *fib_entry) return (FIB_FORW_CHAIN_TYPE_UNICAST_IP6); case FIB_PROTOCOL_MPLS: if (MPLS_EOS == fib_entry->fe_prefix.fp_eos) - /* - * If the entry being asked is a eos-MPLS label entry, - * then use the payload-protocol field, that we stashed there - * for just this purpose - */ - return (fib_forw_chain_type_from_dpo_proto( - fib_entry->fe_prefix.fp_payload_proto)); + return (FIB_FORW_CHAIN_TYPE_MPLS_EOS); else return (FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS); } @@ -370,6 +364,35 @@ fib_entry_contribute_urpf (fib_node_index_t entry_index, return (fib_path_list_contribute_urpf(fib_entry->fe_parent, urpf)); } +/* + * If the client is request a chain for multicast forwarding then swap + * the chain type to one that can provide such transport. + */ +static fib_forward_chain_type_t +fib_entry_chain_type_mcast_to_ucast (fib_forward_chain_type_t fct) +{ + switch (fct) + { + case FIB_FORW_CHAIN_TYPE_MCAST_IP4: + case FIB_FORW_CHAIN_TYPE_MCAST_IP6: + /* + * we can only transport IP multicast packets if there is an + * LSP. + */ + fct = FIB_FORW_CHAIN_TYPE_MPLS_EOS; + break; + case FIB_FORW_CHAIN_TYPE_MPLS_EOS: + case FIB_FORW_CHAIN_TYPE_UNICAST_IP4: + case FIB_FORW_CHAIN_TYPE_UNICAST_IP6: + case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS: + case FIB_FORW_CHAIN_TYPE_ETHERNET: + case FIB_FORW_CHAIN_TYPE_NSH: + break; + } + + return (fct); +} + /* * fib_entry_contribute_forwarding * @@ -385,6 +408,11 @@ fib_entry_contribute_forwarding (fib_node_index_t fib_entry_index, fib_entry = fib_entry_get(fib_entry_index); + /* + * mfib children ask for mcast chains. fix these to the appropriate ucast types. + */ + fct = fib_entry_chain_type_mcast_to_ucast(fct); + if (fct == fib_entry_get_default_chain_type(fib_entry)) { dpo_copy(dpo, &fib_entry->fe_lb); @@ -414,6 +442,11 @@ fib_entry_contribute_forwarding (fib_node_index_t fib_entry_index, dpo_copy(dpo, &fed->fd_dpo); } + /* + * don't allow the special index indicating replicate.vs.load-balance + * to escape to the clients + */ + dpo->dpoi_index &= ~MPLS_IS_REPLICATE; } const dpo_id_t * diff --git a/src/vnet/fib/fib_entry.h b/src/vnet/fib/fib_entry.h index a3f75e60..b17a0b64 100644 --- a/src/vnet/fib/fib_entry.h +++ b/src/vnet/fib/fib_entry.h @@ -192,6 +192,11 @@ typedef enum fib_entry_attribute_t_ { * The prefix/address is local to this device */ FIB_ENTRY_ATTRIBUTE_LOCAL, + /** + * The prefix/address is a multicast prefix. + * this aplies only to MPLS. IP multicast is handled by mfib + */ + FIB_ENTRY_ATTRIBUTE_MULTICAST, /** * The prefix/address exempted from loose uRPF check * To be used with caution @@ -200,7 +205,7 @@ typedef enum fib_entry_attribute_t_ { /** * Marker. add new entries before this one. */ - FIB_ENTRY_ATTRIBUTE_LAST = FIB_ENTRY_ATTRIBUTE_URPF_EXEMPT, + FIB_ENTRY_ATTRIBUTE_LAST = FIB_ENTRY_ATTRIBUTE_MULTICAST, } fib_entry_attribute_t; /** @@ -215,7 +220,8 @@ typedef enum fib_entry_attribute_t_ { [FIB_ENTRY_ATTRIBUTE_DROP] = "drop", \ [FIB_ENTRY_ATTRIBUTE_EXCLUSIVE] = "exclusive", \ [FIB_ENTRY_ATTRIBUTE_LOCAL] = "local", \ - [FIB_ENTRY_ATTRIBUTE_URPF_EXEMPT] = "uRPF-exempt" \ + [FIB_ENTRY_ATTRIBUTE_URPF_EXEMPT] = "uRPF-exempt", \ + [FIB_ENTRY_ATTRIBUTE_MULTICAST] = "multicast", \ } #define FOR_EACH_FIB_ATTRIBUTE(_item) \ @@ -232,6 +238,7 @@ typedef enum fib_entry_flag_t_ { FIB_ENTRY_FLAG_LOCAL = (1 << FIB_ENTRY_ATTRIBUTE_LOCAL), FIB_ENTRY_FLAG_IMPORT = (1 << FIB_ENTRY_ATTRIBUTE_IMPORT), FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT = (1 << FIB_ENTRY_ATTRIBUTE_URPF_EXEMPT), + FIB_ENTRY_FLAG_MULTICAST = (1 << FIB_ENTRY_ATTRIBUTE_MULTICAST), } __attribute__((packed)) fib_entry_flag_t; /** @@ -396,7 +403,7 @@ typedef struct fib_entry_t_ { * paint the header straight on without the need to check the packet * type to derive the EOS bit value. */ - dpo_id_t fe_lb; // [FIB_FORW_CHAIN_MPLS_NUM]; + dpo_id_t fe_lb; /** * Vector of source infos. * Most entries will only have 1 source. So we optimise for memory usage, diff --git a/src/vnet/fib/fib_entry_src.c b/src/vnet/fib/fib_entry_src.c index aa1d5a24..a700282e 100644 --- a/src/vnet/fib/fib_entry_src.c +++ b/src/vnet/fib/fib_entry_src.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -229,8 +230,6 @@ fib_forward_chain_type_t fib_entry_chain_type_fixup (const fib_entry_t *entry, fib_forward_chain_type_t fct) { - ASSERT(FIB_FORW_CHAIN_TYPE_MPLS_EOS == fct); - /* * The EOS chain is a tricky since one cannot know the adjacency * to link to without knowing what the packets payload protocol @@ -238,6 +237,11 @@ fib_entry_chain_type_fixup (const fib_entry_t *entry, */ fib_forward_chain_type_t dfct; + if (FIB_FORW_CHAIN_TYPE_MPLS_EOS != fct) + { + return (fct); + } + dfct = fib_entry_get_default_chain_type(entry); if (FIB_FORW_CHAIN_TYPE_MPLS_EOS == dfct) @@ -303,7 +307,12 @@ fib_entry_src_collect_forwarding (fib_node_index_t pl_index, * found a matching extension. stack it to obtain the forwarding * info for this path. */ - ctx->next_hops = fib_path_ext_stack(path_ext, ctx->fib_entry, ctx->fct, ctx->next_hops); + ctx->next_hops = + fib_path_ext_stack(path_ext, + ctx->fct, + fib_entry_chain_type_fixup(ctx->fib_entry, + ctx->fct), + ctx->next_hops); } else { @@ -355,6 +364,9 @@ fib_entry_src_collect_forwarding (fib_node_index_t pl_index, fib_entry_chain_type_fixup(ctx->fib_entry, ctx->fct), &nh->path_dpo); + fib_path_stack_mpls_disp(path_index, + ctx->fib_entry->fe_prefix.fp_payload_proto, + &nh->path_dpo); break; } @@ -424,50 +436,70 @@ fib_entry_src_mk_lb (fib_entry_t *fib_entry, /* * first time create */ - flow_hash_config_t fhc; - - fhc = fib_table_get_flow_hash_config(fib_entry->fe_fib_index, - dpo_proto_to_fib(lb_proto)); - dpo_set(dpo_lb, - DPO_LOAD_BALANCE, - lb_proto, - load_balance_create(0, lb_proto, fhc)); + if (esrc->fes_entry_flags & FIB_ENTRY_FLAG_MULTICAST) + { + dpo_set(dpo_lb, + DPO_REPLICATE, + lb_proto, + MPLS_IS_REPLICATE | replicate_create(0, lb_proto)); + } + else + { + flow_hash_config_t fhc; + + fhc = fib_table_get_flow_hash_config(fib_entry->fe_fib_index, + dpo_proto_to_fib(lb_proto)); + dpo_set(dpo_lb, + DPO_LOAD_BALANCE, + lb_proto, + load_balance_create(0, lb_proto, fhc)); + } } - load_balance_multipath_update(dpo_lb, - ctx.next_hops, - fib_entry_calc_lb_flags(&ctx)); - vec_free(ctx.next_hops); - - /* - * if this entry is sourced by the uRPF-exempt source then we - * append the always present local0 interface (index 0) to the - * uRPF list so it is not empty. that way packets pass the loose check. - */ - index_t ui = fib_path_list_get_urpf(esrc->fes_pl); - - if ((fib_entry_is_sourced(fib_entry_get_index(fib_entry), - FIB_SOURCE_URPF_EXEMPT) || - (esrc->fes_entry_flags & FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT))&& - (0 == fib_urpf_check_size(ui))) + if (esrc->fes_entry_flags & FIB_ENTRY_FLAG_MULTICAST) { - /* - * The uRPF list we get from the path-list is shared by all - * other users of the list, but the uRPF exemption applies - * only to this prefix. So we need our own list. - */ - ui = fib_urpf_list_alloc_and_lock(); - fib_urpf_list_append(ui, 0); - fib_urpf_list_bake(ui); - load_balance_set_urpf(dpo_lb->dpoi_index, ui); - fib_urpf_list_unlock(ui); + /* + * MPLS multicast + */ + replicate_multipath_update(dpo_lb, ctx.next_hops); } else { - load_balance_set_urpf(dpo_lb->dpoi_index, ui); + load_balance_multipath_update(dpo_lb, + ctx.next_hops, + fib_entry_calc_lb_flags(&ctx)); + vec_free(ctx.next_hops); + + /* + * if this entry is sourced by the uRPF-exempt source then we + * append the always present local0 interface (index 0) to the + * uRPF list so it is not empty. that way packets pass the loose check. + */ + index_t ui = fib_path_list_get_urpf(esrc->fes_pl); + + if ((fib_entry_is_sourced(fib_entry_get_index(fib_entry), + FIB_SOURCE_URPF_EXEMPT) || + (esrc->fes_entry_flags & FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT))&& + (0 == fib_urpf_check_size(ui))) + { + /* + * The uRPF list we get from the path-list is shared by all + * other users of the list, but the uRPF exemption applies + * only to this prefix. So we need our own list. + */ + ui = fib_urpf_list_alloc_and_lock(); + fib_urpf_list_append(ui, 0); + fib_urpf_list_bake(ui); + load_balance_set_urpf(dpo_lb->dpoi_index, ui); + fib_urpf_list_unlock(ui); + } + else + { + load_balance_set_urpf(dpo_lb->dpoi_index, ui); + } + load_balance_set_fib_entry_flags(dpo_lb->dpoi_index, + fib_entry_get_flags_i(fib_entry)); } - load_balance_set_fib_entry_flags(dpo_lb->dpoi_index, - fib_entry_get_flags_i(fib_entry)); } void @@ -887,21 +919,6 @@ fib_entry_src_action_remove (fib_entry_t *fib_entry, return (sflags); } -static inline int -fib_route_recurses_via_self (const fib_prefix_t *prefix, - const fib_route_path_t *rpath) -{ - /* - * not all zeros next hop && - * is recursive path && - * nexthop is same as the route's address - */ - return ((!ip46_address_is_zero(&rpath->frp_addr)) && - (~0 == rpath->frp_sw_if_index) && - (0 == ip46_address_cmp(&rpath->frp_addr, &prefix->fp_addr))); - -} - /* * fib_route_attached_cross_table * @@ -962,14 +979,14 @@ fib_entry_src_flags_2_path_list_flags (fib_entry_flag_t eflags) { plf |= FIB_PATH_LIST_FLAG_DROP; } - if (eflags & FIB_ENTRY_FLAG_LOCAL) - { - plf |= FIB_PATH_LIST_FLAG_LOCAL; - } if (eflags & FIB_ENTRY_FLAG_EXCLUSIVE) { plf |= FIB_PATH_LIST_FLAG_EXCLUSIVE; } + if (eflags & FIB_ENTRY_FLAG_LOCAL) + { + plf |= FIB_PATH_LIST_FLAG_LOCAL; + } return (plf); } @@ -980,25 +997,6 @@ fib_entry_flags_update (const fib_entry_t *fib_entry, fib_path_list_flags_t *pl_flags, fib_entry_src_t *esrc) { - /* - * don't allow the addition of a recursive looped path for prefix - * via itself. - */ - if (fib_route_recurses_via_self(&fib_entry->fe_prefix, rpath)) - { - /* - * force the install of a drop path-list. - * we want the entry to have some path-list, mainly so - * the dodgy path can be rmeoved when the source stops playing - * silly buggers. - */ - *pl_flags |= FIB_PATH_LIST_FLAG_DROP; - } - else - { - *pl_flags &= ~FIB_PATH_LIST_FLAG_DROP; - } - if ((esrc->fes_src == FIB_SOURCE_API) || (esrc->fes_src == FIB_SOURCE_CLI)) { diff --git a/src/vnet/fib/fib_internal.h b/src/vnet/fib/fib_internal.h index 2d980bcc..8abc0e07 100644 --- a/src/vnet/fib/fib_internal.h +++ b/src/vnet/fib/fib_internal.h @@ -25,6 +25,7 @@ #undef FIB_DEBUG extern void fib_prefix_from_mpls_label(mpls_label_t label, + mpls_eos_bit_t eos, fib_prefix_t *prf); extern int fib_route_path_cmp(const fib_route_path_t *rpath1, diff --git a/src/vnet/fib/fib_path.c b/src/vnet/fib/fib_path.c index 6b202a97..f81f4170 100644 --- a/src/vnet/fib/fib_path.c +++ b/src/vnet/fib/fib_path.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include @@ -66,6 +68,10 @@ typedef enum fib_path_type_t_ { * deag. Link to a lookup adj in the next table */ FIB_PATH_TYPE_DEAG, + /** + * interface receive. + */ + FIB_PATH_TYPE_INTF_RX, /** * receive. it's for-us. */ @@ -88,6 +94,7 @@ typedef enum fib_path_type_t_ { [FIB_PATH_TYPE_SPECIAL] = "special", \ [FIB_PATH_TYPE_EXCLUSIVE] = "exclusive", \ [FIB_PATH_TYPE_DEAG] = "deag", \ + [FIB_PATH_TYPE_INTF_RX] = "intf-rx", \ [FIB_PATH_TYPE_RECEIVE] = "receive", \ } @@ -220,10 +227,16 @@ typedef struct fib_path_t_ { * The next-hop */ ip46_address_t fp_ip; - /** - * The local label to resolve through. - */ - mpls_label_t fp_local_label; + struct { + /** + * The local label to resolve through. + */ + mpls_label_t fp_local_label; + /** + * The EOS bit of the resolving label + */ + mpls_eos_bit_t fp_eos; + }; } fp_nh; /** * The FIB table index in which to find the next-hop. @@ -254,6 +267,10 @@ typedef struct fib_path_t_ { * The FIB index in which to perfom the next lookup */ fib_node_index_t fp_tbl_id; + /** + * The RPF-ID to tag the packets with + */ + fib_rpf_id_t fp_rpf_id; } deag; struct { } special; @@ -273,6 +290,12 @@ typedef struct fib_path_t_ { */ ip46_address_t fp_addr; } receive; + struct { + /** + * The interface on which the packets will be input. + */ + u32 fp_interface; + } intf_rx; }; STRUCT_MARK(path_hash_end); @@ -444,9 +467,11 @@ format_fib_path (u8 * s, va_list * args) case FIB_PATH_TYPE_RECURSIVE: if (FIB_PROTOCOL_MPLS == path->fp_nh_proto) { - s = format (s, "via %U", + s = format (s, "via %U %U", format_mpls_unicast_label, - path->recursive.fp_nh.fp_local_label); + path->recursive.fp_nh.fp_local_label, + format_mpls_eos_bit, + path->recursive.fp_nh.fp_eos); } else { @@ -465,6 +490,7 @@ format_fib_path (u8 * s, va_list * args) break; case FIB_PATH_TYPE_RECEIVE: + case FIB_PATH_TYPE_INTF_RX: case FIB_PATH_TYPE_SPECIAL: case FIB_PATH_TYPE_DEAG: case FIB_PATH_TYPE_EXCLUSIVE: @@ -736,6 +762,7 @@ fib_path_unresolve (fib_path_t *path) break; case FIB_PATH_TYPE_SPECIAL: case FIB_PATH_TYPE_RECEIVE: + case FIB_PATH_TYPE_INTF_RX: case FIB_PATH_TYPE_DEAG: /* * these hold only the path's DPO, which is reset below. @@ -754,16 +781,24 @@ fib_path_unresolve (fib_path_t *path) } static fib_forward_chain_type_t -fib_path_proto_to_chain_type (fib_protocol_t proto) +fib_path_to_chain_type (const fib_path_t *path) { - switch (proto) + switch (path->fp_nh_proto) { case FIB_PROTOCOL_IP4: return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4); case FIB_PROTOCOL_IP6: return (FIB_FORW_CHAIN_TYPE_UNICAST_IP6); case FIB_PROTOCOL_MPLS: - return (FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS); + if (FIB_PATH_TYPE_RECURSIVE == path->fp_type && + MPLS_EOS == path->recursive.fp_nh.fp_eos) + { + return (FIB_FORW_CHAIN_TYPE_MPLS_EOS); + } + else + { + return (FIB_FORW_CHAIN_TYPE_MPLS_EOS); + } } return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4); } @@ -793,7 +828,7 @@ fib_path_back_walk_notify (fib_node_t *node, */ fib_path_recursive_adj_update( path, - fib_path_proto_to_chain_type(path->fp_nh_proto), + fib_path_to_chain_type(path), &path->fp_dpo); } if ((FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason) || @@ -931,6 +966,8 @@ FIXME comment path->fp_oper_flags |= FIB_PATH_OPER_FLAG_DROP; } break; + case FIB_PATH_TYPE_INTF_RX: + ASSERT(0); case FIB_PATH_TYPE_DEAG: /* * FIXME When VRF delete is allowed this will need a poke. @@ -986,6 +1023,14 @@ fib_path_route_flags_to_cfg_flags (const fib_route_path_t *rpath) cfg_flags |= FIB_PATH_CFG_FLAG_LOCAL; if (rpath->frp_flags & FIB_ROUTE_PATH_ATTACHED) cfg_flags |= FIB_PATH_CFG_FLAG_ATTACHED; + if (rpath->frp_flags & FIB_ROUTE_PATH_INTF_RX) + cfg_flags |= FIB_PATH_CFG_FLAG_INTF_RX; + if (rpath->frp_flags & FIB_ROUTE_PATH_RPF_ID) + cfg_flags |= FIB_PATH_CFG_FLAG_RPF_ID; + if (rpath->frp_flags & FIB_ROUTE_PATH_EXCLUSIVE) + cfg_flags |= FIB_PATH_CFG_FLAG_EXCLUSIVE; + if (rpath->frp_flags & FIB_ROUTE_PATH_DROP) + cfg_flags |= FIB_PATH_CFG_FLAG_DROP; return (cfg_flags); } @@ -998,8 +1043,6 @@ fib_path_route_flags_to_cfg_flags (const fib_route_path_t *rpath) */ fib_node_index_t fib_path_create (fib_node_index_t pl_index, - fib_protocol_t nh_proto, - fib_path_cfg_flags_t flags, const fib_route_path_t *rpath) { fib_path_t *path; @@ -1012,7 +1055,7 @@ fib_path_create (fib_node_index_t pl_index, dpo_reset(&path->fp_dpo); path->fp_pl_index = pl_index; - path->fp_nh_proto = nh_proto; + path->fp_nh_proto = rpath->frp_proto; path->fp_via_fib = FIB_NODE_INDEX_INVALID; path->fp_weight = rpath->frp_weight; if (0 == path->fp_weight) @@ -1023,8 +1066,7 @@ fib_path_create (fib_node_index_t pl_index, */ path->fp_weight = 1; } - path->fp_cfg_flags = flags; - path->fp_cfg_flags |= fib_path_route_flags_to_cfg_flags(rpath); + path->fp_cfg_flags = fib_path_route_flags_to_cfg_flags(rpath); /* * deduce the path's tpye from the parementers and save what is needed. @@ -1035,6 +1077,17 @@ fib_path_create (fib_node_index_t pl_index, path->receive.fp_interface = rpath->frp_sw_if_index; path->receive.fp_addr = rpath->frp_addr; } + else if (path->fp_cfg_flags & FIB_PATH_CFG_FLAG_INTF_RX) + { + path->fp_type = FIB_PATH_TYPE_INTF_RX; + path->intf_rx.fp_interface = rpath->frp_sw_if_index; + } + else if (path->fp_cfg_flags & FIB_PATH_CFG_FLAG_RPF_ID) + { + path->fp_type = FIB_PATH_TYPE_DEAG; + path->deag.fp_tbl_id = rpath->frp_fib_index; + path->deag.fp_rpf_id = rpath->frp_rpf_id; + } else if (~0 != rpath->frp_sw_if_index) { if (ip46_address_is_zero(&rpath->frp_addr)) @@ -1069,6 +1122,7 @@ fib_path_create (fib_node_index_t pl_index, if (FIB_PROTOCOL_MPLS == path->fp_nh_proto) { path->recursive.fp_nh.fp_local_label = rpath->frp_local_label; + path->recursive.fp_nh.fp_eos = rpath->frp_eos; } else { @@ -1238,17 +1292,13 @@ fib_path_cmp_i (const fib_path_t *path1, res = ip46_address_cmp(&path1->attached_next_hop.fp_nh, &path2->attached_next_hop.fp_nh); if (0 == res) { - res = vnet_sw_interface_compare( - vnet_get_main(), - path1->attached_next_hop.fp_interface, - path2->attached_next_hop.fp_interface); + res = (path1->attached_next_hop.fp_interface - + path2->attached_next_hop.fp_interface); } break; case FIB_PATH_TYPE_ATTACHED: - res = vnet_sw_interface_compare( - vnet_get_main(), - path1->attached.fp_interface, - path2->attached.fp_interface); + res = (path1->attached.fp_interface - + path2->attached.fp_interface); break; case FIB_PATH_TYPE_RECURSIVE: res = ip46_address_cmp(&path1->recursive.fp_nh, @@ -1261,6 +1311,13 @@ fib_path_cmp_i (const fib_path_t *path1, break; case FIB_PATH_TYPE_DEAG: res = (path1->deag.fp_tbl_id - path2->deag.fp_tbl_id); + if (0 == res) + { + res = (path1->deag.fp_rpf_id - path2->deag.fp_rpf_id); + } + break; + case FIB_PATH_TYPE_INTF_RX: + res = (path1->intf_rx.fp_interface - path2->intf_rx.fp_interface); break; case FIB_PATH_TYPE_SPECIAL: case FIB_PATH_TYPE_RECEIVE: @@ -1336,22 +1393,22 @@ fib_path_cmp_w_route_path (fib_node_index_t path_index, &rpath->frp_addr); if (0 == res) { - res = vnet_sw_interface_compare( - vnet_get_main(), - path->attached_next_hop.fp_interface, - rpath->frp_sw_if_index); + res = (path->attached_next_hop.fp_interface - + rpath->frp_sw_if_index); } break; case FIB_PATH_TYPE_ATTACHED: - res = vnet_sw_interface_compare( - vnet_get_main(), - path->attached.fp_interface, - rpath->frp_sw_if_index); + res = (path->attached.fp_interface - rpath->frp_sw_if_index); break; case FIB_PATH_TYPE_RECURSIVE: if (FIB_PROTOCOL_MPLS == path->fp_nh_proto) { res = path->recursive.fp_nh.fp_local_label - rpath->frp_local_label; + + if (res == 0) + { + res = path->recursive.fp_nh.fp_eos - rpath->frp_eos; + } } else { @@ -1364,9 +1421,16 @@ fib_path_cmp_w_route_path (fib_node_index_t path_index, res = (path->recursive.fp_tbl_id - rpath->frp_fib_index); } break; + case FIB_PATH_TYPE_INTF_RX: + res = (path->intf_rx.fp_interface - rpath->frp_sw_if_index); + break; case FIB_PATH_TYPE_DEAG: res = (path->deag.fp_tbl_id - rpath->frp_fib_index); - break; + if (0 == res) + { + res = (path->deag.fp_rpf_id - rpath->frp_rpf_id); + } + break; case FIB_PATH_TYPE_SPECIAL: case FIB_PATH_TYPE_RECEIVE: case FIB_PATH_TYPE_EXCLUSIVE: @@ -1465,6 +1529,7 @@ fib_path_recursive_loop_detect (fib_node_index_t path_index, case FIB_PATH_TYPE_SPECIAL: case FIB_PATH_TYPE_DEAG: case FIB_PATH_TYPE_RECEIVE: + case FIB_PATH_TYPE_INTF_RX: case FIB_PATH_TYPE_EXCLUSIVE: /* * these path types cannot be part of a loop, since they are the leaves @@ -1563,7 +1628,9 @@ fib_path_resolve (fib_node_index_t path_index) if (FIB_PROTOCOL_MPLS == path->fp_nh_proto) { - fib_prefix_from_mpls_label(path->recursive.fp_nh.fp_local_label, &pfx); + fib_prefix_from_mpls_label(path->recursive.fp_nh.fp_local_label, + path->recursive.fp_nh.fp_eos, + &pfx); } else { @@ -1592,7 +1659,7 @@ fib_path_resolve (fib_node_index_t path_index) */ fib_path_recursive_adj_update( path, - fib_path_proto_to_chain_type(path->fp_nh_proto), + fib_path_to_chain_type(path), &path->fp_dpo); break; @@ -1605,16 +1672,25 @@ fib_path_resolve (fib_node_index_t path_index) drop_dpo_get(fib_proto_to_dpo(path->fp_nh_proto))); break; case FIB_PATH_TYPE_DEAG: + { /* * Resolve via a lookup DPO. * FIXME. control plane should add routes with a table ID */ - lookup_dpo_add_or_lock_w_fib_index(path->deag.fp_tbl_id, - fib_proto_to_dpo(path->fp_nh_proto), - LOOKUP_INPUT_DST_ADDR, - LOOKUP_TABLE_FROM_CONFIG, - &path->fp_dpo); + lookup_cast_t cast; + + cast = (path->fp_cfg_flags & FIB_PATH_CFG_FLAG_RPF_ID ? + LOOKUP_MULTICAST : + LOOKUP_UNICAST); + + lookup_dpo_add_or_lock_w_fib_index(path->deag.fp_tbl_id, + fib_proto_to_dpo(path->fp_nh_proto), + cast, + LOOKUP_INPUT_DST_ADDR, + LOOKUP_TABLE_FROM_CONFIG, + &path->fp_dpo); break; + } case FIB_PATH_TYPE_RECEIVE: /* * Resolve via a receive DPO. @@ -1624,6 +1700,15 @@ fib_path_resolve (fib_node_index_t path_index) &path->receive.fp_addr, &path->fp_dpo); break; + case FIB_PATH_TYPE_INTF_RX: { + /* + * Resolve via a receive DPO. + */ + interface_dpo_add_or_lock(fib_proto_to_dpo(path->fp_nh_proto), + path->intf_rx.fp_interface, + &path->fp_dpo); + break; + } case FIB_PATH_TYPE_EXCLUSIVE: /* * Resolve via the user provided DPO @@ -1652,6 +1737,7 @@ fib_path_get_resolving_interface (fib_node_index_t path_index) return (path->receive.fp_interface); case FIB_PATH_TYPE_RECURSIVE: return (fib_entry_get_resolving_interface(path->fp_via_fib)); + case FIB_PATH_TYPE_INTF_RX: case FIB_PATH_TYPE_SPECIAL: case FIB_PATH_TYPE_DEAG: case FIB_PATH_TYPE_EXCLUSIVE: @@ -1743,6 +1829,7 @@ fib_path_contribute_urpf (fib_node_index_t path_index, case FIB_PATH_TYPE_DEAG: case FIB_PATH_TYPE_RECEIVE: + case FIB_PATH_TYPE_INTF_RX: /* * these path types don't link to an adj */ @@ -1750,6 +1837,44 @@ fib_path_contribute_urpf (fib_node_index_t path_index, } } +void +fib_path_stack_mpls_disp (fib_node_index_t path_index, + dpo_proto_t payload_proto, + dpo_id_t *dpo) +{ + fib_path_t *path; + + path = fib_path_get(path_index); + + ASSERT(path); + + switch (path->fp_type) + { + case FIB_PATH_TYPE_DEAG: + { + dpo_id_t tmp = DPO_INVALID; + + dpo_copy(&tmp, dpo); + dpo_set(dpo, + DPO_MPLS_DISPOSITION, + payload_proto, + mpls_disp_dpo_create(payload_proto, + path->deag.fp_rpf_id, + &tmp)); + dpo_reset(&tmp); + break; + } + case FIB_PATH_TYPE_RECEIVE: + case FIB_PATH_TYPE_ATTACHED: + case FIB_PATH_TYPE_ATTACHED_NEXT_HOP: + case FIB_PATH_TYPE_RECURSIVE: + case FIB_PATH_TYPE_INTF_RX: + case FIB_PATH_TYPE_EXCLUSIVE: + case FIB_PATH_TYPE_SPECIAL: + break; + } +} + void fib_path_contribute_forwarding (fib_node_index_t path_index, fib_forward_chain_type_t fct, @@ -1769,7 +1894,7 @@ fib_path_contribute_forwarding (fib_node_index_t path_index, * This then represents the path's 'native' protocol; IP. * For all others will need to go find something else. */ - if (fib_path_proto_to_chain_type(path->fp_nh_proto) == fct) + if (fib_path_to_chain_type(path) == fct) { dpo_copy(dpo, &path->fp_dpo); } @@ -1813,10 +1938,10 @@ fib_path_contribute_forwarding (fib_node_index_t path_index, case FIB_FORW_CHAIN_TYPE_UNICAST_IP4: case FIB_FORW_CHAIN_TYPE_UNICAST_IP6: case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS: - fib_path_recursive_adj_update(path, fct, dpo); - break; case FIB_FORW_CHAIN_TYPE_MCAST_IP4: case FIB_FORW_CHAIN_TYPE_MCAST_IP6: + fib_path_recursive_adj_update(path, fct, dpo); + break; case FIB_FORW_CHAIN_TYPE_ETHERNET: case FIB_FORW_CHAIN_TYPE_NSH: ASSERT(0); @@ -1829,13 +1954,14 @@ fib_path_contribute_forwarding (fib_node_index_t path_index, case FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS: lookup_dpo_add_or_lock_w_table_id(MPLS_FIB_DEFAULT_TABLE_ID, DPO_PROTO_MPLS, + LOOKUP_UNICAST, LOOKUP_INPUT_DST_ADDR, LOOKUP_TABLE_FROM_CONFIG, dpo); break; + case FIB_FORW_CHAIN_TYPE_MPLS_EOS: case FIB_FORW_CHAIN_TYPE_UNICAST_IP4: case FIB_FORW_CHAIN_TYPE_UNICAST_IP6: - case FIB_FORW_CHAIN_TYPE_MPLS_EOS: dpo_copy(dpo, &path->fp_dpo); break; case FIB_FORW_CHAIN_TYPE_MCAST_IP4: @@ -1870,7 +1996,7 @@ fib_path_contribute_forwarding (fib_node_index_t path_index, ai = adj_mcast_add_or_lock(path->fp_nh_proto, fib_forw_chain_type_to_link_type(fct), path->attached.fp_interface); - dpo_set(dpo, DPO_ADJACENCY_MCAST, + dpo_set(dpo, DPO_ADJACENCY, fib_forw_chain_type_to_dpo_proto(fct), ai); adj_unlock(ai); @@ -1878,6 +2004,14 @@ fib_path_contribute_forwarding (fib_node_index_t path_index, break; } break; + case FIB_PATH_TYPE_INTF_RX: + /* + * Create the adj needed for sending IP multicast traffic + */ + interface_dpo_add_or_lock(fib_forw_chain_type_to_dpo_proto(fct), + path->attached.fp_interface, + dpo); + break; case FIB_PATH_TYPE_RECEIVE: case FIB_PATH_TYPE_SPECIAL: dpo_copy(dpo, &path->fp_dpo); diff --git a/src/vnet/fib/fib_path.h b/src/vnet/fib/fib_path.h index 14efc1ab..334be6f5 100644 --- a/src/vnet/fib/fib_path.h +++ b/src/vnet/fib/fib_path.h @@ -69,6 +69,14 @@ typedef enum fib_path_cfg_attribute_t_ { /** * The path is a for-us path */ + FIB_PATH_CFG_ATTRIBUTE_INTF_RX, + /** + * The path is a deag with rpf-id + */ + FIB_PATH_CFG_ATTRIBUTE_RPF_ID, + /** + * The path is an interface recieve + */ FIB_PATH_CFG_ATTRIBUTE_LOCAL, /** * Marker. Add new types before this one, then update it. @@ -88,6 +96,8 @@ typedef enum fib_path_cfg_attribute_t_ { [FIB_PATH_CFG_ATTRIBUTE_RESOLVE_ATTACHED] = "resolve-attached", \ [FIB_PATH_CFG_ATTRIBUTE_LOCAL] = "local", \ [FIB_PATH_CFG_ATTRIBUTE_ATTACHED] = "attached", \ + [FIB_PATH_CFG_ATTRIBUTE_INTF_RX] = "interface-rx", \ + [FIB_PATH_CFG_ATTRIBUTE_RPF_ID] = "rpf-id", \ } #define FOR_EACH_FIB_PATH_CFG_ATTRIBUTE(_item) \ @@ -106,6 +116,8 @@ typedef enum fib_path_cfg_flags_t_ { FIB_PATH_CFG_FLAG_RESOLVE_ATTACHED = (1 << FIB_PATH_CFG_ATTRIBUTE_RESOLVE_ATTACHED), FIB_PATH_CFG_FLAG_LOCAL = (1 << FIB_PATH_CFG_ATTRIBUTE_LOCAL), FIB_PATH_CFG_FLAG_ATTACHED = (1 << FIB_PATH_CFG_ATTRIBUTE_ATTACHED), + FIB_PATH_CFG_FLAG_INTF_RX = (1 << FIB_PATH_CFG_ATTRIBUTE_INTF_RX), + FIB_PATH_CFG_FLAG_RPF_ID = (1 << FIB_PATH_CFG_ATTRIBUTE_RPF_ID), } __attribute__ ((packed)) fib_path_cfg_flags_t; @@ -117,8 +129,6 @@ extern u8 *fib_path_adj_format(fib_node_index_t pi, extern u8 * format_fib_path(u8 * s, va_list * args); extern fib_node_index_t fib_path_create(fib_node_index_t pl_index, - fib_protocol_t nh_proto, - fib_path_cfg_flags_t flags, const fib_route_path_t *path); extern fib_node_index_t fib_path_create_special(fib_node_index_t pl_index, fib_protocol_t nh_proto, @@ -145,6 +155,9 @@ extern load_balance_path_t * fib_path_append_nh_for_multipath_hash( fib_node_index_t path_index, fib_forward_chain_type_t fct, load_balance_path_t *hash_key); +extern void fib_path_stack_mpls_disp(fib_node_index_t path_index, + dpo_proto_t payload_proto, + dpo_id_t *dpo); extern void fib_path_contribute_forwarding(fib_node_index_t path_index, fib_forward_chain_type_t type, dpo_id_t *dpo); diff --git a/src/vnet/fib/fib_path_ext.c b/src/vnet/fib/fib_path_ext.c index f75b5626..08293bcf 100644 --- a/src/vnet/fib/fib_path_ext.c +++ b/src/vnet/fib/fib_path_ext.c @@ -103,8 +103,8 @@ fib_path_ext_is_imp_null (fib_path_ext_t *path_ext) load_balance_path_t * fib_path_ext_stack (fib_path_ext_t *path_ext, - const fib_entry_t *entry, fib_forward_chain_type_t child_fct, + fib_forward_chain_type_t imp_null_fct, load_balance_path_t *nhs) { fib_forward_chain_type_t parent_fct; @@ -129,7 +129,7 @@ fib_path_ext_stack (fib_path_ext_t *path_ext, */ if (fib_path_ext_is_imp_null(path_ext)) { - parent_fct = fib_entry_chain_type_fixup(entry, child_fct); + parent_fct = imp_null_fct; } else { diff --git a/src/vnet/fib/fib_path_ext.h b/src/vnet/fib/fib_path_ext.h index cf8f8df0..d617700d 100644 --- a/src/vnet/fib/fib_path_ext.h +++ b/src/vnet/fib/fib_path_ext.h @@ -18,6 +18,7 @@ #include #include +#include /** * A path extension is a per-entry addition to the forwarding information @@ -61,8 +62,8 @@ extern void fib_path_ext_resolve(fib_path_ext_t *path_ext, fib_node_index_t path_list_index); extern load_balance_path_t *fib_path_ext_stack(fib_path_ext_t *path_ext, - const struct fib_entry_t_ *entry, fib_forward_chain_type_t fct, + fib_forward_chain_type_t imp_null_fct, load_balance_path_t *nhs); #endif diff --git a/src/vnet/fib/fib_path_list.c b/src/vnet/fib/fib_path_list.c index b9a391b3..ea6565dd 100644 --- a/src/vnet/fib/fib_path_list.c +++ b/src/vnet/fib/fib_path_list.c @@ -40,13 +40,6 @@ typedef struct fib_path_list_t_ { */ fib_path_list_flags_t fpl_flags; - /** - * The next-hop protocol for the paths in this path list. - * Note that fixing the proto here means we don't support a mix of - * v4 and v6 paths. ho hum. - */ - fib_protocol_t fpl_nh_proto; - /** * Vector of paths indicies for all configured paths. * For shareable path-lists this list MUST not change. @@ -57,6 +50,11 @@ typedef struct fib_path_list_t_ { * the RPF list calculated for this path list */ fib_node_index_t fpl_urpf; + + /** + * Hash table of paths. valid only with INDEXED flag + */ + uword *fpl_db; } fib_path_list_t; /* @@ -131,7 +129,6 @@ format_fib_path_list (u8 * s, va_list * args) s = format (s, " index:%u", fib_path_list_get_index(path_list)); s = format (s, " locks:%u", path_list->fpl_node.fn_locks); - s = format (s, " proto:%U", format_fib_protocol, path_list->fpl_nh_proto); if (FIB_PATH_LIST_FLAG_NONE != path_list->fpl_flags) { @@ -155,26 +152,6 @@ format_fib_path_list (u8 * s, va_list * args) return (s); } -u8 * -fib_path_list_adjs_format (fib_node_index_t path_list_index, - u32 indent, - u8 * s) -{ - fib_path_list_t *path_list; - u32 i; - - path_list = fib_path_list_get(path_list_index); - - vec_foreach_index (i, path_list->fpl_paths) - { - s = fib_path_adj_format(path_list->fpl_paths[i], - indent, s); - } - - return (s); -} - - u8 * fib_path_list_format (fib_node_index_t path_list_index, u8 * s) @@ -648,27 +625,6 @@ fib_path_list_is_looped (fib_node_index_t path_list_index) return (path_list->fpl_flags & FIB_PATH_LIST_FLAG_LOOPED); } -static fib_path_cfg_flags_t -fib_path_list_flags_2_path_flags (fib_path_list_flags_t plf) -{ - fib_path_cfg_flags_t pf = FIB_PATH_CFG_FLAG_NONE; - - if (plf & FIB_PATH_LIST_FLAG_LOCAL) - { - pf |= FIB_PATH_CFG_FLAG_LOCAL; - } - if (plf & FIB_PATH_LIST_FLAG_DROP) - { - pf |= FIB_PATH_CFG_FLAG_DROP; - } - if (plf & FIB_PATH_LIST_FLAG_EXCLUSIVE) - { - pf |= FIB_PATH_CFG_FLAG_EXCLUSIVE; - } - - return (pf); -} - static fib_path_list_flags_t fib_path_list_flags_fixup (fib_path_list_flags_t flags) { @@ -695,18 +651,15 @@ fib_path_list_create (fib_path_list_flags_t flags, flags = fib_path_list_flags_fixup(flags); path_list = fib_path_list_alloc(&path_list_index); path_list->fpl_flags = flags; - /* - * we'll assume for now all paths are the same next-hop protocol - */ - path_list->fpl_nh_proto = rpaths[0].frp_proto; - vec_foreach_index(i, rpaths) + if (NULL != rpaths) { - vec_add1(path_list->fpl_paths, - fib_path_create(path_list_index, - path_list->fpl_nh_proto, - fib_path_list_flags_2_path_flags(flags), - &rpaths[i])); + vec_foreach_index(i, rpaths) + { + vec_add1(path_list->fpl_paths, + fib_path_create(path_list_index, + &rpaths[i])); + } } /* @@ -748,6 +701,27 @@ fib_path_list_create (fib_path_list_flags_t flags, return (path_list_index); } +static fib_path_cfg_flags_t +fib_path_list_flags_2_path_flags (fib_path_list_flags_t plf) +{ + fib_path_cfg_flags_t pf = FIB_PATH_CFG_FLAG_NONE; + + if (plf & FIB_PATH_LIST_FLAG_DROP) + { + pf |= FIB_PATH_CFG_FLAG_DROP; + } + if (plf & FIB_PATH_LIST_FLAG_EXCLUSIVE) + { + pf |= FIB_PATH_CFG_FLAG_EXCLUSIVE; + } + if (plf & FIB_PATH_LIST_FLAG_LOCAL) + { + pf |= FIB_PATH_CFG_FLAG_LOCAL; + } + + return (pf); +} + fib_node_index_t fib_path_list_create_special (fib_protocol_t nh_proto, fib_path_list_flags_t flags, @@ -758,11 +732,10 @@ fib_path_list_create_special (fib_protocol_t nh_proto, path_list = fib_path_list_alloc(&path_list_index); path_list->fpl_flags = flags; - path_list->fpl_nh_proto = nh_proto; path_index = fib_path_create_special(path_list_index, - path_list->fpl_nh_proto, + nh_proto, fib_path_list_flags_2_path_flags(flags), dpo); vec_add1(path_list->fpl_paths, path_index); @@ -775,6 +748,30 @@ fib_path_list_create_special (fib_protocol_t nh_proto, return (path_list_index); } +/* + * return the index info the path-lists's vector of paths, of the matching path. + * ~0 if not found + */ +u32 +fib_path_list_find_rpath (fib_node_index_t path_list_index, + const fib_route_path_t *rpath) +{ + fib_path_list_t *path_list; + u32 ii; + + path_list = fib_path_list_get(path_list_index); + + vec_foreach_index (ii, path_list->fpl_paths) + { + if (!fib_path_cmp_w_route_path(path_list->fpl_paths[ii], rpath)) + { + return (ii); + } + } + return (~0); +} + + /* * fib_path_list_copy_and_path_add * @@ -782,13 +779,62 @@ fib_path_list_create_special (fib_protocol_t nh_proto, * The path-list returned could either have been newly created, or * can be a shared path-list from the data-base. */ +fib_node_index_t +fib_path_list_path_add (fib_node_index_t path_list_index, + const fib_route_path_t *rpaths) +{ + fib_node_index_t new_path_index, *orig_path_index; + fib_path_list_t *path_list; + + /* + * alloc the new list before we retrieve the old one, lest + * the alloc result in a realloc + */ + path_list = fib_path_list_get(path_list_index); + + ASSERT(1 == vec_len(rpaths)); + ASSERT(!(path_list->fpl_flags & FIB_PATH_LIST_FLAG_SHARED)); + + FIB_PATH_LIST_DBG(orig_path_list, "path-add"); + + new_path_index = fib_path_create(path_list_index, + rpaths); + + vec_foreach (orig_path_index, path_list->fpl_paths) + { + /* + * don't add duplicate paths + */ + if (0 == fib_path_cmp(new_path_index, *orig_path_index)) + { + return (*orig_path_index); + } + } + + /* + * Add the new path - no sort, no sharing, no key.. + */ + vec_add1(path_list->fpl_paths, new_path_index); + + FIB_PATH_LIST_DBG(path_list, "path-added"); + + /* + * no shared path list requested. resolve and use the one + * just created. + */ + fib_path_resolve(new_path_index); + + return (new_path_index); +} + fib_node_index_t fib_path_list_copy_and_path_add (fib_node_index_t orig_path_list_index, - fib_path_list_flags_t flags, - const fib_route_path_t *rpaths) + fib_path_list_flags_t flags, + const fib_route_path_t *rpaths) { fib_node_index_t path_index, new_path_index, *orig_path_index; fib_path_list_t *path_list, *orig_path_list; + fib_node_index_t exist_path_list_index; fib_node_index_t path_list_index; fib_node_index_t pi; @@ -806,13 +852,11 @@ fib_path_list_copy_and_path_add (fib_node_index_t orig_path_list_index, flags = fib_path_list_flags_fixup(flags); path_list->fpl_flags = flags; - path_list->fpl_nh_proto = orig_path_list->fpl_nh_proto; + vec_validate(path_list->fpl_paths, vec_len(orig_path_list->fpl_paths)); pi = 0; new_path_index = fib_path_create(path_list_index, - path_list->fpl_nh_proto, - fib_path_list_flags_2_path_flags(flags), rpaths); vec_foreach (orig_path_index, orig_path_list->fpl_paths) @@ -845,46 +889,79 @@ fib_path_list_copy_and_path_add (fib_node_index_t orig_path_list_index, FIB_PATH_LIST_DBG(path_list, "path-added"); /* - * If a shared path list is requested, consult the DB for a match + * check for a matching path-list in the DB. + * If we find one then we can return the existing one and destroy the + * new one just created. */ - if (path_list->fpl_flags & FIB_PATH_LIST_FLAG_SHARED) + exist_path_list_index = fib_path_list_db_find(path_list); + if (FIB_NODE_INDEX_INVALID != exist_path_list_index) { - fib_node_index_t exist_path_list_index; - /* - * check for a matching path-list in the DB. - * If we find one then we can return the existing one and destroy the - * new one just created. - */ - exist_path_list_index = fib_path_list_db_find(path_list); - if (FIB_NODE_INDEX_INVALID != exist_path_list_index) - { - fib_path_list_destroy(path_list); + fib_path_list_destroy(path_list); - path_list_index = exist_path_list_index; - } - else - { - /* - * if there was not a matching path-list, then this - * new one will need inserting into the DB and resolving. - */ - fib_path_list_db_insert(path_list_index); - - path_list = fib_path_list_resolve(path_list); - } + path_list_index = exist_path_list_index; } else { - /* - * no shared path list requested. resolve and use the one - * just created. - */ - path_list = fib_path_list_resolve(path_list); + /* + * if there was not a matching path-list, then this + * new one will need inserting into the DB and resolving. + */ + fib_path_list_db_insert(path_list_index); + + path_list = fib_path_list_resolve(path_list); } return (path_list_index); } +/* + * fib_path_list_path_remove + */ +fib_node_index_t +fib_path_list_path_remove (fib_node_index_t path_list_index, + const fib_route_path_t *rpaths) +{ + fib_node_index_t match_path_index, tmp_path_index; + fib_path_list_t *path_list; + fib_node_index_t pi; + + path_list = fib_path_list_get(path_list_index); + + ASSERT(1 == vec_len(rpaths)); + ASSERT(!(path_list->fpl_flags & FIB_PATH_LIST_FLAG_SHARED)); + + FIB_PATH_LIST_DBG(orig_path_list, "path-remove"); + + /* + * create a representation of the path to be removed, so it + * can be used as a comparison object during the copy. + */ + tmp_path_index = fib_path_create(path_list_index, + rpaths); + match_path_index = FIB_NODE_INDEX_INVALID; + + vec_foreach_index (pi, path_list->fpl_paths) + { + if (0 == fib_path_cmp(tmp_path_index, + path_list->fpl_paths[pi])) + { + /* + * match - remove it + */ + match_path_index = path_list->fpl_paths[pi]; + fib_path_destroy(match_path_index); + vec_del1(path_list->fpl_paths, pi); + } + } + + /* + * done with the temporary now + */ + fib_path_destroy(tmp_path_index); + + return (match_path_index); +} + /* * fib_path_list_copy_and_path_remove * @@ -911,7 +988,6 @@ fib_path_list_copy_and_path_remove (fib_node_index_t orig_path_list_index, FIB_PATH_LIST_DBG(orig_path_list, "copy-remove"); path_list->fpl_flags = flags; - path_list->fpl_nh_proto = orig_path_list->fpl_nh_proto; /* * allocate as many paths as we might need in one go, rather than * using vec_add to do a few at a time. @@ -927,8 +1003,6 @@ fib_path_list_copy_and_path_remove (fib_node_index_t orig_path_list_index, * can be used as a comparison object during the copy. */ tmp_path_index = fib_path_create(path_list_index, - path_list->fpl_nh_proto, - fib_path_list_flags_2_path_flags(flags), rpaths); vec_foreach (orig_path_index, orig_path_list->fpl_paths) diff --git a/src/vnet/fib/fib_path_list.h b/src/vnet/fib/fib_path_list.h index b4971add..9d246211 100644 --- a/src/vnet/fib/fib_path_list.h +++ b/src/vnet/fib/fib_path_list.h @@ -38,6 +38,11 @@ typedef enum fib_path_list_attribute_t_ { * be searched for each route update. */ FIB_PATH_LIST_ATTRIBUTE_SHARED = FIB_PATH_LIST_ATTRIBUTE_FIRST, + /** + * Indexed means the path-list keeps a hash table of all paths for + * fast lookup. The lookup result is the fib_node_index of the path. + */ + FIB_PATH_LIST_ATTRIBUTE_INDEXED, /** * explicit drop path-list. Used when the entry source needs to * force a drop, despite the fact the path info is present. @@ -73,6 +78,7 @@ typedef enum fib_path_list_attribute_t_ { typedef enum fib_path_list_flags_t_ { FIB_PATH_LIST_FLAG_NONE = 0, FIB_PATH_LIST_FLAG_SHARED = (1 << FIB_PATH_LIST_ATTRIBUTE_SHARED), + FIB_PATH_LIST_FLAG_INDEXED = (1 << FIB_PATH_LIST_ATTRIBUTE_INDEXED), FIB_PATH_LIST_FLAG_DROP = (1 << FIB_PATH_LIST_ATTRIBUTE_DROP), FIB_PATH_LIST_FLAG_LOCAL = (1 << FIB_PATH_LIST_ATTRIBUTE_LOCAL), FIB_PATH_LIST_FLAG_EXCLUSIVE = (1 << FIB_PATH_LIST_ATTRIBUTE_EXCLUSIVE), @@ -83,10 +89,11 @@ typedef enum fib_path_list_flags_t_ { #define FIB_PATH_LIST_ATTRIBUTES { \ [FIB_PATH_LIST_ATTRIBUTE_SHARED] = "shared", \ + [FIB_PATH_LIST_ATTRIBUTE_INDEXED] = "indexed", \ [FIB_PATH_LIST_ATTRIBUTE_RESOLVED] = "resolved", \ [FIB_PATH_LIST_ATTRIBUTE_DROP] = "drop", \ [FIB_PATH_LIST_ATTRIBUTE_EXCLUSIVE] = "exclusive", \ - [FIB_PATH_LIST_ATTRIBUTE_LOCAL] = "local", \ + [FIB_PATH_LIST_ATTRIBUTE_LOCAL] = "local", \ [FIB_PATH_LIST_ATTRIBUTE_LOOPED] = "looped", \ [FIB_PATH_LIST_ATTRIBUTE_NO_URPF] = "no-uRPF", \ } @@ -110,6 +117,13 @@ extern fib_node_index_t fib_path_list_copy_and_path_remove( fib_node_index_t pl_index, fib_path_list_flags_t flags, const fib_route_path_t *path); +extern fib_node_index_t fib_path_list_path_add ( + fib_node_index_t path_list_index, + const fib_route_path_t *rpaths); +extern fib_node_index_t fib_path_list_path_remove ( + fib_node_index_t path_list_index, + const fib_route_path_t *rpaths); + extern u32 fib_path_list_get_n_paths(fib_node_index_t pl_index); extern void fib_path_list_contribute_forwarding(fib_node_index_t path_list_index, @@ -137,11 +151,11 @@ extern int fib_path_list_is_looped(fib_node_index_t path_list_index); extern fib_protocol_t fib_path_list_get_proto(fib_node_index_t path_list_index); extern u8 * fib_path_list_format(fib_node_index_t pl_index, u8 * s); -extern u8 * fib_path_list_adjs_format(fib_node_index_t pl_index, - u32 indent, - u8 * s); extern index_t fib_path_list_lb_map_add_or_lock(fib_node_index_t pl_index, const fib_node_index_t *pis); +extern u32 fib_path_list_find_rpath (fib_node_index_t path_list_index, + const fib_route_path_t *rpath); + /** * A callback function type for walking a path-list's paths */ diff --git a/src/vnet/fib/fib_table.c b/src/vnet/fib/fib_table.c index 6c3162e7..b31f35e3 100644 --- a/src/vnet/fib/fib_table.c +++ b/src/vnet/fib/fib_table.c @@ -475,8 +475,21 @@ fib_table_entry_special_remove (u32 fib_index, */ static void fib_table_route_path_fixup (const fib_prefix_t *prefix, + fib_entry_flag_t eflags, fib_route_path_t *path) { + /* + * not all zeros next hop && + * is recursive path && + * nexthop is same as the route's address + */ + if ((!ip46_address_is_zero(&path->frp_addr)) && + (~0 == path->frp_sw_if_index) && + (0 == ip46_address_cmp(&path->frp_addr, &prefix->fp_addr))) + { + /* Prefix recurses via itse;f */ + path->frp_flags |= FIB_ROUTE_PATH_DROP; + } if (fib_prefix_is_host(prefix) && ip46_address_is_zero(&path->frp_addr) && path->frp_sw_if_index != ~0) @@ -484,7 +497,19 @@ fib_table_route_path_fixup (const fib_prefix_t *prefix, path->frp_addr = prefix->fp_addr; path->frp_flags |= FIB_ROUTE_PATH_ATTACHED; } -} + if (eflags & FIB_ENTRY_FLAG_DROP) + { + path->frp_flags |= FIB_ROUTE_PATH_DROP; + } + if (eflags & FIB_ENTRY_FLAG_LOCAL) + { + path->frp_flags |= FIB_ROUTE_PATH_LOCAL; + } + if (eflags & FIB_ENTRY_FLAG_EXCLUSIVE) + { + path->frp_flags |= FIB_ROUTE_PATH_EXCLUSIVE; + } +} fib_node_index_t fib_table_entry_path_add (u32 fib_index, @@ -536,7 +561,7 @@ fib_table_entry_path_add2 (u32 fib_index, for (ii = 0; ii < vec_len(rpath); ii++) { - fib_table_route_path_fixup(prefix, &rpath[ii]); + fib_table_route_path_fixup(prefix, flags, &rpath[ii]); } if (FIB_NODE_INDEX_INVALID == fib_entry_index) @@ -583,11 +608,6 @@ fib_table_entry_path_remove2 (u32 fib_index, fib_table = fib_table_get(fib_index, prefix->fp_proto); fib_entry_index = fib_table_lookup_exact_match_i(fib_table, prefix); - for (ii = 0; ii < vec_len(rpath); ii++) - { - fib_table_route_path_fixup(prefix, &rpath[ii]); - } - if (FIB_NODE_INDEX_INVALID == fib_entry_index) { /* @@ -605,6 +625,15 @@ fib_table_entry_path_remove2 (u32 fib_index, fib_entry_lock(fib_entry_index); was_sourced = fib_entry_is_sourced(fib_entry_index, source); + for (ii = 0; ii < vec_len(rpath); ii++) + { + fib_table_route_path_fixup( + prefix, + fib_entry_get_flags_for_source(fib_entry_index, + source), + &rpath[ii]); + } + src_flag = fib_entry_path_remove(fib_entry_index, source, rpath); if (!(FIB_ENTRY_SRC_FLAG_ADDED & src_flag)) @@ -661,7 +690,6 @@ fib_table_entry_path_remove (u32 fib_index, }; fib_route_path_t *paths = NULL; - fib_table_route_path_fixup(prefix, &path); vec_add1(paths, path); fib_table_entry_path_remove2(fib_index, prefix, source, paths); @@ -692,7 +720,7 @@ fib_table_entry_update (u32 fib_index, for (ii = 0; ii < vec_len(paths); ii++) { - fib_table_route_path_fixup(prefix, &paths[ii]); + fib_table_route_path_fixup(prefix, flags, &paths[ii]); } /* * sort the paths provided by the control plane. this means @@ -750,7 +778,6 @@ fib_table_entry_update_one_path (u32 fib_index, }; fib_route_path_t *paths = NULL; - fib_table_route_path_fixup(prefix, &path); vec_add1(paths, path); fib_entry_index = diff --git a/src/vnet/fib/fib_test.c b/src/vnet/fib/fib_test.c index 3c9b8a38..e4a8a70e 100644 --- a/src/vnet/fib/fib_test.c +++ b/src/vnet/fib/fib_test.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include @@ -271,6 +273,7 @@ typedef enum fib_test_lb_bucket_type_t_ { FT_LB_O_LB, FT_LB_SPECIAL, FT_LB_ADJ, + FT_LB_INTF, } fib_test_lb_bucket_type_t; typedef struct fib_test_lb_bucket_t_ { @@ -315,6 +318,31 @@ typedef struct fib_test_lb_bucket_t_ { }; } fib_test_lb_bucket_t; +typedef enum fib_test_rep_bucket_type_t_ { + FT_REP_LABEL_O_ADJ, + FT_REP_DISP_MFIB_LOOKUP, + FT_REP_INTF, +} fib_test_rep_bucket_type_t; + +typedef struct fib_test_rep_bucket_t_ { + fib_test_rep_bucket_type_t type; + + union + { + struct + { + mpls_eos_bit_t eos; + mpls_label_t label; + u8 ttl; + adj_index_t adj; + } label_o_adj; + struct + { + adj_index_t adj; + } adj; + }; +} fib_test_rep_bucket_t; + #define FIB_TEST_LB(_cond, _comment, _args...) \ { \ if (!FIB_TEST_I(_cond, _comment, ##_args)) { \ @@ -322,7 +350,83 @@ typedef struct fib_test_lb_bucket_t_ { } \ } -static int +int +fib_test_validate_rep_v (const replicate_t *rep, + u16 n_buckets, + va_list ap) +{ + const fib_test_rep_bucket_t *exp; + const dpo_id_t *dpo; + int bucket; + + FIB_TEST_LB((n_buckets == rep->rep_n_buckets), + "n_buckets = %d", rep->rep_n_buckets); + + for (bucket = 0; bucket < n_buckets; bucket++) + { + exp = va_arg(ap, fib_test_rep_bucket_t*); + + dpo = replicate_get_bucket_i(rep, bucket); + + switch (exp->type) + { + case FT_REP_LABEL_O_ADJ: + { + const mpls_label_dpo_t *mld; + mpls_label_t hdr; + FIB_TEST_LB((DPO_MPLS_LABEL == dpo->dpoi_type), + "bucket %d stacks on %U", + bucket, + format_dpo_type, dpo->dpoi_type); + + mld = mpls_label_dpo_get(dpo->dpoi_index); + hdr = clib_net_to_host_u32(mld->mld_hdr[0].label_exp_s_ttl); + + FIB_TEST_LB((vnet_mpls_uc_get_label(hdr) == + exp->label_o_adj.label), + "bucket %d stacks on label %d", + bucket, + exp->label_o_adj.label); + + FIB_TEST_LB((vnet_mpls_uc_get_s(hdr) == + exp->label_o_adj.eos), + "bucket %d stacks on label %d %U", + bucket, + exp->label_o_adj.label, + format_mpls_eos_bit, exp->label_o_adj.eos); + + FIB_TEST_LB((DPO_ADJACENCY_INCOMPLETE == mld->mld_dpo.dpoi_type), + "bucket %d label stacks on %U", + bucket, + format_dpo_type, mld->mld_dpo.dpoi_type); + + FIB_TEST_LB((exp->label_o_adj.adj == mld->mld_dpo.dpoi_index), + "bucket %d label stacks on adj %d", + bucket, + exp->label_o_adj.adj); + } + break; + case FT_REP_INTF: + FIB_TEST_LB((DPO_INTERFACE == dpo->dpoi_type), + "bucket %d stacks on %U", + bucket, + format_dpo_type, dpo->dpoi_type); + + FIB_TEST_LB((exp->adj.adj == dpo->dpoi_index), + "bucket %d stacks on adj %d", + bucket, + exp->adj.adj); + break; + case FT_REP_DISP_MFIB_LOOKUP: +// ASSERT(0); + break; + } + } + + return (!0); +} + +int fib_test_validate_lb_v (const load_balance_t *lb, u16 n_buckets, va_list ap) @@ -484,6 +588,16 @@ fib_test_validate_lb_v (const load_balance_t *lb, bucket, exp->adj.adj); break; + case FT_LB_INTF: + FIB_TEST_I((DPO_INTERFACE == dpo->dpoi_type), + "bucket %d stacks on %U", + bucket, + format_dpo_type, dpo->dpoi_type); + FIB_TEST_LB((exp->adj.adj == dpo->dpoi_index), + "bucket %d stacks on adj %d", + bucket, + exp->adj.adj); + break; case FT_LB_O_LB: FIB_TEST_I((DPO_LOAD_BALANCE == dpo->dpoi_type), "bucket %d stacks on %U", @@ -509,14 +623,13 @@ fib_test_validate_lb_v (const load_balance_t *lb, return (!0); } -static int +int fib_test_validate_entry (fib_node_index_t fei, fib_forward_chain_type_t fct, u16 n_buckets, ...) { dpo_id_t dpo = DPO_INVALID; - const load_balance_t *lb; fib_prefix_t pfx; index_t fw_lbi; u32 fib_index; @@ -529,47 +642,59 @@ fib_test_validate_entry (fib_node_index_t fei, fib_index = fib_entry_get_fib_index(fei); fib_entry_contribute_forwarding(fei, fct, &dpo); - FIB_TEST_LB((DPO_LOAD_BALANCE == dpo.dpoi_type), - "Entry links to %U", - format_dpo_type, dpo.dpoi_type); - lb = load_balance_get(dpo.dpoi_index); - - res = fib_test_validate_lb_v(lb, n_buckets, ap); + if (DPO_REPLICATE == dpo.dpoi_type) + { + const replicate_t *rep; - /* - * ensure that the LB contributed by the entry is the - * same as the LB in the forwarding tables - */ - if (fct == fib_entry_get_default_chain_type(fib_entry_get(fei))) + rep = replicate_get(dpo.dpoi_index); + res = fib_test_validate_rep_v(rep, n_buckets, ap); + } + else { - switch (pfx.fp_proto) - { - case FIB_PROTOCOL_IP4: - fw_lbi = ip4_fib_forwarding_lookup(fib_index, &pfx.fp_addr.ip4); - break; - case FIB_PROTOCOL_IP6: - fw_lbi = ip6_fib_table_fwding_lookup(&ip6_main, fib_index, &pfx.fp_addr.ip6); - break; - case FIB_PROTOCOL_MPLS: - { - mpls_unicast_header_t hdr = { - .label_exp_s_ttl = 0, - }; + const load_balance_t *lb; + + FIB_TEST_LB((DPO_LOAD_BALANCE == dpo.dpoi_type), + "Entry links to %U", + format_dpo_type, dpo.dpoi_type); - vnet_mpls_uc_set_label(&hdr.label_exp_s_ttl, pfx.fp_label); - vnet_mpls_uc_set_s(&hdr.label_exp_s_ttl, pfx.fp_eos); - hdr.label_exp_s_ttl = clib_host_to_net_u32(hdr.label_exp_s_ttl); + lb = load_balance_get(dpo.dpoi_index); + res = fib_test_validate_lb_v(lb, n_buckets, ap); - fw_lbi = mpls_fib_table_forwarding_lookup(fib_index, &hdr); + /* + * ensure that the LB contributed by the entry is the + * same as the LB in the forwarding tables + */ + if (fct == fib_entry_get_default_chain_type(fib_entry_get(fei))) + { + switch (pfx.fp_proto) + { + case FIB_PROTOCOL_IP4: + fw_lbi = ip4_fib_forwarding_lookup(fib_index, &pfx.fp_addr.ip4); + break; + case FIB_PROTOCOL_IP6: + fw_lbi = ip6_fib_table_fwding_lookup(&ip6_main, fib_index, &pfx.fp_addr.ip6); break; + case FIB_PROTOCOL_MPLS: + { + mpls_unicast_header_t hdr = { + .label_exp_s_ttl = 0, + }; + + vnet_mpls_uc_set_label(&hdr.label_exp_s_ttl, pfx.fp_label); + vnet_mpls_uc_set_s(&hdr.label_exp_s_ttl, pfx.fp_eos); + hdr.label_exp_s_ttl = clib_host_to_net_u32(hdr.label_exp_s_ttl); + + fw_lbi = mpls_fib_table_forwarding_lookup(fib_index, &hdr); + break; + } + default: + fw_lbi = 0; } - default: - fw_lbi = 0; + FIB_TEST_LB((fw_lbi == dpo.dpoi_index), + "Contributed LB = FW LB: %U\n %U", + format_load_balance, fw_lbi, 0, + format_load_balance, dpo.dpoi_index, 0); } - FIB_TEST_LB((fw_lbi == dpo.dpoi_index), - "Contributed LB = FW LB: %U\n %U", - format_load_balance, fw_lbi, 0, - format_load_balance, dpo.dpoi_index, 0); } dpo_reset(&dpo); @@ -1289,6 +1414,7 @@ fib_test_v4 (void) lookup_dpo_add_or_lock_w_fib_index(fib_index, DPO_PROTO_IP4, + LOOKUP_UNICAST, LOOKUP_INPUT_DST_ADDR, LOOKUP_TABLE_FROM_CONFIG, &ex_dpo); @@ -2605,7 +2731,6 @@ fib_test_v4 (void) NULL, FIB_ROUTE_PATH_FLAG_NONE); - fei = fib_table_lookup(fib_index, &pfx_5_5_5_6_s_32); dpo1 = fib_entry_contribute_ip_forwarding(fei); @@ -7493,6 +7618,7 @@ lfib_test (void) fib_route_path_t *rpaths = NULL, rpath = { .frp_proto = FIB_PROTOCOL_MPLS, .frp_local_label = 1200, + .frp_eos = MPLS_NON_EOS, .frp_sw_if_index = ~0, // recurive .frp_fib_index = 0, // Default MPLS fib .frp_weight = 1, @@ -7607,6 +7733,146 @@ lfib_test (void) dpo_reset(&ip_1200); + /* + * An rx-interface route. + * like the tail of an mcast LSP + */ + dpo_id_t idpo = DPO_INVALID; + + interface_dpo_add_or_lock(DPO_PROTO_IP4, + tm->hw[0]->sw_if_index, + &idpo); + + fib_prefix_t pfx_2500 = { + .fp_len = 21, + .fp_proto = FIB_PROTOCOL_MPLS, + .fp_label = 2500, + .fp_eos = MPLS_EOS, + .fp_payload_proto = DPO_PROTO_IP4, + }; + fib_test_lb_bucket_t rx_intf_0 = { + .type = FT_LB_INTF, + .adj = { + .adj = idpo.dpoi_index, + }, + }; + + lfe = fib_table_entry_update_one_path(fib_index, + &pfx_2500, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_NONE, + FIB_PROTOCOL_IP4, + NULL, + tm->hw[0]->sw_if_index, + ~0, // invalid fib index + 0, + NULL, + FIB_ROUTE_PATH_INTF_RX); + FIB_TEST(fib_test_validate_entry(lfe, + FIB_FORW_CHAIN_TYPE_MPLS_EOS, + 1, + &rx_intf_0), + "2500 rx-interface 0"); + fib_table_entry_delete(fib_index, &pfx_2500, FIB_SOURCE_API); + + /* + * An MPLS mulicast entry + */ + fib_prefix_t pfx_3500 = { + .fp_len = 21, + .fp_proto = FIB_PROTOCOL_MPLS, + .fp_label = 3500, + .fp_eos = MPLS_EOS, + .fp_payload_proto = DPO_PROTO_IP4, + }; + fib_test_rep_bucket_t mc_0 = { + .type = FT_REP_LABEL_O_ADJ, + .label_o_adj = { + .adj = ai_mpls_10_10_10_1, + .label = 3300, + .eos = MPLS_EOS, + }, + }; + fib_test_rep_bucket_t mc_intf_0 = { + .type = FT_REP_INTF, + .adj = { + .adj = idpo.dpoi_index, + }, + }; + mpls_label_t *l3300 = NULL; + vec_add1(l3300, 3300); + + lfe = fib_table_entry_update_one_path(lfib_index, + &pfx_3500, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_MULTICAST, + FIB_PROTOCOL_IP4, + &nh_10_10_10_1, + tm->hw[0]->sw_if_index, + ~0, // invalid fib index + 1, + l3300, + FIB_ROUTE_PATH_FLAG_NONE); + FIB_TEST(fib_test_validate_entry(lfe, + FIB_FORW_CHAIN_TYPE_MPLS_EOS, + 1, + &mc_0), + "3500 via replicate over 10.10.10.1"); + + /* + * MPLS Bud-node. Add a replication via an interface-receieve path + */ + lfe = fib_table_entry_path_add(lfib_index, + &pfx_3500, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_MULTICAST, + FIB_PROTOCOL_IP4, + NULL, + tm->hw[0]->sw_if_index, + ~0, // invalid fib index + 0, + NULL, + FIB_ROUTE_PATH_INTF_RX); + FIB_TEST(fib_test_validate_entry(lfe, + FIB_FORW_CHAIN_TYPE_MPLS_EOS, + 2, + &mc_0, + &mc_intf_0), + "3500 via replicate over 10.10.10.1 and interface-rx"); + + /* + * Add a replication via an interface-free for-us path + */ + fib_test_rep_bucket_t mc_disp = { + .type = FT_REP_DISP_MFIB_LOOKUP, + .adj = { + .adj = idpo.dpoi_index, + }, + }; + lfe = fib_table_entry_path_add(lfib_index, + &pfx_3500, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_MULTICAST, + FIB_PROTOCOL_IP4, + NULL, + 5, // rpf-id + 0, // default table + 0, + NULL, + FIB_ROUTE_PATH_RPF_ID); + FIB_TEST(fib_test_validate_entry(lfe, + FIB_FORW_CHAIN_TYPE_MPLS_EOS, + 3, + &mc_0, + &mc_disp, + &mc_intf_0), + "3500 via replicate over 10.10.10.1 and interface-rx"); + + + + fib_table_entry_delete(fib_index, &pfx_3500, FIB_SOURCE_API); + dpo_reset(&idpo); + /* * cleanup */ @@ -7617,6 +7883,9 @@ lfib_test (void) FIB_TEST(lb_count == pool_elts(load_balance_pool), "Load-balance resources freed %d of %d", lb_count, pool_elts(load_balance_pool)); + FIB_TEST(0 == pool_elts(interface_dpo_pool), + "interface_dpo resources freed %d of %d", + 0, pool_elts(interface_dpo_pool)); return (0); } diff --git a/src/vnet/fib/fib_test.h b/src/vnet/fib/fib_test.h new file mode 100644 index 00000000..b98680bf --- /dev/null +++ b/src/vnet/fib/fib_test.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FIB_TEST_H__ +#define __FIB_TEST_H__ + +#include + +typedef enum fib_test_lb_bucket_type_t_ { + FT_LB_LABEL_O_ADJ, + FT_LB_LABEL_STACK_O_ADJ, + FT_LB_LABEL_O_LB, + FT_LB_O_LB, + FT_LB_SPECIAL, + FT_LB_ADJ, + FT_LB_INTF, +} fib_test_lb_bucket_type_t; + +typedef struct fib_test_lb_bucket_t_ { + fib_test_lb_bucket_type_t type; + + union + { + struct + { + mpls_eos_bit_t eos; + mpls_label_t label; + u8 ttl; + adj_index_t adj; + } label_o_adj; + struct + { + mpls_eos_bit_t eos; + mpls_label_t label_stack[8]; + u8 label_stack_size; + u8 ttl; + adj_index_t adj; + } label_stack_o_adj; + struct + { + mpls_eos_bit_t eos; + mpls_label_t label; + u8 ttl; + index_t lb; + } label_o_lb; + struct + { + index_t adj; + } adj; + struct + { + index_t lb; + } lb; + struct + { + index_t adj; + } special; + }; +} fib_test_lb_bucket_t; + +typedef enum fib_test_rep_bucket_type_t_ { + FT_REP_LABEL_O_ADJ, + FT_REP_INTF, +} fib_test_rep_bucket_type_t; + +typedef struct fib_test_rep_bucket_t_ { + fib_test_rep_bucket_type_t type; + + union + { + struct + { + mpls_eos_bit_t eos; + mpls_label_t label; + u8 ttl; + adj_index_t adj; + } label_o_adj; + struct + { + adj_index_t adj; + } adj; + }; +} fib_test_rep_bucket_t; + + +extern int fib_test_validate_rep_v(const replicate_t *rep, + u16 n_buckets, + va_list ap); + +extern int fib_test_validate_lb_v(const load_balance_t *lb, + u16 n_buckets, + va_list ap); + +extern int fib_test_validate_entry(fib_node_index_t fei, + fib_forward_chain_type_t fct, + u16 n_buckets, + ...); + +#endif diff --git a/src/vnet/fib/fib_types.c b/src/vnet/fib/fib_types.c index 2837a59d..8165f3eb 100644 --- a/src/vnet/fib/fib_types.c +++ b/src/vnet/fib/fib_types.c @@ -66,12 +66,13 @@ fib_prefix_from_ip46_addr (const ip46_address_t *addr, void fib_prefix_from_mpls_label (mpls_label_t label, + mpls_eos_bit_t eos, fib_prefix_t *pfx) { pfx->fp_proto = FIB_PROTOCOL_MPLS; pfx->fp_len = 21; pfx->fp_label = label; - pfx->fp_eos = MPLS_NON_EOS; + pfx->fp_eos = eos; } int @@ -194,17 +195,7 @@ fib_route_path_cmp (const fib_route_path_t *rpath1, if (0 != res) return (res); - if (~0 != rpath1->frp_sw_if_index && - ~0 != rpath2->frp_sw_if_index) - { - res = vnet_sw_interface_compare(vnet_get_main(), - rpath1->frp_sw_if_index, - rpath2->frp_sw_if_index); - } - else - { - res = rpath1->frp_sw_if_index - rpath2->frp_sw_if_index; - } + res = (rpath1->frp_sw_if_index - rpath2->frp_sw_if_index); if (0 != res) return (res); diff --git a/src/vnet/fib/fib_types.h b/src/vnet/fib/fib_types.h index 1c5299a9..4cb73e8a 100644 --- a/src/vnet/fib/fib_types.h +++ b/src/vnet/fib/fib_types.h @@ -286,8 +286,36 @@ typedef enum fib_route_path_flags_t_ * Attached path */ FIB_ROUTE_PATH_ATTACHED = (1 << 3), + /** + * A Drop path - resolve the path on the drop DPO + */ + FIB_ROUTE_PATH_DROP = (1 << 4), + /** + * Don't resolve the path, use the DPO the client provides + */ + FIB_ROUTE_PATH_EXCLUSIVE = (1 << 5), + /** + * A path that result in received traffic being recieved/recirculated + * so that it appears to have arrived on the new interface + */ + FIB_ROUTE_PATH_INTF_RX = (1 << 6), + /** + * A local path with a RPF-ID => multicast traffic + */ + FIB_ROUTE_PATH_RPF_ID = (1 << 7), } fib_route_path_flags_t; +/** + * An RPF-ID is numerical value that is used RPF validate. An entry + * has-a RPF-ID, when a packet egress from (e.g. an LSP) it gains an + * RPF-ID, these two are compared for the RPF check. + * This replaces the interfce based chack (since the LSP has no associated + * interface. + */ +typedef u32 fib_rpf_id_t; + +#define MFIB_RPF_ID_NONE (0) + /** * @brief * A representation of a path as described by a route producer. @@ -321,17 +349,29 @@ typedef struct fib_route_path_t_ { */ ip46_address_t frp_addr; - /** - * The MPLS local Label to reursively resolve through. - * This is valid when the path type is MPLS. - */ - mpls_label_t frp_local_label; + struct { + /** + * The MPLS local Label to reursively resolve through. + * This is valid when the path type is MPLS. + */ + mpls_label_t frp_local_label; + /** + * EOS bit for the resolving label + */ + mpls_eos_bit_t frp_eos; + }; + }; + union { + /** + * The interface. + * Will be invalid for recursive paths. + */ + u32 frp_sw_if_index; + /** + * The RPF-ID + */ + fib_rpf_id_t frp_rpf_id; }; - /** - * The interface. - * Will be invalid for recursive paths. - */ - u32 frp_sw_if_index; /** * The FIB index to lookup the nexthop * Only valid for recursive paths. diff --git a/src/vnet/fib/mpls_fib.c b/src/vnet/fib/mpls_fib.c index 4b2b76ea..19f9f3c1 100644 --- a/src/vnet/fib/mpls_fib.c +++ b/src/vnet/fib/mpls_fib.c @@ -165,6 +165,7 @@ mpls_fib_create_with_table_id (u32 table_id) lookup_dpo_add_or_lock_w_fib_index(0, // unused DPO_PROTO_IP4, + LOOKUP_UNICAST, LOOKUP_INPUT_DST_ADDR, LOOKUP_TABLE_FROM_INPUT_INTERFACE, &dpo); @@ -179,6 +180,7 @@ mpls_fib_create_with_table_id (u32 table_id) lookup_dpo_add_or_lock_w_fib_index(0, //unsued DPO_PROTO_MPLS, + LOOKUP_UNICAST, LOOKUP_INPUT_DST_ADDR, LOOKUP_TABLE_FROM_INPUT_INTERFACE, &dpo); @@ -197,6 +199,7 @@ mpls_fib_create_with_table_id (u32 table_id) lookup_dpo_add_or_lock_w_fib_index(0, //unused DPO_PROTO_IP6, + LOOKUP_UNICAST, LOOKUP_INPUT_DST_ADDR, LOOKUP_TABLE_FROM_INPUT_INTERFACE, &dpo); @@ -210,6 +213,7 @@ mpls_fib_create_with_table_id (u32 table_id) prefix.fp_eos = MPLS_NON_EOS; lookup_dpo_add_or_lock_w_fib_index(0, // unsued DPO_PROTO_MPLS, + LOOKUP_UNICAST, LOOKUP_INPUT_DST_ADDR, LOOKUP_TABLE_FROM_INPUT_INTERFACE, &dpo); @@ -320,8 +324,15 @@ mpls_fib_forwarding_table_update (mpls_fib_t *mf, { mpls_label_t key; - ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type); - + ASSERT((DPO_LOAD_BALANCE == dpo->dpoi_type) || + (DPO_REPLICATE == dpo->dpoi_type)); + if (CLIB_DEBUG > 0) + { + if (DPO_REPLICATE == dpo->dpoi_type) + ASSERT(dpo->dpoi_index & MPLS_IS_REPLICATE); + if (DPO_LOAD_BALANCE == dpo->dpoi_type) + ASSERT(!(dpo->dpoi_index & MPLS_IS_REPLICATE)); + } key = mpls_fib_entry_mk_key(label, eos); mf->mf_lbs[key] = dpo->dpoi_index; diff --git a/src/vnet/handoff.h b/src/vnet/handoff.h index 815206a9..04ba8bfb 100644 --- a/src/vnet/handoff.h +++ b/src/vnet/handoff.h @@ -150,7 +150,7 @@ eth_get_sym_key (ethernet_header_t * h0) ip->dst_address.as_u64[0] ^ ip->dst_address.as_u64[1] ^ ip->protocol); } - else if (h0->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST)) + else if (h0->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS)) { hash_key = mpls_get_key ((mpls_unicast_header_t *) (h0 + 1)); } @@ -179,8 +179,7 @@ eth_get_sym_key (ethernet_header_t * h0) ip->dst_address.as_u64[0] ^ ip->dst_address.as_u64[1] ^ ip->protocol); } - else if (outer->type == - clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST)) + else if (outer->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS)) { hash_key = mpls_get_key ((mpls_unicast_header_t *) (outer + 1)); } @@ -210,7 +209,7 @@ eth_get_key (ethernet_header_t * h0) { hash_key = ipv6_get_key ((ip6_header_t *) (h0 + 1)); } - else if (h0->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST)) + else if (h0->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS)) { hash_key = mpls_get_key ((mpls_unicast_header_t *) (h0 + 1)); } @@ -230,8 +229,7 @@ eth_get_key (ethernet_header_t * h0) { hash_key = ipv6_get_key ((ip6_header_t *) (outer + 1)); } - else if (outer->type == - clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST)) + else if (outer->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS)) { hash_key = mpls_get_key ((mpls_unicast_header_t *) (outer + 1)); } diff --git a/src/vnet/interface.c b/src/vnet/interface.c index 2a1e70e8..45417b2f 100644 --- a/src/vnet/interface.c +++ b/src/vnet/interface.c @@ -1360,7 +1360,7 @@ vnet_link_to_l3_proto (vnet_link_t link) case VNET_LINK_IP6: return (VNET_L3_PACKET_TYPE_IP6); case VNET_LINK_MPLS: - return (VNET_L3_PACKET_TYPE_MPLS_UNICAST); + return (VNET_L3_PACKET_TYPE_MPLS); case VNET_LINK_ARP: return (VNET_L3_PACKET_TYPE_ARP); case VNET_LINK_ETHERNET: diff --git a/src/vnet/ip/ip.api b/src/vnet/ip/ip.api index 5c2df32c..6af1714f 100644 --- a/src/vnet/ip/ip.api +++ b/src/vnet/ip/ip.api @@ -478,6 +478,7 @@ define ip_mroute_add_del u32 table_id; u32 entry_flags; u32 itf_flags; + u32 rpf_id; u16 grp_address_length; u8 create_vrf_if_needed; u8 is_add; @@ -518,6 +519,8 @@ manual_endian manual_print define ip_mfib_details { u32 context; u32 table_id; + u32 entry_flags; + u32 rpf_id; u8 address_length; u8 grp_address[4]; u8 src_address[4]; diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index fdfe7f63..9fdf9b3c 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -2752,6 +2752,16 @@ ip4_rewrite_mcast (vlib_main_t * vm, return ip4_rewrite_inline (vm, node, frame, 0, 0, 1); } +static uword +ip4_mcast_midchain (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + if (adj_are_counters_enabled ()) + return ip4_rewrite_inline (vm, node, frame, 1, 1, 1); + else + return ip4_rewrite_inline (vm, node, frame, 0, 1, 1); +} + /* *INDENT-OFF* */ VLIB_REGISTER_NODE (ip4_rewrite_node) = { .function = ip4_rewrite, @@ -2778,6 +2788,16 @@ VLIB_REGISTER_NODE (ip4_rewrite_mcast_node) = { }; VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_mcast_node, ip4_rewrite_mcast) +VLIB_REGISTER_NODE (ip4_mcast_midchain_node, static) = { + .function = ip4_mcast_midchain, + .name = "ip4-mcast-midchain", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_rewrite_trace, + .sibling_of = "ip4-rewrite", +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip4_mcast_midchain_node, ip4_mcast_midchain) + VLIB_REGISTER_NODE (ip4_midchain_node) = { .function = ip4_midchain, .name = "ip4-midchain", diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index c2fc4f87..a369f79f 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -2246,6 +2246,16 @@ ip6_midchain (vlib_main_t * vm, return ip6_rewrite_inline (vm, node, frame, 0, 1, 0); } +static uword +ip6_mcast_midchain (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + if (adj_are_counters_enabled ()) + return ip6_rewrite_inline (vm, node, frame, 1, 1, 1); + else + return ip6_rewrite_inline (vm, node, frame, 1, 1, 1); +} + /* *INDENT-OFF* */ VLIB_REGISTER_NODE (ip6_midchain_node) = { @@ -2290,6 +2300,19 @@ VLIB_REGISTER_NODE (ip6_rewrite_mcast_node) = VLIB_NODE_FUNCTION_MULTIARCH (ip6_rewrite_mcast_node, ip6_rewrite_mcast); +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ip6_mcast_midchain_node, static) = +{ + .function = ip6_mcast_midchain, + .name = "ip6-mcast-midchain", + .vector_size = sizeof (u32), + .format_trace = format_ip6_rewrite_trace, + .sibling_of = "ip6-rewrite", +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (ip6_mcast_midchain_node, ip6_mcast_midchain); + /* * Hop-by-Hop handling */ diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 2af546df..58b997aa 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -557,6 +557,7 @@ ip6_ethernet_update_adjacency (vnet_main_t * vnm, u32 sw_if_index, u32 ai) case IP_LOOKUP_NEXT_PUNT: case IP_LOOKUP_NEXT_LOCAL: case IP_LOOKUP_NEXT_REWRITE: + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: case IP_LOOKUP_NEXT_MIDCHAIN: case IP_LOOKUP_NEXT_ICMP_ERROR: case IP_LOOKUP_N_NEXT: diff --git a/src/vnet/ip/ip_api.c b/src/vnet/ip/ip_api.c index b9f1782b..9c9cb4a4 100644 --- a/src/vnet/ip/ip_api.c +++ b/src/vnet/ip/ip_api.c @@ -438,17 +438,20 @@ vl_api_ip6_fib_dump_t_handler (vl_api_ip6_fib_dump_t * mp) } static void -send_ip_mfib_details (vpe_api_main_t * am, - unix_shared_memory_queue_t * q, - u32 table_id, - mfib_prefix_t * pfx, - fib_route_path_encode_t * api_rpaths, u32 context) +send_ip_mfib_details (unix_shared_memory_queue_t * q, + u32 context, u32 table_id, fib_node_index_t mfei) { + fib_route_path_encode_t *api_rpath, *api_rpaths = NULL; vl_api_ip_mfib_details_t *mp; - fib_route_path_encode_t *api_rpath; + mfib_entry_t *mfib_entry; vl_api_fib_path_t *fp; + mfib_prefix_t pfx; int path_count; + mfib_entry = mfib_entry_get (mfei); + mfib_entry_get_prefix (mfei, &pfx); + mfib_entry_encode (mfei, &api_rpaths); + path_count = vec_len (api_rpaths); mp = vl_msg_api_alloc (sizeof (*mp) + path_count * sizeof (*fp)); if (!mp) @@ -457,12 +460,14 @@ send_ip_mfib_details (vpe_api_main_t * am, mp->_vl_msg_id = ntohs (VL_API_IP_FIB_DETAILS); mp->context = context; + mp->rpf_id = mfib_entry->mfe_rpf_id; + mp->entry_flags = mfib_entry->mfe_flags; mp->table_id = htonl (table_id); - mp->address_length = pfx->fp_len; - memcpy (mp->grp_address, &pfx->fp_grp_addr.ip4, - sizeof (pfx->fp_grp_addr.ip4)); - memcpy (mp->src_address, &pfx->fp_src_addr.ip4, - sizeof (pfx->fp_src_addr.ip4)); + mp->address_length = pfx.fp_len; + memcpy (mp->grp_address, &pfx.fp_grp_addr.ip4, + sizeof (pfx.fp_grp_addr.ip4)); + memcpy (mp->src_address, &pfx.fp_src_addr.ip4, + sizeof (pfx.fp_src_addr.ip4)); mp->count = htonl (path_count); fp = mp->path; @@ -475,6 +480,7 @@ send_ip_mfib_details (vpe_api_main_t * am, copy_fib_next_hop (api_rpath, fp); fp++; } + vec_free (api_rpaths); vl_msg_api_send_shmem (q, (u8 *) & mp); } @@ -497,13 +503,10 @@ vl_api_ip_mfib_table_dump_walk (fib_node_index_t fei, void *arg) static void vl_api_ip_mfib_dump_t_handler (vl_api_ip_mfib_dump_t * mp) { - vpe_api_main_t *am = &vpe_api_main; unix_shared_memory_queue_t *q; ip4_main_t *im = &ip4_main; mfib_table_t *mfib_table; fib_node_index_t *mfeip; - mfib_prefix_t pfx; - fib_route_path_encode_t *api_rpaths = NULL; vl_api_ip_mfib_dump_ctc_t ctx = { .entries = NULL, }; @@ -524,21 +527,16 @@ vl_api_ip_mfib_dump_t_handler (vl_api_ip_mfib_dump_t * mp) vec_foreach (mfeip, ctx.entries) { - mfib_entry_get_prefix (*mfeip, &pfx); - mfib_entry_encode (*mfeip, &api_rpaths); - send_ip_mfib_details (am, q, + send_ip_mfib_details (q, mp->context, mfib_table->mft_table_id, - &pfx, api_rpaths, - mp->context); + *mfeip); } - vec_reset_length (api_rpaths); vec_reset_length (ctx.entries); })); /* *INDENT-ON* */ vec_free (ctx.entries); - vec_free (api_rpaths); } static void @@ -705,10 +703,13 @@ add_del_route_t_handler (u8 is_multipath, u8 is_unreach, u8 is_prohibit, u8 is_local, + u8 is_multicast, u8 is_classify, u32 classify_table_index, u8 is_resolve_host, u8 is_resolve_attached, + u8 is_interface_rx, + u8 is_rpf_id, u32 fib_index, const fib_prefix_t * prefix, u8 next_hop_proto_is_ip4, @@ -731,16 +732,24 @@ add_del_route_t_handler (u8 is_multipath, .frp_label_stack = next_hop_out_label_stack, }; fib_route_path_t *paths = NULL; + fib_entry_flag_t entry_flags = FIB_ENTRY_FLAG_NONE; if (MPLS_LABEL_INVALID != next_hop_via_label) { path.frp_proto = FIB_PROTOCOL_MPLS; path.frp_local_label = next_hop_via_label; + path.frp_eos = MPLS_NON_EOS; } if (is_resolve_host) path_flags |= FIB_ROUTE_PATH_RESOLVE_VIA_HOST; if (is_resolve_attached) path_flags |= FIB_ROUTE_PATH_RESOLVE_VIA_ATTACHED; + if (is_interface_rx) + path_flags |= FIB_ROUTE_PATH_INTF_RX; + if (is_rpf_id) + path_flags |= FIB_ROUTE_PATH_RPF_ID; + if (is_multicast) + entry_flags |= FIB_ENTRY_FLAG_MULTICAST; path.frp_flags = path_flags; @@ -754,8 +763,7 @@ add_del_route_t_handler (u8 is_multipath, if (is_add) fib_table_entry_path_add2 (fib_index, prefix, - FIB_SOURCE_API, - FIB_ENTRY_FLAG_NONE, paths); + FIB_SOURCE_API, entry_flags, paths); else fib_table_entry_path_remove2 (fib_index, prefix, FIB_SOURCE_API, paths); @@ -826,8 +834,7 @@ add_del_route_t_handler (u8 is_multipath, { vec_add1 (paths, path); fib_table_entry_update (fib_index, - prefix, - FIB_SOURCE_API, FIB_ENTRY_FLAG_NONE, paths); + prefix, FIB_SOURCE_API, entry_flags, paths); vec_free (paths); } else @@ -847,7 +854,7 @@ add_del_route_check (fib_protocol_t table_proto, fib_protocol_t next_hop_table_proto, u32 next_hop_table_id, u8 create_missing_tables, - u32 * fib_index, u32 * next_hop_fib_index) + u8 is_rpf_id, u32 * fib_index, u32 * next_hop_fib_index) { vnet_main_t *vnm = vnet_get_main (); @@ -866,7 +873,7 @@ add_del_route_check (fib_protocol_t table_proto, } } - if (~0 != ntohl (next_hop_sw_if_index)) + if (!is_rpf_id && ~0 != ntohl (next_hop_sw_if_index)) { if (pool_is_free_index (vnm->interface_main.sw_interfaces, ntohl (next_hop_sw_if_index))) @@ -876,16 +883,27 @@ add_del_route_check (fib_protocol_t table_proto, } else { - *next_hop_fib_index = fib_table_find (next_hop_table_proto, - ntohl (next_hop_table_id)); + if (is_rpf_id) + *next_hop_fib_index = mfib_table_find (next_hop_table_proto, + ntohl (next_hop_table_id)); + else + *next_hop_fib_index = fib_table_find (next_hop_table_proto, + ntohl (next_hop_table_id)); if (~0 == *next_hop_fib_index) { if (create_missing_tables) { - *next_hop_fib_index = - fib_table_find_or_create_and_lock (next_hop_table_proto, - ntohl (next_hop_table_id)); + if (is_rpf_id) + *next_hop_fib_index = + mfib_table_find_or_create_and_lock (next_hop_table_proto, + ntohl + (next_hop_table_id)); + else + *next_hop_fib_index = + fib_table_find_or_create_and_lock (next_hop_table_proto, + ntohl + (next_hop_table_id)); } else { @@ -910,7 +928,7 @@ ip4_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp) mp->next_hop_sw_if_index, FIB_PROTOCOL_IP4, mp->next_hop_table_id, - mp->create_vrf_if_needed, + mp->create_vrf_if_needed, 0, &fib_index, &next_hop_fib_index); if (0 != rv) @@ -943,11 +961,11 @@ ip4_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp) mp->is_drop, mp->is_unreach, mp->is_prohibit, - mp->is_local, + mp->is_local, 0, mp->is_classify, mp->classify_table_index, mp->is_resolve_host, - mp->is_resolve_attached, + mp->is_resolve_attached, 0, 0, fib_index, &pfx, 1, &nh, ntohl (mp->next_hop_sw_if_index), @@ -969,7 +987,7 @@ ip6_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp) mp->next_hop_sw_if_index, FIB_PROTOCOL_IP6, mp->next_hop_table_id, - mp->create_vrf_if_needed, + mp->create_vrf_if_needed, 0, &fib_index, &next_hop_fib_index); if (0 != rv) @@ -1002,11 +1020,11 @@ ip6_add_del_route_t_handler (vl_api_ip_add_del_route_t * mp) mp->is_drop, mp->is_unreach, mp->is_prohibit, - mp->is_local, + mp->is_local, 0, mp->is_classify, mp->classify_table_index, mp->is_resolve_host, - mp->is_resolve_attached, + mp->is_resolve_attached, 0, 0, fib_index, &pfx, 0, &nh, ntohl (mp->next_hop_sw_if_index), next_hop_fib_index, @@ -1075,6 +1093,7 @@ mroute_add_del_handler (u8 is_add, u32 fib_index, const mfib_prefix_t * prefix, u32 entry_flags, + fib_rpf_id_t rpf_id, u32 next_hop_sw_if_index, u32 itf_flags) { stats_dslock_with_hint (1 /* release hint */ , 2 /* tag */ ); @@ -1091,7 +1110,7 @@ mroute_add_del_handler (u8 is_add, if (!is_local && ~0 == next_hop_sw_if_index) { mfib_table_entry_update (fib_index, prefix, - MFIB_SOURCE_API, entry_flags); + MFIB_SOURCE_API, rpf_id, entry_flags); } else { @@ -1152,6 +1171,7 @@ api_mroute_add_del_t_handler (vl_api_ip_mroute_add_del_t * mp) mp->is_local, fib_index, &pfx, ntohl (mp->entry_flags), + ntohl (mp->rpf_id), ntohl (mp->next_hop_sw_if_index), ntohl (mp->itf_flags))); } diff --git a/src/vnet/ip/lookup.c b/src/vnet/ip/lookup.c index ec9a1f97..597de06b 100755 --- a/src/vnet/ip/lookup.c +++ b/src/vnet/ip/lookup.c @@ -450,6 +450,7 @@ vnet_ip_route_cmd (vlib_main_t * vm, unformat_mpls_unicast_label, &rpath.frp_local_label)) { rpath.frp_weight = 1; + rpath.frp_eos = MPLS_NON_EOS; rpath.frp_proto = FIB_PROTOCOL_MPLS; rpath.frp_sw_if_index = ~0; vec_add1 (rpaths, rpath); @@ -923,7 +924,7 @@ vnet_ip_mroute_cmd (vlib_main_t * vm, else if (eflags) { mfib_table_entry_update (fib_index, &pfx, MFIB_SOURCE_CLI, - eflags); + MFIB_RPF_ID_NONE, eflags); } else { diff --git a/src/vnet/lisp-gpe/lisp_gpe_fwd_entry.c b/src/vnet/lisp-gpe/lisp_gpe_fwd_entry.c index efa724e0..d2954e96 100644 --- a/src/vnet/lisp-gpe/lisp_gpe_fwd_entry.c +++ b/src/vnet/lisp-gpe/lisp_gpe_fwd_entry.c @@ -88,6 +88,7 @@ ip_dst_fib_add_route (u32 dst_fib_index, const ip_prefix_t * dst_prefix) (ip_prefix_version (dst_prefix) == IP6 ? DPO_PROTO_IP6 : DPO_PROTO_IP4), + LOOKUP_UNICAST, LOOKUP_INPUT_SRC_ADDR, LOOKUP_TABLE_FROM_CONFIG, &src_lkup_dpo); diff --git a/src/vnet/mfib/ip4_mfib.c b/src/vnet/mfib/ip4_mfib.c index 164cafa1..3ed7cba7 100644 --- a/src/vnet/mfib/ip4_mfib.c +++ b/src/vnet/mfib/ip4_mfib.c @@ -72,6 +72,7 @@ ip4_create_mfib_with_table_id (u32 table_id) mfib_table_entry_update(mfib_table->mft_index, &prefix, MFIB_SOURCE_DEFAULT_ROUTE, + MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_DROP); } diff --git a/src/vnet/mfib/ip6_mfib.c b/src/vnet/mfib/ip6_mfib.c index 991b91c6..116fee22 100644 --- a/src/vnet/mfib/ip6_mfib.c +++ b/src/vnet/mfib/ip6_mfib.c @@ -195,6 +195,7 @@ ip6_create_mfib_with_table_id (u32 table_id) mfib_table_entry_update(mfib_table->mft_index, &all_zeros, MFIB_SOURCE_DEFAULT_ROUTE, + MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_DROP); /* diff --git a/src/vnet/mfib/mfib_entry.c b/src/vnet/mfib/mfib_entry.c index 1aa8e086..847f25e7 100644 --- a/src/vnet/mfib/mfib_entry.c +++ b/src/vnet/mfib/mfib_entry.c @@ -48,6 +48,15 @@ #define MFIB_ENTRY_DBG(_e, _fmt, _args...) #endif +/** + * MFIB extensions to each path + */ +typedef struct mfib_path_ext_t_ +{ + mfib_itf_flags_t mfpe_flags; + fib_node_index_t mfpe_path; +} mfib_path_ext_t; + /** * The source of an MFIB entry */ @@ -58,22 +67,39 @@ typedef struct mfib_entry_src_t_ */ mfib_source_t mfes_src; + /** + * Route flags + */ + mfib_entry_flags_t mfes_flags; + /** * The path-list of forwarding interfaces */ fib_node_index_t mfes_pl; /** - * Route flags + * RPF-ID */ - mfib_entry_flags_t mfes_flags; + fib_rpf_id_t mfes_rpf_id; + + /** + * Hash table of path extensions + */ + mfib_path_ext_t *mfes_exts; /** - * The hash table of all interfaces + * The hash table of all interfaces. + * This is forwarding time information derived from the paths + * and their extensions. */ mfib_itf_t *mfes_itfs; } mfib_entry_src_t; +/** + * Pool of path extensions + */ +static mfib_path_ext_t *mfib_path_ext_pool; + /** * String names for each source */ @@ -123,6 +149,24 @@ format_mfib_entry_dpo (u8 * s, va_list * args) MFIB_ENTRY_FORMAT_BRIEF)); } +static inline mfib_path_ext_t * +mfib_entry_path_ext_get (index_t mi) +{ + return (pool_elt_at_index(mfib_path_ext_pool, mi)); +} + +static u8 * +format_mfib_entry_path_ext (u8 * s, va_list * args) +{ + mfib_path_ext_t *path_ext; + index_t mpi = va_arg(*args, index_t); + + path_ext = mfib_entry_path_ext_get(mpi); + return (format(s, "path:%d flags:%U", + path_ext->mfpe_path, + format_mfib_itf_flags, path_ext->mfpe_flags)); +} + u8 * format_mfib_entry (u8 * s, va_list * args) { @@ -141,6 +185,8 @@ format_mfib_entry (u8 * s, va_list * args) if (level >= MFIB_ENTRY_FORMAT_DETAIL) { + fib_node_index_t path_index, mpi; + s = format (s, "\n"); s = format (s, " fib:%d", mfib_entry->mfe_fib_index); s = format (s, " index:%d", mfib_entry_get_index(mfib_entry)); @@ -153,6 +199,14 @@ format_mfib_entry (u8 * s, va_list * args) { s = fib_path_list_format(msrc->mfes_pl, s); } + s = format (s, " Extensions:\n", + mfib_source_names[msrc->mfes_src]); + hash_foreach(path_index, mpi, msrc->mfes_exts, + ({ + s = format(s, " %U\n", format_mfib_entry_path_ext, mpi); + })); + s = format (s, " Interface-Forwarding:\n", + mfib_source_names[msrc->mfes_src]); hash_foreach(sw_if_index, mfi, msrc->mfes_itfs, ({ s = format(s, " %U\n", format_mfib_itf, mfi); @@ -165,7 +219,7 @@ format_mfib_entry (u8 * s, va_list * args) ({ s = format(s, "\n %U", format_mfib_itf, mfi); })); - + s = format(s, "\n RPF-ID:%d", mfib_entry->mfe_rpf_id); s = format(s, "\n %U-chain\n %U", format_fib_forw_chain_type, mfib_entry_get_default_chain_type(mfib_entry), @@ -314,13 +368,6 @@ mfib_entry_src_remove (mfib_entry_t *mfib_entry, } } -static int -mfib_entry_src_n_itfs (const mfib_entry_src_t *msrc) -{ - return (hash_elts(msrc->mfes_itfs)); -} - - static void mfib_entry_last_lock_gone (fib_node_t *node) { @@ -338,7 +385,6 @@ mfib_entry_last_lock_gone (fib_node_t *node) mfib_entry_src_flush(msrc); } - fib_path_list_unlock(mfib_entry->mfe_parent); vec_free(mfib_entry->mfe_srcs); fib_node_deinit(&mfib_entry->mfe_node); @@ -417,10 +463,9 @@ mfib_entry_alloc (u32 fib_index, mfib_entry->mfe_flags = 0; mfib_entry->mfe_fib_index = fib_index; mfib_entry->mfe_prefix = *prefix; - mfib_entry->mfe_parent = FIB_NODE_INDEX_INVALID; - mfib_entry->mfe_sibling = FIB_NODE_INDEX_INVALID; mfib_entry->mfe_srcs = NULL; mfib_entry->mfe_itfs = NULL; + mfib_entry->mfe_rpf_id = MFIB_RPF_ID_NONE; dpo_reset(&mfib_entry->mfe_rep); @@ -431,10 +476,57 @@ mfib_entry_alloc (u32 fib_index, return (mfib_entry); } +static inline mfib_path_ext_t * +mfib_entry_path_ext_find (mfib_path_ext_t *exts, + fib_node_index_t path_index) +{ + uword *p; + + p = hash_get(exts, path_index); + + if (NULL != p) + { + return (mfib_entry_path_ext_get(p[0])); + } + + return (NULL); +} + +static mfib_path_ext_t* +mfib_path_ext_add (mfib_entry_src_t *msrc, + fib_node_index_t path_index, + mfib_itf_flags_t mfi_flags) +{ + mfib_path_ext_t *path_ext; + + pool_get(mfib_path_ext_pool, path_ext); + + path_ext->mfpe_flags = mfi_flags; + path_ext->mfpe_path = path_index; + + hash_set(msrc->mfes_exts, path_index, + path_ext - mfib_path_ext_pool); + + return (path_ext); +} + +static void +mfib_path_ext_remove (mfib_entry_src_t *msrc, + fib_node_index_t path_index) +{ + mfib_path_ext_t *path_ext; + + path_ext = mfib_entry_path_ext_find(msrc->mfes_exts, path_index); + + hash_unset(msrc->mfes_exts, path_index); + pool_put(mfib_path_ext_pool, path_ext); +} + typedef struct mfib_entry_collect_forwarding_ctx_t_ { load_balance_path_t * next_hops; fib_forward_chain_type_t fct; + mfib_entry_src_t *msrc; } mfib_entry_collect_forwarding_ctx_t; static int @@ -455,6 +547,20 @@ mfib_entry_src_collect_forwarding (fib_node_index_t pl_index, return (!0); } + /* + * If the path is not forwarding to use it + */ + mfib_path_ext_t *path_ext; + + path_ext = mfib_entry_path_ext_find(ctx->msrc->mfes_exts, + path_index); + + if (NULL != path_ext && + !(path_ext->mfpe_flags & MFIB_ITF_FLAG_FORWARD)) + { + return (!0); + } + switch (ctx->fct) { case FIB_FORW_CHAIN_TYPE_MCAST_IP4: @@ -483,46 +589,61 @@ mfib_entry_src_collect_forwarding (fib_node_index_t pl_index, } static void -mfib_entry_stack (mfib_entry_t *mfib_entry) +mfib_entry_stack (mfib_entry_t *mfib_entry, + mfib_entry_src_t *msrc) { dpo_proto_t dp; dp = fib_proto_to_dpo(mfib_entry_get_proto(mfib_entry)); - if (FIB_NODE_INDEX_INVALID != mfib_entry->mfe_parent) + if (NULL != msrc && + FIB_NODE_INDEX_INVALID != msrc->mfes_pl) { mfib_entry_collect_forwarding_ctx_t ctx = { .next_hops = NULL, .fct = mfib_entry_get_default_chain_type(mfib_entry), + .msrc = msrc, }; - fib_path_list_walk(mfib_entry->mfe_parent, + fib_path_list_walk(msrc->mfes_pl, mfib_entry_src_collect_forwarding, &ctx); if (!(MFIB_ENTRY_FLAG_EXCLUSIVE & mfib_entry->mfe_flags)) { - /* - * each path contirbutes a next-hop. form a replicate - * from those choices. - */ - if (!dpo_id_is_valid(&mfib_entry->mfe_rep) || - dpo_is_drop(&mfib_entry->mfe_rep)) + if (NULL == ctx.next_hops) { - dpo_id_t tmp_dpo = DPO_INVALID; - - dpo_set(&tmp_dpo, - DPO_REPLICATE, dp, - replicate_create(0, dp)); - + /* + * no next-hops, stack directly on the drop + */ dpo_stack(DPO_MFIB_ENTRY, dp, &mfib_entry->mfe_rep, - &tmp_dpo); - - dpo_reset(&tmp_dpo); + drop_dpo_get(dp)); + } + else + { + /* + * each path contirbutes a next-hop. form a replicate + * from those choices. + */ + if (!dpo_id_is_valid(&mfib_entry->mfe_rep) || + dpo_is_drop(&mfib_entry->mfe_rep)) + { + dpo_id_t tmp_dpo = DPO_INVALID; + + dpo_set(&tmp_dpo, + DPO_REPLICATE, dp, + replicate_create(0, dp)); + + dpo_stack(DPO_MFIB_ENTRY, dp, + &mfib_entry->mfe_rep, + &tmp_dpo); + + dpo_reset(&tmp_dpo); + } + replicate_multipath_update(&mfib_entry->mfe_rep, + ctx.next_hops); } - replicate_multipath_update(&mfib_entry->mfe_rep, - ctx.next_hops); } else { @@ -548,11 +669,11 @@ mfib_entry_stack (mfib_entry_t *mfib_entry) } } -static void -mfib_entry_forwarding_path_add (mfib_entry_src_t *msrc, - const fib_route_path_t *rpath) +static fib_node_index_t +mfib_entry_src_path_add (mfib_entry_src_t *msrc, + const fib_route_path_t *rpath) { - fib_node_index_t old_pl_index; + fib_node_index_t path_index; fib_route_path_t *rpaths; ASSERT(!(MFIB_ENTRY_FLAG_EXCLUSIVE & msrc->mfes_flags)); @@ -563,32 +684,26 @@ mfib_entry_forwarding_path_add (mfib_entry_src_t *msrc, rpaths = NULL; vec_add1(rpaths, rpath[0]); - old_pl_index = msrc->mfes_pl; - if (FIB_NODE_INDEX_INVALID == msrc->mfes_pl) { - msrc->mfes_pl = - fib_path_list_create(FIB_PATH_LIST_FLAG_NO_URPF, - rpaths); - } - else - { - msrc->mfes_pl = - fib_path_list_copy_and_path_add(msrc->mfes_pl, - FIB_PATH_LIST_FLAG_NO_URPF, - rpaths); + /* A non-shared path-list */ + msrc->mfes_pl = fib_path_list_create(FIB_PATH_LIST_FLAG_NO_URPF, + NULL); + fib_path_list_lock(msrc->mfes_pl); } - fib_path_list_lock(msrc->mfes_pl); - fib_path_list_unlock(old_pl_index); + + path_index = fib_path_list_path_add(msrc->mfes_pl, rpaths); vec_free(rpaths); + + return (path_index); } -static int -mfib_entry_forwarding_path_remove (mfib_entry_src_t *msrc, - const fib_route_path_t *rpath) +static fib_node_index_t +mfib_entry_src_path_remove (mfib_entry_src_t *msrc, + const fib_route_path_t *rpath) { - fib_node_index_t old_pl_index; + fib_node_index_t path_index; fib_route_path_t *rpaths; ASSERT(!(MFIB_ENTRY_FLAG_EXCLUSIVE & msrc->mfes_flags)); @@ -599,56 +714,31 @@ mfib_entry_forwarding_path_remove (mfib_entry_src_t *msrc, rpaths = NULL; vec_add1(rpaths, rpath[0]); - old_pl_index = msrc->mfes_pl; - - msrc->mfes_pl = - fib_path_list_copy_and_path_remove(msrc->mfes_pl, - FIB_PATH_LIST_FLAG_NONE, - rpaths); - - fib_path_list_lock(msrc->mfes_pl); - fib_path_list_unlock(old_pl_index); + path_index = fib_path_list_path_remove(msrc->mfes_pl, rpaths); vec_free(rpaths); - return (FIB_NODE_INDEX_INVALID != msrc->mfes_pl); + return (path_index); } static void mfib_entry_recalculate_forwarding (mfib_entry_t *mfib_entry) { - fib_node_index_t old_pl_index; mfib_entry_src_t *bsrc; - old_pl_index = mfib_entry->mfe_parent; - /* * copy the forwarding data from the bast source */ bsrc = mfib_entry_get_best_src(mfib_entry); - if (NULL == bsrc) - { - mfib_entry->mfe_parent = FIB_NODE_INDEX_INVALID; - } - else + if (NULL != bsrc) { - mfib_entry->mfe_parent = bsrc->mfes_pl; mfib_entry->mfe_flags = bsrc->mfes_flags; mfib_entry->mfe_itfs = bsrc->mfes_itfs; + mfib_entry->mfe_rpf_id = bsrc->mfes_rpf_id; } - /* - * re-stack the entry on the best forwarding info. - */ - if (old_pl_index != mfib_entry->mfe_parent || - FIB_NODE_INDEX_INVALID == old_pl_index) - { - mfib_entry_stack(mfib_entry); - - fib_path_list_lock(mfib_entry->mfe_parent); - fib_path_list_unlock(old_pl_index); - } + mfib_entry_stack(mfib_entry, bsrc); } @@ -656,6 +746,7 @@ fib_node_index_t mfib_entry_create (u32 fib_index, mfib_source_t source, const mfib_prefix_t *prefix, + fib_rpf_id_t rpf_id, mfib_entry_flags_t entry_flags) { fib_node_index_t mfib_entry_index; @@ -666,6 +757,7 @@ mfib_entry_create (u32 fib_index, &mfib_entry_index); msrc = mfib_entry_src_find_or_create(mfib_entry, source); msrc->mfes_flags = entry_flags; + msrc->mfes_rpf_id = rpf_id; mfib_entry_recalculate_forwarding(mfib_entry); @@ -682,13 +774,14 @@ static int mfib_entry_src_ok_for_delete (const mfib_entry_src_t *msrc) { return ((MFIB_ENTRY_FLAG_NONE == msrc->mfes_flags && - 0 == mfib_entry_src_n_itfs(msrc))); + 0 == fib_path_list_get_n_paths(msrc->mfes_pl))); } int mfib_entry_update (fib_node_index_t mfib_entry_index, mfib_source_t source, mfib_entry_flags_t entry_flags, + fib_rpf_id_t rpf_id, index_t repi) { mfib_entry_t *mfib_entry; @@ -697,6 +790,7 @@ mfib_entry_update (fib_node_index_t mfib_entry_index, mfib_entry = mfib_entry_get(mfib_entry_index); msrc = mfib_entry_src_find_or_create(mfib_entry, source); msrc->mfes_flags = entry_flags; + msrc->mfes_rpf_id = rpf_id; if (INDEX_INVALID != repi) { @@ -768,55 +862,79 @@ mfib_entry_path_update (fib_node_index_t mfib_entry_index, const fib_route_path_t *rpath, mfib_itf_flags_t itf_flags) { + fib_node_index_t path_index; + mfib_path_ext_t *path_ext; + mfib_itf_flags_t old, new; mfib_entry_t *mfib_entry; mfib_entry_src_t *msrc; - mfib_itf_t *mfib_itf; mfib_entry = mfib_entry_get(mfib_entry_index); ASSERT(NULL != mfib_entry); msrc = mfib_entry_src_find_or_create(mfib_entry, source); /* - * search for the interface in the current set + * add the path to the path-list. If it's a duplicate we'll get + * back the original path. + */ + path_index = mfib_entry_src_path_add(msrc, rpath); + + /* + * find the path extension for that path */ - mfib_itf = mfib_entry_itf_find(msrc->mfes_itfs, - rpath[0].frp_sw_if_index); + path_ext = mfib_entry_path_ext_find(msrc->mfes_exts, path_index); - if (NULL == mfib_itf) + if (NULL == path_ext) { - /* - * this is a path we do not yet have. If it is forwarding then we - * add it to the replication set - */ - if (itf_flags & MFIB_ITF_FLAG_FORWARD) - { - mfib_entry_forwarding_path_add(msrc, rpath); - } - /* - * construct a new ITF for this entry's list - */ - mfib_entry_itf_add(msrc, - rpath[0].frp_sw_if_index, - mfib_itf_create(rpath[0].frp_sw_if_index, - itf_flags)); + old = MFIB_ITF_FLAG_NONE; + path_ext = mfib_path_ext_add(msrc, path_index, itf_flags); } else { - int was_forwarding = !!(mfib_itf->mfi_flags & MFIB_ITF_FLAG_FORWARD); - int is_forwarding = !!(itf_flags & MFIB_ITF_FLAG_FORWARD); + old = path_ext->mfpe_flags; + path_ext->mfpe_flags = itf_flags; + } - if (!was_forwarding && is_forwarding) - { - mfib_entry_forwarding_path_add(msrc, rpath); - } - else if (was_forwarding && !is_forwarding) + /* + * Has the path changed its contribution to the input interface set. + * Which only paths with interfaces can do... + */ + if (~0 != rpath[0].frp_sw_if_index) + { + mfib_itf_t *mfib_itf; + + new = itf_flags; + + if (old != new) { - mfib_entry_forwarding_path_remove(msrc, rpath); + if (MFIB_ITF_FLAG_NONE == new) + { + /* + * no more interface flags on this path, remove + * from the data-plane set + */ + mfib_entry_itf_remove(msrc, rpath[0].frp_sw_if_index); + } + else if (MFIB_ITF_FLAG_NONE == old) + { + /* + * This interface is now contributing + */ + mfib_entry_itf_add(msrc, + rpath[0].frp_sw_if_index, + mfib_itf_create(rpath[0].frp_sw_if_index, + itf_flags)); + } + else + { + /* + * change of flag contributions + */ + mfib_itf = mfib_entry_itf_find(msrc->mfes_itfs, + rpath[0].frp_sw_if_index); + /* Seen by packets inflight */ + mfib_itf->mfi_flags = new; + } } - /* - * packets in flight see these updates. - */ - mfib_itf->mfi_flags = itf_flags; } mfib_entry_recalculate_forwarding(mfib_entry); @@ -833,9 +951,9 @@ mfib_entry_path_remove (fib_node_index_t mfib_entry_index, mfib_source_t source, const fib_route_path_t *rpath) { + fib_node_index_t path_index; mfib_entry_t *mfib_entry; mfib_entry_src_t *msrc; - mfib_itf_t *mfib_itf; mfib_entry = mfib_entry_get(mfib_entry_index); ASSERT(NULL != mfib_entry); @@ -850,33 +968,23 @@ mfib_entry_path_remove (fib_node_index_t mfib_entry_index, } /* - * search for the interface in the current set + * remove the path from the path-list. If it's not there we'll get + * back invalid */ - mfib_itf = mfib_entry_itf_find(msrc->mfes_itfs, - rpath[0].frp_sw_if_index); + path_index = mfib_entry_src_path_remove(msrc, rpath); - if (NULL == mfib_itf) + if (FIB_NODE_INDEX_INVALID != path_index) { /* - * removing a path that does not exist + * don't need the extension, nor the interface anymore */ - return (mfib_entry_ok_for_delete(mfib_entry)); - } - - /* - * we have this path. If it is forwarding then we - * remove it to the replication set - */ - if (mfib_itf->mfi_flags & MFIB_ITF_FLAG_FORWARD) - { - mfib_entry_forwarding_path_remove(msrc, rpath); + mfib_path_ext_remove(msrc, path_index); + if (~0 != rpath[0].frp_sw_if_index) + { + mfib_entry_itf_remove(msrc, rpath[0].frp_sw_if_index); + } } - /* - * remove the interface/path from this entry's list - */ - mfib_entry_itf_remove(msrc, rpath[0].frp_sw_if_index); - if (mfib_entry_src_ok_for_delete(msrc)) { /* @@ -1057,11 +1165,14 @@ mfib_entry_encode (fib_node_index_t mfib_entry_index, fib_route_path_encode_t **api_rpaths) { mfib_entry_t *mfib_entry; + mfib_entry_src_t *bsrc; mfib_entry = mfib_entry_get(mfib_entry_index); - if (FIB_NODE_INDEX_INVALID != mfib_entry->mfe_parent) + bsrc = mfib_entry_get_best_src(mfib_entry); + + if (FIB_NODE_INDEX_INVALID != bsrc->mfes_pl) { - fib_path_list_walk(mfib_entry->mfe_parent, + fib_path_list_walk(bsrc->mfes_pl, fib_path_encode, api_rpaths); } diff --git a/src/vnet/mfib/mfib_entry.h b/src/vnet/mfib/mfib_entry.h index dc8f49aa..4f62b18e 100644 --- a/src/vnet/mfib/mfib_entry.h +++ b/src/vnet/mfib/mfib_entry.h @@ -42,17 +42,6 @@ typedef struct mfib_entry_t_ { * The index of the FIB table this entry is in */ u32 mfe_fib_index; - /** - * the path-list for which this entry is a child. This is also the path-list - * that is contributing forwarding for this entry. - */ - fib_node_index_t mfe_parent; - /** - * index of this entry in the parent's child list. - * This is set when this entry is added as a child, but can also - * be changed by the parent as it manages its list. - */ - u32 mfe_sibling; /** * A vector of sources contributing forwarding @@ -65,7 +54,7 @@ typedef struct mfib_entry_t_ { CLIB_CACHE_LINE_ALIGN_MARK(cacheline1); /** - * The Replicate DPO used for forwarding. + * The DPO used for forwarding; replicate, drop, etc.. */ dpo_id_t mfe_rep; @@ -74,6 +63,11 @@ typedef struct mfib_entry_t_ { */ mfib_entry_flags_t mfe_flags; + /** + * RPF-ID used when the packets ingress not from an interface + */ + fib_rpf_id_t mfe_rpf_id; + /** * A hash table of interfaces */ @@ -90,11 +84,13 @@ extern u8 *format_mfib_entry(u8 * s, va_list * args); extern fib_node_index_t mfib_entry_create(u32 fib_index, mfib_source_t source, const mfib_prefix_t *prefix, + fib_rpf_id_t rpf_id, mfib_entry_flags_t entry_flags); extern int mfib_entry_update(fib_node_index_t fib_entry_index, mfib_source_t source, mfib_entry_flags_t entry_flags, + fib_rpf_id_t rpf_id, index_t rep_dpo); extern void mfib_entry_path_update(fib_node_index_t fib_entry_index, diff --git a/src/vnet/mfib/mfib_forward.c b/src/vnet/mfib/mfib_forward.c index 5fe0a57c..3d8f4f98 100644 --- a/src/vnet/mfib/mfib_forward.c +++ b/src/vnet/mfib/mfib_forward.c @@ -380,13 +380,27 @@ mfib_forward_rpf (vlib_main_t * vm, * for the case of throughput traffic that is not replicated * to the host stack nor sets local flags */ - if (PREDICT_TRUE(NULL != mfi0)) + + /* + * If the mfib entry has a configured RPF-ID check that + * in preference to an interface based RPF + */ + if (MFIB_RPF_ID_NONE != mfe0->mfe_rpf_id) { - iflags0 = mfi0->mfi_flags; + iflags0 = (mfe0->mfe_rpf_id == vnet_buffer(b0)->ip.rpf_id ? + MFIB_ITF_FLAG_ACCEPT : + MFIB_ITF_FLAG_NONE); } else { - iflags0 = MFIB_ITF_FLAG_NONE; + if (PREDICT_TRUE(NULL != mfi0)) + { + iflags0 = mfi0->mfi_flags; + } + else + { + iflags0 = MFIB_ITF_FLAG_NONE; + } } eflags0 = mfe0->mfe_flags; @@ -436,17 +450,16 @@ mfib_forward_rpf (vlib_main_t * vm, { mfib_forward_rpf_trace_t *t0; - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); t0->entry_index = mfei0; + t0->itf_flags = iflags0; if (NULL == mfi0) { t0->sw_if_index = ~0; - t0->itf_flags = MFIB_ITF_FLAG_NONE; } else { t0->sw_if_index = mfi0->mfi_sw_if_index; - t0->itf_flags = mfi0->mfi_flags; } } vlib_validate_buffer_enqueue_x1 (vm, node, next, @@ -478,7 +491,7 @@ VLIB_REGISTER_NODE (ip4_mfib_forward_rpf_node, static) = { .n_next_nodes = MFIB_FORWARD_RPF_N_NEXT, .next_nodes = { - [MFIB_FORWARD_RPF_NEXT_DROP] = "error-drop", + [MFIB_FORWARD_RPF_NEXT_DROP] = "ip4-drop", }, }; @@ -503,7 +516,7 @@ VLIB_REGISTER_NODE (ip6_mfib_forward_rpf_node, static) = { .n_next_nodes = MFIB_FORWARD_RPF_N_NEXT, .next_nodes = { - [MFIB_FORWARD_RPF_NEXT_DROP] = "error-drop", + [MFIB_FORWARD_RPF_NEXT_DROP] = "ip6-drop", }, }; diff --git a/src/vnet/mfib/mfib_table.c b/src/vnet/mfib/mfib_table.c index 3b4bd985..7ffe8941 100644 --- a/src/vnet/mfib/mfib_table.c +++ b/src/vnet/mfib/mfib_table.c @@ -165,6 +165,7 @@ fib_node_index_t mfib_table_entry_update (u32 fib_index, const mfib_prefix_t *prefix, mfib_source_t source, + fib_rpf_id_t rpf_id, mfib_entry_flags_t entry_flags) { fib_node_index_t mfib_entry_index; @@ -181,7 +182,8 @@ mfib_table_entry_update (u32 fib_index, * update to a non-existing entry with non-zero flags */ mfib_entry_index = mfib_entry_create(fib_index, source, - prefix, entry_flags); + prefix, rpf_id, + entry_flags); mfib_table_entry_insert(mfib_table, prefix, mfib_entry_index); } @@ -198,6 +200,7 @@ mfib_table_entry_update (u32 fib_index, if (mfib_entry_update(mfib_entry_index, source, entry_flags, + rpf_id, INDEX_INVALID)) { /* @@ -230,6 +233,7 @@ mfib_table_entry_path_update (u32 fib_index, mfib_entry_index = mfib_entry_create(fib_index, source, prefix, + MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_NONE); mfib_table_entry_insert(mfib_table, prefix, mfib_entry_index); @@ -304,6 +308,7 @@ mfib_table_entry_special_add (u32 fib_index, mfib_entry_index = mfib_entry_create(fib_index, source, prefix, + MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_NONE); mfib_table_entry_insert(mfib_table, prefix, mfib_entry_index); @@ -311,6 +316,7 @@ mfib_table_entry_special_add (u32 fib_index, mfib_entry_update(mfib_entry_index, source, (MFIB_ENTRY_FLAG_EXCLUSIVE | entry_flags), + MFIB_RPF_ID_NONE, rep_dpo); return (mfib_entry_index); diff --git a/src/vnet/mfib/mfib_table.h b/src/vnet/mfib/mfib_table.h index 95239f7c..83aa04ef 100644 --- a/src/vnet/mfib/mfib_table.h +++ b/src/vnet/mfib/mfib_table.h @@ -122,6 +122,7 @@ extern fib_node_index_t mfib_table_lookup_exact_match(u32 fib_index, extern fib_node_index_t mfib_table_entry_update(u32 fib_index, const mfib_prefix_t *prefix, mfib_source_t source, + fib_rpf_id_t rpf_id, mfib_entry_flags_t flags); /** diff --git a/src/vnet/mfib/mfib_test.c b/src/vnet/mfib/mfib_test.c index 36a303e8..7c92ae99 100644 --- a/src/vnet/mfib/mfib_test.c +++ b/src/vnet/mfib/mfib_test.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -201,8 +203,8 @@ mfib_test_validate_rep_v (const replicate_t *rep, if (DPO_RECEIVE != dt) { MFIB_TEST_REP((ai == dpo->dpoi_index), - "bucket %d stacks on %U", - bucket, + "bucket %d [exp:%d] stacks on %U", + bucket, ai, format_dpo_id, dpo, 0); } } @@ -734,6 +736,7 @@ mfib_test_i (fib_protocol_t PROTO, mfib_table_entry_update(fib_index, pfx_s_g, MFIB_SOURCE_API, + MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_SIGNAL); MFIB_TEST(mfib_test_entry(mfei, MFIB_ENTRY_FLAG_SIGNAL, @@ -824,6 +827,7 @@ mfib_test_i (fib_protocol_t PROTO, mfib_table_entry_update(fib_index, pfx_s_g, MFIB_SOURCE_API, + MFIB_RPF_ID_NONE, (MFIB_ENTRY_FLAG_SIGNAL | MFIB_ENTRY_FLAG_CONNECTED)); MFIB_TEST(mfib_test_entry(mfei, @@ -965,6 +969,7 @@ mfib_test_i (fib_protocol_t PROTO, mfib_table_entry_update(fib_index, pfx_s_g, MFIB_SOURCE_API, + MFIB_RPF_ID_NONE, MFIB_ENTRY_FLAG_NONE); mfei = mfib_table_lookup_exact_match(fib_index, pfx_s_g); @@ -1073,6 +1078,117 @@ mfib_test_i (fib_protocol_t PROTO, MFIB_SOURCE_SRv6); dpo_reset(&td); + /* + * A Multicast LSP. This a mLDP head-end + */ + fib_node_index_t ai_mpls_10_10_10_1, lfei; + ip46_address_t nh_10_10_10_1 = { + .ip4 = { + .as_u32 = clib_host_to_net_u32(0x0a0a0a01), + }, + }; + ai_mpls_10_10_10_1 = adj_nbr_add_or_lock(FIB_PROTOCOL_IP4, + VNET_LINK_MPLS, + &nh_10_10_10_1, + tm->hw[0]->sw_if_index); + + fib_prefix_t pfx_3500 = { + .fp_len = 21, + .fp_proto = FIB_PROTOCOL_MPLS, + .fp_label = 3500, + .fp_eos = MPLS_EOS, + .fp_payload_proto = DPO_PROTO_IP4, + }; + fib_test_rep_bucket_t mc_0 = { + .type = FT_REP_LABEL_O_ADJ, + .label_o_adj = { + .adj = ai_mpls_10_10_10_1, + .label = 3300, + .eos = MPLS_EOS, + }, + }; + mpls_label_t *l3300 = NULL; + vec_add1(l3300, 3300); + + /* + * MPLS enable an interface so we get the MPLS table created + */ + mpls_sw_interface_enable_disable(&mpls_main, + tm->hw[0]->sw_if_index, + 1); + + lfei = fib_table_entry_update_one_path(0, // default MPLS Table + &pfx_3500, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_MULTICAST, + FIB_PROTOCOL_IP4, + &nh_10_10_10_1, + tm->hw[0]->sw_if_index, + ~0, // invalid fib index + 1, + l3300, + FIB_ROUTE_PATH_FLAG_NONE); + MFIB_TEST(fib_test_validate_entry(lfei, + FIB_FORW_CHAIN_TYPE_MPLS_EOS, + 1, + &mc_0), + "3500 via replicate over 10.10.10.1"); + + /* + * An (S,G) that resolves via the mLDP head-end + */ + fib_route_path_t path_via_mldp = { + .frp_proto = FIB_PROTOCOL_MPLS, + .frp_local_label = pfx_3500.fp_label, + .frp_eos = MPLS_EOS, + .frp_sw_if_index = 0xffffffff, + .frp_fib_index = 0, + .frp_weight = 1, + .frp_flags = FIB_ROUTE_PATH_FLAG_NONE, + }; + dpo_id_t mldp_dpo = DPO_INVALID; + + fib_entry_contribute_forwarding(lfei, + FIB_FORW_CHAIN_TYPE_MPLS_EOS, + &mldp_dpo); + + mfei = mfib_table_entry_path_update(fib_index, + pfx_s_g, + MFIB_SOURCE_API, + &path_via_mldp, + MFIB_ITF_FLAG_FORWARD); + + MFIB_TEST(mfib_test_entry(mfei, + MFIB_ENTRY_FLAG_NONE, + 1, + DPO_REPLICATE, mldp_dpo.dpoi_index), + "%U over-mLDP replicate OK", + format_mfib_prefix, pfx_s_g); + + /* + * add a for-us path. this tests two types of non-attached paths on one entry + */ + mfei = mfib_table_entry_path_update(fib_index, + pfx_s_g, + MFIB_SOURCE_API, + &path_for_us, + MFIB_ITF_FLAG_FORWARD); + MFIB_TEST(mfib_test_entry(mfei, + MFIB_ENTRY_FLAG_NONE, + 2, + DPO_REPLICATE, mldp_dpo.dpoi_index, + DPO_RECEIVE, 0), + "%U mLDP+for-us replicate OK", + format_mfib_prefix, pfx_s_g); + + mfib_table_entry_delete(fib_index, + pfx_s_g, + MFIB_SOURCE_API); + fib_table_entry_delete(0, + &pfx_3500, + FIB_SOURCE_API); + dpo_reset(&mldp_dpo); + /* * Unlock the table - it's the last lock so should be gone thereafter */ @@ -1086,6 +1202,13 @@ mfib_test_i (fib_protocol_t PROTO, adj_unlock(ai_2); adj_unlock(ai_3); + /* + * MPLS disable the interface + */ + mpls_sw_interface_enable_disable(&mpls_main, + tm->hw[0]->sw_if_index, + 0); + /* * test we've leaked no resources */ diff --git a/src/vnet/mpls/mpls.api b/src/vnet/mpls/mpls.api index 2e3bfaf5..a1e1270a 100644 --- a/src/vnet/mpls/mpls.api +++ b/src/vnet/mpls/mpls.api @@ -55,6 +55,7 @@ define mpls_ip_bind_unbind_reply @param context - sender context, to match reply w/ request @param mt_is_add - Is this a route add or delete @param mt_sw_if_index - The SW interface index of the tunnel to delete + @param mt_is_multicast - Is the tunnel's underlying LSP multicast @param mt_next_hop_proto_is_ip4 - The next-hop is IPV4 @param mt_next_hop_weight - The weight, for UCMP @param mt_next_hop[16] - the nextop address @@ -70,6 +71,7 @@ define mpls_tunnel_add_del u32 mt_sw_if_index; u8 mt_is_add; u8 mt_l2_only; + u8 mt_is_multicast; u8 mt_next_hop_proto_is_ip4; u8 mt_next_hop_weight; u8 mt_next_hop[16]; @@ -102,30 +104,43 @@ define mpls_tunnel_dump i32 tunnel_index; }; -/** \brief mpls eth tunnel operational state response - @param tunnel_index - eth tunnel identifier - @param intfc_address - interface ipv4 addr - @param mask_width - interface ipv4 addr mask - @param hw_if_index - interface id - @param l2_only - - @param tunnel_dst_mac - - @param tx_sw_if_index - - @param encap_index - reference to mpls label table - @param nlabels - number of resolved labels - @param labels - resolved labels +/** \brief FIB path + @param sw_if_index - index of the interface + @param weight - The weight, for UCMP + @param is_local - local if non-zero, else remote + @param is_drop - Drop the packet + @param is_unreach - Drop the packet and rate limit send ICMP unreachable + @param is_prohibit - Drop the packet and rate limit send ICMP prohibited + @param afi - the afi of the next hop, IP46_TYPE_IP4=1, IP46_TYPE_IP6=2 + @param next_hop[16] - the next hop address + + WARNING: this type is replicated, pending cleanup completion + +*/ +typeonly manual_print manual_endian define fib_path2 +{ + u32 sw_if_index; + u32 weight; + u8 is_local; + u8 is_drop; + u8 is_unreach; + u8 is_prohibit; + u8 afi; + u8 next_hop[16]; + u32 labels[16]; +}; + +/** \brief mpls tunnel details */ -define mpls_tunnel_details +manual_endian manual_print define mpls_tunnel_details { u32 context; - u32 tunnel_index; - u8 mt_l2_only; u8 mt_sw_if_index; - u8 mt_next_hop_proto_is_ip4; - u8 mt_next_hop[16]; - u32 mt_next_hop_sw_if_index; - u32 mt_next_hop_table_id; - u32 mt_next_hop_n_labels; - u32 mt_next_hop_out_labels[mt_next_hop_n_labels]; + u8 mt_tunnel_index; + u8 mt_l2_only; + u8 mt_is_multicast; + u32 mt_count; + vl_api_fib_path2_t mt_paths[mt_count]; }; /** \brief MPLS Route Add / del route @@ -140,10 +155,14 @@ define mpls_tunnel_details create them @param mr_is_add - Is this a route add or delete @param mr_is_classify - Is this route result a classify + @param mr_is_multicast - Is this a multicast route @param mr_is_multipath - Is this route update a multipath - i.e. is this a path addition to an existing route @param mr_is_resolve_host - Recurse resolution constraint via a host prefix @param mr_is_resolve_attached - Recurse resolution constraint via attached prefix + @param mr_is_interface_rx - Interface Receive path + @param mr_is_interface_rx - RPF-ID Receive path. The next-hop interface + is used as the RPF-ID @param mr_next_hop_proto_is_ip4 - The next-hop is IPV4 @param mr_next_hop_weight - The weight, for UCMP @param mr_next_hop[16] - the nextop address @@ -164,9 +183,12 @@ define mpls_route_add_del u8 mr_create_table_if_needed; u8 mr_is_add; u8 mr_is_classify; + u8 mr_is_multicast; u8 mr_is_multipath; u8 mr_is_resolve_host; u8 mr_is_resolve_attached; + u8 mr_is_interface_rx; + u8 mr_is_rpf_id; u8 mr_next_hop_proto_is_ip4; u8 mr_next_hop_weight; u8 mr_next_hop[16]; @@ -187,31 +209,6 @@ define mpls_route_add_del_reply i32 retval; }; -/** \brief FIB path - @param sw_if_index - index of the interface - @param weight - The weight, for UCMP - @param is_local - local if non-zero, else remote - @param is_drop - Drop the packet - @param is_unreach - Drop the packet and rate limit send ICMP unreachable - @param is_prohibit - Drop the packet and rate limit send ICMP prohibited - @param afi - the afi of the next hop, IP46_TYPE_IP4=1, IP46_TYPE_IP6=2 - @param next_hop[16] - the next hop address - - WARNING: this type is replicated, pending cleanup completion - -*/ -typeonly manual_print manual_endian define fib_path2 -{ - u32 sw_if_index; - u32 weight; - u8 is_local; - u8 is_drop; - u8 is_unreach; - u8 is_prohibit; - u8 afi; - u8 next_hop[16]; -}; - /** \brief Dump MPLS fib table @param client_index - opaque cookie to identify the sender */ diff --git a/src/vnet/mpls/mpls.c b/src/vnet/mpls/mpls.c index 482577b1..451b15cf 100644 --- a/src/vnet/mpls/mpls.c +++ b/src/vnet/mpls/mpls.c @@ -286,7 +286,15 @@ vnet_mpls_local_label (vlib_main_t * vm, rpath.frp_proto = FIB_PROTOCOL_IP4; vec_add1(rpaths, rpath); } - + else if (unformat (line_input, "rx-ip4 %U", + unformat_vnet_sw_interface, vnm, + &rpath.frp_sw_if_index)) + { + rpath.frp_weight = 1; + rpath.frp_proto = FIB_PROTOCOL_IP4; + rpath.frp_flags = FIB_ROUTE_PATH_INTF_RX; + vec_add1(rpaths, rpath); + } else if (unformat (line_input, "via %U %U", unformat_ip6_address, &rpath.frp_addr.ip6, @@ -512,10 +520,3 @@ mpls_init (vlib_main_t * vm) } VLIB_INIT_FUNCTION (mpls_init); - -mpls_main_t * mpls_get_main (vlib_main_t * vm) -{ - vlib_call_init_function (vm, mpls_init); - return &mpls_main; -} - diff --git a/src/vnet/mpls/mpls_api.c b/src/vnet/mpls/mpls_api.c index f1aef6c9..6bfc491d 100644 --- a/src/vnet/mpls/mpls_api.c +++ b/src/vnet/mpls/mpls_api.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -163,6 +164,7 @@ mpls_route_add_del_t_handler (vnet_main_t * vnm, dpo_proto_to_fib (pfx.fp_payload_proto), mp->mr_next_hop_table_id, mp->mr_create_table_if_needed, + mp->mr_is_rpf_id, &fib_index, &next_hop_fib_index); if (0 != rv) @@ -192,10 +194,13 @@ mpls_route_add_del_t_handler (vnet_main_t * vnm, 0, // mp->is_unreach, 0, // mp->is_prohibit, 0, // mp->is_local, + mp->mr_is_multicast, mp->mr_is_classify, mp->mr_classify_table_index, mp->mr_is_resolve_host, mp->mr_is_resolve_attached, + mp->mr_is_interface_rx, + mp->mr_is_rpf_id, fib_index, &pfx, mp->mr_next_hop_proto_is_ip4, &nh, ntohl (mp->mr_next_hop_sw_if_index), @@ -229,46 +234,54 @@ vl_api_mpls_tunnel_add_del_t_handler (vl_api_mpls_tunnel_add_del_t * mp) int rv = 0; u32 tunnel_sw_if_index; int ii; + fib_route_path_t rpath, *rpaths = NULL; + + memset (&rpath, 0, sizeof (rpath)); stats_dslock_with_hint (1 /* release hint */ , 5 /* tag */ ); - if (mp->mt_is_add) + if (mp->mt_next_hop_proto_is_ip4) { - fib_route_path_t rpath, *rpaths = NULL; - mpls_label_t *label_stack = NULL; - - memset (&rpath, 0, sizeof (rpath)); - - if (mp->mt_next_hop_proto_is_ip4) - { - rpath.frp_proto = FIB_PROTOCOL_IP4; - clib_memcpy (&rpath.frp_addr.ip4, - mp->mt_next_hop, sizeof (rpath.frp_addr.ip4)); - } - else - { - rpath.frp_proto = FIB_PROTOCOL_IP6; - clib_memcpy (&rpath.frp_addr.ip6, - mp->mt_next_hop, sizeof (rpath.frp_addr.ip6)); - } - rpath.frp_sw_if_index = ntohl (mp->mt_next_hop_sw_if_index); + rpath.frp_proto = FIB_PROTOCOL_IP4; + clib_memcpy (&rpath.frp_addr.ip4, + mp->mt_next_hop, sizeof (rpath.frp_addr.ip4)); + } + else + { + rpath.frp_proto = FIB_PROTOCOL_IP6; + clib_memcpy (&rpath.frp_addr.ip6, + mp->mt_next_hop, sizeof (rpath.frp_addr.ip6)); + } + rpath.frp_sw_if_index = ntohl (mp->mt_next_hop_sw_if_index); + rpath.frp_weight = 1; + if (mp->mt_is_add) + { for (ii = 0; ii < mp->mt_next_hop_n_out_labels; ii++) - vec_add1 (label_stack, ntohl (mp->mt_next_hop_out_label_stack[ii])); + vec_add1 (rpath.frp_label_stack, + ntohl (mp->mt_next_hop_out_label_stack[ii])); + } - vec_add1 (rpaths, rpath); + vec_add1 (rpaths, rpath); - vnet_mpls_tunnel_add (rpaths, label_stack, - mp->mt_l2_only, &tunnel_sw_if_index); - vec_free (rpaths); - vec_free (label_stack); + tunnel_sw_if_index = ntohl (mp->mt_sw_if_index); + + if (mp->mt_is_add) + { + if (~0 == tunnel_sw_if_index) + tunnel_sw_if_index = vnet_mpls_tunnel_create (mp->mt_l2_only, + mp->mt_is_multicast); + vnet_mpls_tunnel_path_add (tunnel_sw_if_index, rpaths); } else { tunnel_sw_if_index = ntohl (mp->mt_sw_if_index); - vnet_mpls_tunnel_del (tunnel_sw_if_index); + if (!vnet_mpls_tunnel_path_remove (tunnel_sw_if_index, rpaths)) + vnet_mpls_tunnel_del (tunnel_sw_if_index); } + vec_free (rpaths); + stats_dsunlock (); /* *INDENT-OFF* */ @@ -289,10 +302,12 @@ typedef struct mpls_tunnel_send_walk_ctx_t_ static void send_mpls_tunnel_entry (u32 mti, void *arg) { + fib_route_path_encode_t *api_rpaths, *api_rpath; mpls_tunnel_send_walk_ctx_t *ctx; vl_api_mpls_tunnel_details_t *mp; const mpls_tunnel_t *mt; - u32 nlabels; + vl_api_fib_path2_t *fp; + u32 n; ctx = arg; @@ -300,18 +315,34 @@ send_mpls_tunnel_entry (u32 mti, void *arg) return; mt = mpls_tunnel_get (mti); - nlabels = vec_len (mt->mt_label_stack); + n = fib_path_list_get_n_paths (mt->mt_path_list); + + mp = vl_msg_api_alloc (sizeof (*mp) + n * sizeof (vl_api_fib_path2_t)); + memset (mp, 0, sizeof (*mp) + n * sizeof (vl_api_fib_path2_t)); - mp = vl_msg_api_alloc (sizeof (*mp) + nlabels * sizeof (u32)); - memset (mp, 0, sizeof (*mp)); mp->_vl_msg_id = ntohs (VL_API_MPLS_TUNNEL_DETAILS); mp->context = ctx->context; - mp->tunnel_index = ntohl (mti); - memcpy (mp->mt_next_hop_out_labels, - mt->mt_label_stack, nlabels * sizeof (u32)); + mp->mt_tunnel_index = ntohl (mti); + mp->mt_count = ntohl (n); + + fib_path_list_walk (mt->mt_path_list, fib_path_encode, &api_rpaths); + + fp = mp->mt_paths; + vec_foreach (api_rpath, api_rpaths) + { + memset (fp, 0, sizeof (*fp)); + + fp->weight = htonl (api_rpath->rpath.frp_weight); + fp->sw_if_index = htonl (api_rpath->rpath.frp_sw_if_index); + copy_fib_next_hop (api_rpath, fp); + fp++; + } // FIXME + // memcpy (mp->mt_next_hop_out_labels, + // mt->mt_label_stack, nlabels * sizeof (u32)); + vl_msg_api_send_shmem (ctx->q, (u8 *) & mp); } diff --git a/src/vnet/mpls/mpls_input.c b/src/vnet/mpls/mpls_input.c index 1b9bdd05..86ad8bba 100644 --- a/src/vnet/mpls/mpls_input.c +++ b/src/vnet/mpls/mpls_input.c @@ -291,7 +291,7 @@ mpls_setup_nodes (vlib_main_t * vm) rt->last_outer_fib_index = 0; rt->mpls_main = &mpls_main; - ethernet_register_input_type (vm, ETHERNET_TYPE_MPLS_UNICAST, + ethernet_register_input_type (vm, ETHERNET_TYPE_MPLS, mpls_input_node.index); } diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c index ace6a70f..3c6be7e8 100644 --- a/src/vnet/mpls/mpls_lookup.c +++ b/src/vnet/mpls/mpls_lookup.c @@ -20,8 +20,17 @@ #include #include #include +#include -vlib_node_registration_t mpls_lookup_node; +/** + * Static MPLS VLIB forwarding node + */ +static vlib_node_registration_t mpls_lookup_node; + +/** + * The arc/edge from the MPLS lookup node to the MPLS replicate node + */ +static u32 mpls_lookup_to_replicate_edge; typedef struct { u32 next_index; @@ -156,81 +165,123 @@ mpls_lookup (vlib_main_t * vm, lbi2 = mpls_fib_table_forwarding_lookup (lfib_index2, h2); lbi3 = mpls_fib_table_forwarding_lookup (lfib_index3, h3); - lb0 = load_balance_get(lbi0); - lb1 = load_balance_get(lbi1); - lb2 = load_balance_get(lbi2); - lb3 = load_balance_get(lbi3); - hash_c0 = vnet_buffer(b0)->ip.flow_hash = 0; hash_c1 = vnet_buffer(b1)->ip.flow_hash = 0; hash_c2 = vnet_buffer(b2)->ip.flow_hash = 0; hash_c3 = vnet_buffer(b3)->ip.flow_hash = 0; - if (PREDICT_FALSE(lb0->lb_n_buckets > 1)) + if (MPLS_IS_REPLICATE & lbi0) { - hash_c0 = vnet_buffer (b0)->ip.flow_hash = - mpls_compute_flow_hash(h0, lb0->lb_hash_config); + next0 = mpls_lookup_to_replicate_edge; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = + (lbi0 & ~MPLS_IS_REPLICATE); } - if (PREDICT_FALSE(lb1->lb_n_buckets > 1)) + else { - hash_c1 = vnet_buffer (b1)->ip.flow_hash = - mpls_compute_flow_hash(h1, lb1->lb_hash_config); + lb0 = load_balance_get(lbi0); + + if (PREDICT_FALSE(lb0->lb_n_buckets > 1)) + { + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + mpls_compute_flow_hash(h0, lb0->lb_hash_config); + } + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + next0 = dpo0->dpoi_next_node; + + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); } - if (PREDICT_FALSE(lb2->lb_n_buckets > 1)) + if (MPLS_IS_REPLICATE & lbi1) { - hash_c2 = vnet_buffer (b2)->ip.flow_hash = - mpls_compute_flow_hash(h2, lb2->lb_hash_config); + next1 = mpls_lookup_to_replicate_edge; + vnet_buffer (b1)->ip.adj_index[VLIB_TX] = + (lbi1 & ~MPLS_IS_REPLICATE); } - if (PREDICT_FALSE(lb3->lb_n_buckets > 1)) + else { - hash_c3 = vnet_buffer (b3)->ip.flow_hash = - mpls_compute_flow_hash(h3, lb3->lb_hash_config); - } - - ASSERT (lb0->lb_n_buckets > 0); - ASSERT (is_pow2 (lb0->lb_n_buckets)); - ASSERT (lb1->lb_n_buckets > 0); - ASSERT (is_pow2 (lb1->lb_n_buckets)); - ASSERT (lb2->lb_n_buckets > 0); - ASSERT (is_pow2 (lb2->lb_n_buckets)); - ASSERT (lb3->lb_n_buckets > 0); - ASSERT (is_pow2 (lb3->lb_n_buckets)); - - dpo0 = load_balance_get_bucket_i(lb0, - (hash_c0 & - (lb0->lb_n_buckets_minus_1))); - dpo1 = load_balance_get_bucket_i(lb1, - (hash_c1 & - (lb1->lb_n_buckets_minus_1))); - dpo2 = load_balance_get_bucket_i(lb2, - (hash_c2 & - (lb2->lb_n_buckets_minus_1))); - dpo3 = load_balance_get_bucket_i(lb3, - (hash_c3 & - (lb3->lb_n_buckets_minus_1))); + lb1 = load_balance_get(lbi1); - next0 = dpo0->dpoi_next_node; - next1 = dpo1->dpoi_next_node; - next2 = dpo2->dpoi_next_node; - next3 = dpo3->dpoi_next_node; + if (PREDICT_FALSE(lb1->lb_n_buckets > 1)) + { + hash_c1 = vnet_buffer (b1)->ip.flow_hash = + mpls_compute_flow_hash(h1, lb1->lb_hash_config); + } + ASSERT (lb1->lb_n_buckets > 0); + ASSERT (is_pow2 (lb1->lb_n_buckets)); + dpo1 = load_balance_get_bucket_i(lb1, + (hash_c1 & + (lb1->lb_n_buckets_minus_1))); + next1 = dpo1->dpoi_next_node; + + vnet_buffer (b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi1, 1, + vlib_buffer_length_in_chain (vm, b1)); + } + if (MPLS_IS_REPLICATE & lbi2) + { + next2 = mpls_lookup_to_replicate_edge; + vnet_buffer (b2)->ip.adj_index[VLIB_TX] = + (lbi2 & ~MPLS_IS_REPLICATE); + } + else + { + lb2 = load_balance_get(lbi2); - vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; - vnet_buffer (b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; - vnet_buffer (b2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index; - vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; + if (PREDICT_FALSE(lb2->lb_n_buckets > 1)) + { + hash_c2 = vnet_buffer (b2)->ip.flow_hash = + mpls_compute_flow_hash(h2, lb2->lb_hash_config); + } + ASSERT (lb2->lb_n_buckets > 0); + ASSERT (is_pow2 (lb2->lb_n_buckets)); + dpo2 = load_balance_get_bucket_i(lb2, + (hash_c2 & + (lb2->lb_n_buckets_minus_1))); + next2 = dpo2->dpoi_next_node; + + vnet_buffer (b2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi2, 1, + vlib_buffer_length_in_chain (vm, b2)); + } + if (MPLS_IS_REPLICATE & lbi3) + { + next3 = mpls_lookup_to_replicate_edge; + vnet_buffer (b3)->ip.adj_index[VLIB_TX] = + (lbi3 & ~MPLS_IS_REPLICATE); + } + else + { + lb3 = load_balance_get(lbi3); - vlib_increment_combined_counter - (cm, thread_index, lbi0, 1, - vlib_buffer_length_in_chain (vm, b0)); - vlib_increment_combined_counter - (cm, thread_index, lbi1, 1, - vlib_buffer_length_in_chain (vm, b1)); - vlib_increment_combined_counter - (cm, thread_index, lbi2, 1, - vlib_buffer_length_in_chain (vm, b2)); - vlib_increment_combined_counter - (cm, thread_index, lbi3, 1, - vlib_buffer_length_in_chain (vm, b3)); + if (PREDICT_FALSE(lb3->lb_n_buckets > 1)) + { + hash_c3 = vnet_buffer (b3)->ip.flow_hash = + mpls_compute_flow_hash(h3, lb3->lb_hash_config); + } + ASSERT (lb3->lb_n_buckets > 0); + ASSERT (is_pow2 (lb3->lb_n_buckets)); + dpo3 = load_balance_get_bucket_i(lb3, + (hash_c3 & + (lb3->lb_n_buckets_minus_1))); + next3 = dpo3->dpoi_next_node; + + vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi3, 1, + vlib_buffer_length_in_chain (vm, b3)); + } /* * before we pop the label copy th values we need to maintain. @@ -331,31 +382,41 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer(b0)->sw_if_index[VLIB_RX]); lbi0 = mpls_fib_table_forwarding_lookup(lfib_index0, h0); - lb0 = load_balance_get(lbi0); - hash_c0 = vnet_buffer(b0)->ip.flow_hash = 0; - if (PREDICT_FALSE(lb0->lb_n_buckets > 1)) + + if (MPLS_IS_REPLICATE & lbi0) { - hash_c0 = vnet_buffer (b0)->ip.flow_hash = - mpls_compute_flow_hash(h0, lb0->lb_hash_config); + next0 = mpls_lookup_to_replicate_edge; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = + (lbi0 & ~MPLS_IS_REPLICATE); } + else + { + lb0 = load_balance_get(lbi0); - ASSERT (lb0->lb_n_buckets > 0); - ASSERT (is_pow2 (lb0->lb_n_buckets)); + if (PREDICT_FALSE(lb0->lb_n_buckets > 1)) + { + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + mpls_compute_flow_hash(h0, lb0->lb_hash_config); + } - dpo0 = load_balance_get_bucket_i(lb0, - (hash_c0 & - (lb0->lb_n_buckets_minus_1))); + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); - next0 = dpo0->dpoi_next_node; - vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); - vlib_increment_combined_counter - (cm, thread_index, lbi0, 1, - vlib_buffer_length_in_chain (vm, b0)); + next0 = dpo0->dpoi_next_node; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + } /* - * before we pop the label copy th values we need to maintain. + * before we pop the label copy, values we need to maintain. * The label header is in network byte order. * last byte is the TTL. * bits 2 to 4 inclusive are the EXP bits @@ -398,7 +459,7 @@ static char * mpls_error_strings[] = { #undef mpls_error }; -VLIB_REGISTER_NODE (mpls_lookup_node) = { +VLIB_REGISTER_NODE (mpls_lookup_node, static) = { .function = mpls_lookup, .name = "mpls-lookup", /* Takes a vector of packets. */ @@ -621,3 +682,22 @@ VLIB_REGISTER_NODE (mpls_load_balance_node) = { }; VLIB_NODE_FUNCTION_MULTIARCH (mpls_load_balance_node, mpls_load_balance) + + +static clib_error_t * +mpls_lookup_init (vlib_main_t * vm) +{ + clib_error_t * error; + + if ((error = vlib_call_init_function (vm, mpls_init))) + return error; + + mpls_lookup_to_replicate_edge = + vlib_node_add_named_next(vm, + mpls_lookup_node.index, + "mpls-replicate"); + + return (NULL); +} + +VLIB_INIT_FUNCTION (mpls_lookup_init); diff --git a/src/vnet/mpls/mpls_tunnel.c b/src/vnet/mpls/mpls_tunnel.c index ac6fdcdf..1254dd9d 100644 --- a/src/vnet/mpls/mpls_tunnel.c +++ b/src/vnet/mpls/mpls_tunnel.c @@ -18,9 +18,12 @@ #include #include #include +#include #include #include #include +#include +#include /** * @brief pool of tunnel instances @@ -37,6 +40,11 @@ static u32 * mpls_tunnel_free_hw_if_indices; */ static u32 *mpls_tunnel_db; +/** + * @brief MPLS tunnel flags strings + */ +static const char *mpls_tunnel_attribute_names[] = MPLS_TUNNEL_ATTRIBUTES; + /** * @brief Get a tunnel object from a SW interface index */ @@ -44,103 +52,178 @@ static mpls_tunnel_t* mpls_tunnel_get_from_sw_if_index (u32 sw_if_index) { if ((vec_len(mpls_tunnel_db) < sw_if_index) || - (~0 == mpls_tunnel_db[sw_if_index])) - return (NULL); + (~0 == mpls_tunnel_db[sw_if_index])) + return (NULL); return (pool_elt_at_index(mpls_tunnel_pool, - mpls_tunnel_db[sw_if_index])); + mpls_tunnel_db[sw_if_index])); } /** - * @brief Return true if the label stack is imp-null only + * @brief Build a rewrite string for the MPLS tunnel. */ -static fib_forward_chain_type_t -mpls_tunnel_get_fwd_chain_type (const mpls_tunnel_t *mt) +static u8* +mpls_tunnel_build_rewrite_i (void) { - if ((1 == vec_len(mt->mt_label_stack)) && - (mt->mt_label_stack[0] == MPLS_IETF_IMPLICIT_NULL_LABEL)) - { - /* - * the only label in the label stack is implicit null - * we need to build an IP chain. - */ - if (FIB_PROTOCOL_IP4 == fib_path_list_get_proto(mt->mt_path_list)) - { - return (FIB_FORW_CHAIN_TYPE_UNICAST_IP4); - } - else - { - return (FIB_FORW_CHAIN_TYPE_UNICAST_IP6); - } - } - else - { - return (FIB_FORW_CHAIN_TYPE_MPLS_NON_EOS); - } + /* + * passing the adj code a NULL rewirte means 'i don't have one cos + * t'other end is unresolved'. That's not the case here. For the mpls + * tunnel there are just no bytes of encap to apply in the adj. We'll impose + * the label stack once we choose a path. So return a zero length rewrite. + */ + u8 *rewrite = NULL; + + vec_validate(rewrite, 0); + vec_reset_length(rewrite); + + return (rewrite); } /** * @brief Build a rewrite string for the MPLS tunnel. - * - * We have choices here; - * 1 - have an Adjacency with a zero length string and stack it on - * MPLS label objects - * 2 - put the label header rewrites in the adjacency string. - * - * We choose 2 since it results in fewer graph nodes in the egress path */ static u8* mpls_tunnel_build_rewrite (vnet_main_t * vnm, - u32 sw_if_index, - vnet_link_t link_type, - const void *dst_address) + u32 sw_if_index, + vnet_link_t link_type, + const void *dst_address) { - mpls_unicast_header_t *muh; - mpls_tunnel_t *mt; - u8 *rewrite; - u32 mti, ii; + return (mpls_tunnel_build_rewrite_i()); +} - rewrite = NULL; - mti = mpls_tunnel_db[sw_if_index]; - mt = pool_elt_at_index(mpls_tunnel_pool, mti); +typedef struct mpls_tunnel_collect_forwarding_ctx_t_ +{ + load_balance_path_t * next_hops; + const mpls_tunnel_t *mt; + fib_forward_chain_type_t fct; +} mpls_tunnel_collect_forwarding_ctx_t; + +static int +mpls_tunnel_collect_forwarding (fib_node_index_t pl_index, + fib_node_index_t path_index, + void *arg) +{ + mpls_tunnel_collect_forwarding_ctx_t *ctx; + fib_path_ext_t *path_ext; + int have_path_ext; + + ctx = arg; /* - * The vector must be allocated as u8 so the length is correct + * if the path is not resolved, don't include it. */ - ASSERT(0 < vec_len(mt->mt_label_stack)); - vec_validate(rewrite, (sizeof(*muh) * vec_len(mt->mt_label_stack)) - 1); - ASSERT(rewrite); - muh = (mpls_unicast_header_t *)rewrite; + if (!fib_path_is_resolved(path_index)) + { + return (!0); + } /* - * The last (inner most) label in the stack may be EOS, all the rest Non-EOS + * get the matching path-extension for the path being visited. */ - for (ii = 0; ii < vec_len(mt->mt_label_stack)-1; ii++) + have_path_ext = 0; + vec_foreach(path_ext, ctx->mt->mt_path_exts) { - vnet_mpls_uc_set_label(&muh[ii].label_exp_s_ttl, mt->mt_label_stack[ii]); - vnet_mpls_uc_set_ttl(&muh[ii].label_exp_s_ttl, 255); - vnet_mpls_uc_set_exp(&muh[ii].label_exp_s_ttl, 0); - vnet_mpls_uc_set_s(&muh[ii].label_exp_s_ttl, MPLS_NON_EOS); - muh[ii].label_exp_s_ttl = clib_host_to_net_u32(muh[ii].label_exp_s_ttl); + if (path_ext->fpe_path_index == path_index) + { + have_path_ext = 1; + break; + } } - vnet_mpls_uc_set_label(&muh[ii].label_exp_s_ttl, mt->mt_label_stack[ii]); - vnet_mpls_uc_set_ttl(&muh[ii].label_exp_s_ttl, 255); - vnet_mpls_uc_set_exp(&muh[ii].label_exp_s_ttl, 0); - - if ((VNET_LINK_MPLS == link_type) && - (mt->mt_label_stack[ii] != MPLS_IETF_IMPLICIT_NULL_LABEL)) + if (have_path_ext) { - vnet_mpls_uc_set_s(&muh[ii].label_exp_s_ttl, MPLS_NON_EOS); + /* + * found a matching extension. stack it to obtain the forwarding + * info for this path. + */ + ctx->next_hops = fib_path_ext_stack(path_ext, + ctx->fct, + ctx->fct, + ctx->next_hops); } else + ASSERT(0); + /* + * else + * There should be a path-extenios associated with each path + */ + + return (!0); +} + +static void +mpls_tunnel_mk_lb (mpls_tunnel_t *mt, + vnet_link_t linkt, + fib_forward_chain_type_t fct, + dpo_id_t *dpo_lb) +{ + dpo_proto_t lb_proto; + + /* + * If the entry has path extensions then we construct a load-balance + * by stacking the extensions on the forwarding chains of the paths. + * Otherwise we use the load-balance of the path-list + */ + mpls_tunnel_collect_forwarding_ctx_t ctx = { + .mt = mt, + .next_hops = NULL, + .fct = fct, + }; + + /* + * As an optimisation we allocate the vector of next-hops to be sized + * equal to the maximum nuber of paths we will need, which is also the + * most likely number we will need, since in most cases the paths are 'up'. + */ + vec_validate(ctx.next_hops, fib_path_list_get_n_paths(mt->mt_path_list)); + vec_reset_length(ctx.next_hops); + + lb_proto = vnet_link_to_dpo_proto(linkt); + + fib_path_list_walk(mt->mt_path_list, + mpls_tunnel_collect_forwarding, + &ctx); + + if (!dpo_id_is_valid(dpo_lb)) { - vnet_mpls_uc_set_s(&muh[ii].label_exp_s_ttl, MPLS_EOS); + /* + * first time create + */ + if (mt->mt_flags & MPLS_TUNNEL_FLAG_MCAST) + { + dpo_set(dpo_lb, + DPO_REPLICATE, + lb_proto, + replicate_create(0, lb_proto)); + } + else + { + flow_hash_config_t fhc; + + fhc = 0; // FIXME + /* fhc = fib_table_get_flow_hash_config(fib_entry->fe_fib_index, */ + /* dpo_proto_to_fib(lb_proto)); */ + dpo_set(dpo_lb, + DPO_LOAD_BALANCE, + lb_proto, + load_balance_create(0, lb_proto, fhc)); + } } - muh[ii].label_exp_s_ttl = clib_host_to_net_u32(muh[ii].label_exp_s_ttl); - - return (rewrite); + if (mt->mt_flags & MPLS_TUNNEL_FLAG_MCAST) + { + /* + * MPLS multicast + */ + replicate_multipath_update(dpo_lb, ctx.next_hops); + } + else + { + load_balance_multipath_update(dpo_lb, + ctx.next_hops, + LOAD_BALANCE_FLAG_NONE); + vec_free(ctx.next_hops); + } } /** @@ -161,45 +244,47 @@ mpls_tunnel_stack (adj_index_t ai) mt = mpls_tunnel_get_from_sw_if_index(sw_if_index); if (NULL == mt) - return; + return; /* - * find the adjacency that is contributed by the FIB path-list - * that this tunnel resovles via, and use it as the next adj - * in the midchain + * while we're stacking the adj, remove the tunnel from the child list + * of the path list. this breaks a circular dependency of walk updates + * where the create of adjacencies in the children can lead to walks + * that get back here. */ - if (vnet_hw_interface_get_flags(vnet_get_main(), - mt->mt_hw_if_index) & - VNET_HW_INTERFACE_FLAG_LINK_UP) - { - dpo_id_t dpo = DPO_INVALID; + fib_path_list_lock(mt->mt_path_list); - fib_path_list_contribute_forwarding(mt->mt_path_list, - mpls_tunnel_get_fwd_chain_type(mt), - &dpo); - - if (DPO_LOAD_BALANCE == dpo.dpoi_type) - { - /* - * we don't support multiple paths, so no need to load-balance. - * pull the first and only choice and stack directly on that. - */ - load_balance_t *lb; - - lb = load_balance_get (dpo.dpoi_index); + fib_path_list_child_remove(mt->mt_path_list, + mt->mt_sibling_index); - ASSERT(1 == lb->lb_n_buckets); + /* + * Construct the DPO (load-balance or replicate) that we can stack + * the tunnel's midchain on + */ + if (vnet_hw_interface_get_flags(vnet_get_main(), + mt->mt_hw_if_index) & + VNET_HW_INTERFACE_FLAG_LINK_UP) + { + dpo_id_t dpo = DPO_INVALID; - dpo_copy(&dpo, load_balance_get_bucket_i (lb, 0)); - } + mpls_tunnel_mk_lb(mt, + adj->ia_link, + FIB_FORW_CHAIN_TYPE_MPLS_EOS, + &dpo); - adj_nbr_midchain_stack(ai, &dpo); - dpo_reset(&dpo); + adj_nbr_midchain_stack(ai, &dpo); + dpo_reset(&dpo); } else { - adj_nbr_midchain_unstack(ai); + adj_nbr_midchain_unstack(ai); } + + mt->mt_sibling_index = fib_path_list_child_add(mt->mt_path_list, + FIB_NODE_TYPE_MPLS_TUNNEL, + mt - mpls_tunnel_pool); + + fib_path_list_lock(mt->mt_path_list); } /** @@ -207,7 +292,7 @@ mpls_tunnel_stack (adj_index_t ai) */ static adj_walk_rc_t mpls_adj_walk_cb (adj_index_t ai, - void *ctx) + void *ctx) { mpls_tunnel_stack(ai); @@ -224,17 +309,17 @@ mpls_tunnel_restack (mpls_tunnel_t *mt) */ FOR_EACH_FIB_PROTOCOL(proto) { - adj_nbr_walk(mt->mt_sw_if_index, - proto, - mpls_adj_walk_cb, - NULL); + adj_nbr_walk(mt->mt_sw_if_index, + proto, + mpls_adj_walk_cb, + NULL); } } static clib_error_t * mpls_tunnel_admin_up_down (vnet_main_t * vnm, - u32 hw_if_index, - u32 flags) + u32 hw_if_index, + u32 flags) { vnet_hw_interface_t * hi; mpls_tunnel_t *mt; @@ -244,13 +329,13 @@ mpls_tunnel_admin_up_down (vnet_main_t * vnm, mt = mpls_tunnel_get_from_sw_if_index(hi->sw_if_index); if (NULL == mt) - return (NULL); + return (NULL); if (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) - vnet_hw_interface_set_flags (vnm, hw_if_index, - VNET_HW_INTERFACE_FLAG_LINK_UP); + vnet_hw_interface_set_flags (vnm, hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); else - vnet_hw_interface_set_flags (vnm, hw_if_index, 0 /* down */); + vnet_hw_interface_set_flags (vnm, hw_if_index, 0 /* down */); mpls_tunnel_restack(mt); @@ -263,22 +348,58 @@ mpls_tunnel_admin_up_down (vnet_main_t * vnm, */ static void mpls_tunnel_fixup (vlib_main_t *vm, - ip_adjacency_t *adj, - vlib_buffer_t *b0) + ip_adjacency_t *adj, + vlib_buffer_t *b0) { + /* + * A no-op w.r.t. the header. but reset the 'have we pushed any + * MPLS labels onto the packet' flag. That way when we enter the + * tunnel we'll get a TTL set to 255 + */ + vnet_buffer(b0)->mpls.first = 0; } static void mpls_tunnel_update_adj (vnet_main_t * vnm, - u32 sw_if_index, - adj_index_t ai) + u32 sw_if_index, + adj_index_t ai) { - adj_nbr_midchain_update_rewrite( - ai, mpls_tunnel_fixup, - ADJ_FLAG_NONE, - mpls_tunnel_build_rewrite(vnm, sw_if_index, - adj_get_link_type(ai), - NULL)); + ip_adjacency_t *adj; + + ASSERT(ADJ_INDEX_INVALID != ai); + + adj = adj_get(ai); + + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_ARP: + case IP_LOOKUP_NEXT_GLEAN: + adj_nbr_midchain_update_rewrite(ai, mpls_tunnel_fixup, + ADJ_FLAG_NONE, + mpls_tunnel_build_rewrite_i()); + break; + case IP_LOOKUP_NEXT_MCAST: + /* + * Construct a partial rewrite from the known ethernet mcast dest MAC + * There's no MAC fixup, so the last 2 parameters are 0 + */ + adj_mcast_midchain_update_rewrite(ai, mpls_tunnel_fixup, + ADJ_FLAG_NONE, + mpls_tunnel_build_rewrite_i(), + 0, 0); + break; + + case IP_LOOKUP_NEXT_DROP: + case IP_LOOKUP_NEXT_PUNT: + case IP_LOOKUP_NEXT_LOCAL: + case IP_LOOKUP_NEXT_REWRITE: + case IP_LOOKUP_NEXT_MIDCHAIN: + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: + case IP_LOOKUP_NEXT_ICMP_ERROR: + case IP_LOOKUP_N_NEXT: + ASSERT (0); + break; + } mpls_tunnel_stack(ai); } @@ -312,7 +433,7 @@ typedef struct mpls_tunnel_trace_t_ static u8 * format_mpls_tunnel_tx_trace (u8 * s, - va_list * args) + va_list * args) { CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); @@ -327,8 +448,8 @@ format_mpls_tunnel_tx_trace (u8 * s, */ static uword mpls_tunnel_tx (vlib_main_t * vm, - vlib_node_runtime_t * node, - vlib_frame_t * frame) + vlib_node_runtime_t * node, + vlib_frame_t * frame) { u32 next_index; u32 * from, * to_next, n_left_from, n_left_to_next; @@ -355,32 +476,32 @@ mpls_tunnel_tx (vlib_main_t * vm, * FIXME DUAL LOOP */ while (n_left_from > 0 && n_left_to_next > 0) - { - vlib_buffer_t * b0; - u32 bi0; + { + vlib_buffer_t * b0; + u32 bi0; - bi0 = from[0]; - to_next[0] = bi0; - from += 1; - to_next += 1; - n_left_from -= 1; - n_left_to_next -= 1; + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; - b0 = vlib_get_buffer(vm, bi0); + b0 = vlib_get_buffer(vm, bi0); - vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mt->mt_l2_adj; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mt->mt_l2_adj; - if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) - { - mpls_tunnel_trace_t *tr = vlib_add_trace (vm, node, - b0, sizeof (*tr)); - tr->tunnel_id = rd->dev_instance; - } + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_tunnel_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->tunnel_id = rd->dev_instance; + } - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi0, mt->mt_l2_tx_arc); - } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, mt->mt_l2_tx_arc); + } vlib_put_next_frame (vm, node, next_index, n_left_to_next); } @@ -417,13 +538,13 @@ mpls_tunnel_get (u32 mti) */ void mpls_tunnel_walk (mpls_tunnel_walk_cb_t cb, - void *ctx) + void *ctx) { u32 mti; pool_foreach_index(mti, mpls_tunnel_pool, ({ - cb(mti, ctx); + cb(mti, ctx); })); } @@ -435,25 +556,22 @@ vnet_mpls_tunnel_del (u32 sw_if_index) mt = mpls_tunnel_get_from_sw_if_index(sw_if_index); if (NULL == mt) - return; - - fib_path_list_child_remove(mt->mt_path_list, - mt->mt_sibling_index); - if (ADJ_INDEX_INVALID != mt->mt_l2_adj) - adj_unlock(mt->mt_l2_adj); + return; - vec_free(mt->mt_label_stack); + if (FIB_NODE_INDEX_INVALID != mt->mt_path_list) + fib_path_list_child_remove(mt->mt_path_list, + mt->mt_sibling_index); + if (ADJ_INDEX_INVALID != mt->mt_l2_adj) + adj_unlock(mt->mt_l2_adj); vec_add1 (mpls_tunnel_free_hw_if_indices, mt->mt_hw_if_index); pool_put(mpls_tunnel_pool, mt); mpls_tunnel_db[sw_if_index] = ~0; } -void -vnet_mpls_tunnel_add (fib_route_path_t *rpaths, - mpls_label_t *label_stack, - u8 l2_only, - u32 *sw_if_index) +u32 +vnet_mpls_tunnel_create (u8 l2_only, + u8 is_multicast) { vnet_hw_interface_t * hi; mpls_tunnel_t *mt; @@ -466,28 +584,33 @@ vnet_mpls_tunnel_add (fib_route_path_t *rpaths, mti = mt - mpls_tunnel_pool; fib_node_init(&mt->mt_node, FIB_NODE_TYPE_MPLS_TUNNEL); mt->mt_l2_adj = ADJ_INDEX_INVALID; + mt->mt_path_list = FIB_NODE_INDEX_INVALID; + mt->mt_sibling_index = FIB_NODE_INDEX_INVALID; + + if (is_multicast) + mt->mt_flags |= MPLS_TUNNEL_FLAG_MCAST; /* * Create a new, or re=use and old, tunnel HW interface */ if (vec_len (mpls_tunnel_free_hw_if_indices) > 0) { - mt->mt_hw_if_index = - mpls_tunnel_free_hw_if_indices[vec_len(mpls_tunnel_free_hw_if_indices)-1]; - _vec_len (mpls_tunnel_free_hw_if_indices) -= 1; - hi = vnet_get_hw_interface (vnm, mt->mt_hw_if_index); - hi->hw_instance = mti; - hi->dev_instance = mti; + mt->mt_hw_if_index = + mpls_tunnel_free_hw_if_indices[vec_len(mpls_tunnel_free_hw_if_indices)-1]; + _vec_len (mpls_tunnel_free_hw_if_indices) -= 1; + hi = vnet_get_hw_interface (vnm, mt->mt_hw_if_index); + hi->hw_instance = mti; + hi->dev_instance = mti; } - else + else { - mt->mt_hw_if_index = vnet_register_interface( - vnm, - mpls_tunnel_class.index, - mti, - mpls_tunnel_hw_interface_class.index, - mti); - hi = vnet_get_hw_interface(vnm, mt->mt_hw_if_index); + mt->mt_hw_if_index = vnet_register_interface( + vnm, + mpls_tunnel_class.index, + mti, + mpls_tunnel_hw_interface_class.index, + mti); + hi = vnet_get_hw_interface(vnm, mt->mt_hw_if_index); } /* @@ -497,43 +620,218 @@ vnet_mpls_tunnel_add (fib_route_path_t *rpaths, vec_validate_init_empty(mpls_tunnel_db, mt->mt_sw_if_index, ~0); mpls_tunnel_db[mt->mt_sw_if_index] = mti; + if (l2_only) + { + mt->mt_l2_adj = + adj_nbr_add_or_lock(fib_path_list_get_proto(mt->mt_path_list), + VNET_LINK_ETHERNET, + &zero_addr, + mt->mt_sw_if_index); + + mt->mt_l2_tx_arc = vlib_node_add_named_next(vlib_get_main(), + hi->tx_node_index, + "adj-l2-midchain"); + } + + return (mt->mt_sw_if_index); +} + +/* + * mpls_tunnel_path_ext_add + * + * append a path extension to the entry's list + */ +static void +mpls_tunnel_path_ext_append (mpls_tunnel_t *mt, + const fib_route_path_t *rpath) +{ + if (NULL != rpath->frp_label_stack) + { + fib_path_ext_t *path_ext; + + vec_add2(mt->mt_path_exts, path_ext, 1); + + fib_path_ext_init(path_ext, mt->mt_path_list, rpath); + } +} + +/* + * mpls_tunnel_path_ext_insert + * + * insert, sorted, a path extension to the entry's list. + * It's not strictly necessary in sort the path extensions, since each + * extension has the path index to which it resolves. However, by being + * sorted the load-balance produced has a deterministic order, not an order + * based on the sequence of extension additions. this is a considerable benefit. + */ +static void +mpls_tunnel_path_ext_insert (mpls_tunnel_t *mt, + const fib_route_path_t *rpath) +{ + if (0 == vec_len(mt->mt_path_exts)) + return (mpls_tunnel_path_ext_append(mt, rpath)); + + if (NULL != rpath->frp_label_stack) + { + fib_path_ext_t path_ext; + int i = 0; + + fib_path_ext_init(&path_ext, mt->mt_path_list, rpath); + + while (i < vec_len(mt->mt_path_exts) && + (fib_path_ext_cmp(&mt->mt_path_exts[i], rpath) < 0)) + { + i++; + } + + vec_insert_elts(mt->mt_path_exts, &path_ext, 1, i); + } +} + +void +vnet_mpls_tunnel_path_add (u32 sw_if_index, + fib_route_path_t *rpaths) +{ + mpls_tunnel_t *mt; + u32 mti; + + mt = mpls_tunnel_get_from_sw_if_index(sw_if_index); + + if (NULL == mt) + return; + + mti = mt - mpls_tunnel_pool; + /* * construct a path-list from the path provided */ - mt->mt_path_list = fib_path_list_create(FIB_PATH_LIST_FLAG_SHARED, rpaths); - mt->mt_sibling_index = fib_path_list_child_add(mt->mt_path_list, - FIB_NODE_TYPE_MPLS_TUNNEL, - mti); + if (FIB_NODE_INDEX_INVALID == mt->mt_path_list) + { + mt->mt_path_list = fib_path_list_create(FIB_PATH_LIST_FLAG_SHARED, rpaths); + mt->mt_sibling_index = fib_path_list_child_add(mt->mt_path_list, + FIB_NODE_TYPE_MPLS_TUNNEL, + mti); + } + else + { + fib_node_index_t old_pl_index; + fib_path_ext_t *path_ext; + + old_pl_index = mt->mt_path_list; + + mt->mt_path_list = + fib_path_list_copy_and_path_add(old_pl_index, + FIB_PATH_LIST_FLAG_SHARED, + rpaths); + + fib_path_list_child_remove(old_pl_index, + mt->mt_sibling_index); + mt->mt_sibling_index = fib_path_list_child_add(mt->mt_path_list, + FIB_NODE_TYPE_MPLS_TUNNEL, + mti); + /* + * re-resolve all the path-extensions with the new path-list + */ + vec_foreach(path_ext, mt->mt_path_exts) + { + fib_path_ext_resolve(path_ext, mt->mt_path_list); + } + } + mpls_tunnel_path_ext_insert(mt, rpaths); + mpls_tunnel_restack(mt); +} + +int +vnet_mpls_tunnel_path_remove (u32 sw_if_index, + fib_route_path_t *rpaths) +{ + mpls_tunnel_t *mt; + u32 mti; - mt->mt_label_stack = vec_dup(label_stack); + mt = mpls_tunnel_get_from_sw_if_index(sw_if_index); - if (l2_only) + if (NULL == mt) + return (0); + + mti = mt - mpls_tunnel_pool; + + /* + * construct a path-list from the path provided + */ + if (FIB_NODE_INDEX_INVALID == mt->mt_path_list) { - mt->mt_l2_adj = - adj_nbr_add_or_lock(fib_path_list_get_proto(mt->mt_path_list), - VNET_LINK_ETHERNET, - &zero_addr, - mt->mt_sw_if_index); - - mt->mt_l2_tx_arc = vlib_node_add_named_next(vlib_get_main(), - hi->tx_node_index, - "adj-l2-midchain"); + /* can't remove a path if we have onoe */ + return (0); } - - *sw_if_index = mt->mt_sw_if_index; + else + { + fib_node_index_t old_pl_index; + fib_path_ext_t *path_ext; + + old_pl_index = mt->mt_path_list; + + mt->mt_path_list = + fib_path_list_copy_and_path_remove(old_pl_index, + FIB_PATH_LIST_FLAG_SHARED, + rpaths); + + fib_path_list_child_remove(old_pl_index, + mt->mt_sibling_index); + + if (FIB_NODE_INDEX_INVALID == mt->mt_path_list) + { + /* no paths left */ + return (0); + } + else + { + mt->mt_sibling_index = + fib_path_list_child_add(mt->mt_path_list, + FIB_NODE_TYPE_MPLS_TUNNEL, + mti); + } + /* + * find the matching path extension and remove it + */ + vec_foreach(path_ext, mt->mt_path_exts) + { + if (!fib_path_ext_cmp(path_ext, rpaths)) + { + /* + * delete the element moving the remaining elements down 1 position. + * this preserves the sorted order. + */ + vec_free(path_ext->fpe_label_stack); + vec_delete(mt->mt_path_exts, 1, + (path_ext - mt->mt_path_exts)); + break; + } + } + /* + * re-resolve all the path-extensions with the new path-list + */ + vec_foreach(path_ext, mt->mt_path_exts) + { + fib_path_ext_resolve(path_ext, mt->mt_path_list); + } + + mpls_tunnel_restack(mt); + } + + return (fib_path_list_get_n_paths(mt->mt_path_list)); } + static clib_error_t * vnet_create_mpls_tunnel_command_fn (vlib_main_t * vm, - unformat_input_t * input, - vlib_cli_command_t * cmd) + unformat_input_t * input, + vlib_cli_command_t * cmd) { unformat_input_t _line_input, * line_input = &_line_input; vnet_main_t * vnm = vnet_get_main(); - u8 is_del = 0; - u8 l2_only = 0; + u8 is_del = 0, l2_only = 0, is_multicast =0; fib_route_path_t rpath, *rpaths = NULL; - mpls_label_t out_label = MPLS_LABEL_INVALID, *labels = NULL; + mpls_label_t out_label = MPLS_LABEL_INVALID; u32 sw_if_index; clib_error_t *error = NULL; @@ -541,87 +839,89 @@ vnet_create_mpls_tunnel_command_fn (vlib_main_t * vm, /* Get a line of input. */ if (! unformat_user (input, unformat_line_input, line_input)) - return 0; + return 0; while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { - if (unformat (line_input, "del %U", - unformat_vnet_sw_interface, vnm, - &sw_if_index)) - is_del = 1; - else if (unformat (line_input, "add")) - is_del = 0; - else if (unformat (line_input, "out-label %U", - unformat_mpls_unicast_label, &out_label)) - { - vec_add1(labels, out_label); - } - else if (unformat (line_input, "via %U %U", - unformat_ip4_address, - &rpath.frp_addr.ip4, - unformat_vnet_sw_interface, vnm, - &rpath.frp_sw_if_index)) - { - rpath.frp_weight = 1; - rpath.frp_proto = FIB_PROTOCOL_IP4; - } - - else if (unformat (line_input, "via %U %U", - unformat_ip6_address, - &rpath.frp_addr.ip6, - unformat_vnet_sw_interface, vnm, - &rpath.frp_sw_if_index)) - { - rpath.frp_weight = 1; - rpath.frp_proto = FIB_PROTOCOL_IP6; - } - else if (unformat (line_input, "via %U", - unformat_ip6_address, - &rpath.frp_addr.ip6)) - { - rpath.frp_fib_index = 0; - rpath.frp_weight = 1; - rpath.frp_sw_if_index = ~0; - rpath.frp_proto = FIB_PROTOCOL_IP6; - } - else if (unformat (line_input, "via %U", - unformat_ip4_address, - &rpath.frp_addr.ip4)) - { - rpath.frp_fib_index = 0; - rpath.frp_weight = 1; - rpath.frp_sw_if_index = ~0; - rpath.frp_proto = FIB_PROTOCOL_IP4; - } - else if (unformat (line_input, "l2-only")) - l2_only = 1; - else - { - error = clib_error_return (0, "unknown input '%U'", - format_unformat_error, line_input); - goto done; - } + if (unformat (line_input, "del %U", + unformat_vnet_sw_interface, vnm, + &sw_if_index)) + is_del = 1; + else if (unformat (line_input, "add")) + is_del = 0; + else if (unformat (line_input, "out-label %U", + unformat_mpls_unicast_label, &out_label)) + { + vec_add1(rpath.frp_label_stack, out_label); + } + else if (unformat (line_input, "via %U %U", + unformat_ip4_address, + &rpath.frp_addr.ip4, + unformat_vnet_sw_interface, vnm, + &rpath.frp_sw_if_index)) + { + rpath.frp_weight = 1; + rpath.frp_proto = FIB_PROTOCOL_IP4; + } + + else if (unformat (line_input, "via %U %U", + unformat_ip6_address, + &rpath.frp_addr.ip6, + unformat_vnet_sw_interface, vnm, + &rpath.frp_sw_if_index)) + { + rpath.frp_weight = 1; + rpath.frp_proto = FIB_PROTOCOL_IP6; + } + else if (unformat (line_input, "via %U", + unformat_ip6_address, + &rpath.frp_addr.ip6)) + { + rpath.frp_fib_index = 0; + rpath.frp_weight = 1; + rpath.frp_sw_if_index = ~0; + rpath.frp_proto = FIB_PROTOCOL_IP6; + } + else if (unformat (line_input, "via %U", + unformat_ip4_address, + &rpath.frp_addr.ip4)) + { + rpath.frp_fib_index = 0; + rpath.frp_weight = 1; + rpath.frp_sw_if_index = ~0; + rpath.frp_proto = FIB_PROTOCOL_IP4; + } + else if (unformat (line_input, "l2-only")) + l2_only = 1; + else if (unformat (line_input, "multicast")) + is_multicast = 1; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } } if (is_del) { - vnet_mpls_tunnel_del(sw_if_index); + vnet_mpls_tunnel_del(sw_if_index); } else { - if (0 == vec_len(labels)) - { - error = clib_error_return (0, "No Output Labels '%U'", - format_unformat_error, line_input); - goto done; - } - - vec_add1(rpaths, rpath); - vnet_mpls_tunnel_add(rpaths, labels, l2_only, &sw_if_index); + if (0 == vec_len(rpath.frp_label_stack)) + { + error = clib_error_return (0, "No Output Labels '%U'", + format_unformat_error, line_input); + goto done; + } + + vec_add1(rpaths, rpath); + sw_if_index = vnet_mpls_tunnel_create(l2_only, is_multicast); + vnet_mpls_tunnel_path_add(sw_if_index, rpaths); } done: - vec_free(labels); vec_free(rpaths); unformat_free (line_input); @@ -638,7 +938,7 @@ done: ?*/ VLIB_CLI_COMMAND (create_mpls_tunnel_command, static) = { .path = "mpls tunnel", - .short_help = + .short_help = "mpls tunnel via [addr] [interface] [out-labels]", .function = vnet_create_mpls_tunnel_command_fn, }; @@ -647,19 +947,28 @@ static u8 * format_mpls_tunnel (u8 * s, va_list * args) { mpls_tunnel_t *mt = va_arg (*args, mpls_tunnel_t *); - int ii; + mpls_tunnel_attribute_t attr; + fib_path_ext_t *path_ext; s = format(s, "mpls_tunnel%d: sw_if_index:%d hw_if_index:%d", - mt - mpls_tunnel_pool, - mt->mt_sw_if_index, - mt->mt_hw_if_index); - s = format(s, "\n label-stack:\n "); - for (ii = 0; ii < vec_len(mt->mt_label_stack); ii++) - { - s = format(s, "%d, ", mt->mt_label_stack[ii]); + mt - mpls_tunnel_pool, + mt->mt_sw_if_index, + mt->mt_hw_if_index); + if (MPLS_TUNNEL_FLAG_NONE != mt->mt_flags) { + s = format(s, " \n flags:"); + FOR_EACH_MPLS_TUNNEL_ATTRIBUTE(attr) { + if ((1<mt_flags) { + s = format (s, "%s,", mpls_tunnel_attribute_names[attr]); + } + } } s = format(s, "\n via:\n"); s = fib_path_list_format(mt->mt_path_list, s); + s = format(s, " Extensions:"); + vec_foreach(path_ext, mt->mt_path_exts) + { + s = format(s, "\n %U", format_fib_path_ext, path_ext); + } s = format(s, "\n"); return (s); @@ -667,42 +976,42 @@ format_mpls_tunnel (u8 * s, va_list * args) static clib_error_t * show_mpls_tunnel_command_fn (vlib_main_t * vm, - unformat_input_t * input, - vlib_cli_command_t * cmd) + unformat_input_t * input, + vlib_cli_command_t * cmd) { mpls_tunnel_t * mt; u32 mti = ~0; if (pool_elts (mpls_tunnel_pool) == 0) - vlib_cli_output (vm, "No MPLS tunnels configured..."); + vlib_cli_output (vm, "No MPLS tunnels configured..."); while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { - if (unformat (input, "%d", &mti)) - ; - else - break; + if (unformat (input, "%d", &mti)) + ; + else + break; } if (~0 == mti) { - pool_foreach (mt, mpls_tunnel_pool, - ({ - vlib_cli_output (vm, "[@%d] %U", - mt - mpls_tunnel_pool, - format_mpls_tunnel, mt); - })); + pool_foreach (mt, mpls_tunnel_pool, + ({ + vlib_cli_output (vm, "[@%d] %U", + mt - mpls_tunnel_pool, + format_mpls_tunnel, mt); + })); } else { - if (pool_is_free_index(mpls_tunnel_pool, mti)) - return clib_error_return (0, "Not atunnel index %d", mti); + if (pool_is_free_index(mpls_tunnel_pool, mti)) + return clib_error_return (0, "Not atunnel index %d", mti); - mt = pool_elt_at_index(mpls_tunnel_pool, mti); + mt = pool_elt_at_index(mpls_tunnel_pool, mti); - vlib_cli_output (vm, "[@%d] %U", - mt - mpls_tunnel_pool, - format_mpls_tunnel, mt); + vlib_cli_output (vm, "[@%d] %U", + mt - mpls_tunnel_pool, + format_mpls_tunnel, mt); } return 0; @@ -715,7 +1024,7 @@ show_mpls_tunnel_command_fn (vlib_main_t * vm, * @cliexstart{sh mpls tunnel 2} * [@2] mpls_tunnel2: sw_if_index:5 hw_if_index:5 * label-stack: - * 3, + * 3, * via: * index:26 locks:1 proto:ipv4 uPRF-list:26 len:1 itfs:[2, ] * index:26 pl-index:26 ipv4 weight=1 attached-nexthop: oper-flags:resolved, @@ -743,7 +1052,7 @@ mpls_tunnel_from_fib_node (fib_node_t *node) */ static fib_node_back_walk_rc_t mpls_tunnel_back_walk (fib_node_t *node, - fib_node_back_walk_ctx_t *ctx) + fib_node_back_walk_ctx_t *ctx) { mpls_tunnel_restack(mpls_tunnel_from_fib_node(node)); diff --git a/src/vnet/mpls/mpls_tunnel.h b/src/vnet/mpls/mpls_tunnel.h index ee56c0fc..0b55d0db 100644 --- a/src/vnet/mpls/mpls_tunnel.h +++ b/src/vnet/mpls/mpls_tunnel.h @@ -17,6 +17,31 @@ #define __MPLS_TUNNEL_H__ #include +#include + +typedef enum mpls_tunnel_attribute_t_ +{ + MPLS_TUNNEL_ATTRIBUTE_FIRST = 0, + /** + * @brief The tunnel has an underlying multicast LSP + */ + MPLS_TUNNEL_ATTRIBUTE_MCAST = MPLS_TUNNEL_ATTRIBUTE_FIRST, + MPLS_TUNNEL_ATTRIBUTE_LAST = MPLS_TUNNEL_ATTRIBUTE_MCAST, +} mpls_tunnel_attribute_t; + +#define MPLS_TUNNEL_ATTRIBUTES { \ + [MPLS_TUNNEL_ATTRIBUTE_MCAST] = "multicast", \ +} +#define FOR_EACH_MPLS_TUNNEL_ATTRIBUTE(_item) \ + for (_item = MPLS_TUNNEL_ATTRIBUTE_FIRST; \ + _item < MPLS_TUNNEL_ATTRIBUTE_LAST; \ + _item++) + +typedef enum mpls_tunnel_flag_t_ { + MPLS_TUNNEL_FLAG_NONE = 0, + MPLS_TUNNEL_FLAG_MCAST = (1 << MPLS_TUNNEL_ATTRIBUTE_MCAST), +} __attribute__ ((packed)) mpls_tunnel_flags_t; + /** * @brief A uni-directional MPLS tunnel @@ -28,6 +53,11 @@ typedef struct mpls_tunnel_t_ */ fib_node_t mt_node; + /** + * @brief Tunnel flags + */ + mpls_tunnel_flags_t mt_flags; + /** * @brief If the tunnel is an L2 tunnel, this is the link type ETHERNET * adjacency @@ -50,9 +80,9 @@ typedef struct mpls_tunnel_t_ u32 mt_sibling_index; /** - * @brief The Label stack to apply to egress packets + * A vector of path extensions o hold the label stack for each path */ - mpls_label_t *mt_label_stack; + fib_path_ext_t *mt_path_exts; /** * @brief Flag to indicate the tunnel is only for L2 traffic, that is @@ -74,12 +104,27 @@ typedef struct mpls_tunnel_t_ /** * @brief Create a new MPLS tunnel + * @return the SW Interface index of the newly created tuneel */ -extern void vnet_mpls_tunnel_add (fib_route_path_t *rpath, - mpls_label_t *label_stack, - u8 l2_only, - u32 *sw_if_index); +extern u32 vnet_mpls_tunnel_create (u8 l2_only, + u8 is_multicast); +/** + * @brief Add a path to an MPLS tunnel + */ +extern void vnet_mpls_tunnel_path_add (u32 sw_if_index, + fib_route_path_t *rpath); + +/** + * @brief remove a path from a tunnel. + * @return the number of remaining paths. 0 implies the tunnel can be deleted + */ +extern int vnet_mpls_tunnel_path_remove (u32 sw_if_index, + fib_route_path_t *rpath); + +/** + * @brief Delete an MPLS tunnel + */ extern void vnet_mpls_tunnel_del (u32 sw_if_index); extern const mpls_tunnel_t *mpls_tunnel_get(u32 index); diff --git a/src/vnet/mpls/mpls_types.h b/src/vnet/mpls/mpls_types.h index d7c629df..b1075cdd 100644 --- a/src/vnet/mpls/mpls_types.h +++ b/src/vnet/mpls/mpls_types.h @@ -1,3 +1,17 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __MPLS_TYPES_H__ #define __MPLS_TYPES_H__ @@ -36,4 +50,10 @@ (((_lbl) > MPLS_IETF_MIN_UNRES_LABEL) && \ ((_lbl) <= MPLS_IETF_MAX_UNRES_LABEL)) +/** + * The top bit of the index, which is the result of the MPLS lookup + * is used to determine if the DPO is a load-balance or a replicate + */ +#define MPLS_IS_REPLICATE 0x80000000 + #endif diff --git a/src/vnet/srp/interface.c b/src/vnet/srp/interface.c index d427cc3c..44e2b0d6 100644 --- a/src/vnet/srp/interface.c +++ b/src/vnet/srp/interface.c @@ -58,7 +58,7 @@ srp_build_rewrite (vnet_main_t * vnm, #define _(a,b) case VNET_LINK_##a: type = ETHERNET_TYPE_##b; break _ (IP4, IP4); _ (IP6, IP6); - _ (MPLS, MPLS_UNICAST); + _ (MPLS, MPLS); _ (ARP, ARP); #undef _ default: diff --git a/test/test_ip_mcast.py b/test/test_ip_mcast.py index 36d597a7..c1397d70 100644 --- a/test/test_ip_mcast.py +++ b/test/test_ip_mcast.py @@ -622,6 +622,7 @@ class TestIPMcast(VppTestCase): (MRouteItfFlags.MFIB_ITF_FLAG_ACCEPT | MRouteItfFlags.MFIB_ITF_FLAG_NEGATE_SIGNAL)) + self.vapi.cli("clear trace") tx = self._mcast_connected_send_stream("232.1.1.1") signals = self.vapi.mfib_signal_dump() diff --git a/test/test_mpls.py b/test/test_mpls.py index fc832644..700b7091 100644 --- a/test/test_mpls.py +++ b/test/test_mpls.py @@ -5,7 +5,9 @@ import socket from framework import VppTestCase, VppTestRunner from vpp_ip_route import VppIpRoute, VppRoutePath, VppMplsRoute, \ - VppMplsIpBind + VppMplsIpBind, VppIpMRoute, VppMRoutePath, \ + MRouteItfFlags, MRouteEntryFlags +from vpp_mpls_tunnel_interface import VppMPLSTunnelInterface from scapy.packet import Raw from scapy.layers.l2 import Ether @@ -21,7 +23,7 @@ class TestMPLS(VppTestCase): super(TestMPLS, self).setUp() # create 2 pg interfaces - self.create_pg_interfaces(range(2)) + self.create_pg_interfaces(range(4)) # setup both interfaces # assign them different tables. @@ -53,10 +55,12 @@ class TestMPLS(VppTestCase): mpls_labels, mpls_ttl=255, ping=0, - ip_itf=None): + ip_itf=None, + dst_ip=None, + n=257): self.reset_packet_infos() pkts = [] - for i in range(0, 257): + for i in range(0, n): info = self.create_packet_info(src_if, src_if) payload = self.info_to_payload(info) p = Ether(dst=src_if.local_mac, src=src_if.remote_mac) @@ -67,9 +71,14 @@ class TestMPLS(VppTestCase): else: p = p / MPLS(label=mpls_labels[ii], ttl=mpls_ttl, s=0) if not ping: - p = (p / IP(src=src_if.local_ip4, dst=src_if.remote_ip4) / - UDP(sport=1234, dport=1234) / - Raw(payload)) + if not dst_ip: + p = (p / IP(src=src_if.local_ip4, dst=src_if.remote_ip4) / + UDP(sport=1234, dport=1234) / + Raw(payload)) + else: + p = (p / IP(src=src_if.local_ip4, dst=dst_ip) / + UDP(sport=1234, dport=1234) / + Raw(payload)) else: p = (p / IP(src=ip_itf.remote_ip4, dst=ip_itf.local_ip4) / @@ -254,6 +263,13 @@ class TestMPLS(VppTestCase): except: raise + def send_and_assert_no_replies(self, intf, pkts, remark): + intf.add_stream(pkts) + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + for i in self.pg_interfaces: + i.assert_nothing_captured(remark=remark) + def test_swap(self): """ MPLS label swap tests """ @@ -278,7 +294,7 @@ class TestMPLS(VppTestCase): self.pg_start() rx = self.pg0.get_capture() - self.verify_capture_labelled_ip4(self.pg0, rx, tx, [33]) + self.verify_capture_labelled(self.pg0, rx, tx, [33]) # # A simple MPLS xconnect - non-eos label in label out @@ -358,7 +374,7 @@ class TestMPLS(VppTestCase): self.pg_start() rx = self.pg0.get_capture() - self.verify_capture_labelled_ip4(self.pg0, rx, tx, [33, 44, 45]) + self.verify_capture_labelled(self.pg0, rx, tx, [33, 44, 45], num=2) # # A recursive non-EOS x-connect, which resolves through another @@ -576,25 +592,19 @@ class TestMPLS(VppTestCase): # # Create a tunnel with a single out label # - nh_addr = socket.inet_pton(socket.AF_INET, self.pg0.remote_ip4) - - reply = self.vapi.mpls_tunnel_add_del( - 0xffffffff, # don't know the if index yet - 1, # IPv4 next-hop - nh_addr, - self.pg0.sw_if_index, - 0, # next-hop-table-id - 1, # next-hop-weight - 2, # num-out-labels, - [44, 46]) - self.vapi.sw_interface_set_flags(reply.sw_if_index, admin_up_down=1) + mpls_tun = VppMPLSTunnelInterface(self, + [VppRoutePath(self.pg0.remote_ip4, + self.pg0.sw_if_index, + labels=[44, 46])]) + mpls_tun.add_vpp_config() + mpls_tun.admin_up() # # add an unlabelled route through the new tunnel # route_10_0_0_3 = VppIpRoute(self, "10.0.0.3", 32, [VppRoutePath("0.0.0.0", - reply.sw_if_index)]) + mpls_tun._sw_if_index)]) route_10_0_0_3.add_vpp_config() self.vapi.cli("clear trace") @@ -738,6 +748,229 @@ class TestMPLS(VppTestCase): route_35_eos.remove_vpp_config() route_34_eos.remove_vpp_config() + def test_interface_rx(self): + """ MPLS Interface Receive """ + + # + # Add a non-recursive route that will forward the traffic + # post-interface-rx + # + route_10_0_0_1 = VppIpRoute(self, "10.0.0.1", 32, + table_id=1, + paths=[VppRoutePath(self.pg1.remote_ip4, + self.pg1.sw_if_index)]) + route_10_0_0_1.add_vpp_config() + + # + # An interface receive label that maps traffic to RX on interface + # pg1 + # by injecting the packet in on pg0, which is in table 0 + # doing an interface-rx on pg1 and matching a route in table 1 + # if the packet egresses, then we must have swapped to pg1 + # so as to have matched the route in table 1 + # + route_34_eos = VppMplsRoute(self, 34, 1, + [VppRoutePath("0.0.0.0", + self.pg1.sw_if_index, + is_interface_rx=1)]) + route_34_eos.add_vpp_config() + + # + # ping an interface in the default table + # PG0 is in the default table + # + self.vapi.cli("clear trace") + tx = self.create_stream_labelled_ip4(self.pg0, [34], n=257, + dst_ip="10.0.0.1") + self.pg0.add_stream(tx) + + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx = self.pg1.get_capture(257) + self.verify_capture_ip4(self.pg1, rx, tx) + + def test_mcast_mid_point(self): + """ MPLS Multicast Mid Point """ + + # + # Add a non-recursive route that will forward the traffic + # post-interface-rx + # + route_10_0_0_1 = VppIpRoute(self, "10.0.0.1", 32, + table_id=1, + paths=[VppRoutePath(self.pg1.remote_ip4, + self.pg1.sw_if_index)]) + route_10_0_0_1.add_vpp_config() + + # + # Add a mcast entry that replicate to pg2 and pg3 + # and replicate to a interface-rx (like a bud node would) + # + route_3400_eos = VppMplsRoute(self, 3400, 1, + [VppRoutePath(self.pg2.remote_ip4, + self.pg2.sw_if_index, + labels=[3401]), + VppRoutePath(self.pg3.remote_ip4, + self.pg3.sw_if_index, + labels=[3402]), + VppRoutePath("0.0.0.0", + self.pg1.sw_if_index, + is_interface_rx=1)], + is_multicast=1) + route_3400_eos.add_vpp_config() + + # + # ping an interface in the default table + # PG0 is in the default table + # + self.vapi.cli("clear trace") + tx = self.create_stream_labelled_ip4(self.pg0, [3400], n=257, + dst_ip="10.0.0.1") + self.pg0.add_stream(tx) + + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx = self.pg1.get_capture(257) + self.verify_capture_ip4(self.pg1, rx, tx) + + rx = self.pg2.get_capture(257) + self.verify_capture_labelled(self.pg2, rx, tx, [3401]) + rx = self.pg3.get_capture(257) + self.verify_capture_labelled(self.pg3, rx, tx, [3402]) + + def test_mcast_head(self): + """ MPLS Multicast Head-end """ + + # + # Create a multicast tunnel with two replications + # + mpls_tun = VppMPLSTunnelInterface(self, + [VppRoutePath(self.pg2.remote_ip4, + self.pg2.sw_if_index, + labels=[42]), + VppRoutePath(self.pg3.remote_ip4, + self.pg3.sw_if_index, + labels=[43])], + is_multicast=1) + mpls_tun.add_vpp_config() + mpls_tun.admin_up() + + # + # add an unlabelled route through the new tunnel + # + route_10_0_0_3 = VppIpRoute(self, "10.0.0.3", 32, + [VppRoutePath("0.0.0.0", + mpls_tun._sw_if_index)]) + route_10_0_0_3.add_vpp_config() + + self.vapi.cli("clear trace") + tx = self.create_stream_ip4(self.pg0, "10.0.0.3") + self.pg0.add_stream(tx) + + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx = self.pg2.get_capture(257) + self.verify_capture_tunneled_ip4(self.pg0, rx, tx, [42]) + rx = self.pg3.get_capture(257) + self.verify_capture_tunneled_ip4(self.pg0, rx, tx, [43]) + + # + # An an IP multicast route via the tunnel + # A (*,G). + # one accepting interface, pg0, 1 forwarding interface via the tunnel + # + route_232_1_1_1 = VppIpMRoute( + self, + "0.0.0.0", + "232.1.1.1", 32, + MRouteEntryFlags.MFIB_ENTRY_FLAG_NONE, + [VppMRoutePath(self.pg0.sw_if_index, + MRouteItfFlags.MFIB_ITF_FLAG_ACCEPT), + VppMRoutePath(mpls_tun._sw_if_index, + MRouteItfFlags.MFIB_ITF_FLAG_FORWARD)]) + route_232_1_1_1.add_vpp_config() + + self.vapi.cli("clear trace") + tx = self.create_stream_ip4(self.pg0, "232.1.1.1") + self.pg0.add_stream(tx) + + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx = self.pg2.get_capture(257) + self.verify_capture_tunneled_ip4(self.pg0, rx, tx, [42]) + rx = self.pg3.get_capture(257) + self.verify_capture_tunneled_ip4(self.pg0, rx, tx, [43]) + + def test_mcast_tail(self): + """ MPLS Multicast Tail """ + + # + # Add a multicast route that will forward the traffic + # post-disposition + # + route_232_1_1_1 = VppIpMRoute( + self, + "0.0.0.0", + "232.1.1.1", 32, + MRouteEntryFlags.MFIB_ENTRY_FLAG_NONE, + table_id=1, + paths=[VppMRoutePath(self.pg1.sw_if_index, + MRouteItfFlags.MFIB_ITF_FLAG_FORWARD)]) + route_232_1_1_1.add_vpp_config() + + # + # An interface receive label that maps traffic to RX on interface + # pg1 + # by injecting the packet in on pg0, which is in table 0 + # doing an rpf-id and matching a route in table 1 + # if the packet egresses, then we must have matched the route in + # table 1 + # + route_34_eos = VppMplsRoute(self, 34, 1, + [VppRoutePath("0.0.0.0", + self.pg1.sw_if_index, + nh_table_id=1, + rpf_id=55)], + is_multicast=1) + + route_34_eos.add_vpp_config() + + # + # Drop due to interface lookup miss + # + self.vapi.cli("clear trace") + tx = self.create_stream_labelled_ip4(self.pg0, [34], + dst_ip="232.1.1.1", n=1) + self.send_and_assert_no_replies(self.pg0, tx, "RPF-ID drop none") + + # + # set the RPF-ID of the enrtry to match the input packet's + # + route_232_1_1_1.update_rpf_id(55) + + self.vapi.cli("clear trace") + tx = self.create_stream_labelled_ip4(self.pg0, [34], + dst_ip="232.1.1.1", n=257) + self.pg0.add_stream(tx) + + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + rx = self.pg1.get_capture(257) + self.verify_capture_ip4(self.pg1, rx, tx) + + # + # set the RPF-ID of the enrtry to not match the input packet's + # + route_232_1_1_1.update_rpf_id(56) + tx = self.create_stream_labelled_ip4(self.pg0, [34], + dst_ip="232.1.1.1") + self.send_and_assert_no_replies(self.pg0, tx, "RPF-ID drop 56") + class TestMPLSDisabled(VppTestCase): """ MPLS disabled """ diff --git a/test/vpp_ip_route.py b/test/vpp_ip_route.py index faf5f801..d6146f28 100644 --- a/test/vpp_ip_route.py +++ b/test/vpp_ip_route.py @@ -55,15 +55,24 @@ class VppRoutePath(object): nh_table_id=0, labels=[], nh_via_label=MPLS_LABEL_INVALID, - is_ip6=0): + is_ip6=0, + rpf_id=0, + is_interface_rx=0): self.nh_itf = nh_sw_if_index self.nh_table_id = nh_table_id self.nh_via_label = nh_via_label self.nh_labels = labels + self.weight = 1 + self.rpf_id = rpf_id if is_ip6: self.nh_addr = inet_pton(AF_INET6, nh_addr) else: self.nh_addr = inet_pton(AF_INET, nh_addr) + self.is_interface_rx = is_interface_rx + self.is_rpf_id = 0 + if rpf_id != 0: + self.is_rpf_id = 1 + self.nh_itf = rpf_id class VppMRoutePath(VppRoutePath): @@ -176,13 +185,15 @@ class VppIpMRoute(VppObject): """ def __init__(self, test, src_addr, grp_addr, - grp_addr_len, e_flags, paths, table_id=0, is_ip6=0): + grp_addr_len, e_flags, paths, table_id=0, + rpf_id=0, is_ip6=0): self._test = test self.paths = paths self.grp_addr_len = grp_addr_len self.table_id = table_id self.e_flags = e_flags self.is_ip6 = is_ip6 + self.rpf_id = rpf_id if is_ip6: self.grp_addr = inet_pton(AF_INET6, grp_addr) @@ -199,6 +210,7 @@ class VppIpMRoute(VppObject): self.e_flags, path.nh_itf, path.nh_i_flags, + rpf_id=self.rpf_id, table_id=self.table_id, is_ipv6=self.is_ip6) self._test.registry.register(self, self._test.logger) @@ -226,6 +238,18 @@ class VppIpMRoute(VppObject): table_id=self.table_id, is_ipv6=self.is_ip6) + def update_rpf_id(self, rpf_id): + self.rpf_id = rpf_id + self._test.vapi.ip_mroute_add_del(self.src_addr, + self.grp_addr, + self.grp_addr_len, + self.e_flags, + 0xffffffff, + 0, + rpf_id=self.rpf_id, + table_id=self.table_id, + is_ipv6=self.is_ip6) + def update_path_flags(self, itf, flags): for path in self.paths: if path.nh_itf == itf: @@ -342,14 +366,17 @@ class VppMplsRoute(VppObject): MPLS Route/LSP """ - def __init__(self, test, local_label, eos_bit, paths, table_id=0): + def __init__(self, test, local_label, eos_bit, paths, table_id=0, + is_multicast=0): self._test = test self.paths = paths self.local_label = local_label self.eos_bit = eos_bit self.table_id = table_id + self.is_multicast = is_multicast def add_vpp_config(self): + is_multipath = len(self.paths) > 1 for path in self.paths: self._test.vapi.mpls_route_add_del( self.local_label, @@ -357,7 +384,11 @@ class VppMplsRoute(VppObject): 1, path.nh_addr, path.nh_itf, + is_multicast=self.is_multicast, + is_multipath=is_multipath, table_id=self.table_id, + is_interface_rx=path.is_interface_rx, + is_rpf_id=path.is_rpf_id, next_hop_out_label_stack=path.nh_labels, next_hop_n_out_labels=len( path.nh_labels), @@ -372,6 +403,7 @@ class VppMplsRoute(VppObject): 1, path.nh_addr, path.nh_itf, + is_rpf_id=path.is_rpf_id, table_id=self.table_id, is_add=0) diff --git a/test/vpp_mpls_tunnel_interface.py b/test/vpp_mpls_tunnel_interface.py new file mode 100644 index 00000000..f2001574 --- /dev/null +++ b/test/vpp_mpls_tunnel_interface.py @@ -0,0 +1,46 @@ + +from vpp_interface import VppInterface +from vpp_ip_route import VppRoutePath +import socket + + +class VppMPLSTunnelInterface(VppInterface): + """ + VPP MPLS Tunnel interface + """ + + def __init__(self, test, paths, is_multicast=0): + """ Create MPLS Tunnel interface """ + self._sw_if_index = 0 + super(VppMPLSTunnelInterface, self).__init__(test) + self._test = test + self.t_paths = paths + self.is_multicast = is_multicast + + def add_vpp_config(self): + self._sw_if_index = 0xffffffff + for path in self.t_paths: + reply = self.test.vapi.mpls_tunnel_add_del( + self._sw_if_index, + 1, # IPv4 next-hop + path.nh_addr, + path.nh_itf, + path.nh_table_id, + path.weight, + next_hop_out_label_stack=path.nh_labels, + next_hop_n_out_labels=len(path.nh_labels), + is_multicast=self.is_multicast) + self._sw_if_index = reply.sw_if_index + + def remove_vpp_config(self): + for path in self.t_paths: + reply = self.test.vapi.mpls_tunnel_add_del( + self.sw_if_index, + 1, # IPv4 next-hop + path.nh_addr, + path.nh_itf, + path.nh_table_id, + path.weight, + next_hop_out_label_stack=path.nh_labels, + next_hop_n_out_labels=len(path.nh_labels), + is_add=0) diff --git a/test/vpp_papi_provider.py b/test/vpp_papi_provider.py index e8025dff..ceb684b7 100644 --- a/test/vpp_papi_provider.py +++ b/test/vpp_papi_provider.py @@ -849,6 +849,9 @@ class VppPapiProvider(object): create_vrf_if_needed=0, is_resolve_host=0, is_resolve_attached=0, + is_interface_rx=0, + is_rpf_id=0, + is_multicast=0, is_add=1, is_drop=0, is_multipath=0, @@ -872,6 +875,7 @@ class VppPapiProvider(object): :param is_local: (Default value = 0) :param is_classify: (Default value = 0) :param is_multipath: (Default value = 0) + :param is_multicast: (Default value = 0) :param is_resolve_host: (Default value = 0) :param is_resolve_attached: (Default value = 0) :param not_last: (Default value = 0) @@ -889,8 +893,11 @@ class VppPapiProvider(object): 'mr_is_add': is_add, 'mr_is_classify': is_classify, 'mr_is_multipath': is_multipath, + 'mr_is_multicast': is_multicast, 'mr_is_resolve_host': is_resolve_host, 'mr_is_resolve_attached': is_resolve_attached, + 'mr_is_interface_rx': is_interface_rx, + 'mr_is_rpf_id': is_rpf_id, 'mr_next_hop_proto_is_ip4': next_hop_proto_is_ip4, 'mr_next_hop_weight': next_hop_weight, 'mr_next_hop': next_hop_address, @@ -936,7 +943,8 @@ class VppPapiProvider(object): next_hop_via_label=MPLS_LABEL_INVALID, create_vrf_if_needed=0, is_add=1, - l2_only=0): + l2_only=0, + is_multicast=0): """ :param dst_address_length: @@ -956,8 +964,8 @@ class VppPapiProvider(object): :param is_multipath: (Default value = 0) :param is_resolve_host: (Default value = 0) :param is_resolve_attached: (Default value = 0) - :param not_last: (Default value = 0) :param next_hop_weight: (Default value = 1) + :param is_multicast: (Default value = 0) """ return self.api( @@ -965,6 +973,7 @@ class VppPapiProvider(object): {'mt_sw_if_index': tun_sw_if_index, 'mt_is_add': is_add, 'mt_l2_only': l2_only, + 'mt_is_multicast': is_multicast, 'mt_next_hop_proto_is_ip4': next_hop_proto_is_ip4, 'mt_next_hop_weight': next_hop_weight, 'mt_next_hop': next_hop_address, @@ -1469,6 +1478,7 @@ class VppPapiProvider(object): e_flags, next_hop_sw_if_index, i_flags, + rpf_id=0, table_id=0, create_vrf_if_needed=0, is_add=1, @@ -1481,6 +1491,8 @@ class VppPapiProvider(object): {'next_hop_sw_if_index': next_hop_sw_if_index, 'entry_flags': e_flags, 'itf_flags': i_flags, + 'table_id': table_id, + 'rpf_id': rpf_id, 'create_vrf_if_needed': create_vrf_if_needed, 'is_add': is_add, 'is_ipv6': is_ipv6, -- cgit 1.2.3-korg From 250b95b71babdfb558554c788a82cf45ccc34ab8 Mon Sep 17 00:00:00 2001 From: John Lo Date: Fri, 7 Apr 2017 23:32:32 -0400 Subject: Fix init of ENIC driver to allow receiving of jumbo packets Set rxmode.enable_scatter field in DPDK port config parameter so ENIC driver will use multiple mbuf's for receiving jumbo packets. Also remove ENIC driver check to disable setting ENIC MTU as this capability is now working with the new ENIC driver, subject to 9002B limit. Change-Id: I563976201c4968d4538c0759505cef2de876934a Signed-off-by: John Lo --- src/plugins/dpdk/device/init.c | 76 +++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 49 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 7eaf8da7..39d919e2 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -419,56 +419,35 @@ dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) } else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU (flags)) { - /* - * DAW-FIXME: The Cisco VIC firmware does not provide an api for a - * driver to dynamically change the mtu. If/when the - * VIC firmware gets fixed, then this should be removed. - */ - if (xd->pmd == VNET_DPDK_PMD_ENIC) - { - struct rte_eth_dev_info dev_info; + int rv; - /* - * Restore mtu to what has been set by CIMC in the firmware cfg. - */ - rte_eth_dev_info_get (xd->device_index, &dev_info); - hi->max_packet_bytes = dev_info.max_rx_pktlen; + xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes; - vlib_cli_output (vlib_get_main (), - "Cisco VIC mtu can only be changed " - "using CIMC then rebooting the server!"); - } - else - { - int rv; + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + rte_eth_dev_stop (xd->device_index); - xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes; + rv = rte_eth_dev_configure + (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf); - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - rte_eth_dev_stop (xd->device_index); + if (rv < 0) + vlib_cli_output (vlib_get_main (), + "rte_eth_dev_configure[%d]: err %d", + xd->device_index, rv); - rv = rte_eth_dev_configure - (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf); + rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + { + int rv = rte_eth_dev_start (xd->device_index); + if (!rv && xd->default_mac_address) + rv = rte_eth_dev_default_mac_addr_set (xd->device_index, + (struct ether_addr *) + xd->default_mac_address); if (rv < 0) - vlib_cli_output (vlib_get_main (), - "rte_eth_dev_configure[%d]: err %d", - xd->device_index, rv); - - rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); - - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - { - int rv = rte_eth_dev_start (xd->device_index); - if (!rv && xd->default_mac_address) - rv = rte_eth_dev_default_mac_addr_set (xd->device_index, - (struct ether_addr *) - xd->default_mac_address); - if (rv < 0) - clib_warning ("rte_eth_dev_start %d returned %d", - xd->device_index, rv); - } + clib_warning ("rte_eth_dev_start %d returned %d", + xd->device_index, rv); } + } return old; } @@ -655,11 +634,13 @@ dpdk_lib_init (dpdk_main_t * dm) { xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; port_conf_template.rxmode.jumbo_frame = 0; + port_conf_template.rxmode.enable_scatter = 0; } else { xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS; port_conf_template.rxmode.jumbo_frame = 1; + port_conf_template.rxmode.enable_scatter = 1; xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG; } @@ -1068,16 +1049,13 @@ dpdk_lib_init (dpdk_main_t * dm) hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index); /* - * DAW-FIXME: The Cisco VIC firmware does not provide an api for a - * driver to dynamically change the mtu. If/when the - * VIC firmware gets fixed, then this should be removed. + * For cisco VIC vNIC, set default to VLAN strip enabled, unless + * specified otherwise in the startup config. + * For other NICs default to VLAN strip disabled, unless specified + * otherwis in the startup config. */ if (xd->pmd == VNET_DPDK_PMD_ENIC) { - /* - * Initialize mtu to what has been set by CIMC in the firmware cfg. - */ - hi->max_packet_bytes = dev_info.max_rx_pktlen; if (devconf->vlan_strip_offload != DPDK_DEVICE_VLAN_STRIP_OFF) vlan_strip = 1; /* remove vlan tag from VIC port by default */ else -- cgit 1.2.3-korg From 3a7956383420a1d2f5f28b5bd3d3b3f5dda0420d Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 5 Apr 2017 12:28:38 +0200 Subject: dpdk: use common interface placement infra This pathch deprecates "show dpdk placement" and "set dpdk placement" CLI commands. Change-Id: I4e052ec3e8b8e6c54b4816e1e689e5b7a24892db Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/cli.c | 184 +-------------------------------------- src/plugins/dpdk/device/device.c | 4 +- src/plugins/dpdk/device/dpdk.h | 7 +- src/plugins/dpdk/device/format.c | 2 +- src/plugins/dpdk/device/init.c | 159 +++++++++++---------------------- src/plugins/dpdk/device/node.c | 9 +- 6 files changed, 64 insertions(+), 301 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index d2def2fc..17d1bfe1 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -563,61 +563,6 @@ VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = { }; /* *INDENT-ON* */ -static clib_error_t * -show_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd) -{ - vlib_thread_main_t *tm = vlib_get_thread_main (); - dpdk_main_t *dm = &dpdk_main; - dpdk_device_and_queue_t *dq; - int cpu; - - if (tm->n_vlib_mains == 1) - vlib_cli_output (vm, "All interfaces are handled by main thread"); - - for (cpu = 0; cpu < vec_len (dm->devices_by_cpu); cpu++) - { - if (cpu >= dm->input_cpu_first_index && - cpu < (dm->input_cpu_first_index + dm->input_cpu_count)) - vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu, - vlib_worker_threads[cpu].name, - vlib_worker_threads[cpu].lcore_id); - - /* *INDENT-OFF* */ - vec_foreach(dq, dm->devices_by_cpu[cpu]) - { - u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index; - vnet_hw_interface_t * hi = vnet_get_hw_interface(dm->vnet_main, hw_if_index); - vlib_cli_output(vm, " %v queue %u", hi->name, dq->queue_id); - } - /* *INDENT-ON* */ - } - return 0; -} - -/*? - * This command is used to display the thread and core each - * DPDK interface and queue is assigned too. - * - * @cliexpar - * Example of how to display the DPDK interface placement: - * @cliexstart{show dpdk interface placement} - * Thread 1 (vpp_wk_0 at lcore 1): - * GigabitEthernet0/8/0 queue 0 - * GigabitEthernet0/9/0 queue 0 - * Thread 2 (vpp_wk_1 at lcore 2): - * GigabitEthernet0/8/0 queue 1 - * GigabitEthernet0/9/0 queue 1 - * @cliexend -?*/ -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (cmd_show_dpdk_if_placement,static) = { - .path = "show dpdk interface placement", - .short_help = "show dpdk interface placement", - .function = show_dpdk_if_placement, -}; -/* *INDENT-ON* */ - static int dpdk_device_queue_sort (void *a1, void *a2) { @@ -636,131 +581,6 @@ dpdk_device_queue_sort (void *a1, void *a2) return 0; } -static clib_error_t * -set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd) -{ - unformat_input_t _line_input, *line_input = &_line_input; - dpdk_main_t *dm = &dpdk_main; - dpdk_device_and_queue_t *dq; - vnet_hw_interface_t *hw; - dpdk_device_t *xd; - u32 hw_if_index = (u32) ~ 0; - u32 queue = (u32) 0; - u32 cpu = (u32) ~ 0; - int i; - clib_error_t *error = NULL; - - if (!unformat_user (input, unformat_line_input, line_input)) - return 0; - - while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) - { - if (unformat - (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, - &hw_if_index)) - ; - else if (unformat (line_input, "queue %d", &queue)) - ; - else if (unformat (line_input, "thread %d", &cpu)) - ; - else - { - error = clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); - goto done; - } - } - - if (hw_if_index == (u32) ~ 0) - { - error = clib_error_return (0, "please specify valid interface name"); - goto done; - } - - if (cpu < dm->input_cpu_first_index || - cpu >= (dm->input_cpu_first_index + dm->input_cpu_count)) - { - error = clib_error_return (0, "please specify valid thread id"); - goto done; - } - - hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); - xd = vec_elt_at_index (dm->devices, hw->dev_instance); - - for (i = 0; i < vec_len (dm->devices_by_cpu); i++) - { - /* *INDENT-OFF* */ - vec_foreach(dq, dm->devices_by_cpu[i]) - { - if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index && - queue == dq->queue_id) - { - if (cpu == i) /* nothing to do */ - goto done; - - vec_del1(dm->devices_by_cpu[i], dq - dm->devices_by_cpu[i]); - vec_add2(dm->devices_by_cpu[cpu], dq, 1); - dq->queue_id = queue; - dq->device = xd->device_index; - xd->cpu_socket_id_by_queue[queue] = - rte_lcore_to_socket_id(vlib_worker_threads[cpu].lcore_id); - - vec_sort_with_function(dm->devices_by_cpu[i], - dpdk_device_queue_sort); - - vec_sort_with_function(dm->devices_by_cpu[cpu], - dpdk_device_queue_sort); - - if (vec_len(dm->devices_by_cpu[i]) == 0) - vlib_node_set_state (vlib_mains[i], dpdk_input_node.index, - VLIB_NODE_STATE_DISABLED); - - if (vec_len(dm->devices_by_cpu[cpu]) == 1) - vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index, - VLIB_NODE_STATE_POLLING); - - goto done; - } - } - /* *INDENT-ON* */ - } - - error = clib_error_return (0, "not found"); - -done: - unformat_free (line_input); - - return error; -} - -/*? - * This command is used to assign a given interface, and optionally a - * given queue, to a different thread. This will not create a thread, - * so the thread must already exist. Use '/etc/vpp/startup.conf' - * for the initial thread creation. If the 'queue' is not provided, - * it defaults to 0. - * - * @cliexpar - * Example of how to display the DPDK interface placement: - * @cliexstart{show dpdk interface placement} - * Thread 1 (vpp_wk_0 at lcore 1): - * GigabitEthernet0/8/0 queue 0 - * GigabitEthernet0/9/0 queue 0 - * Thread 2 (vpp_wk_1 at lcore 2): - * GigabitEthernet0/8/0 queue 1 - * GigabitEthernet0/9/0 queue 1 - * @cliexend - * Example of how to assign a DPDK interface and queue to a thread: - * @cliexcmd{set dpdk interface placement GigabitEthernet0/8/0 queue 1 thread 1} -?*/ -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = { - .path = "set dpdk interface placement", - .short_help = "set dpdk interface placement [queue ] thread ", - .function = set_dpdk_if_placement, -}; -/* *INDENT-ON* */ static clib_error_t * show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, @@ -784,7 +604,7 @@ show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, vec_foreach (dq, dm->devices_by_hqos_cpu[cpu]) { - u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index; + u32 hw_if_index = dm->devices[dq->device].hw_if_index; vnet_hw_interface_t *hi = vnet_get_hw_interface (dm->vnet_main, hw_if_index); vlib_cli_output (vm, " %v queue %u", hi->name, dq->queue_id); @@ -864,7 +684,7 @@ set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, { vec_foreach (dq, dm->devices_by_hqos_cpu[i]) { - if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index) + if (hw_if_index == dm->devices[dq->device].hw_if_index) { if (cpu == i) /* nothing to do */ goto done; diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 91661246..e84d524f 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -301,7 +301,7 @@ static_always_inline u32 node_index; node_index = vec_elt_at_index (im->hw_interfaces, - xd->vlib_hw_if_index)->tx_node_index; + xd->hw_if_index)->tx_node_index; vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1); clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index, @@ -658,7 +658,7 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP; rte_eth_allmulticast_disable (xd->device_index); - vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0); + vnet_hw_interface_set_flags (vnm, xd->hw_if_index, 0); rte_eth_dev_stop (xd->device_index); /* For bonded interface, stop slave links */ diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 06d89adb..9afd0b35 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -175,7 +175,7 @@ typedef struct /* Instance ID */ u32 device_index; - u32 vlib_hw_if_index; + u32 hw_if_index; u32 vlib_sw_if_index; /* next node index if we decide to steal the rx graph arc */ @@ -353,7 +353,6 @@ typedef struct /* Devices */ dpdk_device_t *devices; - dpdk_device_and_queue_t **devices_by_cpu; dpdk_device_and_queue_t **devices_by_hqos_cpu; /* per-thread recycle lists */ @@ -392,10 +391,6 @@ typedef struct u8 use_rss; - /* which cpus are running dpdk-input */ - int input_cpu_first_index; - int input_cpu_count; - /* which cpus are running I/O TX */ int hqos_cpu_first_index; int hqos_cpu_count; diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index a09a3f83..f1cca3f7 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -295,7 +295,7 @@ format_dpdk_link_status (u8 * s, va_list * args) dpdk_device_t *xd = va_arg (*args, dpdk_device_t *); struct rte_eth_link *l = &xd->link; vnet_main_t *vnm = vnet_get_main (); - vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index); + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->hw_if_index); s = format (s, "%s ", l->link_status ? "up" : "down"); if (l->link_status) diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 39d919e2..d3763e0b 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -328,7 +328,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) { - vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0); + vnet_hw_interface_set_flags (dm->vnet_main, xd->hw_if_index, 0); rte_eth_dev_stop (xd->device_index); } @@ -359,20 +359,20 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) for (j = 0; j < xd->rx_q_used; j++) { + uword tidx = vnet_get_device_input_thread_index (dm->vnet_main, + xd->hw_if_index, j); + unsigned lcore = vlib_worker_threads[tidx].lcore_id; + u16 socket_id = rte_lcore_to_socket_id (lcore); rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, xd->cpu_socket, 0, - dm-> - pktmbuf_pools[xd->cpu_socket_id_by_queue - [j]]); + dm->pktmbuf_pools[socket_id]); /* retry with any other CPU socket */ if (rv < 0) rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, SOCKET_ID_ANY, 0, - dm-> - pktmbuf_pools[xd->cpu_socket_id_by_queue - [j]]); + dm->pktmbuf_pools[socket_id]); if (rv < 0) return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d", xd->device_index, rv); @@ -485,35 +485,20 @@ dpdk_lib_init (dpdk_main_t * dm) clib_error_t *error; vlib_main_t *vm = vlib_get_main (); vlib_thread_main_t *tm = vlib_get_thread_main (); + vnet_device_main_t *vdm = &vnet_device_main; vnet_sw_interface_t *sw; vnet_hw_interface_t *hi; dpdk_device_t *xd; vlib_pci_addr_t last_pci_addr; u32 last_pci_addr_port = 0; - vlib_thread_registration_t *tr, *tr_hqos; - uword *p, *p_hqos; + vlib_thread_registration_t *tr_hqos; + uword *p_hqos; - u32 next_cpu = 0, next_hqos_cpu = 0; + u32 next_hqos_cpu = 0; u8 af_packet_port_id = 0; u8 bond_ether_port_id = 0; last_pci_addr.as_u32 = ~0; - dm->input_cpu_first_index = 0; - dm->input_cpu_count = 1; - - /* find out which cpus will be used for input */ - p = hash_get_mem (tm->thread_registrations_by_name, "workers"); - tr = p ? (vlib_thread_registration_t *) p[0] : 0; - - if (tr && tr->count > 0) - { - dm->input_cpu_first_index = tr->first_index; - dm->input_cpu_count = tr->count; - } - - vec_validate_aligned (dm->devices_by_cpu, tm->n_vlib_mains - 1, - CLIB_CACHE_LINE_BYTES); - dm->hqos_cpu_first_index = 0; dm->hqos_cpu_count = 0; @@ -924,48 +909,6 @@ dpdk_lib_init (dpdk_main_t * dm) dpdk_device_and_queue_t *dq; int q; - if (devconf->workers) - { - int i; - q = 0; - /* *INDENT-OFF* */ - clib_bitmap_foreach (i, devconf->workers, ({ - int cpu = dm->input_cpu_first_index + i; - unsigned lcore = vlib_worker_threads[cpu].lcore_id; - vec_validate(xd->cpu_socket_id_by_queue, q); - xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore); - vec_add2(dm->devices_by_cpu[cpu], dq, 1); - dq->device = xd->device_index; - dq->queue_id = q++; - })); - /* *INDENT-ON* */ - } - else - for (q = 0; q < xd->rx_q_used; q++) - { - int cpu = dm->input_cpu_first_index + next_cpu; - unsigned lcore = vlib_worker_threads[cpu].lcore_id; - - /* - * numa node for worker thread handling this queue - * needed for taking buffers from the right mempool - */ - vec_validate (xd->cpu_socket_id_by_queue, q); - xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id (lcore); - - /* - * construct vector of (device,queue) pairs for each worker thread - */ - vec_add2 (dm->devices_by_cpu[cpu], dq, 1); - dq->device = xd->device_index; - dq->queue_id = q; - - next_cpu++; - if (next_cpu == dm->input_cpu_count) - next_cpu = 0; - } - - if (devconf->hqos_enabled) { xd->flags |= DPDK_DEVICE_FLAG_HQOS; @@ -1022,17 +965,6 @@ dpdk_lib_init (dpdk_main_t * dm) vec_validate_aligned (xd->d_trace_buffers, tm->n_vlib_mains, CLIB_CACHE_LINE_BYTES); - rv = dpdk_port_setup (dm, xd); - - if (rv) - return rv; - - if (devconf->hqos_enabled) - { - rv = dpdk_port_setup_hqos (xd, &devconf->hqos); - if (rv) - return rv; - } /* count the number of descriptors used for this device */ nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used; @@ -1040,13 +972,46 @@ dpdk_lib_init (dpdk_main_t * dm) error = ethernet_register_interface (dm->vnet_main, dpdk_device_class.index, xd->device_index, /* ethernet address */ addr, - &xd->vlib_hw_if_index, dpdk_flag_change); + &xd->hw_if_index, dpdk_flag_change); if (error) return error; - sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index); + sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->hw_if_index); xd->vlib_sw_if_index = sw->sw_if_index; - hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index); + vnet_set_device_input_node (dm->vnet_main, xd->hw_if_index, + dpdk_input_node.index); + + if (devconf->workers) + { + int i; + q = 0; + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, devconf->workers, ({ + vnet_device_input_assign_thread (dm->vnet_main, xd->hw_if_index, q++, + vdm->first_worker_thread_index + i); + })); + /* *INDENT-ON* */ + } + else + for (q = 0; q < xd->rx_q_used; q++) + { + vnet_device_input_assign_thread (dm->vnet_main, xd->hw_if_index, q, /* any */ + ~1); + } + + hi = vnet_get_hw_interface (dm->vnet_main, xd->hw_if_index); + + rv = dpdk_port_setup (dm, xd); + + if (rv) + return rv; + + if (devconf->hqos_enabled) + { + rv = dpdk_port_setup_hqos (xd, &devconf->hqos); + if (rv) + return rv; + } /* * For cisco VIC vNIC, set default to VLAN strip enabled, unless @@ -1723,13 +1688,13 @@ dpdk_update_link_state (dpdk_device_t * xd, f64 now) ed->sw_if_index = xd->vlib_sw_if_index; ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0; ed->old_link_state = (u8) - vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index); + vnet_hw_interface_is_link_up (vnm, xd->hw_if_index); ed->new_link_state = (u8) xd->link.link_status; } if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) && ((xd->link.link_status != 0) ^ - vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index))) + vnet_hw_interface_is_link_up (vnm, xd->hw_if_index))) { hw_flags_chg = 1; hw_flags |= (xd->link.link_status ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); @@ -1798,7 +1763,7 @@ dpdk_update_link_state (dpdk_device_t * xd, f64 now) ed->sw_if_index = xd->vlib_sw_if_index; ed->flags = hw_flags; } - vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, hw_flags); + vnet_hw_interface_set_flags (vnm, xd->hw_if_index, hw_flags); } } @@ -1815,23 +1780,6 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) error = dpdk_lib_init (dm); - /* - * Turn on the input node if we found some devices to drive - * and we're not running worker threads or i/o threads - */ - - if (error == 0 && vec_len (dm->devices) > 0) - { - if (tm->n_vlib_mains == 1) - vlib_node_set_state (vm, dpdk_input_node.index, - VLIB_NODE_STATE_POLLING); - else - for (i = 0; i < tm->n_vlib_mains; i++) - if (vec_len (dm->devices_by_cpu[i]) > 0) - vlib_node_set_state (vlib_mains[i], dpdk_input_node.index, - VLIB_NODE_STATE_POLLING); - } - if (error) clib_error_report (error); @@ -1881,7 +1829,7 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) /* Populate MAC of bonded interface in VPP hw tables */ bhi = vnet_get_hw_interface - (vnm, dm->devices[i].vlib_hw_if_index); + (vnm, dm->devices[i].hw_if_index); bei = pool_elt_at_index (em->interfaces, bhi->hw_instance); clib_memcpy (bhi->hw_address, addr, 6); @@ -1910,10 +1858,9 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) } /* Set slaves bitmap for bonded interface */ bhi->bond_info = clib_bitmap_set - (bhi->bond_info, sdev->vlib_hw_if_index, 1); + (bhi->bond_info, sdev->hw_if_index, 1); /* Set slave link flags on slave interface */ - shi = vnet_get_hw_interface - (vnm, sdev->vlib_hw_if_index); + shi = vnet_get_hw_interface (vnm, sdev->hw_if_index); ssi = vnet_get_sw_interface (vnm, sdev->vlib_sw_if_index); sei = pool_elt_at_index diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 0549ba5d..5cc611cd 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -631,16 +631,17 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) dpdk_main_t *dm = &dpdk_main; dpdk_device_t *xd; uword n_rx_packets = 0; - dpdk_device_and_queue_t *dq; - u32 thread_index = vlib_get_thread_index (); + vnet_device_input_runtime_t *rt = (void *) node->runtime_data; + vnet_device_and_queue_t *dq; + u32 thread_index = node->thread_index; /* * Poll all devices on this cpu for input/interrupts. */ /* *INDENT-OFF* */ - vec_foreach (dq, dm->devices_by_cpu[thread_index]) + foreach_device_and_queue (dq, rt->devices_and_queues) { - xd = vec_elt_at_index(dm->devices, dq->device); + xd = vec_elt_at_index(dm->devices, dq->dev_instance); if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1); else -- cgit 1.2.3-korg From bc66a9122f73b97ca1ae60f1df47b39c141be3ae Mon Sep 17 00:00:00 2001 From: Steve Shin Date: Sat, 22 Apr 2017 06:58:23 -0700 Subject: Fix multicast enable on the bonded interface Multicast enable flag should be set when the bonded interface is up. This flag allows multicast packets to be processed from the slave devices of the bonded interface. Also promiscuous mode for all-multicast should be correctly displayed as part of 'show hardware detail' output. Change-Id: Ief0157c4c030a28afb9c45ebf3d6a12710083724 Signed-off-by: Steve Shin --- src/plugins/dpdk/device/device.c | 12 ++++++++++++ src/plugins/dpdk/device/format.c | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index e84d524f..48f3237b 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -649,6 +649,18 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) rte_eth_promiscuous_disable (xd->device_index); rte_eth_allmulticast_enable (xd->device_index); + + if (xd->pmd == VNET_DPDK_PMD_BOND) + { + u8 slink[16]; + int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16); + while (nlink >= 1) + { + u8 dpdk_port = slink[--nlink]; + rte_eth_allmulticast_enable (dpdk_port); + } + } + xd->flags |= DPDK_DEVICE_FLAG_ADMIN_UP; dpdk_update_counters (xd, now); dpdk_update_link_state (xd, now); diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index f1cca3f7..93eca524 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -417,7 +417,7 @@ format_dpdk_device (u8 * s, va_list * args) format (s, "%Upromiscuous: unicast %s all-multicast %s\n", format_white_space, indent + 2, rte_eth_promiscuous_get (xd->device_index) ? "on" : "off", - rte_eth_promiscuous_get (xd->device_index) ? "on" : "off"); + rte_eth_allmulticast_get (xd->device_index) ? "on" : "off"); vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index); s = format (s, "%Uvlan offload: strip %s filter %s qinq %s\n", format_white_space, indent + 2, -- cgit 1.2.3-korg From c903793662e16309a67161a58500f6a1a15d37f6 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 27 Apr 2017 21:12:17 +0200 Subject: dpdk: cleanup, move APIs to separate .c file Change-Id: Id632ff1b30be808d9f270e2f77260391569fbda2 Signed-off-by: Damjan Marion --- src/plugins/dpdk.am | 1 + src/plugins/dpdk/api/dpdk_api.c | 333 ++++++++++++++++++++++++++++++++++++++++ src/plugins/dpdk/device/dpdk.h | 12 -- src/plugins/dpdk/device/init.c | 316 +------------------------------------- src/plugins/dpdk/device/node.c | 2 +- 5 files changed, 337 insertions(+), 327 deletions(-) create mode 100755 src/plugins/dpdk/api/dpdk_api.c (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk.am b/src/plugins/dpdk.am index a233be7a..bb46ae6e 100644 --- a/src/plugins/dpdk.am +++ b/src/plugins/dpdk.am @@ -28,6 +28,7 @@ dpdk_plugin_la_SOURCES = \ dpdk/main.c \ dpdk/buffer.c \ dpdk/thread.c \ + dpdk/api/dpdk_api.c \ dpdk/device/cli.c \ dpdk/device/dpdk_priv.h \ dpdk/device/device.c \ diff --git a/src/plugins/dpdk/api/dpdk_api.c b/src/plugins/dpdk/api/dpdk_api.c new file mode 100755 index 00000000..08afdd70 --- /dev/null +++ b/src/plugins/dpdk/api/dpdk_api.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +/* define message IDs */ +#include + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include +#undef vl_api_version + +/* Macro to finish up custom dump fns */ +#define FINISH \ + vec_add1 (s, 0); \ + vl_print (handle, (char *)s); \ + vec_free (s); \ + return handle; + +#include + +static void + vl_api_sw_interface_set_dpdk_hqos_pipe_t_handler + (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_pipe_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 subport = ntohl (mp->subport); + u32 pipe = ntohl (mp->pipe); + u32 profile = ntohl (mp->profile); + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rv = rte_sched_pipe_config (xd->hqos_ht->hqos, subport, pipe, profile); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_PIPE_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_pipe_t_print + (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_pipe "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = format (s, "subport %u pipe %u profile %u ", + ntohl (mp->subport), ntohl (mp->pipe), ntohl (mp->profile)); + + FINISH; +} + +static void + vl_api_sw_interface_set_dpdk_hqos_subport_t_handler + (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_subport_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd; + struct rte_sched_subport_params p; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 subport = ntohl (mp->subport); + p.tb_rate = ntohl (mp->tb_rate); + p.tb_size = ntohl (mp->tb_size); + p.tc_rate[0] = ntohl (mp->tc_rate[0]); + p.tc_rate[1] = ntohl (mp->tc_rate[1]); + p.tc_rate[2] = ntohl (mp->tc_rate[2]); + p.tc_rate[3] = ntohl (mp->tc_rate[3]); + p.tc_period = ntohl (mp->tc_period); + + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport, &p); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_SUBPORT_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_subport_t_print + (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_subport "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = + format (s, + "subport %u rate %u bkt_size %u tc0 %u tc1 %u tc2 %u tc3 %u period %u", + ntohl (mp->subport), ntohl (mp->tb_rate), ntohl (mp->tb_size), + ntohl (mp->tc_rate[0]), ntohl (mp->tc_rate[1]), + ntohl (mp->tc_rate[2]), ntohl (mp->tc_rate[3]), + ntohl (mp->tc_period)); + + FINISH; +} + +static void + vl_api_sw_interface_set_dpdk_hqos_tctbl_t_handler + (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_tctbl_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_device_t *xd; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 entry = ntohl (mp->entry); + u32 tc = ntohl (mp->tc); + u32 queue = ntohl (mp->queue); + u32 val, i; + + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE) + { + clib_warning ("invalid traffic class !!"); + rv = VNET_API_ERROR_INVALID_VALUE; + goto done; + } + if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) + { + clib_warning ("invalid queue !!"); + rv = VNET_API_ERROR_INVALID_VALUE; + goto done; + } + + /* Detect the set of worker threads */ + uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + + if (p == 0) + { + clib_warning ("worker thread registration AWOL !!"); + rv = VNET_API_ERROR_INVALID_VALUE_2; + goto done; + } + + vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; + int worker_thread_first = tr->first_index; + int worker_thread_count = tr->count; + + val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue; + for (i = 0; i < worker_thread_count; i++) + xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val; + + BAD_SW_IF_INDEX_LABEL; +done: + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_TCTBL_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_tctbl_t_print + (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_tctbl "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = format (s, "entry %u tc %u queue %u", + ntohl (mp->entry), ntohl (mp->tc), ntohl (mp->queue)); + + FINISH; +} + +#define foreach_dpdk_plugin_api_msg \ +_(SW_INTERFACE_SET_DPDK_HQOS_PIPE, sw_interface_set_dpdk_hqos_pipe) \ +_(SW_INTERFACE_SET_DPDK_HQOS_SUBPORT, sw_interface_set_dpdk_hqos_subport) \ +_(SW_INTERFACE_SET_DPDK_HQOS_TCTBL, sw_interface_set_dpdk_hqos_tctbl) + +/* Set up the API message handling tables */ +static clib_error_t * +dpdk_plugin_api_hookup (vlib_main_t * vm) +{ + dpdk_main_t *dm __attribute__ ((unused)) = &dpdk_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + dm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_dpdk_plugin_api_msg; +#undef _ + return 0; +} + +#define vl_msg_name_crc_list +#include +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (dpdk_main_t * dm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + dm->msg_id_base); + foreach_vl_msg_name_crc_dpdk; +#undef _ +} + +// TODO +/* +static void plugin_custom_dump_configure (dpdk_main_t * dm) +{ +#define _(n,f) dm->api_main->msg_print_handlers \ + [VL_API_##n + dm->msg_id_base] \ + = (void *) vl_api_##f##_t_print; + foreach_dpdk_plugin_api_msg; +#undef _ +} +*/ +/* force linker to link functions used by vlib and declared weak */ + +static clib_error_t * +dpdk_api_init (vlib_main_t * vm) +{ + dpdk_main_t *dm = &dpdk_main; + clib_error_t *error = 0; + + /* init CLI */ + if ((error = vlib_call_init_function (vm, dpdk_init))) + return error; + + u8 *name; + name = format (0, "dpdk_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + dm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + vec_free (name); + + error = dpdk_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (dm, &api_main); + +// TODO +// plugin_custom_dump_configure (dm); + + return error; +} + +VLIB_INIT_FUNCTION (dpdk_api_init); + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 9afd0b35..90e93e75 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -469,19 +469,7 @@ typedef enum DPDK_N_ERROR, } dpdk_error_t; -int dpdk_set_stat_poll_interval (f64 interval); -int dpdk_set_link_state_poll_interval (f64 interval); void dpdk_update_link_state (dpdk_device_t * xd, f64 now); -void dpdk_device_lock_init (dpdk_device_t * xd); -void dpdk_device_lock_free (dpdk_device_t * xd); - -void dpdk_rx_trace (dpdk_main_t * dm, - vlib_node_runtime_t * node, - dpdk_device_t * xd, - u16 queue_id, u32 * buffers, uword n_buffers); - -#define EFD_OPERATION_LESS_THAN 0 -#define EFD_OPERATION_GREATER_OR_EQUAL 1 format_function_t format_dpdk_device_name; format_function_t format_dpdk_device; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index d3763e0b..33ecde44 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -35,269 +35,6 @@ dpdk_main_t dpdk_main; -#include -#include - -/* define message IDs */ -#include - -#define vl_typedefs /* define message structures */ -#include -#undef vl_typedefs - -#define vl_endianfun /* define message structures */ -#include -#undef vl_endianfun - -#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) - -/* Get the API version number. */ -#define vl_api_version(n,v) static u32 api_version=(v); -#include -#undef vl_api_version - -/* Macro to finish up custom dump fns */ -#define FINISH \ - vec_add1 (s, 0); \ - vl_print (handle, (char *)s); \ - vec_free (s); \ - return handle; - -#include - -static void - vl_api_sw_interface_set_dpdk_hqos_pipe_t_handler - (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp) -{ - vl_api_sw_interface_set_dpdk_hqos_pipe_reply_t *rmp; - int rv = 0; - - dpdk_main_t *dm = &dpdk_main; - dpdk_device_t *xd; - - u32 sw_if_index = ntohl (mp->sw_if_index); - u32 subport = ntohl (mp->subport); - u32 pipe = ntohl (mp->pipe); - u32 profile = ntohl (mp->profile); - vnet_hw_interface_t *hw; - - VALIDATE_SW_IF_INDEX (mp); - - /* hw_if & dpdk device */ - hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); - - xd = vec_elt_at_index (dm->devices, hw->dev_instance); - - rv = rte_sched_pipe_config (xd->hqos_ht->hqos, subport, pipe, profile); - - BAD_SW_IF_INDEX_LABEL; - - REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_PIPE_REPLY); -} - -static void *vl_api_sw_interface_set_dpdk_hqos_pipe_t_print - (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp, void *handle) -{ - u8 *s; - - s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_pipe "); - - s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); - - s = format (s, "subport %u pipe %u profile %u ", - ntohl (mp->subport), ntohl (mp->pipe), ntohl (mp->profile)); - - FINISH; -} - -static void - vl_api_sw_interface_set_dpdk_hqos_subport_t_handler - (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp) -{ - vl_api_sw_interface_set_dpdk_hqos_subport_reply_t *rmp; - int rv = 0; - - dpdk_main_t *dm = &dpdk_main; - dpdk_device_t *xd; - struct rte_sched_subport_params p; - - u32 sw_if_index = ntohl (mp->sw_if_index); - u32 subport = ntohl (mp->subport); - p.tb_rate = ntohl (mp->tb_rate); - p.tb_size = ntohl (mp->tb_size); - p.tc_rate[0] = ntohl (mp->tc_rate[0]); - p.tc_rate[1] = ntohl (mp->tc_rate[1]); - p.tc_rate[2] = ntohl (mp->tc_rate[2]); - p.tc_rate[3] = ntohl (mp->tc_rate[3]); - p.tc_period = ntohl (mp->tc_period); - - vnet_hw_interface_t *hw; - - VALIDATE_SW_IF_INDEX (mp); - - /* hw_if & dpdk device */ - hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); - - xd = vec_elt_at_index (dm->devices, hw->dev_instance); - - rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport, &p); - - BAD_SW_IF_INDEX_LABEL; - - REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_SUBPORT_REPLY); -} - -static void *vl_api_sw_interface_set_dpdk_hqos_subport_t_print - (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp, void *handle) -{ - u8 *s; - - s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_subport "); - - s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); - - s = - format (s, - "subport %u rate %u bkt_size %u tc0 %u tc1 %u tc2 %u tc3 %u period %u", - ntohl (mp->subport), ntohl (mp->tb_rate), ntohl (mp->tb_size), - ntohl (mp->tc_rate[0]), ntohl (mp->tc_rate[1]), - ntohl (mp->tc_rate[2]), ntohl (mp->tc_rate[3]), - ntohl (mp->tc_period)); - - FINISH; -} - -static void - vl_api_sw_interface_set_dpdk_hqos_tctbl_t_handler - (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp) -{ - vl_api_sw_interface_set_dpdk_hqos_tctbl_reply_t *rmp; - int rv = 0; - - dpdk_main_t *dm = &dpdk_main; - vlib_thread_main_t *tm = vlib_get_thread_main (); - dpdk_device_t *xd; - - u32 sw_if_index = ntohl (mp->sw_if_index); - u32 entry = ntohl (mp->entry); - u32 tc = ntohl (mp->tc); - u32 queue = ntohl (mp->queue); - u32 val, i; - - vnet_hw_interface_t *hw; - - VALIDATE_SW_IF_INDEX (mp); - - /* hw_if & dpdk device */ - hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); - - xd = vec_elt_at_index (dm->devices, hw->dev_instance); - - if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE) - { - clib_warning ("invalid traffic class !!"); - rv = VNET_API_ERROR_INVALID_VALUE; - goto done; - } - if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) - { - clib_warning ("invalid queue !!"); - rv = VNET_API_ERROR_INVALID_VALUE; - goto done; - } - - /* Detect the set of worker threads */ - uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); - - if (p == 0) - { - clib_warning ("worker thread registration AWOL !!"); - rv = VNET_API_ERROR_INVALID_VALUE_2; - goto done; - } - - vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; - int worker_thread_first = tr->first_index; - int worker_thread_count = tr->count; - - val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue; - for (i = 0; i < worker_thread_count; i++) - xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val; - - BAD_SW_IF_INDEX_LABEL; -done: - - REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_TCTBL_REPLY); -} - -static void *vl_api_sw_interface_set_dpdk_hqos_tctbl_t_print - (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp, void *handle) -{ - u8 *s; - - s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_tctbl "); - - s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); - - s = format (s, "entry %u tc %u queue %u", - ntohl (mp->entry), ntohl (mp->tc), ntohl (mp->queue)); - - FINISH; -} - -#define foreach_dpdk_plugin_api_msg \ -_(SW_INTERFACE_SET_DPDK_HQOS_PIPE, sw_interface_set_dpdk_hqos_pipe) \ -_(SW_INTERFACE_SET_DPDK_HQOS_SUBPORT, sw_interface_set_dpdk_hqos_subport) \ -_(SW_INTERFACE_SET_DPDK_HQOS_TCTBL, sw_interface_set_dpdk_hqos_tctbl) - -/* Set up the API message handling tables */ -static clib_error_t * -dpdk_plugin_api_hookup (vlib_main_t * vm) -{ - dpdk_main_t *dm __attribute__ ((unused)) = &dpdk_main; -#define _(N,n) \ - vl_msg_api_set_handlers((VL_API_##N + dm->msg_id_base), \ - #n, \ - vl_api_##n##_t_handler, \ - vl_noop_handler, \ - vl_api_##n##_t_endian, \ - vl_api_##n##_t_print, \ - sizeof(vl_api_##n##_t), 1); - foreach_dpdk_plugin_api_msg; -#undef _ - return 0; -} - -#define vl_msg_name_crc_list -#include -#undef vl_msg_name_crc_list - -static void -setup_message_id_table (dpdk_main_t * dm, api_main_t * am) -{ -#define _(id,n,crc) \ - vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + dm->msg_id_base); - foreach_vl_msg_name_crc_dpdk; -#undef _ -} - -// TODO -/* -static void plugin_custom_dump_configure (dpdk_main_t * dm) -{ -#define _(n,f) dm->api_main->msg_print_handlers \ - [VL_API_##n + dm->msg_id_base] \ - = (void *) vl_api_##f##_t_print; - foreach_dpdk_plugin_api_msg; -#undef _ -} -*/ -/* force linker to link functions used by vlib and declared weak */ -void *vlib_weakly_linked_functions[] = { - &rte_pktmbuf_init, - &rte_pktmbuf_pool_init, -}; - #define LINK_STATE_ELOGS 0 #define DEFAULT_HUGE_DIR "/run/vpp/hugepages" @@ -452,7 +189,7 @@ dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) return old; } -void +static void dpdk_device_lock_init (dpdk_device_t * xd) { int q; @@ -465,17 +202,6 @@ dpdk_device_lock_init (dpdk_device_t * xd) } } -void -dpdk_device_lock_free (dpdk_device_t * xd) -{ - int q; - - for (q = 0; q < vec_len (xd->lockp); q++) - clib_mem_free ((void *) xd->lockp[q]); - vec_free (xd->lockp); - xd->lockp = 0; -} - static clib_error_t * dpdk_lib_init (dpdk_main_t * dm) { @@ -1925,29 +1651,7 @@ VLIB_REGISTER_NODE (dpdk_process_node,static) = { }; /* *INDENT-ON* */ -int -dpdk_set_stat_poll_interval (f64 interval) -{ - if (interval < DPDK_MIN_STATS_POLL_INTERVAL) - return (VNET_API_ERROR_INVALID_VALUE); - - dpdk_main.stat_poll_interval = interval; - - return 0; -} - -int -dpdk_set_link_state_poll_interval (f64 interval) -{ - if (interval < DPDK_MIN_LINK_POLL_INTERVAL) - return (VNET_API_ERROR_INVALID_VALUE); - - dpdk_main.link_state_poll_interval = interval; - - return 0; -} - -clib_error_t * +static clib_error_t * dpdk_init (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; @@ -1964,26 +1668,10 @@ dpdk_init (vlib_main_t * vm) STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0, "Cache line marker must be 1st element in frame_queue_trace_t"); - u8 *name; - name = format (0, "dpdk_%08x%c", api_version, 0); - - /* Ask for a correctly-sized block of API message decode slots */ - dm->msg_id_base = vl_msg_api_get_msg_ids - ((char *) name, VL_MSG_FIRST_AVAILABLE); - vec_free (name); - dm->vlib_main = vm; dm->vnet_main = vnet_get_main (); dm->conf = &dpdk_config_main; - error = dpdk_plugin_api_hookup (vm); - - /* Add our API messages to the global name_crc hash table */ - setup_message_id_table (dm, &api_main); - -// TODO -// plugin_custom_dump_configure (dm); - ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input"); if (ei == 0) return clib_error_return (0, "ethernet-input node AWOL"); diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 5cc611cd..0562b48a 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -114,7 +114,7 @@ dpdk_rx_error_from_mb (struct rte_mbuf *mb, u32 * next, u8 * error) *error = DPDK_ERROR_NONE; } -void +static void dpdk_rx_trace (dpdk_main_t * dm, vlib_node_runtime_t * node, dpdk_device_t * xd, -- cgit 1.2.3-korg From 63c7e14f2e62caa1246349cfe341a93176ec4a43 Mon Sep 17 00:00:00 2001 From: Sergio Gonzalez Monroy Date: Wed, 22 Mar 2017 16:11:06 +0000 Subject: dpdk: cryptodev support enabled by default This patch slightly modifes how to enable DPDK Cryptodev. The startup option 'enable-cryptodev' has been removed and unless not enough cryptodevs are found, DPDK cryptodev will be enabled by default. Change-Id: Ic0ac507802cdc0eeb51f065e04ec43a1885617cf Signed-off-by: Sergio Gonzalez Monroy --- src/plugins/dpdk/device/dpdk.h | 1 - src/plugins/dpdk/device/init.c | 3 --- src/plugins/dpdk/ipsec/cli.c | 3 +-- src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md | 18 +++++------------- src/plugins/dpdk/ipsec/ipsec.c | 10 +--------- src/plugins/dpdk/ipsec/ipsec.h | 1 + 6 files changed, 8 insertions(+), 28 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 90e93e75..82e5938a 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -323,7 +323,6 @@ typedef struct u8 *uio_driver_name; u8 no_multi_seg; u8 enable_tcp_udp_checksum; - u8 cryptodev; /* Required config parameters */ u8 coremask_set_manually; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 33ecde44..e20b2585 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -995,9 +995,6 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "no-multi-seg")) conf->no_multi_seg = 1; - else if (unformat (input, "enable-cryptodev")) - conf->cryptodev = 1; - else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input, &sub_input)) { diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c index 3ae8c9b8..a9314065 100644 --- a/src/plugins/dpdk/ipsec/cli.c +++ b/src/plugins/dpdk/ipsec/cli.c @@ -20,12 +20,11 @@ static void dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) { - dpdk_config_main_t *conf = &dpdk_config_main; dpdk_crypto_main_t *dcm = &dpdk_crypto_main; vlib_thread_main_t *tm = vlib_get_thread_main (); u32 i, skip_master; - if (!conf->cryptodev) + if (!dcm->enabled) { vlib_cli_output (vm, "DPDK Cryptodev support is disabled\n"); return; diff --git a/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md b/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md index b3d3cc48..5a9f9c6e 100644 --- a/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md +++ b/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md @@ -23,15 +23,7 @@ Set new default next nodes: ### How to enable VPP IPSec with DPDK Cryptodev support -DPDK Cryptodev is supported in DPDK enabled VPP. -By default, only HW Cryptodev is supported but needs to be explicetly enabled with the following config option: - -``` -dpdk { - enable-cryptodev -} -``` - +DPDK Cryptodev is supported in DPDK enabled VPP and by default only HW Cryptodev is supported. To enable SW Cryptodev support (AESNI-MB-PMD and GCM-PMD), we need the following env option: vpp_uses_dpdk_cryptodev_sw=yes @@ -47,15 +39,15 @@ When enabling SW Cryptodev support, it means that you need to pre-build the requ VPP allocates crypto resources based on a best effort approach: * first allocate Hardware crypto resources, then Software. -* if there are not enough crypto resources for all workers, the graph node is not modifed, therefore the default VPP IPsec implementation based in OpenSSL is used. The following message is displayed: +* if there are not enough crypto resources for all workers, the graph node is not modifed and the default VPP IPsec implementation based in OpenSSL is used. The following message is displayed: - 0: dpdk_ipsec_init: not enough cryptodevs for ipsec + 0: dpdk_ipsec_init: not enough Cryptodevs, default to OpenSSL IPsec ### Configuration example -To enable DPDK Cryptodev the user just need to provide the startup.conf option -as mentioned previously. +To enable DPDK Cryptodev the user just need to provide cryptodevs int the +startup.conf. Example startup.conf: diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index 5d8f4fba..7066564d 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -224,7 +224,6 @@ static uword dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) { - dpdk_config_main_t *conf = &dpdk_config_main; ipsec_main_t *im = &ipsec_main; dpdk_crypto_main_t *dcm = &dpdk_crypto_main; vlib_thread_main_t *tm = vlib_get_thread_main (); @@ -235,19 +234,12 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, i32 dev_id, ret; u32 i, skip_master; - if (!conf->cryptodev) - { - clib_warning ("DPDK Cryptodev support is disabled, " - "default to OpenSSL IPsec"); - return 0; - } - if (check_cryptodev_queues () < 0) { - conf->cryptodev = 0; clib_warning ("not enough Cryptodevs, default to OpenSSL IPsec"); return 0; } + dcm->enabled = 1; vec_alloc (dcm->workers_main, tm->n_vlib_mains); _vec_len (dcm->workers_main) = tm->n_vlib_mains; diff --git a/src/plugins/dpdk/ipsec/ipsec.h b/src/plugins/dpdk/ipsec/ipsec.h index f0f793c0..d7940345 100644 --- a/src/plugins/dpdk/ipsec/ipsec.h +++ b/src/plugins/dpdk/ipsec/ipsec.h @@ -83,6 +83,7 @@ typedef struct { struct rte_mempool **cop_pools; crypto_worker_main_t *workers_main; + u8 enabled; } dpdk_crypto_main_t; dpdk_crypto_main_t dpdk_crypto_main; -- cgit 1.2.3-korg From c12311b86fb27114cdd3fa4ad9a5897a98448184 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 28 Apr 2017 17:19:16 +0200 Subject: dpdk: remove unused code Change-Id: I16dcc0de2553c6c1eb87dd1ec4c8d3c649e6f285 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/device.c | 22 +--------------------- src/plugins/dpdk/device/dpdk.h | 34 +--------------------------------- src/plugins/dpdk/device/init.c | 7 ------- src/plugins/dpdk/hqos/hqos.c | 2 -- src/plugins/dpdk/main.c | 2 +- 5 files changed, 3 insertions(+), 64 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 48f3237b..465a5874 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -44,7 +44,7 @@ static char *dpdk_tx_func_error_strings[] = { #undef _ }; -clib_error_t * +static clib_error_t * dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address) { int error; @@ -66,26 +66,6 @@ dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address) } } -clib_error_t * -dpdk_set_mc_filter (vnet_hw_interface_t * hi, - struct ether_addr mc_addr_vec[], int naddr) -{ - int error; - dpdk_main_t *dm = &dpdk_main; - dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance); - - error = rte_eth_dev_set_mc_addr_list (xd->device_index, mc_addr_vec, naddr); - - if (error) - { - return clib_error_return (0, "mc addr list failed: %d", error); - } - else - { - return NULL; - } -} - struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b) { diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 82e5938a..53f79ff2 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -22,29 +22,19 @@ #include #include -#include #include -#include -#include #include #include -#include -#include #include -#include #include #include -#include #include #include -#include -#include #include #include #include #include #include -#include #include #include #include @@ -64,7 +54,6 @@ extern vnet_device_class_t dpdk_device_class; extern vlib_node_registration_t dpdk_input_node; -extern vlib_node_registration_t handoff_dispatch_node; #if RTE_VERSION >= RTE_VERSION_NUM(17, 2, 0, 0) #define foreach_dpdk_pmd \ @@ -330,7 +319,6 @@ typedef struct u32 coremask; u32 nchannels; u32 num_mbufs; - u8 num_kni; /* while kni_init allows u32, port_id in callback fn is only u8 */ /* * format interface names ala xxxEthernet%d/%d/%d instead of @@ -376,12 +364,6 @@ typedef struct u32 pcap_sw_if_index; u32 pcap_pkts_to_capture; - /* hashes */ - uword *dpdk_device_by_kni_port_id; - uword *vu_sw_if_index_by_listener_fd; - uword *vu_sw_if_index_by_sock_fd; - u32 *vu_inactive_interfaces_device_index; - /* * flag indicating that a posted admin up/down * (via post_sw_interface_set_flags) is in progress @@ -413,7 +395,7 @@ typedef struct u16 msg_id_base; } dpdk_main_t; -dpdk_main_t dpdk_main; +extern dpdk_main_t dpdk_main; typedef struct { @@ -435,22 +417,8 @@ typedef struct u8 data[256]; /* First 256 data bytes, used for hexdump */ } dpdk_rx_dma_trace_t; -void vnet_buffer_needs_dpdk_mb (vlib_buffer_t * b); - -clib_error_t *dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address); - -clib_error_t *dpdk_set_mc_filter (vnet_hw_interface_t * hi, - struct ether_addr mc_addr_vec[], int naddr); - -void dpdk_thread_input (dpdk_main_t * dm, dpdk_device_t * xd); - clib_error_t *dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd); -u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance); - -struct rte_mbuf *dpdk_replicate_packet_mb (vlib_buffer_t * b); -struct rte_mbuf *dpdk_zerocopy_replicate_packet_mb (vlib_buffer_t * b); - #define foreach_dpdk_error \ _(NONE, "no error") \ _(RX_PACKET_ERROR, "Rx packet errors") \ diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index e20b2585..6f51ff64 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1028,8 +1028,6 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) } else if (unformat (input, "num-mbufs %d", &conf->num_mbufs)) ; - else if (unformat (input, "kni %d", &conf->num_kni)) - ; else if (unformat (input, "uio-driver %s", &conf->uio_driver_name)) ; else if (unformat (input, "socket-mem %s", &socket_mem)) @@ -1679,11 +1677,6 @@ dpdk_init (vlib_main_t * vm) dm->conf->num_mbufs = dm->conf->num_mbufs ? dm->conf->num_mbufs : NB_MBUF; vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet"); - dm->dpdk_device_by_kni_port_id = hash_create (0, sizeof (uword)); - dm->vu_sw_if_index_by_listener_fd = hash_create (0, sizeof (uword)); - dm->vu_sw_if_index_by_sock_fd = hash_create (0, sizeof (uword)); - - /* $$$ use n_thread_stacks since it's known-good at this point */ vec_validate (dm->recycle, tm->n_thread_stacks - 1); /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index 8b251beb..ca1bdafa 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -46,8 +46,6 @@ #include -dpdk_main_t dpdk_main; - /*** * * HQoS default configuration values diff --git a/src/plugins/dpdk/main.c b/src/plugins/dpdk/main.c index 942b8b2d..f2f1ba22 100644 --- a/src/plugins/dpdk/main.c +++ b/src/plugins/dpdk/main.c @@ -24,7 +24,7 @@ * Return 1 if to skip the delay loop because we are suspending * the calling vlib process instead. */ -int +static int rte_delay_us_override (unsigned us) { vlib_main_t *vm; -- cgit 1.2.3-korg From 4403690cda44134af3b9ea78d33a5cbf78a5acc9 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 28 Apr 2017 12:29:15 +0200 Subject: Add interface rx mode commands, unify rx mode and placement CLI Change-Id: Ib506c3e9d66170f29e3266ad6dc4d32b829befba Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 10 +- src/vnet/api_errno.h | 3 +- src/vnet/devices/af_packet/af_packet.c | 18 ++- src/vnet/devices/devices.c | 172 +++--------------------- src/vnet/devices/devices.h | 34 ++--- src/vnet/interface.h | 15 +++ src/vnet/interface_cli.c | 238 +++++++++++++++++++++++++++++++++ src/vnet/interface_format.c | 17 +++ src/vnet/interface_funcs.h | 1 + 9 files changed, 326 insertions(+), 182 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 6f51ff64..2d21bfd9 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -704,8 +704,8 @@ dpdk_lib_init (dpdk_main_t * dm) sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->hw_if_index); xd->vlib_sw_if_index = sw->sw_if_index; - vnet_set_device_input_node (dm->vnet_main, xd->hw_if_index, - dpdk_input_node.index); + vnet_hw_interface_set_input_node (dm->vnet_main, xd->hw_if_index, + dpdk_input_node.index); if (devconf->workers) { @@ -713,7 +713,7 @@ dpdk_lib_init (dpdk_main_t * dm) q = 0; /* *INDENT-OFF* */ clib_bitmap_foreach (i, devconf->workers, ({ - vnet_device_input_assign_thread (dm->vnet_main, xd->hw_if_index, q++, + vnet_hw_interface_assign_rx_thread (dm->vnet_main, xd->hw_if_index, q++, vdm->first_worker_thread_index + i); })); /* *INDENT-ON* */ @@ -721,8 +721,8 @@ dpdk_lib_init (dpdk_main_t * dm) else for (q = 0; q < xd->rx_q_used; q++) { - vnet_device_input_assign_thread (dm->vnet_main, xd->hw_if_index, q, /* any */ - ~1); + vnet_hw_interface_assign_rx_thread (dm->vnet_main, xd->hw_if_index, q, /* any */ + ~1); } hi = vnet_get_hw_interface (dm->vnet_main, xd->hw_if_index); diff --git a/src/vnet/api_errno.h b/src/vnet/api_errno.h index 0d5b2227..b87c197f 100644 --- a/src/vnet/api_errno.h +++ b/src/vnet/api_errno.h @@ -109,7 +109,8 @@ _(ENTRY_ALREADY_EXISTS, -116, "Entry already exists") \ _(SVM_SEGMENT_CREATE_FAIL, -117, "svm segment create fail") \ _(APPLICATION_NOT_ATTACHED, -118, "application not attached") \ _(BD_ALREADY_EXISTS, -119, "Bridge domain already exists") \ -_(BD_IN_USE, -120, "Bridge domain has member interfaces") +_(BD_IN_USE, -120, "Bridge domain has member interfaces") \ +_(UNSUPPORTED, -121, "Unsupported") typedef enum { diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index 92bd1092..cb52e6da 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -195,6 +195,7 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, u8 hw_addr[6]; clib_error_t *error; vnet_sw_interface_t *sw; + vnet_hw_interface_t *hw; vlib_thread_main_t *tm = vlib_get_thread_main (); vnet_main_t *vnm = vnet_get_main (); uword *p; @@ -294,17 +295,21 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, } sw = vnet_get_hw_sw_interface (vnm, apif->hw_if_index); + hw = vnet_get_hw_interface (vnm, apif->hw_if_index); apif->sw_if_index = sw->sw_if_index; - vnet_set_device_input_node (vnm, apif->hw_if_index, - af_packet_input_node.index); - vnet_device_input_assign_thread (vnm, apif->hw_if_index, 0, /* queue */ - ~0 /* any cpu */ ); - vnet_device_input_set_mode (vnm, apif->hw_if_index, 0, - VNET_DEVICE_INPUT_MODE_INTERRUPT); + vnet_hw_interface_set_input_node (vnm, apif->hw_if_index, + af_packet_input_node.index); + vnet_hw_interface_assign_rx_thread (vnm, apif->hw_if_index, 0, /* queue */ + ~0 /* any cpu */ ); + + hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; vnet_hw_interface_set_flags (vnm, apif->hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP); + vnet_hw_interface_set_rx_mode (vnm, apif->hw_if_index, 0, + VNET_HW_INTERFACE_RX_MODE_INTERRUPT); + mhash_set_mem (&apm->if_index_by_host_if_name, host_if_name_dup, &if_index, 0); if (sw_if_index) @@ -340,6 +345,7 @@ af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name) /* bring down the interface */ vnet_hw_interface_set_flags (vnm, apif->hw_if_index, 0); + vnet_hw_interface_unassign_rx_thread (vnm, apif->hw_if_index, 0); /* clean up */ if (apif->unix_file_index != ~0) diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c index 2f55adcb..d75d905a 100644 --- a/src/vnet/devices/devices.c +++ b/src/vnet/devices/devices.c @@ -119,8 +119,8 @@ vnet_device_queue_update (vnet_main_t * vnm, vnet_device_input_runtime_t * rt) } void -vnet_device_input_assign_thread (vnet_main_t * vnm, u32 hw_if_index, - u16 queue_id, uword thread_index) +vnet_hw_interface_assign_rx_thread (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, uword thread_index) { vnet_device_main_t *vdm = &vnet_device_main; vlib_main_t *vm; @@ -149,16 +149,19 @@ vnet_device_input_assign_thread (vnet_main_t * vnm, u32 hw_if_index, dq->hw_if_index = hw_if_index; dq->dev_instance = hw->dev_instance; dq->queue_id = queue_id; + dq->mode = VNET_HW_INTERFACE_RX_MODE_POLLING; vnet_device_queue_update (vnm, rt); vec_validate (hw->input_node_thread_index_by_queue, queue_id); + vec_validate (hw->rx_mode_by_queue, queue_id); hw->input_node_thread_index_by_queue[queue_id] = thread_index; + hw->rx_mode_by_queue[queue_id] = VNET_HW_INTERFACE_RX_MODE_POLLING; vlib_node_set_state (vm, hw->input_node_index, rt->enabled_node_state); } int -vnet_device_input_unassign_thread (vnet_main_t * vnm, u32 hw_if_index, - u16 queue_id, uword thread_index) +vnet_hw_interface_unassign_rx_thread (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id) { vlib_main_t *vm; vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); @@ -190,6 +193,7 @@ vnet_device_input_unassign_thread (vnet_main_t * vnm, u32 hw_if_index, deleted: vnet_device_queue_update (vnm, rt); + hw->rx_mode_by_queue[queue_id] = VNET_HW_INTERFACE_RX_MODE_UNKNOWN; if (vec_len (rt->devices_and_queues) == 0) vlib_node_set_state (vm, hw->input_node_index, VLIB_NODE_STATE_DISABLED); @@ -199,21 +203,28 @@ deleted: int -vnet_device_input_set_mode (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id, - vnet_device_input_mode_t mode) +vnet_hw_interface_set_rx_mode (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, vnet_hw_interface_rx_mode mode) { vlib_main_t *vm; uword thread_index; vnet_device_and_queue_t *dq; vlib_node_state_t enabled_node_state; - ASSERT (mode < VNET_DEVICE_INPUT_N_MODES); + ASSERT (mode < VNET_HW_INTERFACE_NUM_RX_MODES); vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); vnet_device_input_runtime_t *rt; int is_polling = 0; - if (hw->input_node_thread_index_by_queue == 0) + if (hw->input_node_thread_index_by_queue == 0 || hw->rx_mode_by_queue == 0) return VNET_API_ERROR_INVALID_INTERFACE; + if (hw->rx_mode_by_queue[queue_id] == mode) + return 0; + + if (mode != VNET_HW_INTERFACE_RX_MODE_POLLING && + (hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE) == 0) + return VNET_API_ERROR_UNSUPPORTED; + thread_index = hw->input_node_thread_index_by_queue[queue_id]; vm = vlib_mains[thread_index]; @@ -223,7 +234,7 @@ vnet_device_input_set_mode (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id, { if (dq->hw_if_index == hw_if_index && dq->queue_id == queue_id) dq->mode = mode; - if (dq->mode == VNET_DEVICE_INPUT_MODE_POLLING) + if (dq->mode == VNET_HW_INTERFACE_RX_MODE_POLLING) is_polling = 1; } @@ -244,8 +255,8 @@ vnet_device_input_set_mode (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id, } int -vnet_device_input_get_mode (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id, - vnet_device_input_mode_t * mode) +vnet_hw_interface_get_rx_mode (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, vnet_hw_interface_rx_mode * mode) { vlib_main_t *vm; uword thread_index; @@ -271,146 +282,7 @@ vnet_device_input_get_mode (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id, return VNET_API_ERROR_INVALID_INTERFACE; } -static clib_error_t * -show_device_placement_fn (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd) -{ - u8 *s = 0; - vnet_main_t *vnm = vnet_get_main (); - vnet_device_input_runtime_t *rt; - vnet_device_and_queue_t *dq; - vlib_node_t *pn = vlib_get_node_by_name (vm, (u8 *) "device-input"); - uword si; - int index = 0; - - /* *INDENT-OFF* */ - foreach_vlib_main (({ - clib_bitmap_foreach (si, pn->sibling_bitmap, - ({ - rt = vlib_node_get_runtime_data (this_vlib_main, si); - - if (vec_len (rt->devices_and_queues)) - s = format (s, " node %U:\n", format_vlib_node_name, vm, si); - - vec_foreach (dq, rt->devices_and_queues) - { - s = format (s, " %U queue %u (%s)\n", - format_vnet_sw_if_index_name, vnm, dq->hw_if_index, - dq->queue_id, - dq->mode == VNET_DEVICE_INPUT_MODE_POLLING ? - "polling" : "interrupt"); - } - })); - if (vec_len (s) > 0) - { - vlib_cli_output(vm, "Thread %u (%v):\n%v", index, - vlib_worker_threads[index].name, s); - vec_reset_length (s); - } - index++; - })); - /* *INDENT-ON* */ - - vec_free (s); - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (memif_delete_command, static) = { - .path = "show interface placement", - .short_help = "show interface placement", - .function = show_device_placement_fn, -}; -/* *INDENT-ON* */ - -static clib_error_t * -set_device_placement (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd) -{ - clib_error_t *error = 0; - unformat_input_t _line_input, *line_input = &_line_input; - vnet_main_t *vnm = vnet_get_main (); - vnet_device_main_t *vdm = &vnet_device_main; - vnet_device_input_mode_t mode; - u32 hw_if_index = (u32) ~ 0; - u32 queue_id = (u32) 0; - u32 thread_index = (u32) ~ 0; - int rv; - - if (!unformat_user (input, unformat_line_input, line_input)) - return 0; - - while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) - { - if (unformat - (line_input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) - ; - else if (unformat (line_input, "queue %d", &queue_id)) - ; - else if (unformat (line_input, "main", &thread_index)) - thread_index = 0; - else if (unformat (line_input, "worker %d", &thread_index)) - thread_index += vdm->first_worker_thread_index; - else - { - error = clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); - unformat_free (line_input); - return error; - } - } - - unformat_free (line_input); - - if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify valid interface name"); - - if (thread_index > vdm->last_worker_thread_index) - return clib_error_return (0, - "please specify valid worker thread or main"); - - rv = vnet_device_input_get_mode (vnm, hw_if_index, queue_id, &mode); - - if (rv) - return clib_error_return (0, "not found"); - - rv = vnet_device_input_unassign_thread (vnm, hw_if_index, queue_id, - thread_index); - if (rv) - return clib_error_return (0, "not found"); - - vnet_device_input_assign_thread (vnm, hw_if_index, queue_id, thread_index); - vnet_device_input_set_mode (vnm, hw_if_index, queue_id, mode); - - return 0; -} - -/*? - * This command is used to assign a given interface, and optionally a - * given queue, to a different thread. If the 'queue' is not provided, - * it defaults to 0. - * - * @cliexpar - * Example of how to display the interface placement: - * @cliexstart{show interface placement} - * Thread 1 (vpp_wk_0): - * GigabitEthernet0/8/0 queue 0 - * GigabitEthernet0/9/0 queue 0 - * Thread 2 (vpp_wk_1): - * GigabitEthernet0/8/0 queue 1 - * GigabitEthernet0/9/0 queue 1 - * @cliexend - * Example of how to assign a interface and queue to a thread: - * @cliexcmd{set interface placement GigabitEthernet0/8/0 queue 1 thread 1} -?*/ -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = { - .path = "set interface placement", - .short_help = "set interface placement [queue ] [thread | main]", - .function = set_device_placement, -}; -/* *INDENT-ON* */ static clib_error_t * vnet_device_init (vlib_main_t * vm) diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h index baf03b7c..f1f7e778 100644 --- a/src/vnet/devices/devices.h +++ b/src/vnet/devices/devices.h @@ -55,19 +55,12 @@ typedef struct uword next_worker_thread_index; } vnet_device_main_t; -typedef enum -{ - VNET_DEVICE_INPUT_MODE_POLLING = 0, - VNET_DEVICE_INPUT_MODE_INTERRUPT, - VNET_DEVICE_INPUT_N_MODES, -} vnet_device_input_mode_t; - typedef struct { u32 hw_if_index; u32 dev_instance; u16 queue_id; - vnet_device_input_mode_t mode; + vnet_hw_interface_rx_mode mode; uword interrupt_pending; } vnet_device_and_queue_t; @@ -82,22 +75,23 @@ extern vlib_node_registration_t device_input_node; extern const u32 device_input_next_node_advance[]; static inline void -vnet_set_device_input_node (vnet_main_t * vnm, u32 hw_if_index, - u32 node_index) +vnet_hw_interface_set_input_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) { vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); hw->input_node_index = node_index; } -void vnet_device_input_assign_thread (vnet_main_t * vnm, u32 hw_if_index, - u16 queue_id, uword thread_index); -int vnet_device_input_unassign_thread (vnet_main_t * vnm, u32 hw_if_index, - u16 queue_id, uword thread_index); -int vnet_device_input_set_mode (vnet_main_t * vnm, u32 hw_if_index, - u16 queue_id, vnet_device_input_mode_t mode); -int vnet_device_input_get_mode (vnet_main_t * vnm, u32 hw_if_index, - u16 queue_id, - vnet_device_input_mode_t * mode); +void vnet_hw_interface_assign_rx_thread (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, uword thread_index); +int vnet_hw_interface_unassign_rx_thread (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id); +int vnet_hw_interface_set_rx_mode (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, + vnet_hw_interface_rx_mode mode); +int vnet_hw_interface_get_rx_mode (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, + vnet_hw_interface_rx_mode * mode); static inline u64 vnet_get_aggregate_rx_packets (void) @@ -161,7 +155,7 @@ vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index, #define foreach_device_and_queue(var,vec) \ for (var = (vec); var < vec_end (vec); var++) \ if (clib_smp_swap (&((var)->interrupt_pending), 0) || \ - var->mode == VNET_DEVICE_INPUT_MODE_POLLING) + var->mode == VNET_HW_INTERFACE_RX_MODE_POLLING) #endif /* included_vnet_vnet_device_h */ diff --git a/src/vnet/interface.h b/src/vnet/interface.h index 9c223040..2344348b 100644 --- a/src/vnet/interface.h +++ b/src/vnet/interface.h @@ -405,6 +405,9 @@ typedef struct vnet_hw_interface_t #define VNET_HW_INTERFACE_FLAG_L2OUTPUT_SHIFT 9 #define VNET_HW_INTERFACE_FLAG_L2OUTPUT_MAPPED (1 << 9) + /* rx mode flags */ +#define VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE (1 << 10) + /* Hardware address as vector. Zero (e.g. zero-length vector) if no address for this class (e.g. PPP). */ u8 *hw_address; @@ -470,6 +473,9 @@ typedef struct vnet_hw_interface_t /* input node cpu index by queue */ u32 *input_node_thread_index_by_queue; + /* vnet_hw_interface_rx_mode by queue */ + u8 *rx_mode_by_queue; + /* device input device_and_queue runtime index */ uword *dq_runtime_index_by_queue; @@ -486,6 +492,15 @@ typedef enum VNET_SW_INTERFACE_TYPE_SUB, } vnet_sw_interface_type_t; +typedef enum +{ + VNET_HW_INTERFACE_RX_MODE_UNKNOWN, + VNET_HW_INTERFACE_RX_MODE_POLLING, + VNET_HW_INTERFACE_RX_MODE_INTERRUPT, + VNET_HW_INTERFACE_RX_MODE_ADAPTIVE, + VNET_HW_INTERFACE_NUM_RX_MODES, +} vnet_hw_interface_rx_mode; + typedef struct { /* diff --git a/src/vnet/interface_cli.c b/src/vnet/interface_cli.c index 94eb7ea9..bfce03e1 100644 --- a/src/vnet/interface_cli.c +++ b/src/vnet/interface_cli.c @@ -1175,7 +1175,245 @@ VLIB_CLI_COMMAND (clear_tag_command, static) = { }; /* *INDENT-ON* */ +static clib_error_t * +set_interface_rx_mode (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hw; + u32 hw_if_index = (u32) ~ 0; + u32 queue_id = (u32) ~ 0; + vnet_hw_interface_rx_mode mode = VNET_HW_INTERFACE_RX_MODE_UNKNOWN; + int i, rv = 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) + ; + else if (unformat (line_input, "queue %d", &queue_id)) + ; + else if (unformat (line_input, "polling")) + mode = VNET_HW_INTERFACE_RX_MODE_POLLING; + else if (unformat (line_input, "interrupt")) + mode = VNET_HW_INTERFACE_RX_MODE_INTERRUPT; + else if (unformat (line_input, "adaptive")) + mode = VNET_HW_INTERFACE_RX_MODE_ADAPTIVE; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + unformat_free (line_input); + return error; + } + } + + unformat_free (line_input); + + if (hw_if_index == (u32) ~ 0) + return clib_error_return (0, "please specify valid interface name"); + if (mode == VNET_HW_INTERFACE_RX_MODE_UNKNOWN) + return clib_error_return (0, "please specify valid rx-mode"); + + hw = vnet_get_hw_interface (vnm, hw_if_index); + + if (queue_id == ~0) + for (i = 0; i < vec_len (hw->dq_runtime_index_by_queue); i++) + { + rv = vnet_hw_interface_set_rx_mode (vnm, hw_if_index, i, mode); + if (rv) + goto error; + } + else + rv = vnet_hw_interface_set_rx_mode (vnm, hw_if_index, queue_id, mode); + + if (rv) + goto error; + + return 0; + +error: + if (rv == VNET_API_ERROR_UNSUPPORTED) + return clib_error_return (0, "unsupported"); + + if (rv == VNET_API_ERROR_INVALID_INTERFACE) + return clib_error_return (0, "invalid interfaace"); + + return clib_error_return (0, "unknown error"); +} + +/*? + * This command is used to assign a given interface, and optionally a + * given queue, to a different thread. If the 'queue' is not provided, + * it defaults to 0. + * + * @cliexpar + * Example of how to display the interface placement: + * @cliexstart{show interface rx-placement} + * Thread 1 (vpp_wk_0): + * GigabitEthernet0/8/0 queue 0 + * GigabitEthernet0/9/0 queue 0 + * Thread 2 (vpp_wk_1): + * GigabitEthernet0/8/0 queue 1 + * GigabitEthernet0/9/0 queue 1 + * @cliexend + * Example of how to assign a interface and queue to a thread: + * @cliexcmd{set interface placement GigabitEthernet0/8/0 queue 1 thread 1} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_if_rx_mode,static) = { + .path = "set interface rx-mode", + .short_help = "set interface rx-mode [queue ] [polling | interrupt | adaptive]", + .function = set_interface_rx_mode, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_interface_rx_placement_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *s = 0; + vnet_main_t *vnm = vnet_get_main (); + vnet_device_input_runtime_t *rt; + vnet_device_and_queue_t *dq; + vlib_node_t *pn = vlib_get_node_by_name (vm, (u8 *) "device-input"); + uword si; + int index = 0; + + /* *INDENT-OFF* */ + foreach_vlib_main (({ + clib_bitmap_foreach (si, pn->sibling_bitmap, + ({ + rt = vlib_node_get_runtime_data (this_vlib_main, si); + + if (vec_len (rt->devices_and_queues)) + s = format (s, " node %U:\n", format_vlib_node_name, vm, si); + + vec_foreach (dq, rt->devices_and_queues) + { + s = format (s, " %U queue %u (%U)\n", + format_vnet_sw_if_index_name, vnm, dq->hw_if_index, + dq->queue_id, + format_vnet_hw_interface_rx_mode, dq->mode); + } + })); + if (vec_len (s) > 0) + { + vlib_cli_output(vm, "Thread %u (%v):\n%v", index, + vlib_worker_threads[index].name, s); + vec_reset_length (s); + } + index++; + })); + /* *INDENT-ON* */ + + vec_free (s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_interface_rx_placement, static) = { + .path = "show interface rx-placement", + .short_help = "show interface rx-placement", + .function = show_interface_rx_placement_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_interface_rx_placement (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + vnet_device_main_t *vdm = &vnet_device_main; + vnet_hw_interface_rx_mode mode; + u32 hw_if_index = (u32) ~ 0; + u32 queue_id = (u32) 0; + u32 thread_index = (u32) ~ 0; + int rv; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) + ; + else if (unformat (line_input, "queue %d", &queue_id)) + ; + else if (unformat (line_input, "main", &thread_index)) + thread_index = 0; + else if (unformat (line_input, "worker %d", &thread_index)) + thread_index += vdm->first_worker_thread_index; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + unformat_free (line_input); + return error; + } + } + + unformat_free (line_input); + + if (hw_if_index == (u32) ~ 0) + return clib_error_return (0, "please specify valid interface name"); + + if (thread_index > vdm->last_worker_thread_index) + return clib_error_return (0, + "please specify valid worker thread or main"); + + rv = vnet_hw_interface_get_rx_mode (vnm, hw_if_index, queue_id, &mode); + + if (rv) + return clib_error_return (0, "not found"); + + rv = vnet_hw_interface_unassign_rx_thread (vnm, hw_if_index, queue_id); + + if (rv) + return clib_error_return (0, "not found"); + + vnet_hw_interface_assign_rx_thread (vnm, hw_if_index, queue_id, + thread_index); + vnet_hw_interface_set_rx_mode (vnm, hw_if_index, queue_id, mode); + + return 0; +} + +/*? + * This command is used to assign a given interface, and optionally a + * given queue, to a different thread. If the 'queue' is not provided, + * it defaults to 0. + * + * @cliexpar + * Example of how to display the interface placement: + * @cliexstart{show interface placement} + * Thread 1 (vpp_wk_0): + * GigabitEthernet0/8/0 queue 0 + * GigabitEthernet0/9/0 queue 0 + * Thread 2 (vpp_wk_1): + * GigabitEthernet0/8/0 queue 1 + * GigabitEthernet0/9/0 queue 1 + * @cliexend + * Example of how to assign a interface and queue to a thread: + * @cliexcmd{set interface placement GigabitEthernet0/8/0 queue 1 thread 1} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_if_rx_placement,static) = { + .path = "set interface rx-placement", + .short_help = "set interface rx-placement [queue ] [thread | main]", + .function = set_interface_rx_placement, +}; + +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/interface_format.c b/src/vnet/interface_format.c index b961c778..03caf5c6 100644 --- a/src/vnet/interface_format.c +++ b/src/vnet/interface_format.c @@ -58,6 +58,23 @@ format_vnet_sw_interface_flags (u8 * s, va_list * args) return s; } +u8 * +format_vnet_hw_interface_rx_mode (u8 * s, va_list * args) +{ + vnet_hw_interface_rx_mode mode = va_arg (*args, vnet_hw_interface_rx_mode); + + if (mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + return format (s, "polling"); + + if (mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) + return format (s, "interrupt"); + + if (mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE) + return format (s, "adaptive"); + + return format (s, "unknown"); +} + u8 * format_vnet_hw_interface (u8 * s, va_list * args) { diff --git a/src/vnet/interface_funcs.h b/src/vnet/interface_funcs.h index b3aca2fd..999b72e5 100644 --- a/src/vnet/interface_funcs.h +++ b/src/vnet/interface_funcs.h @@ -277,6 +277,7 @@ clib_error_t *vnet_hw_interface_change_mac_address (vnet_main_t * vnm, /* Formats sw/hw interface. */ format_function_t format_vnet_hw_interface; +format_function_t format_vnet_hw_interface_rx_mode; format_function_t format_vnet_sw_interface; format_function_t format_vnet_sw_interface_name; format_function_t format_vnet_sw_interface_name_override; -- cgit 1.2.3-korg From 1ea82dfe5dabb0f9bbfa5ba953ef31328c987b89 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 8 May 2017 18:19:45 +0200 Subject: dpdk: use speed_capa to detect i40e device speed Change-Id: I1decca6fb5ce6de74ccbcd5a262ebd3db6f55cdc Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/dpdk.h | 1 + src/plugins/dpdk/device/format.c | 4 ++++ src/plugins/dpdk/device/init.c | 27 ++++++++------------------- 3 files changed, 13 insertions(+), 19 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 53f79ff2..8dbc0915 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -108,6 +108,7 @@ typedef enum { VNET_DPDK_PORT_TYPE_ETH_1G, VNET_DPDK_PORT_TYPE_ETH_10G, + VNET_DPDK_PORT_TYPE_ETH_25G, VNET_DPDK_PORT_TYPE_ETH_40G, VNET_DPDK_PORT_TYPE_ETH_100G, VNET_DPDK_PORT_TYPE_ETH_BOND, diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index 93eca524..389cd8b6 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -168,6 +168,10 @@ format_dpdk_device_name (u8 * s, va_list * args) device_name = "TenGigabitEthernet"; break; + case VNET_DPDK_PORT_TYPE_ETH_25G: + device_name = "TwentyFiveGigabitEthernet"; + break; + case VNET_DPDK_PORT_TYPE_ETH_40G: device_name = "FortyGigabitEthernet"; break; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 2d21bfd9..c45edc9b 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -444,25 +444,14 @@ dpdk_lib_init (dpdk_main_t * dm) xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - switch (dev_info.pci_dev->id.device_id) - { - case I40E_DEV_ID_10G_BASE_T: - case I40E_DEV_ID_SFP_XL710: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - case I40E_DEV_ID_QSFP_A: - case I40E_DEV_ID_QSFP_B: - case I40E_DEV_ID_QSFP_C: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - break; - case I40E_DEV_ID_VF: - rte_eth_link_get_nowait (i, &l); - xd->port_type = l.link_speed == 10000 ? - VNET_DPDK_PORT_TYPE_ETH_10G : VNET_DPDK_PORT_TYPE_ETH_40G; - break; - default: - xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; - } + if (dev_info.speed_capa & ETH_LINK_SPEED_40G) + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + else if (dev_info.speed_capa & ETH_LINK_SPEED_25G) + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_25G; + else if (dev_info.speed_capa & ETH_LINK_SPEED_10G) + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + else + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; break; case VNET_DPDK_PMD_CXGBE: -- cgit 1.2.3-korg From e6c5941b982083840a48e1de06f5324893153d8f Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 28 Apr 2017 17:10:38 +0200 Subject: dpdk: refactor setup, start, stop code Change-Id: I0fec86914ec027383ff511b7092beac2363f55f7 Signed-off-by: Damjan Marion --- src/plugins/dpdk.am | 1 + src/plugins/dpdk/device/cli.c | 2 +- src/plugins/dpdk/device/common.c | 156 +++++++++++++++++++++++++++++++++++++++ src/plugins/dpdk/device/dpdk.h | 4 +- src/plugins/dpdk/device/init.c | 91 ++--------------------- 5 files changed, 167 insertions(+), 87 deletions(-) create mode 100644 src/plugins/dpdk/device/common.c (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk.am b/src/plugins/dpdk.am index bb46ae6e..0b242502 100644 --- a/src/plugins/dpdk.am +++ b/src/plugins/dpdk.am @@ -30,6 +30,7 @@ dpdk_plugin_la_SOURCES = \ dpdk/thread.c \ dpdk/api/dpdk_api.c \ dpdk/device/cli.c \ + dpdk/device/common.c \ dpdk/device/dpdk_priv.h \ dpdk/device/device.c \ dpdk/device/format.c \ diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index 17d1bfe1..c7e5090d 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -537,7 +537,7 @@ set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input, if (nb_tx_desc != (u32) ~ 0) xd->nb_tx_desc = nb_tx_desc; - error = dpdk_port_setup (dm, xd); + error = dpdk_device_setup (xd); done: unformat_free (line_input); diff --git a/src/plugins/dpdk/device/common.c b/src/plugins/dpdk/device/common.c new file mode 100644 index 00000000..79c5888d --- /dev/null +++ b/src/plugins/dpdk/device/common.c @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +clib_error_t * +dpdk_error_return (clib_error_t * error, char *str, dpdk_device_t * xd, + int rv) +{ + return clib_error_return (error, "%s[%d]: %s(%d)", str, xd->device_index, + rte_strerror (rv), rv); +} + +clib_error_t * +dpdk_device_setup (dpdk_device_t * xd) +{ + dpdk_main_t *dm = &dpdk_main; + clib_error_t *err = 0; + int rv; + int j; + + ASSERT (vlib_get_thread_index () == 0); + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + { + vnet_hw_interface_set_flags (dm->vnet_main, xd->hw_if_index, 0); + dpdk_device_stop (xd); + } + + rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used, + xd->tx_q_used, &xd->port_conf); + + if (rv < 0) + return dpdk_error_return (err, "rte_eth_dev_configure", xd, rv); + + /* Set up one TX-queue per worker thread */ + for (j = 0; j < xd->tx_q_used; j++) + { + rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, + xd->cpu_socket, &xd->tx_conf); + + /* retry with any other CPU socket */ + if (rv < 0) + rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, + SOCKET_ID_ANY, &xd->tx_conf); + if (rv < 0) + err = dpdk_error_return (err, "rte_eth_tx_queue_setup", xd, rv); + } + + for (j = 0; j < xd->rx_q_used; j++) + { + uword tidx = vnet_get_device_input_thread_index (dm->vnet_main, + xd->hw_if_index, j); + unsigned lcore = vlib_worker_threads[tidx].lcore_id; + u16 socket_id = rte_lcore_to_socket_id (lcore); + + rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, + xd->cpu_socket, 0, + dm->pktmbuf_pools[socket_id]); + + /* retry with any other CPU socket */ + if (rv < 0) + rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, + SOCKET_ID_ANY, 0, + dm->pktmbuf_pools[socket_id]); + + if (rv < 0) + err = dpdk_error_return (err, "rte_eth_rx_queue_setup", xd, rv); + } + + if (err) + return err; + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + err = dpdk_device_start (xd); + + return err; +} + +clib_error_t * +dpdk_device_start (dpdk_device_t * xd) +{ + int rv; + clib_error_t *err = 0; + + rv = rte_eth_dev_start (xd->device_index); + + if (rv) + return dpdk_error_return (err, "rte_eth_dev_start", xd, rv); + + if (xd->default_mac_address) + rv = + rte_eth_dev_default_mac_addr_set (xd->device_index, + (struct ether_addr *) + xd->default_mac_address); + + if (rv) + err = dpdk_error_return (err, "rte_eth_dev_default_mac_addr_set", xd, rv); + + if (xd->flags & DPDK_DEVICE_FLAG_PROMISC) + rte_eth_promiscuous_enable (xd->device_index); + else + rte_eth_promiscuous_disable (xd->device_index); + + rte_eth_allmulticast_enable (xd->device_index); + + if (xd->pmd == VNET_DPDK_PMD_BOND) + { + u8 slink[16]; + int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16); + while (nlink >= 1) + { + u8 dpdk_port = slink[--nlink]; + rte_eth_allmulticast_enable (dpdk_port); + } + } + + return err; +} + +clib_error_t * +dpdk_device_stop (dpdk_device_t * xd) +{ + rte_eth_dev_stop (xd->device_index); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 8dbc0915..583d2cd9 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -418,7 +418,9 @@ typedef struct u8 data[256]; /* First 256 data bytes, used for hexdump */ } dpdk_rx_dma_trace_t; -clib_error_t *dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd); +clib_error_t *dpdk_device_setup (dpdk_device_t * xd); +clib_error_t *dpdk_device_start (dpdk_device_t * xd); +clib_error_t *dpdk_device_stop (dpdk_device_t * xd); #define foreach_dpdk_error \ _(NONE, "no error") \ diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index c45edc9b..3aba031c 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -55,80 +55,6 @@ static struct rte_eth_conf port_conf_template = { }, }; -clib_error_t * -dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) -{ - int rv; - int j; - - ASSERT (vlib_get_thread_index () == 0); - - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - { - vnet_hw_interface_set_flags (dm->vnet_main, xd->hw_if_index, 0); - rte_eth_dev_stop (xd->device_index); - } - - rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used, - xd->tx_q_used, &xd->port_conf); - - if (rv < 0) - return clib_error_return (0, "rte_eth_dev_configure[%d]: err %d", - xd->device_index, rv); - - /* Set up one TX-queue per worker thread */ - for (j = 0; j < xd->tx_q_used; j++) - { - rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, - xd->cpu_socket, &xd->tx_conf); - - /* retry with any other CPU socket */ - if (rv < 0) - rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, - SOCKET_ID_ANY, &xd->tx_conf); - if (rv < 0) - break; - } - - if (rv < 0) - return clib_error_return (0, "rte_eth_tx_queue_setup[%d]: err %d", - xd->device_index, rv); - - for (j = 0; j < xd->rx_q_used; j++) - { - uword tidx = vnet_get_device_input_thread_index (dm->vnet_main, - xd->hw_if_index, j); - unsigned lcore = vlib_worker_threads[tidx].lcore_id; - u16 socket_id = rte_lcore_to_socket_id (lcore); - - rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, - xd->cpu_socket, 0, - dm->pktmbuf_pools[socket_id]); - - /* retry with any other CPU socket */ - if (rv < 0) - rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, - SOCKET_ID_ANY, 0, - dm->pktmbuf_pools[socket_id]); - if (rv < 0) - return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d", - xd->device_index, rv); - } - - if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - { - int rv; - rv = rte_eth_dev_start (xd->device_index); - if (!rv && xd->default_mac_address) - rv = rte_eth_dev_default_mac_addr_set (xd->device_index, - (struct ether_addr *) - xd->default_mac_address); - if (rv < 0) - clib_warning ("rte_eth_dev_start %d returned %d", - xd->device_index, rv); - } - return 0; -} static u32 dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) @@ -161,7 +87,7 @@ dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes; if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - rte_eth_dev_stop (xd->device_index); + dpdk_device_stop (xd); rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf); @@ -175,14 +101,9 @@ dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) { - int rv = rte_eth_dev_start (xd->device_index); - if (!rv && xd->default_mac_address) - rv = rte_eth_dev_default_mac_addr_set (xd->device_index, - (struct ether_addr *) - xd->default_mac_address); - if (rv < 0) - clib_warning ("rte_eth_dev_start %d returned %d", - xd->device_index, rv); + clib_error_t *error; + error = dpdk_device_start (xd); + clib_error_report (error); } } @@ -716,7 +637,7 @@ dpdk_lib_init (dpdk_main_t * dm) hi = vnet_get_hw_interface (dm->vnet_main, xd->hw_if_index); - rv = dpdk_port_setup (dm, xd); + rv = dpdk_device_setup (xd); if (rv) return rv; @@ -1505,7 +1426,7 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) /* * Extra set up for bond interfaces: * 1. Setup MACs for bond interfaces and their slave links which was set - * in dpdk_port_setup() but needs to be done again here to take effect. + * in dpdk_device_setup() but needs to be done again here to take effect. * 2. Set up info for bond interface related CLI support. */ int nports = rte_eth_dev_count (); -- cgit 1.2.3-korg From 4fb27eed7e1f986394e20035a24476237c7e6f60 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 9 May 2017 19:50:26 +0200 Subject: dpdk: refactor interface speed detection - Adds VirtualFucntionEthernet for VFs - Enables MLX4 driver Change-Id: I163300e68edbe033227f641bdfcfe5918cbe58cf Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/dpdk.h | 5 +- src/plugins/dpdk/device/format.c | 8 +++ src/plugins/dpdk/device/init.c | 127 +++++++++------------------------------ 3 files changed, 42 insertions(+), 98 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 583d2cd9..7e2901bb 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -72,6 +72,7 @@ extern vlib_node_registration_t dpdk_input_node; _ ("net_bonding", BOND) \ _ ("net_fm10k", FM10K) \ _ ("net_cxgbe", CXGBE) \ + _ ("net_mlx4", MLX4) \ _ ("net_mlx5", MLX5) \ _ ("net_dpaa2", DPAA2) #else @@ -91,6 +92,7 @@ extern vlib_node_registration_t dpdk_input_node; _ ("rte_bond_pmd", BOND) \ _ ("net_fm10k", FM10K) \ _ ("net_cxgbe", CXGBE) \ + _ ("net_mlx4", MLX4) \ _ ("net_mlx5", MLX5) \ _ ("net_dpaa2", DPAA2) #endif @@ -114,6 +116,7 @@ typedef enum VNET_DPDK_PORT_TYPE_ETH_BOND, VNET_DPDK_PORT_TYPE_ETH_SWITCH, VNET_DPDK_PORT_TYPE_AF_PACKET, + VNET_DPDK_PORT_TYPE_ETH_VF, VNET_DPDK_PORT_TYPE_UNKNOWN, } dpdk_port_type_t; @@ -185,7 +188,7 @@ typedef struct #define DPDK_DEVICE_FLAG_ADMIN_UP (1 << 0) #define DPDK_DEVICE_FLAG_PROMISC (1 << 1) #define DPDK_DEVICE_FLAG_PMD (1 << 2) -#define DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE (1 << 3) + #define DPDK_DEVICE_FLAG_MAYBE_MULTISEG (1 << 4) #define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5) #define DPDK_DEVICE_FLAG_HQOS (1 << 6) diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index 389cd8b6..3b75563d 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -187,6 +187,10 @@ format_dpdk_device_name (u8 * s, va_list * args) device_name = "EthernetSwitch"; break; + case VNET_DPDK_PORT_TYPE_ETH_VF: + device_name = "VirtualFunctionEthernet"; + break; + case VNET_DPDK_PORT_TYPE_AF_PACKET: rte_eth_dev_info_get (i, &dev_info); return format (s, "af_packet%d", dm->devices[i].port_id); @@ -264,6 +268,10 @@ format_dpdk_device_type (u8 * s, va_list * args) dev_type = "Chelsio T4/T5"; break; + case VNET_DPDK_PMD_MLX4: + dev_type = "Mellanox ConnectX-3 Family"; + break; + case VNET_DPDK_PMD_MLX5: dev_type = "Mellanox ConnectX-4 Family"; break; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 3aba031c..69959c05 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -55,6 +55,24 @@ static struct rte_eth_conf port_conf_template = { }, }; +static dpdk_port_type_t +port_type_from_speed_capa (struct rte_eth_dev_info *dev_info) +{ + + if (dev_info->speed_capa & ETH_LINK_SPEED_100G) + return VNET_DPDK_PORT_TYPE_ETH_100G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_40G) + return VNET_DPDK_PORT_TYPE_ETH_40G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_25G) + return VNET_DPDK_PORT_TYPE_ETH_25G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_10G) + return VNET_DPDK_PORT_TYPE_ETH_10G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_1G) + return VNET_DPDK_PORT_TYPE_ETH_1G; + + return VNET_DPDK_PORT_TYPE_UNKNOWN; +} + static u32 dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) @@ -332,19 +350,25 @@ dpdk_lib_init (dpdk_main_t * dm) switch (xd->pmd) { - /* 1G adapters */ + /* Drivers with valid speed_capa set */ case VNET_DPDK_PMD_E1000EM: case VNET_DPDK_PMD_IGB: - case VNET_DPDK_PMD_IGBVF: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + case VNET_DPDK_PMD_IXGBE: + case VNET_DPDK_PMD_I40E: + case VNET_DPDK_PMD_CXGBE: + case VNET_DPDK_PMD_MLX4: + case VNET_DPDK_PMD_MLX5: + xd->port_type = port_type_from_speed_capa (&dev_info); break; - /* 10G adapters */ - case VNET_DPDK_PMD_IXGBE: + /* SR-IOV VFs */ + case VNET_DPDK_PMD_IGBVF: case VNET_DPDK_PMD_IXGBEVF: + case VNET_DPDK_PMD_I40EVF: case VNET_DPDK_PMD_THUNDERX: - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF; break; + case VNET_DPDK_PMD_DPAA2: xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; break; @@ -352,102 +376,12 @@ dpdk_lib_init (dpdk_main_t * dm) /* Cisco VIC */ case VNET_DPDK_PMD_ENIC: rte_eth_link_get_nowait (i, &l); - xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; if (l.link_speed == 40000) xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; else xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; break; - /* Intel Fortville */ - case VNET_DPDK_PMD_I40E: - case VNET_DPDK_PMD_I40EVF: - xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - - if (dev_info.speed_capa & ETH_LINK_SPEED_40G) - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - else if (dev_info.speed_capa & ETH_LINK_SPEED_25G) - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_25G; - else if (dev_info.speed_capa & ETH_LINK_SPEED_10G) - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - else - xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; - break; - - case VNET_DPDK_PMD_CXGBE: - switch (dev_info.pci_dev->id.device_id) - { - case 0x540d: /* T580-CR */ - case 0x5410: /* T580-LP-cr */ - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - break; - case 0x5403: /* T540-CR */ - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - default: - xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; - } - break; - - case VNET_DPDK_PMD_MLX5: - { - char *pn_100g[] = { "MCX415A-CCAT", "MCX416A-CCAT", - "MCX556A-ECAT", "MCX556A-EDAT", "MCX555A-ECAT", - "MCX515A-CCAT", "MCX516A-CCAT", "MCX516A-CDAT", 0 - }; - char *pn_40g[] = { "MCX413A-BCAT", "MCX414A-BCAT", - "MCX415A-BCAT", "MCX416A-BCAT", "MCX4131A-BCAT", 0 - }; - char *pn_10g[] = { "MCX4111A-XCAT", "MCX4121A-XCAT", 0 }; - - vlib_pci_device_t *pd = vlib_get_pci_device (&pci_addr); - u8 *pn = 0; - char **c; - int found = 0; - pn = format (0, "%U%c", - format_vlib_pci_vpd, pd->vpd_r, "PN", 0); - - if (!pn) - break; - - c = pn_100g; - while (!found && c[0]) - { - if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) - { - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_100G; - break; - } - c++; - } - - c = pn_40g; - while (!found && c[0]) - { - if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) - { - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; - break; - } - c++; - } - - c = pn_10g; - while (!found && c[0]) - { - if (strncmp ((char *) pn, c[0], strlen (c[0])) == 0) - { - xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; - break; - } - c++; - } - - vec_free (pn); - } - - break; /* Intel Red Rock Canyon */ case VNET_DPDK_PMD_FM10K: xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH; @@ -472,7 +406,6 @@ dpdk_lib_init (dpdk_main_t * dm) break; case VNET_DPDK_PMD_BOND: - xd->flags |= DPDK_DEVICE_FLAG_PMD_SUPPORTS_PTYPE; xd->port_type = VNET_DPDK_PORT_TYPE_ETH_BOND; xd->port_id = bond_ether_port_id++; break; -- cgit 1.2.3-korg From 6a5be214b95f3ec9f30eab46aa1a21a62a7bbc85 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 11 May 2017 14:55:43 +0200 Subject: dpdk: bump to dpdk 17.05 Change-Id: I19744387859129c6b8dc104041af158bf5f1d988 Signed-off-by: Damjan Marion --- dpdk/Makefile | 9 +++++---- src/plugins/dpdk/device/device.c | 4 ++++ src/plugins/dpdk/device/dpdk.h | 22 ---------------------- src/plugins/dpdk/device/init.c | 4 ++++ src/plugins/dpdk/hqos/hqos.c | 8 ++++++++ 5 files changed, 21 insertions(+), 26 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/dpdk/Makefile b/dpdk/Makefile index 23e84ef4..cd79e394 100644 --- a/dpdk/Makefile +++ b/dpdk/Makefile @@ -20,17 +20,18 @@ DPDK_PKTMBUF_HEADROOM ?= 128 DPDK_DOWNLOAD_DIR ?= $(HOME)/Downloads DPDK_DEBUG ?= n DPDK_CRYPTO_SW_PMD ?= n +DPDK_MLX4_PMD ?= n DPDK_MLX5_PMD ?= n B := $(DPDK_BUILD_DIR) I := $(DPDK_INSTALL_DIR) -DPDK_VERSION ?= 17.02 -PKG_SUFFIX ?= vpp3 +DPDK_VERSION ?= 17.05 +PKG_SUFFIX ?= vpp1 DPDK_BASE_URL ?= http://fast.dpdk.org/rel DPDK_TARBALL := dpdk-$(DPDK_VERSION).tar.xz DPDK_TAR_URL := $(DPDK_BASE_URL)/$(DPDK_TARBALL) -DPDK_16.11_TARBALL_MD5_CKSUM := 06c1c577795360719d0b4fafaeee21e9 DPDK_17.02_TARBALL_MD5_CKSUM := 6b9f7387c35641f4e8dbba3e528f2376 +DPDK_17.05_TARBALL_MD5_CKSUM := 0a68c31cd6a6cabeed0a4331073e4c05 DPDK_SOURCE := $(B)/dpdk-$(DPDK_VERSION) ifeq ($(DPDK_CRYPTO_SW_PMD),y) @@ -139,6 +140,7 @@ $(B)/custom-config: $(B)/.patch.ok Makefile $(call set,RTE_LIBRTE_PMD_QAT,y) $(call set,RTE_LIBRTE_PMD_AESNI_MB,$(DPDK_CRYPTO_SW_PMD)) $(call set,RTE_LIBRTE_PMD_AESNI_GCM,$(DPDK_CRYPTO_SW_PMD)) + $(call set,RTE_LIBRTE_MLX4_PMD,$(DPDK_MLX4_PMD)) $(call set,RTE_LIBRTE_MLX5_PMD,$(DPDK_MLX5_PMD)) @# not needed $(call set,RTE_LIBRTE_TIMER,n) @@ -147,7 +149,6 @@ $(B)/custom-config: $(B)/.patch.ok Makefile $(call set,RTE_LIBRTE_ACL,n) $(call set,RTE_LIBRTE_POWER,n) $(call set,RTE_LIBRTE_DISTRIBUTOR,n) - $(call set,RTE_LIBRTE_REORDER,n) $(call set,RTE_LIBRTE_PORT,n) $(call set,RTE_LIBRTE_TABLE,n) $(call set,RTE_LIBRTE_PIPELINE,n) diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 465a5874..51d6eacb 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -254,7 +254,11 @@ static_always_inline &tx_vector[tx_tail], tx_head - tx_tail); rv = rte_ring_sp_enqueue_burst (hqos->swq, (void **) &tx_vector[tx_tail], +#if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) + (uint16_t) (tx_head - tx_tail), 0); +#else (uint16_t) (tx_head - tx_tail)); +#endif } else if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD)) { diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 7e2901bb..7e974491 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -55,7 +55,6 @@ extern vnet_device_class_t dpdk_device_class; extern vlib_node_registration_t dpdk_input_node; -#if RTE_VERSION >= RTE_VERSION_NUM(17, 2, 0, 0) #define foreach_dpdk_pmd \ _ ("net_thunderx", THUNDERX) \ _ ("net_e1000_em", E1000EM) \ @@ -75,27 +74,6 @@ extern vlib_node_registration_t dpdk_input_node; _ ("net_mlx4", MLX4) \ _ ("net_mlx5", MLX5) \ _ ("net_dpaa2", DPAA2) -#else -#define foreach_dpdk_pmd \ - _ ("net_thunderx", THUNDERX) \ - _ ("net_e1000_em", E1000EM) \ - _ ("net_e1000_igb", IGB) \ - _ ("net_e1000_igb_vf", IGBVF) \ - _ ("net_ixgbe", IXGBE) \ - _ ("net_ixgbe_vf", IXGBEVF) \ - _ ("net_i40e", I40E) \ - _ ("net_i40e_vf", I40EVF) \ - _ ("net_virtio", VIRTIO) \ - _ ("net_enic", ENIC) \ - _ ("net_vmxnet3", VMXNET3) \ - _ ("AF_PACKET PMD", AF_PACKET) \ - _ ("rte_bond_pmd", BOND) \ - _ ("net_fm10k", FM10K) \ - _ ("net_cxgbe", CXGBE) \ - _ ("net_mlx4", MLX4) \ - _ ("net_mlx5", MLX5) \ - _ ("net_dpaa2", DPAA2) -#endif typedef enum { diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 69959c05..0ee28db5 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1174,7 +1174,11 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE; +#if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) + rte_log_set_global_level (log_level); +#else rte_set_log_level (log_level); +#endif vm = vlib_get_main (); diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index ca1bdafa..2f2504d6 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -430,7 +430,11 @@ dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) pkts_enq_len += rte_ring_sc_dequeue_burst (swq, (void **) &pkts_enq[pkts_enq_len], +#if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) + hqos->hqos_burst_enq, 0); +#else hqos->hqos_burst_enq); +#endif /* Get next SWQ for this device */ swq_pos++; @@ -521,7 +525,11 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) pkts_enq_len += rte_ring_sc_dequeue_burst (swq, (void **) &pkts_enq[pkts_enq_len], +#if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) + hqos->hqos_burst_enq, 0); +#else hqos->hqos_burst_enq); +#endif /* Get next SWQ for this device */ swq_pos++; -- cgit 1.2.3-korg From f195a5693450083c7c6840c5aab02bfe5088cb40 Mon Sep 17 00:00:00 2001 From: Christophe Fontaine Date: Wed, 5 Apr 2017 11:49:07 +0200 Subject: [plugins/dpdk] Support for CLIB_HAVE_64 (aarch64) For platforms which do not support u8x32, such as aarch64, add an alternative implementation of 'dpdk_buffer_init_from_template'. Change-Id: Ia7e8d0a5985fa5925e063ed6e890208c73e39933 Signed-off-by: Christophe Fontaine --- src/plugins/dpdk/device/node.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 0562b48a..69acc529 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -267,6 +267,7 @@ static_always_inline void dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, void *s) { +#if defined(CLIB_HAVE_VEC128) int i; for (i = 0; i < 2; i++) { @@ -275,6 +276,18 @@ dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, *(u8x32 *) (((u8 *) d2) + i * 32) = *(u8x32 *) (((u8 *) d3) + i * 32) = *(u8x32 *) (((u8 *) s) + i * 32); } +#elif defined(CLIB_HAVE_VEC64) + int i; + for (i = 0; i < 4; i++) + { + *(u8x16 *) (((u8 *) d0) + i * 16) = + *(u8x16 *) (((u8 *) d1) + i * 16) = + *(u8x16 *) (((u8 *) d2) + i * 16) = + *(u8x16 *) (((u8 *) d3) + i * 16) = *(u8x16 *) (((u8 *) s) + i * 16); + } +#else +#error "Either CLIB_HAVE_VEC128 or CLIB_HAVE_VEC64 has to be defined" +#endif } /* -- cgit 1.2.3-korg From 12059c9b6da0536a74d3003cebed61225a8a8ee7 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 11 May 2017 16:53:02 +0200 Subject: dpdk: improve error handling during device initialization Change-Id: Ib390164abb07ca0d38fd49e7e2e6b4e9ea856405 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/cli.c | 5 ++- src/plugins/dpdk/device/common.c | 75 ++++++++++++++++++++++++++++------------ src/plugins/dpdk/device/device.c | 49 ++++---------------------- src/plugins/dpdk/device/dpdk.h | 12 ++++--- src/plugins/dpdk/device/format.c | 20 +++++++++++ src/plugins/dpdk/device/init.c | 16 ++++----- 6 files changed, 98 insertions(+), 79 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index c7e5090d..aeeb772d 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -537,7 +537,10 @@ set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input, if (nb_tx_desc != (u32) ~ 0) xd->nb_tx_desc = nb_tx_desc; - error = dpdk_device_setup (xd); + dpdk_device_setup (xd); + + if (vec_len (xd->errors)) + return clib_error_return (0, "%U", format_dpdk_device_errors, xd); done: unformat_free (line_input); diff --git a/src/plugins/dpdk/device/common.c b/src/plugins/dpdk/device/common.c index 79c5888d..1a9688e7 100644 --- a/src/plugins/dpdk/device/common.c +++ b/src/plugins/dpdk/device/common.c @@ -24,24 +24,28 @@ #include #include -clib_error_t * -dpdk_error_return (clib_error_t * error, char *str, dpdk_device_t * xd, - int rv) +void +dpdk_device_error (dpdk_device_t * xd, char *str, int rv) { - return clib_error_return (error, "%s[%d]: %s(%d)", str, xd->device_index, - rte_strerror (rv), rv); + xd->errors = clib_error_return (xd->errors, "%s[port:%d, errno:%d]: %s", + str, xd->device_index, rv, + rte_strerror (rv)); } -clib_error_t * +void dpdk_device_setup (dpdk_device_t * xd) { dpdk_main_t *dm = &dpdk_main; - clib_error_t *err = 0; + vnet_main_t *vnm = vnet_get_main (); + vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); int rv; int j; ASSERT (vlib_get_thread_index () == 0); + clib_error_free (xd->errors); + sw->flags &= ~VNET_SW_INTERFACE_FLAG_ERROR; + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) { vnet_hw_interface_set_flags (dm->vnet_main, xd->hw_if_index, 0); @@ -52,7 +56,10 @@ dpdk_device_setup (dpdk_device_t * xd) xd->tx_q_used, &xd->port_conf); if (rv < 0) - return dpdk_error_return (err, "rte_eth_dev_configure", xd, rv); + { + dpdk_device_error (xd, "rte_eth_dev_configure", rv); + goto error; + } /* Set up one TX-queue per worker thread */ for (j = 0; j < xd->tx_q_used; j++) @@ -65,7 +72,7 @@ dpdk_device_setup (dpdk_device_t * xd) rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, SOCKET_ID_ANY, &xd->tx_conf); if (rv < 0) - err = dpdk_error_return (err, "rte_eth_tx_queue_setup", xd, rv); + dpdk_device_error (xd, "rte_eth_tx_queue_setup", rv); } for (j = 0; j < xd->rx_q_used; j++) @@ -86,28 +93,40 @@ dpdk_device_setup (dpdk_device_t * xd) dm->pktmbuf_pools[socket_id]); if (rv < 0) - err = dpdk_error_return (err, "rte_eth_rx_queue_setup", xd, rv); + dpdk_device_error (xd, "rte_eth_rx_queue_setup", rv); } - if (err) - return err; + if (vec_len (xd->errors)) + goto error; if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - err = dpdk_device_start (xd); + dpdk_device_start (xd); + + if (vec_len (xd->errors)) + goto error; + + return; - return err; +error: + xd->flags |= DPDK_DEVICE_FLAG_PMD_INIT_FAIL; + sw->flags |= VNET_SW_INTERFACE_FLAG_ERROR; } -clib_error_t * +void dpdk_device_start (dpdk_device_t * xd) { int rv; - clib_error_t *err = 0; + + if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL) + return; rv = rte_eth_dev_start (xd->device_index); if (rv) - return dpdk_error_return (err, "rte_eth_dev_start", xd, rv); + { + dpdk_device_error (xd, "rte_eth_dev_start", rv); + return; + } if (xd->default_mac_address) rv = @@ -116,7 +135,7 @@ dpdk_device_start (dpdk_device_t * xd) xd->default_mac_address); if (rv) - err = dpdk_error_return (err, "rte_eth_dev_default_mac_addr_set", xd, rv); + dpdk_device_error (xd, "rte_eth_dev_default_mac_addr_set", rv); if (xd->flags & DPDK_DEVICE_FLAG_PROMISC) rte_eth_promiscuous_enable (xd->device_index); @@ -135,16 +154,28 @@ dpdk_device_start (dpdk_device_t * xd) rte_eth_allmulticast_enable (dpdk_port); } } - - return err; } -clib_error_t * +void dpdk_device_stop (dpdk_device_t * xd) { + if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL) + return; + + rte_eth_allmulticast_disable (xd->device_index); rte_eth_dev_stop (xd->device_index); - return 0; + /* For bonded interface, stop slave links */ + if (xd->pmd == VNET_DPDK_PMD_BOND) + { + u8 slink[16]; + int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16); + while (nlink >= 1) + { + u8 dpdk_port = slink[--nlink]; + rte_eth_dev_stop (dpdk_port); + } + } } /* diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 51d6eacb..6a0b1acf 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -612,38 +612,16 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; dpdk_main_t *dm = &dpdk_main; dpdk_device_t *xd = vec_elt_at_index (dm->devices, hif->dev_instance); - int rv = 0; + + if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL) + return clib_error_return (0, "Interface not initialized"); if (is_up) { f64 now = vlib_time_now (dm->vlib_main); if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) - { - rv = rte_eth_dev_start (xd->device_index); - if (!rv && xd->default_mac_address) - rv = rte_eth_dev_default_mac_addr_set (xd->device_index, - (struct ether_addr *) - xd->default_mac_address); - } - - if (xd->flags & DPDK_DEVICE_FLAG_PROMISC) - rte_eth_promiscuous_enable (xd->device_index); - else - rte_eth_promiscuous_disable (xd->device_index); - - rte_eth_allmulticast_enable (xd->device_index); - - if (xd->pmd == VNET_DPDK_PMD_BOND) - { - u8 slink[16]; - int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16); - while (nlink >= 1) - { - u8 dpdk_port = slink[--nlink]; - rte_eth_allmulticast_enable (dpdk_port); - } - } + dpdk_device_start (xd); xd->flags |= DPDK_DEVICE_FLAG_ADMIN_UP; dpdk_update_counters (xd, now); @@ -652,27 +630,12 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) else { xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP; - - rte_eth_allmulticast_disable (xd->device_index); vnet_hw_interface_set_flags (vnm, xd->hw_if_index, 0); - rte_eth_dev_stop (xd->device_index); - /* For bonded interface, stop slave links */ - if (xd->pmd == VNET_DPDK_PMD_BOND) - { - u8 slink[16]; - int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16); - while (nlink >= 1) - { - u8 dpdk_port = slink[--nlink]; - rte_eth_dev_stop (dpdk_port); - } - } + if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0) + dpdk_device_stop (xd); } - if (rv < 0) - clib_warning ("rte_eth_dev_%s error: %d", is_up ? "start" : "stop", rv); - return /* no error */ 0; } diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 7e974491..6364d163 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -166,7 +166,7 @@ typedef struct #define DPDK_DEVICE_FLAG_ADMIN_UP (1 << 0) #define DPDK_DEVICE_FLAG_PROMISC (1 << 1) #define DPDK_DEVICE_FLAG_PMD (1 << 2) - +#define DPDK_DEVICE_FLAG_PMD_INIT_FAIL (1 << 3) #define DPDK_DEVICE_FLAG_MAYBE_MULTISEG (1 << 4) #define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5) #define DPDK_DEVICE_FLAG_HQOS (1 << 6) @@ -207,6 +207,9 @@ typedef struct /* mac address */ u8 *default_mac_address; + + /* error string */ + clib_error_t *errors; } dpdk_device_t; #define DPDK_STATS_POLL_INTERVAL (10.0) @@ -399,9 +402,9 @@ typedef struct u8 data[256]; /* First 256 data bytes, used for hexdump */ } dpdk_rx_dma_trace_t; -clib_error_t *dpdk_device_setup (dpdk_device_t * xd); -clib_error_t *dpdk_device_start (dpdk_device_t * xd); -clib_error_t *dpdk_device_stop (dpdk_device_t * xd); +void dpdk_device_setup (dpdk_device_t * xd); +void dpdk_device_start (dpdk_device_t * xd); +void dpdk_device_stop (dpdk_device_t * xd); #define foreach_dpdk_error \ _(NONE, "no error") \ @@ -424,6 +427,7 @@ void dpdk_update_link_state (dpdk_device_t * xd, f64 now); format_function_t format_dpdk_device_name; format_function_t format_dpdk_device; +format_function_t format_dpdk_device_errors; format_function_t format_dpdk_tx_dma_trace; format_function_t format_dpdk_rx_dma_trace; format_function_t format_dpdk_rte_mbuf; diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index 3b75563d..0b67eae3 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -376,6 +376,20 @@ format_dpdk_tx_offload_caps (u8 * s, va_list * args) #undef _line_len #undef _ +u8 * +format_dpdk_device_errors (u8 * s, va_list * args) +{ + dpdk_device_t *xd = va_arg (*args, dpdk_device_t *); + clib_error_t *e; + uword indent = format_get_indent (s); + + vec_foreach (e, xd->errors) + { + s = format (s, "%U%v\n", format_white_space, indent, e->what); + } + return s; +} + u8 * format_dpdk_device (u8 * s, va_list * args) { @@ -511,6 +525,12 @@ format_dpdk_device (u8 * s, va_list * args) vec_free (xs); } + if (vec_len (xd->errors)) + { + s = format (s, "%UErrors:\n %U", format_white_space, indent, + format_dpdk_device_errors, xd); + } + return s; } diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 0ee28db5..bcb787c8 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -118,11 +118,7 @@ dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) - { - clib_error_t *error; - error = dpdk_device_start (xd); - clib_error_report (error); - } + dpdk_device_start (xd); } return old; @@ -223,7 +219,6 @@ dpdk_lib_init (dpdk_main_t * dm) u8 vlan_strip = 0; int j; struct rte_eth_dev_info dev_info; - clib_error_t *rv; struct rte_eth_link l; dpdk_device_config_t *devconf = 0; vlib_pci_addr_t pci_addr; @@ -570,13 +565,16 @@ dpdk_lib_init (dpdk_main_t * dm) hi = vnet_get_hw_interface (dm->vnet_main, xd->hw_if_index); - rv = dpdk_device_setup (xd); + dpdk_device_setup (xd); - if (rv) - return rv; + if (vec_len (xd->errors)) + clib_warning ("setup failed for device %U. Errors:\n %U", + format_dpdk_device_name, i, + format_dpdk_device_errors, xd); if (devconf->hqos_enabled) { + clib_error_t *rv; rv = dpdk_port_setup_hqos (xd, &devconf->hqos); if (rv) return rv; -- cgit 1.2.3-korg From 90b241536653f8acb1401588a2535dead75e5799 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 15 May 2017 18:35:37 +0200 Subject: dpdk: Enable hardware CRC strip for Intel VFs Change-Id: I60a7795761fe74ae5d57dacd03c343ddb77434d4 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index bcb787c8..545a15e8 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -360,6 +360,10 @@ dpdk_lib_init (dpdk_main_t * dm) case VNET_DPDK_PMD_IGBVF: case VNET_DPDK_PMD_IXGBEVF: case VNET_DPDK_PMD_I40EVF: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF; + xd->port_conf.rxmode.hw_strip_crc = 1; + break; + case VNET_DPDK_PMD_THUNDERX: xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF; break; @@ -380,6 +384,7 @@ dpdk_lib_init (dpdk_main_t * dm) /* Intel Red Rock Canyon */ case VNET_DPDK_PMD_FM10K: xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH; + xd->port_conf.rxmode.hw_strip_crc = 1; break; /* virtio */ -- cgit 1.2.3-korg From 0a69734dfe9ea35e3983f0c6a71e7853037adb74 Mon Sep 17 00:00:00 2001 From: John Lo Date: Mon, 15 May 2017 19:21:15 -0400 Subject: Fix regression of setting device to admin-down state Change-Id: I0ffa572839405efe1170d6ddb073e53e9af02db7 Signed-off-by: John Lo --- src/plugins/dpdk/device/device.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 6a0b1acf..c68ecbda 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -618,22 +618,21 @@ dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) if (is_up) { - f64 now = vlib_time_now (dm->vlib_main); - + vnet_hw_interface_set_flags (vnm, xd->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) dpdk_device_start (xd); - xd->flags |= DPDK_DEVICE_FLAG_ADMIN_UP; + f64 now = vlib_time_now (dm->vlib_main); dpdk_update_counters (xd, now); dpdk_update_link_state (xd, now); } else { - xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP; vnet_hw_interface_set_flags (vnm, xd->hw_if_index, 0); - if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0) dpdk_device_stop (xd); + xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP; } return /* no error */ 0; -- cgit 1.2.3-korg From 10980465ce97eceff05ac94a69a13d63d3cfa70a Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 22 May 2017 15:32:04 +0200 Subject: dpdk: make dpdk global debug level configurable Adds startup.conf knob: dpdk { log-level debug } Change-Id: I80dfbc00559528d7b0970958fba9f08d97aa7118 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/dpdk.h | 2 +- src/plugins/dpdk/device/format.c | 54 ++++++++++++++++++---------------------- src/plugins/dpdk/device/init.c | 7 ++++-- 3 files changed, 30 insertions(+), 33 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 6364d163..ab5da567 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -432,7 +432,7 @@ format_function_t format_dpdk_tx_dma_trace; format_function_t format_dpdk_rx_dma_trace; format_function_t format_dpdk_rte_mbuf; format_function_t format_dpdk_rx_rte_mbuf; -unformat_function_t unformat_socket_mem; +unformat_function_t unformat_dpdk_log_level; clib_error_t *unformat_rss_fn (unformat_input_t * input, uword * rss_fn); clib_error_t *unformat_hqos (unformat_input_t * input, dpdk_device_config_hqos_t * hqos); diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index 0b67eae3..8fe66891 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -143,6 +143,16 @@ foreach_dpdk_pkt_rx_offload_flag \ foreach_dpdk_pkt_tx_offload_flag +#define foreach_dpdk_log_level \ + _ (EMERG, "emergency") \ + _ (ALERT, "alert") \ + _ (CRIT, "critical") \ + _ (ERR, "error") \ + _ (WARNING, "warning") \ + _ (NOTICE, "notice") \ + _ (INFO, "info") \ + _ (DEBUG, "debug") + u8 * format_dpdk_device_name (u8 * s, va_list * args) { @@ -704,36 +714,6 @@ format_dpdk_rte_mbuf (u8 * s, va_list * va) return s; } -/* FIXME is this function used? */ -#if 0 -uword -unformat_socket_mem (unformat_input_t * input, va_list * va) -{ - uword **r = va_arg (*va, uword **); - int i = 0; - u32 mem; - - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, ",")) - hash_set (*r, i, 1024); - else if (unformat (input, "%u,", &mem)) - hash_set (*r, i, mem); - else if (unformat (input, "%u", &mem)) - hash_set (*r, i, mem); - else - { - unformat_put_input (input); - goto done; - } - i++; - } - -done: - return 1; -} -#endif - clib_error_t * unformat_rss_fn (unformat_input_t * input, uword * rss_fn) { @@ -757,6 +737,20 @@ unformat_rss_fn (unformat_input_t * input, uword * rss_fn) return 0; } +uword +unformat_dpdk_log_level (unformat_input_t * input, va_list * args) +{ + u32 *r = va_arg (*args, u32 *); + + if (0); +#define _(v,s) else if (unformat (input, s)) *r = RTE_LOG_##v; + foreach_dpdk_log_level +#undef _ + else + return 0; + return 1; +} + clib_error_t * unformat_hqos (unformat_input_t * input, dpdk_device_config_hqos_t * hqos) { diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 545a15e8..a972394c 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -810,6 +810,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) dpdk_device_config_t *devconf; vlib_pci_addr_t pci_addr; unformat_input_t sub_input; + uword x; u8 *s, *tmp = 0; u8 *rte_cmd = 0, *ethname = 0; u32 log_level; @@ -822,6 +823,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) u8 *socket_mem = 0; conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword)); + log_level = RTE_LOG_NOTICE; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -838,6 +840,9 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "decimal-interface-names")) conf->interface_name_format_decimal = 1; + else if (unformat (input, "log-level %U", unformat_dpdk_log_level, &x)) + log_level = x; + else if (unformat (input, "no-multi-seg")) conf->no_multi_seg = 1; @@ -1175,8 +1180,6 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) /* Set up DPDK eal and packet mbuf pool early. */ - log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE; - #if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) rte_log_set_global_level (log_level); #else -- cgit 1.2.3-korg From 5c89c420861ab938151f0bb0a298bb04d590ff2b Mon Sep 17 00:00:00 2001 From: Mohammed Hawari Date: Wed, 17 May 2017 15:52:02 +0200 Subject: Added support for Virtio-user interfaces by means of a new VNET_DPDK_PORT_TYPE. Change-Id: I101e32cee8d9de51227b39ec2639c9fb44da1e6c Signed-off-by: Mohammed Hawari --- src/plugins/dpdk/device/dpdk.h | 5 ++++- src/plugins/dpdk/device/format.c | 8 ++++++++ src/plugins/dpdk/device/init.c | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index ab5da567..d82ba5dd 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -73,7 +73,9 @@ extern vlib_node_registration_t dpdk_input_node; _ ("net_cxgbe", CXGBE) \ _ ("net_mlx4", MLX4) \ _ ("net_mlx5", MLX5) \ - _ ("net_dpaa2", DPAA2) + _ ("net_dpaa2", DPAA2) \ + _ ("net_virtio_user", VIRTIO_USER) + typedef enum { @@ -95,6 +97,7 @@ typedef enum VNET_DPDK_PORT_TYPE_ETH_SWITCH, VNET_DPDK_PORT_TYPE_AF_PACKET, VNET_DPDK_PORT_TYPE_ETH_VF, + VNET_DPDK_PORT_TYPE_VIRTIO_USER, VNET_DPDK_PORT_TYPE_UNKNOWN, } dpdk_port_type_t; diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index 8fe66891..c4ddbe24 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -205,6 +205,10 @@ format_dpdk_device_name (u8 * s, va_list * args) rte_eth_dev_info_get (i, &dev_info); return format (s, "af_packet%d", dm->devices[i].port_id); + case VNET_DPDK_PORT_TYPE_VIRTIO_USER: + device_name = "VirtioUser"; + break; + default: case VNET_DPDK_PORT_TYPE_UNKNOWN: device_name = "UnknownEthernet"; @@ -302,6 +306,10 @@ format_dpdk_device_type (u8 * s, va_list * args) dev_type = "NXP DPAA2 Mac"; break; + case VNET_DPDK_PMD_VIRTIO_USER: + dev_type = "Virtio User"; + break; + default: case VNET_DPDK_PMD_UNKNOWN: dev_type = "### UNKNOWN ###"; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index a972394c..90968075 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -410,6 +410,10 @@ dpdk_lib_init (dpdk_main_t * dm) xd->port_id = bond_ether_port_id++; break; + case VNET_DPDK_PMD_VIRTIO_USER: + xd->port_type = VNET_DPDK_PORT_TYPE_VIRTIO_USER; + break; + default: xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; } -- cgit 1.2.3-korg From 8b81cb43359380e50d3fc216d93ff05894149939 Mon Sep 17 00:00:00 2001 From: John Lo Date: Mon, 26 Jun 2017 01:40:20 -0400 Subject: Send GARP/NA on bonded intf slave up/down if in active-backup mode If a bonded interface is in active-backup mode and configured with IPv4 and/or IPv6 addresses, on slave interface link up/down, send a GARP packet if configured with an IPv4 address and an unsolcited NA if configured with an IPv6 address. These packets can help with faster route convergence in the next hop router/switch. Change-Id: I68ccb11a4a40cda414704fa08ee0171c952befa2 Signed-off-by: John Lo --- src/plugins/dpdk/device/common.c | 62 ++++++++++++++++++++++++++++++ src/plugins/dpdk/device/dpdk.h | 8 ++++ src/plugins/dpdk/device/init.c | 23 +++++++---- src/vnet/ethernet/arp.c | 83 ++++++++++++++++++++++++++++++++++++++++ src/vnet/ethernet/arp_packet.h | 9 +++++ src/vnet/ip/ip6.h | 2 + src/vnet/ip/ip6_neighbor.c | 53 +++++++++++++++++++++++++ 7 files changed, 232 insertions(+), 8 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/common.c b/src/plugins/dpdk/device/common.c index 1a9688e7..df52c58f 100644 --- a/src/plugins/dpdk/device/common.c +++ b/src/plugins/dpdk/device/common.c @@ -12,13 +12,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include #include #include #include #include +#include #include +#include #include #include @@ -178,6 +181,65 @@ dpdk_device_stop (dpdk_device_t * xd) } } +void +dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, void *param) +{ + struct rte_eth_link link; + vlib_main_t *vm = vlib_get_main (); + dpdk_device_t *xd = &dpdk_main.devices[port_id]; + + RTE_SET_USED (param); + if (type != RTE_ETH_EVENT_INTR_LSC) + { + clib_warning ("Unknown event %d received for port %d", type, port_id); + return; + } + + rte_eth_link_get_nowait (port_id, &link); + u8 link_up = link.link_status; + + if (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE) + { + u8 bd_port = xd->bond_port; + int bd_mode = rte_eth_bond_mode_get (bd_port); + + if ((link_up && !(xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE_UP)) || + (!link_up && (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE_UP))) + { + clib_warning ("Port %d state to %s, " + "slave of port %d BondEthernet%d in mode %d", + port_id, (link_up) ? "UP" : "DOWN", + bd_port, xd->port_id, bd_mode); + if (bd_mode == BONDING_MODE_ACTIVE_BACKUP) + { + rte_eth_link_get_nowait (bd_port, &link); + if (link.link_status) /* bonded interface up */ + { + u32 hw_if_index = dpdk_main.devices[bd_port].hw_if_index; + vlib_process_signal_event + (vm, send_garp_na_process_node_index, SEND_GARP_NA, + hw_if_index); + } + } + } + if (link_up) /* Update slave link status */ + xd->flags |= DPDK_DEVICE_FLAG_BOND_SLAVE_UP; + else + xd->flags &= ~DPDK_DEVICE_FLAG_BOND_SLAVE_UP; + } + else /* Should not happen as callback not setup for "normal" links */ + { + if (link_up) + clib_warning ("Port %d Link Up - speed %u Mbps - %s", + port_id, (unsigned) link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? + "full-duplex" : "half-duplex"); + else + clib_warning ("Port %d Link Down\n\n", port_id); + } +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index d82ba5dd..c6fd7388 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -173,6 +173,8 @@ typedef struct #define DPDK_DEVICE_FLAG_MAYBE_MULTISEG (1 << 4) #define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5) #define DPDK_DEVICE_FLAG_HQOS (1 << 6) +#define DPDK_DEVICE_FLAG_BOND_SLAVE (1 << 7) +#define DPDK_DEVICE_FLAG_BOND_SLAVE_UP (1 << 8) u16 nb_tx_desc; CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); @@ -197,6 +199,10 @@ typedef struct /* af_packet or BondEthernet instance number */ u8 port_id; + /* Bonded interface port# of a slave - + only valid if DPDK_DEVICE_FLAG_BOND_SLAVE bit is set */ + u8 bond_port; + struct rte_eth_link link; f64 time_last_link_update; @@ -408,6 +414,8 @@ typedef struct void dpdk_device_setup (dpdk_device_t * xd); void dpdk_device_start (dpdk_device_t * xd); void dpdk_device_stop (dpdk_device_t * xd); +void dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, void *param); #define foreach_dpdk_error \ _(NONE, "no error") \ diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 90968075..d9ab0756 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1373,8 +1373,10 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) /* * Extra set up for bond interfaces: * 1. Setup MACs for bond interfaces and their slave links which was set - * in dpdk_device_setup() but needs to be done again here to take effect. - * 2. Set up info for bond interface related CLI support. + * in dpdk_device_setup() but needs to be done again here to take + * effect. + * 2. Set up info and register slave link state change callback handling. + * 3. Set up info for bond interface related CLI support. */ int nports = rte_eth_dev_count (); if (nports > 0) @@ -1399,7 +1401,8 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) (slink[0], (struct ether_addr *) addr); /* Set MAC of bounded interface to that of 1st slave link */ - clib_warning ("Set MAC for bond dev# %d", i); + clib_warning ("Set MAC for bond port %d BondEthernet%d", + i, xd->port_id); rv = rte_eth_bond_mac_address_set (i, (struct ether_addr *) addr); if (rv) @@ -1428,34 +1431,38 @@ dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) /* Add MAC to all slave links except the first one */ if (nlink) { - clib_warning ("Add MAC for slave dev# %d", slave); + clib_warning ("Add MAC for slave port %d", slave); rv = rte_eth_dev_mac_addr_add (slave, (struct ether_addr *) addr, 0); if (rv) clib_warning ("Add MAC addr failure rv=%d", rv); } + /* Setup slave link state change callback handling */ + rte_eth_dev_callback_register + (slave, RTE_ETH_EVENT_INTR_LSC, + dpdk_port_state_callback, NULL); + dpdk_device_t *sxd = &dm->devices[slave]; + sxd->flags |= DPDK_DEVICE_FLAG_BOND_SLAVE; + sxd->bond_port = i; /* Set slaves bitmap for bonded interface */ bhi->bond_info = clib_bitmap_set (bhi->bond_info, sdev->hw_if_index, 1); - /* Set slave link flags on slave interface */ + /* Set MACs and slave link flags on slave interface */ shi = vnet_get_hw_interface (vnm, sdev->hw_if_index); ssi = vnet_get_sw_interface (vnm, sdev->vlib_sw_if_index); sei = pool_elt_at_index (em->interfaces, shi->hw_instance); - shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE; ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE; clib_memcpy (shi->hw_address, addr, 6); clib_memcpy (sei->address, addr, 6); - /* Set l3 packet size allowed as the lowest of slave */ if (bhi->max_l3_packet_bytes[VLIB_RX] > shi->max_l3_packet_bytes[VLIB_RX]) bhi->max_l3_packet_bytes[VLIB_RX] = bhi->max_l3_packet_bytes[VLIB_TX] = shi->max_l3_packet_bytes[VLIB_RX]; - /* Set max packet size allowed as the lowest of slave */ if (bhi->max_packet_bytes > shi->max_packet_bytes) bhi->max_packet_bytes = shi->max_packet_bytes; diff --git a/src/vnet/ethernet/arp.c b/src/vnet/ethernet/arp.c index d5dc9cce..df681750 100644 --- a/src/vnet/ethernet/arp.c +++ b/src/vnet/ethernet/arp.c @@ -110,6 +110,9 @@ typedef struct static const u8 vrrp_prefix[] = { 0x00, 0x00, 0x5E, 0x00, 0x01 }; +/* Node index for send_garp_na_process */ +u32 send_garp_na_process_node_index; + static void set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t * a); @@ -2378,6 +2381,86 @@ ethernet_arp_change_mac (u32 sw_if_index) /* *INDENT-ON* */ } +void static +send_ip4_garp (vlib_main_t * vm, vnet_hw_interface_t * hi) +{ + ip4_main_t *i4m = &ip4_main; + u32 sw_if_index = hi->sw_if_index; + ip4_address_t *ip4_addr = ip4_interface_first_address (i4m, sw_if_index, 0); + + if (ip4_addr) + { + clib_warning ("Sending GARP for IP4 address %U on sw_if_idex %d", + format_ip4_address, ip4_addr, sw_if_index); + + /* Form GARP packet for output - Gratuitous ARP is an ARP request packet + where the interface IP/MAC pair is used for both source and request + MAC/IP pairs in the request */ + u32 bi = 0; + ethernet_arp_header_t *h = vlib_packet_template_get_packet + (vm, &i4m->ip4_arp_request_packet_template, &bi); + clib_memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address, + sizeof (h->ip4_over_ethernet[0].ethernet)); + clib_memcpy (h->ip4_over_ethernet[1].ethernet, hi->hw_address, + sizeof (h->ip4_over_ethernet[1].ethernet)); + h->ip4_over_ethernet[0].ip4 = ip4_addr[0]; + h->ip4_over_ethernet[1].ip4 = ip4_addr[0]; + + /* Setup MAC header with ARP Etype and broadcast DMAC */ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_buffer_advance (b, -sizeof (ethernet_header_t)); + ethernet_header_t *e = vlib_buffer_get_current (b); + e->type = clib_host_to_net_u16 (ETHERNET_TYPE_ARP); + clib_memcpy (e->src_address, hi->hw_address, sizeof (e->src_address)); + memset (e->dst_address, 0xff, sizeof (e->dst_address)); + + /* Send GARP packet out the specified interface */ + vnet_buffer (b)->sw_if_index[VLIB_RX] = + vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index; + vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index); + u32 *to_next = vlib_frame_vector_args (f); + to_next[0] = bi; + f->n_vectors = 1; + vlib_put_frame_to_node (vm, hi->output_node_index, f); + } +} + +static vlib_node_registration_t send_garp_na_proc_node; + +static uword +send_garp_na_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + uword event_type, *event_data = 0; + + send_garp_na_process_node_index = send_garp_na_proc_node.index; + + while (1) + { + vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &event_data); + if ((event_type == SEND_GARP_NA) && (vec_len (event_data) >= 1)) + { + u32 hw_if_index = event_data[0]; + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index); + send_ip4_garp (vm, hi); + send_ip6_na (vm, hi); + } + vec_reset_length (event_data); + } + return 0; +} + + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (send_garp_na_proc_node, static) = { + .function = send_garp_na_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "send-garp-na-process", +}; +/* *INDENT-ON* */ + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/ethernet/arp_packet.h b/src/vnet/ethernet/arp_packet.h index 17e64f43..d740b844 100644 --- a/src/vnet/ethernet/arp_packet.h +++ b/src/vnet/ethernet/arp_packet.h @@ -167,6 +167,15 @@ typedef struct ethernet_arp_ip4_entry_t *ip4_neighbor_entries (u32 sw_if_index); u8 *format_ethernet_arp_ip4_entry (u8 * s, va_list * va); +/* Node index for send_garp_na_process */ +extern u32 send_garp_na_process_node_index; + +/* Even type for send_garp_na_process */ +enum +{ + SEND_GARP_NA = 1, +} dpdk_send_garp_na_process_event_t; + #endif /* included_ethernet_arp_packet_h */ /* diff --git a/src/vnet/ip/ip6.h b/src/vnet/ip/ip6.h index d623c95f..cf52994e 100644 --- a/src/vnet/ip/ip6.h +++ b/src/vnet/ip/ip6.h @@ -375,6 +375,8 @@ int vnet_ip6_nd_term (vlib_main_t * vm, ethernet_header_t * eth, ip6_header_t * ip, u32 sw_if_index, u16 bd_index); +void send_ip6_na (vlib_main_t * vm, vnet_hw_interface_t * hi); + u8 *format_ip6_forward_next_trace (u8 * s, va_list * args); u32 ip6_tcp_udp_icmp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0); diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index ba7ea143..b8f6f9b1 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -4192,6 +4192,59 @@ ethernet_ndp_change_mac (u32 sw_if_index) /* *INDENT-ON* */ } +void +send_ip6_na (vlib_main_t * vm, vnet_hw_interface_t * hi) +{ + ip6_main_t *i6m = &ip6_main; + u32 sw_if_index = hi->sw_if_index; + ip6_address_t *ip6_addr = ip6_interface_first_address (i6m, sw_if_index); + if (ip6_addr) + { + clib_warning + ("Sending unsolicitated NA IP6 address %U on sw_if_idex %d", + format_ip6_address, ip6_addr, sw_if_index); + + /* Form unsolicited neighbor advertisement packet from NS pkt template */ + int bogus_length; + u32 bi = 0; + icmp6_neighbor_solicitation_header_t *h = + vlib_packet_template_get_packet (vm, + &i6m->discover_neighbor_packet_template, + &bi); + ip6_set_reserved_multicast_address (&h->ip.dst_address, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_all_hosts); + h->ip.src_address = ip6_addr[0]; + h->neighbor.icmp.type = ICMP6_neighbor_advertisement; + h->neighbor.target_address = ip6_addr[0]; + h->neighbor.advertisement_flags = clib_host_to_net_u32 + (ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_OVERRIDE); + clib_memcpy (h->link_layer_option.ethernet_address, + hi->hw_address, vec_len (hi->hw_address)); + h->neighbor.icmp.checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, 0, &h->ip, &bogus_length); + ASSERT (bogus_length == 0); + + /* Setup MAC header with IP6 Etype and mcast DMAC */ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_buffer_advance (b, -sizeof (ethernet_header_t)); + ethernet_header_t *e = vlib_buffer_get_current (b); + e->type = clib_host_to_net_u16 (ETHERNET_TYPE_IP6); + clib_memcpy (e->src_address, hi->hw_address, sizeof (e->src_address)); + ip6_multicast_ethernet_address (e->dst_address, + IP6_MULTICAST_GROUP_ID_all_hosts); + + /* Send unsolicited ND advertisement packet out the specified interface */ + vnet_buffer (b)->sw_if_index[VLIB_RX] = + vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index; + vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index); + u32 *to_next = vlib_frame_vector_args (f); + to_next[0] = bi; + f->n_vectors = 1; + vlib_put_frame_to_node (vm, hi->output_node_index, f); + } +} + /* * fd.io coding-style-patch-verification: ON * -- cgit 1.2.3-korg From 475674ee5aa4e130a0ac0caf08ef9d579b8604b7 Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Wed, 5 Jul 2017 18:02:53 -0400 Subject: unix: mkdir VPP_RUN_DIR before opening a socket in it Change https://gerrit.fd.io/r/#/c/7230/ added a Unix domain CLI socket in the default startup.conf; however unless you had previously run VPP with the DPDK plugin enabled the directory that it is created in. /run/vpp, would not exist and startup would fail. This directory is typically hosted in a tmpfs ramdisk and is thus ephemeral. This patch adds a function that attempts to mkdir VPP_RUN_DIR and uses it in both the DPDK plugin and the CLI code if the CLI socket is to be created in that directory. Change-Id: Ibbf925819099dce2b5eb0fa238b9edca1036d6fd Signed-off-by: Chris Luke --- src/plugins/dpdk/device/init.c | 14 +++++--------- src/vlib/unix/cli.c | 11 +++++++++++ src/vlib/unix/unix.h | 6 ++++++ src/vlib/unix/util.c | 13 +++++++++++++ 4 files changed, 35 insertions(+), 9 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index d9ab0756..04344f74 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -37,8 +37,7 @@ dpdk_main_t dpdk_main; #define LINK_STATE_ELOGS 0 -#define DEFAULT_HUGE_DIR "/run/vpp/hugepages" -#define VPP_RUN_DIR "/run/vpp" +#define DEFAULT_HUGE_DIR (VPP_RUN_DIR "/hugepages") /* Port configuration, mildly modified Intel app values */ @@ -1047,13 +1046,10 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) vec_free (mem_by_socket); - rv = mkdir (VPP_RUN_DIR, 0755); - if (rv && errno != EEXIST) - { - error = clib_error_return (0, "mkdir '%s' failed errno %d", - VPP_RUN_DIR, errno); - goto done; - } + /* Make sure VPP_RUN_DIR exists */ + error = unix_make_vpp_run_dir (); + if (error) + goto done; rv = mkdir (DEFAULT_HUGE_DIR, 0755); if (rv && errno != EEXIST) diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 953d133c..1befa25d 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -2642,6 +2642,17 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) /* CLI listen. */ unix_file_t template = { 0 }; + /* If our listen address looks like a path and it starts with + * VPP_RUN_DIR, go make sure VPP_RUN_DIR exists before trying to open + * a socket in it. + */ + if (strncmp (s->config, VPP_RUN_DIR "/", strlen (VPP_RUN_DIR) + 1) == 0) + { + error = unix_make_vpp_run_dir (); + if (error) + return error; + } + s->flags = SOCKET_IS_SERVER | /* listen, don't connect */ SOCKET_ALLOW_GROUP_WRITE; /* PF_LOCAL socket only */ error = clib_socket_init (s); diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index de607c0f..ffa92bba 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -43,6 +43,10 @@ #include #include + +/** VPP runtime ephemeral directory. Typically stored in a tmpfs. */ +#define VPP_RUN_DIR "/run/vpp" + struct unix_file; typedef clib_error_t *(unix_file_function_t) (struct unix_file * f); @@ -229,6 +233,8 @@ clib_error_t *foreach_directory_file (char *dir_name, u8 * file_name), void *arg, int scan_dirs); +clib_error_t *unix_make_vpp_run_dir (void); + #endif /* included_unix_unix_h */ /* diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index edc3e591..51b4a4ed 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -222,6 +222,19 @@ done: return r; } +clib_error_t * +unix_make_vpp_run_dir (void) +{ + int rv; + + rv = mkdir (VPP_RUN_DIR, 0755); + if (rv && errno != EEXIST) + return clib_error_return (0, "mkdir '%s' failed errno %d", + VPP_RUN_DIR, errno); + + return 0; +} + /* * fd.io coding-style-patch-verification: ON * -- cgit 1.2.3-korg From 9b2cca99e28ef60ac8f10894711e49683ac7bb2a Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 5 Jul 2017 02:04:36 +0200 Subject: dpdk: add FiftyGigabitEtherenet interface support Change-Id: Ied8b26179cdf4add34440a9c396cb821716cfb8e Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/dpdk.h | 1 + src/plugins/dpdk/device/format.c | 4 ++++ src/plugins/dpdk/device/init.c | 2 ++ 3 files changed, 7 insertions(+) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index c6fd7388..a2686978 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -92,6 +92,7 @@ typedef enum VNET_DPDK_PORT_TYPE_ETH_10G, VNET_DPDK_PORT_TYPE_ETH_25G, VNET_DPDK_PORT_TYPE_ETH_40G, + VNET_DPDK_PORT_TYPE_ETH_50G, VNET_DPDK_PORT_TYPE_ETH_100G, VNET_DPDK_PORT_TYPE_ETH_BOND, VNET_DPDK_PORT_TYPE_ETH_SWITCH, diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index c4ddbe24..403d7204 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -186,6 +186,10 @@ format_dpdk_device_name (u8 * s, va_list * args) device_name = "FortyGigabitEthernet"; break; + case VNET_DPDK_PORT_TYPE_ETH_50G: + device_name = "FiftyGigabitEthernet"; + break; + case VNET_DPDK_PORT_TYPE_ETH_100G: device_name = "HundredGigabitEthernet"; break; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 04344f74..9602e93c 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -60,6 +60,8 @@ port_type_from_speed_capa (struct rte_eth_dev_info *dev_info) if (dev_info->speed_capa & ETH_LINK_SPEED_100G) return VNET_DPDK_PORT_TYPE_ETH_100G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_50G) + return VNET_DPDK_PORT_TYPE_ETH_50G; else if (dev_info->speed_capa & ETH_LINK_SPEED_40G) return VNET_DPDK_PORT_TYPE_ETH_40G; else if (dev_info->speed_capa & ETH_LINK_SPEED_25G) -- cgit 1.2.3-korg From 042a621b90c9f521b546cbbf724bb908e36f3b25 Mon Sep 17 00:00:00 2001 From: Steve Shin Date: Fri, 7 Jul 2017 14:57:46 -0700 Subject: lldp packet transmission on a bonded interface LLDP packets are dropped at interface output node if each slave's link is configured as the LLDP interface. The admin state is configured and managed by the bonded interface, so slave link's state is down by default. The checking for the admin state UP should be ignored for the slave link. Change-Id: I06ca250f42fcb8cc50e0ea3a3817a2c5b56865df Signed-off-by: Steve Shin --- src/plugins/dpdk/device/init.c | 6 +++--- src/vnet/interface_output.c | 6 ++++-- src/vnet/lldp/lldp_cli.c | 6 ++++-- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 9602e93c..17e77618 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1268,9 +1268,9 @@ dpdk_update_link_state (dpdk_device_t * xd, f64 now) ed->new_link_state = (u8) xd->link.link_status; } - if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) && - ((xd->link.link_status != 0) ^ - vnet_hw_interface_is_link_up (vnm, xd->hw_if_index))) + if ((xd->flags & (DPDK_DEVICE_FLAG_ADMIN_UP | DPDK_DEVICE_FLAG_BOND_SLAVE)) + && ((xd->link.link_status != 0) ^ + vnet_hw_interface_is_link_up (vnm, xd->hw_if_index))) { hw_flags_chg = 1; hw_flags |= (xd->link.link_status ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c index 3f45cb9f..846eb57b 100644 --- a/src/vnet/interface_output.c +++ b/src/vnet/interface_output.c @@ -259,7 +259,8 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, si = vnet_get_sw_interface (vnm, rt->sw_if_index); hi = vnet_get_sup_hw_interface (vnm, rt->sw_if_index); - if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) || + if (!(si->flags & (VNET_SW_INTERFACE_FLAG_ADMIN_UP | + VNET_SW_INTERFACE_FLAG_BOND_SLAVE)) || !(hi->flags & VNET_HW_INTERFACE_FLAG_LINK_UP)) { vlib_simple_counter_main_t *cm; @@ -451,7 +452,8 @@ vnet_interface_output_node (vlib_main_t * vm, si = vnet_get_sw_interface (vnm, rt->sw_if_index); hi = vnet_get_sup_hw_interface (vnm, rt->sw_if_index); - if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) || + if (!(si->flags & (VNET_SW_INTERFACE_FLAG_ADMIN_UP | + VNET_SW_INTERFACE_FLAG_BOND_SLAVE)) || !(hi->flags & VNET_HW_INTERFACE_FLAG_LINK_UP)) { vlib_simple_counter_main_t *cm; diff --git a/src/vnet/lldp/lldp_cli.c b/src/vnet/lldp/lldp_cli.c index af18f90f..1933ca8c 100644 --- a/src/vnet/lldp/lldp_cli.c +++ b/src/vnet/lldp/lldp_cli.c @@ -74,7 +74,8 @@ lldp_cfg_intf_set (u32 hw_if_index, u8 ** port_desc, int enable) const vnet_sw_interface_t *sw = vnet_get_sw_interface (lm->vnet_main, hi->sw_if_index); - if (sw->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) + if (sw->flags & (VNET_SW_INTERFACE_FLAG_ADMIN_UP | + VNET_SW_INTERFACE_FLAG_BOND_SLAVE)) { lldp_schedule_intf (lm, n); } @@ -501,7 +502,8 @@ format_lldp_intfs_detail (u8 * s, vlib_main_t * vm, const lldp_main_t * lm) hw = vnet_get_hw_interface(vnm, n->hw_if_index); sw = vnet_get_sw_interface(lm->vnet_main, hw->sw_if_index); /* Interface shutdown */ - if (!(sw->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)) + if (!(sw->flags & (VNET_SW_INTERFACE_FLAG_ADMIN_UP | + VNET_SW_INTERFACE_FLAG_BOND_SLAVE))) { s = format(s, "\nInterface name: %s\nInterface/peer state: " "interface down\nLast packet sent: %U\n", -- cgit 1.2.3-korg From 690d26c6b9ddbd1a252e0eff61a28a62fc740432 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 11 Jul 2017 17:13:37 +0200 Subject: dpdk: prefetch 2nd cacheline of rte_mbuf during tx Change-Id: I0db02dd0147dbd47d4296fdb84280d0e7d321f3c Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index c68ecbda..8801bfd3 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -307,7 +307,7 @@ dpdk_prefetch_buffer_by_index (vlib_main_t * vm, u32 bi) struct rte_mbuf *mb; b = vlib_get_buffer (vm, bi); mb = rte_mbuf_from_vlib_buffer (b); - CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (mb, 2 * CLIB_CACHE_LINE_BYTES, STORE); CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, LOAD); } -- cgit 1.2.3-korg From a2522f6fd57eb93f57dfcc27c59862d4cc32879a Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Fri, 7 Jul 2017 14:57:07 -0400 Subject: dpdk: fix dpdk_buffer_pool_create name - vnet_buffer_pool_create should probably be named dpdk_buffer_pool_create since that is what it does. - Its prototype should also be in a DPDK plugin header, not in vlib/buffer_funcs.h, since the implementation is in the plugin and nobody else should be calling it. Change-Id: I7ba259afa4b888bc94f3ad257305e286b41e7370 Signed-off-by: Chris Luke --- src/plugins/dpdk/buffer.c | 2 +- src/plugins/dpdk/device/dpdk.h | 3 +++ src/plugins/dpdk/device/init.c | 4 ++-- src/vlib/buffer_funcs.h | 3 --- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index aa73eb6c..b0f247e1 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -427,7 +427,7 @@ dpdk_packet_template_init (vlib_main_t * vm, } clib_error_t * -vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, +dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id) { dpdk_main_t *dm = &dpdk_main; diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index a2686978..55f63b37 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -453,6 +453,9 @@ uword admin_up_down_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f); +clib_error_t *dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id); + #endif /* __included_dpdk_h__ */ /* diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 17e77618..2ec1664b 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1210,13 +1210,13 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) rte_dump_physmem_layout (stdout); /* main thread 1st */ - error = vlib_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ()); + error = dpdk_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ()); if (error) return error; for (i = 0; i < RTE_MAX_LCORE; i++) { - error = vlib_buffer_pool_create (vm, conf->num_mbufs, + error = dpdk_buffer_pool_create (vm, conf->num_mbufs, rte_lcore_to_socket_id (i)); if (error) return error; diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 79e3e69c..97442e12 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -237,9 +237,6 @@ vlib_buffer_set_known_state (vlib_main_t * vm, u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index, uword follow_chain); -clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, - unsigned socket_id); - /** \brief Allocate buffers into supplied array @param vm - (vlib_main_t *) vlib main data structure pointer -- cgit 1.2.3-korg From 072401e8096c648b91f958bd911f64ce24fecff9 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 13 Jul 2017 18:53:27 +0200 Subject: Introduce l{2,3,4}_hdr_offset fields in the buffer metadata To save space in the first cacheline following is changed: - total_length_not_including_first_buffer moved to the 2nd cacheline. This field is used only when VLIB_BUFFER_TOTAL_LENGTH_VALID and VLIB_BUFFER_NEXT_PRESENT are both set. - free_list_index is now stored in 4bits inside flags, which allows up to 16 free lists. In case we need more we can store index in the 2nd cachelin Change-Id: Ic8521350819391af470d31d3fa1013e67ecb7681 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/node.c | 8 ++++++- src/vlib/buffer.c | 16 ++++++++----- src/vlib/buffer.h | 40 +++++++++++++++++--------------- src/vlib/buffer_funcs.h | 50 +++++++++++++++++++++++++++++----------- src/vnet/bfd/bfd_udp.c | 4 ++-- src/vnet/buffer.h | 14 +++-------- src/vnet/dhcp/dhcp4_proxy_node.c | 2 +- src/vnet/dhcp/dhcp6_proxy_node.c | 2 +- src/vnet/ethernet/ethernet.h | 3 +-- src/vnet/ethernet/node.c | 23 ++++++++---------- src/vnet/ip/ip4_forward.c | 6 ++--- src/vnet/ip/ip6_forward.c | 6 ++--- src/vnet/ip/ip6_neighbor.c | 19 +++++++-------- src/vnet/l2/l2_bvi.h | 2 +- src/vnet/lisp-cp/control.c | 2 +- src/vnet/replication.c | 6 ++--- 16 files changed, 111 insertions(+), 92 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 69acc529..74fb8da1 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -208,7 +208,13 @@ dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b, mb_seg = mb->next; b_chain = b; - while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs)) + if (mb->nb_segs < 2) + return; + + b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b->total_length_not_including_first_buffer = 0; + + while (nb_seg < mb->nb_segs) { ASSERT (mb_seg != 0); diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index b2a095cf..53b60c16 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -72,8 +72,8 @@ format_vlib_buffer (u8 * s, va_list * args) uword indent = format_get_indent (s); s = format (s, "current data %d, length %d, free-list %d, clone-count %u", - b->current_data, b->current_length, b->free_list_index, - b->n_add_refs); + b->current_data, b->current_length, + vlib_buffer_get_free_list_index (b), b->n_add_refs); if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID) s = format (s, ", totlen-nifb %d", @@ -163,10 +163,14 @@ vlib_validate_buffer_helper (vlib_main_t * vm, vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *fl; - if (pool_is_free_index (bm->buffer_free_list_pool, b->free_list_index)) - return format (0, "unknown free list 0x%x", b->free_list_index); + if (pool_is_free_index + (bm->buffer_free_list_pool, vlib_buffer_get_free_list_index (b))) + return format (0, "unknown free list 0x%x", + vlib_buffer_get_free_list_index (b)); - fl = pool_elt_at_index (bm->buffer_free_list_pool, b->free_list_index); + fl = + pool_elt_at_index (bm->buffer_free_list_pool, + vlib_buffer_get_free_list_index (b)); if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE) return format (0, "current data %d before pre-data", b->current_data); @@ -388,7 +392,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, f->name = clib_mem_is_vec (name) ? name : format (0, "%s", name); /* Setup free buffer template. */ - f->buffer_init_template.free_list_index = f->index; + vlib_buffer_set_free_list_index (&f->buffer_init_template, f->index); f->buffer_init_template.n_add_refs = 0; if (is_public) diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index b20538b7..c810db4e 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -72,6 +72,7 @@ typedef struct the end of this buffer. */ u32 flags; /**< buffer flags: +
VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index,
VLIB_BUFFER_IS_TRACED: trace this buffer.
VLIB_BUFFER_NEXT_PRESENT: this is a multi-chunk buffer.
VLIB_BUFFER_TOTAL_LENGTH_VALID: as it says @@ -82,28 +83,26 @@ typedef struct set to avoid adding it to a flow report
VLIB_BUFFER_FLAG_USER(n): user-defined bit N */ -#define VLIB_BUFFER_IS_TRACED (1 << 0) -#define VLIB_BUFFER_LOG2_NEXT_PRESENT (1) + +/* any change to the following line requres update of + * vlib_buffer_get_free_list_index(...) and + * vlib_buffer_set_free_list_index(...) functions */ +#define VLIB_BUFFER_FREE_LIST_INDEX_MASK ((1 << 4) - 1) + +#define VLIB_BUFFER_IS_TRACED (1 << 4) +#define VLIB_BUFFER_LOG2_NEXT_PRESENT (5) #define VLIB_BUFFER_NEXT_PRESENT (1 << VLIB_BUFFER_LOG2_NEXT_PRESENT) -#define VLIB_BUFFER_IS_RECYCLED (1 << 2) -#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 3) -#define VLIB_BUFFER_REPL_FAIL (1 << 4) -#define VLIB_BUFFER_RECYCLE (1 << 5) -#define VLIB_BUFFER_FLOW_REPORT (1 << 6) -#define VLIB_BUFFER_EXT_HDR_VALID (1 << 7) +#define VLIB_BUFFER_IS_RECYCLED (1 << 6) +#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 7) +#define VLIB_BUFFER_REPL_FAIL (1 << 8) +#define VLIB_BUFFER_RECYCLE (1 << 9) +#define VLIB_BUFFER_FLOW_REPORT (1 << 10) +#define VLIB_BUFFER_EXT_HDR_VALID (1 << 11) /* User defined buffer flags. */ #define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n)) #define VLIB_BUFFER_FLAG_USER(n) (1 << LOG2_VLIB_BUFFER_FLAG_USER(n)) - u32 free_list_index; /**< Buffer free list that this buffer was - allocated from and will be freed to. - */ - - u32 total_length_not_including_first_buffer; - /**< Only valid for first buffer in chain. Current length plus - total length given here give total number of bytes in buffer chain. - */ STRUCT_MARK (template_end); u32 next_buffer; /**< Next buffer for this linked-list of buffers. @@ -128,7 +127,7 @@ typedef struct Before allocating any of it, discussion required! */ - u32 opaque[8]; /**< Opaque data used by sub-graphs for their own purposes. + u32 opaque[10]; /**< Opaque data used by sub-graphs for their own purposes. See .../vnet/vnet/buffer.h */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); @@ -137,7 +136,12 @@ typedef struct if VLIB_PACKET_IS_TRACED flag is set. */ u32 recycle_count; /**< Used by L2 path recycle code */ - u32 opaque2[14]; /**< More opaque data, currently unused */ + + u32 total_length_not_including_first_buffer; + /**< Only valid for first buffer in chain. Current length plus + total length given here give total number of bytes in buffer chain. + */ + u32 opaque2[13]; /**< More opaque data, currently unused */ /***** end of second cache line */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline2); diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 97442e12..1aaac0b2 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -106,12 +106,15 @@ uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, always_inline uword vlib_buffer_length_in_chain (vlib_main_t * vm, vlib_buffer_t * b) { - uword l = b->current_length + b->total_length_not_including_first_buffer; - if (PREDICT_FALSE ((b->flags & (VLIB_BUFFER_NEXT_PRESENT - | VLIB_BUFFER_TOTAL_LENGTH_VALID)) - == VLIB_BUFFER_NEXT_PRESENT)) - return vlib_buffer_length_in_chain_slow_path (vm, b); - return l; + uword len = b->current_length; + + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0)) + return len; + + if (PREDICT_TRUE (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID)) + return len + b->total_length_not_including_first_buffer; + + return vlib_buffer_length_in_chain_slow_path (vm, b); } /** \brief Get length in bytes of the buffer index buffer chain @@ -261,6 +264,24 @@ vlib_buffer_round_size (u32 size) return round_pow2 (size, sizeof (vlib_buffer_t)); } +always_inline u32 +vlib_buffer_get_free_list_index (vlib_buffer_t * b) +{ + return b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK; +} + +always_inline void +vlib_buffer_set_free_list_index (vlib_buffer_t * b, u32 index) +{ + /* if there is an need for more free lists we should consider + storig data in the 2nd cacheline */ + ASSERT (VLIB_BUFFER_FREE_LIST_INDEX_MASK & 1); + ASSERT (index <= VLIB_BUFFER_FREE_LIST_INDEX_MASK); + + b->flags &= ~VLIB_BUFFER_FREE_LIST_INDEX_MASK; + b->flags |= index & VLIB_BUFFER_FREE_LIST_INDEX_MASK; +} + /** \brief Allocate buffers from specific freelist into supplied array @param vm - (vlib_main_t *) vlib main data structure pointer @@ -381,7 +402,7 @@ vlib_buffer_get_buffer_free_list (vlib_main_t * vm, vlib_buffer_t * b, vlib_buffer_main_t *bm = vm->buffer_main; u32 i; - *index = i = b->free_list_index; + *index = i = vlib_buffer_get_free_list_index (b); return pool_elt_at_index (bm->buffer_free_list_pool, i); } @@ -569,7 +590,8 @@ vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers, } n_buffers = vlib_buffer_alloc_from_free_list (vm, buffers, n_buffers, - s->free_list_index); + vlib_buffer_get_free_list_index + (s)); if (PREDICT_FALSE (n_buffers == 0)) { buffers[0] = src_buffer; @@ -581,7 +603,8 @@ vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers, vlib_buffer_t *d = vlib_get_buffer (vm, buffers[i]); d->current_data = s->current_data; d->current_length = head_end_offset; - d->free_list_index = s->free_list_index; + vlib_buffer_set_free_list_index (d, + vlib_buffer_get_free_list_index (s)); d->total_length_not_including_first_buffer = s->total_length_not_including_first_buffer + s->current_length - head_end_offset; @@ -615,7 +638,8 @@ vlib_buffer_attach_clone (vlib_main_t * vm, vlib_buffer_t * head, vlib_buffer_t * tail) { ASSERT ((head->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); - ASSERT (head->free_list_index == tail->free_list_index); + ASSERT (vlib_buffer_get_free_list_index (head) == + vlib_buffer_get_free_list_index (tail)); head->flags |= VLIB_BUFFER_NEXT_PRESENT; head->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; @@ -791,7 +815,7 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, CLIB_CACHE_LINE_BYTES * 2); /* Make sure buffer template is sane. */ - ASSERT (fl->index == fl->buffer_init_template.free_list_index); + ASSERT (fl->index == vlib_buffer_get_free_list_index (src)); clib_memcpy (STRUCT_MARK_PTR (dst, template_start), STRUCT_MARK_PTR (src, template_start), @@ -806,7 +830,6 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, _(current_data); _(current_length); _(flags); - _(free_list_index); #undef _ ASSERT (dst->total_length_not_including_first_buffer == 0); ASSERT (dst->n_add_refs == 0); @@ -832,7 +855,7 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, vlib_buffer_t *src = &fl->buffer_init_template; /* Make sure buffer template is sane. */ - ASSERT (fl->index == fl->buffer_init_template.free_list_index); + ASSERT (fl->index == vlib_buffer_get_free_list_index (src)); clib_memcpy (STRUCT_MARK_PTR (dst0, template_start), STRUCT_MARK_PTR (src, template_start), @@ -853,7 +876,6 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, _(current_data); _(current_length); _(flags); - _(free_list_index); #undef _ ASSERT (dst0->total_length_not_including_first_buffer == 0); diff --git a/src/vnet/bfd/bfd_udp.c b/src/vnet/bfd/bfd_udp.c index 346c5495..06b843c6 100644 --- a/src/vnet/bfd/bfd_udp.c +++ b/src/vnet/bfd/bfd_udp.c @@ -843,7 +843,7 @@ bfd_udp4_find_headers (vlib_buffer_t * b, ip4_header_t ** ip4, udp_header_t ** udp) { /* sanity check first */ - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < sizeof (b->pre_data)) { BFD_ERR ("Start of ip header is before pre_data, ignoring"); @@ -1000,7 +1000,7 @@ bfd_udp6_find_headers (vlib_buffer_t * b, ip6_header_t ** ip6, udp_header_t ** udp) { /* sanity check first */ - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < sizeof (b->pre_data)) { BFD_ERR ("Start of ip header is before pre_data, ignoring"); diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index 9aba34da..8647db00 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -71,7 +71,6 @@ #define VNET_BUFFER_SPAN_CLONE (1 << LOG2_VNET_BUFFER_SPAN_CLONE) #define foreach_buffer_opaque_union_subtype \ -_(ethernet) \ _(ip) \ _(swt) \ _(l2) \ @@ -100,16 +99,12 @@ _(tcp) typedef struct { u32 sw_if_index[VLIB_N_RX_TX]; + i16 l2_hdr_offset; + i16 l3_hdr_offset; + i16 l4_hdr_offset; union { - /* Ethernet. */ - struct - { - /* Saved value of current header by ethernet-input. */ - i32 start_of_ethernet_header; - } ethernet; - /* IP4/6 buffer opaque. */ struct { @@ -143,9 +138,6 @@ typedef struct u8 code; u32 data; } icmp; - - /* IP header offset from vlib_buffer.data - saved by ip*_local nodes */ - i32 start_of_ip_header; }; } ip; diff --git a/src/vnet/dhcp/dhcp4_proxy_node.c b/src/vnet/dhcp/dhcp4_proxy_node.c index 26e1e65c..1b59cdea 100644 --- a/src/vnet/dhcp/dhcp4_proxy_node.c +++ b/src/vnet/dhcp/dhcp4_proxy_node.c @@ -231,7 +231,7 @@ dhcp_proxy_to_server_input (vlib_main_t * vm, o = (dhcp_option_t *) (((uword) o) + (o->length + 2)); } - fl = vlib_buffer_get_free_list (vm, b0->free_list_index); + fl = vlib_buffer_get_free_list (vm, vlib_buffer_get_free_list_index (b0)); // start write at (option*)o, some packets have padding if (((u8 *)o - (u8 *)b0->data + VPP_DHCP_OPTION82_SIZE) > fl->n_data_bytes) { diff --git a/src/vnet/dhcp/dhcp6_proxy_node.c b/src/vnet/dhcp/dhcp6_proxy_node.c index 885313a5..e109cc4c 100644 --- a/src/vnet/dhcp/dhcp6_proxy_node.c +++ b/src/vnet/dhcp/dhcp6_proxy_node.c @@ -306,7 +306,7 @@ dhcpv6_proxy_to_server_input (vlib_main_t * vm, copy_ip6_address(&r1->link_addr, ia0); link_address_set: - fl = vlib_buffer_get_free_list (vm, b0->free_list_index); + fl = vlib_buffer_get_free_list (vm, vlib_buffer_get_free_list_index (b0)); if ((b0->current_length+sizeof(*id1)+sizeof(*vss1)+sizeof(*cmac)) > fl->n_data_bytes) diff --git a/src/vnet/ethernet/ethernet.h b/src/vnet/ethernet/ethernet.h index dcc656a7..2fc5b804 100644 --- a/src/vnet/ethernet/ethernet.h +++ b/src/vnet/ethernet/ethernet.h @@ -344,8 +344,7 @@ ethernet_setup_node (vlib_main_t * vm, u32 node_index) always_inline ethernet_header_t * ethernet_buffer_get_header (vlib_buffer_t * b) { - return (void *) - (b->data + vnet_buffer (b)->ethernet.start_of_ethernet_header); + return (void *) (b->data + vnet_buffer (b)->l2_hdr_offset); } /** Returns the number of VLAN headers in the current Ethernet frame in the diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index d9fdff48..421d501a 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -101,7 +101,7 @@ parse_header (ethernet_input_variant_t variant, e0 = (void *) (b0->data + b0->current_data); - vnet_buffer (b0)->ethernet.start_of_ethernet_header = b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; vlib_buffer_advance (b0, sizeof (e0[0])); @@ -205,9 +205,7 @@ identify_subint (vnet_hw_interface_t * hi, if (!(*is_l2)) { ethernet_header_t *e0; - e0 = - (void *) (b0->data + - vnet_buffer (b0)->ethernet.start_of_ethernet_header); + e0 = (void *) (b0->data + vnet_buffer (b0)->l2_hdr_offset); if (!(ethernet_address_cast (e0->dst_address))) { @@ -238,7 +236,7 @@ determine_next_node (ethernet_main_t * em, { *next0 = em->l2_next; // record the L2 len and reset the buffer so the L2 header is preserved - u32 eth_start = vnet_buffer (b0)->ethernet.start_of_ethernet_header; + u32 eth_start = vnet_buffer (b0)->l2_hdr_offset; vnet_buffer (b0)->l2.l2_len = b0->current_data - eth_start; ASSERT (vnet_buffer (b0)->l2.l2_len == ethernet_buffer_header_size (b0)); @@ -424,10 +422,8 @@ ethernet_input_inline (vlib_main_t * vm, cached_is_l2 = is_l20 = subint0->flags & SUBINT_CONFIG_L2; } - vnet_buffer (b0)->ethernet.start_of_ethernet_header = - b0->current_data; - vnet_buffer (b1)->ethernet.start_of_ethernet_header = - b1->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; + vnet_buffer (b1)->l2_hdr_offset = b1->current_data; if (PREDICT_TRUE (is_l20 != 0)) { @@ -519,9 +515,9 @@ ethernet_input_inline (vlib_main_t * vm, { len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data - - vnet_buffer (b0)->ethernet.start_of_ethernet_header; + - vnet_buffer (b0)->l2_hdr_offset; len1 = vlib_buffer_length_in_chain (vm, b1) + b1->current_data - - vnet_buffer (b1)->ethernet.start_of_ethernet_header; + - vnet_buffer (b1)->l2_hdr_offset; stats_n_packets += 2; stats_n_bytes += len0 + len1; @@ -646,8 +642,7 @@ ethernet_input_inline (vlib_main_t * vm, cached_is_l2 = is_l20 = subint0->flags & SUBINT_CONFIG_L2; } - vnet_buffer (b0)->ethernet.start_of_ethernet_header = - b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; if (PREDICT_TRUE (is_l20 != 0)) { @@ -710,7 +705,7 @@ ethernet_input_inline (vlib_main_t * vm, { len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data - - vnet_buffer (b0)->ethernet.start_of_ethernet_header; + - vnet_buffer (b0)->l2_hdr_offset; stats_n_packets += 1; stats_n_bytes += len0; diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 8263e01c..b8dfa847 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1585,8 +1585,8 @@ ip4_local_inline (vlib_main_t * vm, ip0 = vlib_buffer_get_current (p0); ip1 = vlib_buffer_get_current (p1); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; - vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; + vnet_buffer (p1)->l3_hdr_offset = p1->current_data; sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX]; @@ -1788,7 +1788,7 @@ ip4_local_inline (vlib_main_t * vm, ip0 = vlib_buffer_get_current (p0); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 4b574b9a..2b8c2bd2 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -1362,8 +1362,8 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip0 = vlib_buffer_get_current (p0); ip1 = vlib_buffer_get_current (p1); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; - vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; + vnet_buffer (p1)->l3_hdr_offset = p1->current_data; type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; type1 = lm->builtin_protocol_by_ip_protocol[ip1->protocol]; @@ -1493,7 +1493,7 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip0 = vlib_buffer_get_current (p0); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; next0 = lm->local_next_by_ip_protocol[ip0->protocol]; diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index b8f6f9b1..68a8cbbc 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -1479,9 +1479,8 @@ icmp6_router_solicitation (vlib_main_t * vm, sizeof (icmp6_router_advertisement_header_t); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &rh, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &rh, sizeof (icmp6_router_advertisement_header_t)); @@ -1499,9 +1498,8 @@ icmp6_router_solicitation (vlib_main_t * vm, eth_if0->address, 6); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &h, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &h, sizeof (icmp6_neighbor_discovery_ethernet_link_layer_address_option_t)); @@ -1525,9 +1523,8 @@ icmp6_router_solicitation (vlib_main_t * vm, sizeof (icmp6_neighbor_discovery_mtu_option_t); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &h, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &h, sizeof (icmp6_neighbor_discovery_mtu_option_t)); } @@ -1579,7 +1576,7 @@ icmp6_router_solicitation (vlib_main_t * vm, payload_length += sizeof( icmp6_neighbor_discovery_prefix_information_option_t); vlib_buffer_add_data (vm, - p0->free_list_index, + vlib_buffer_get_free_list_index (p0), bi0, (void *)&h, sizeof(icmp6_neighbor_discovery_prefix_information_option_t)); @@ -2326,7 +2323,7 @@ ip6_neighbor_send_mldpv2_report (u32 sw_if_index) num_addr_records++; vlib_buffer_add_data - (vm, b0->free_list_index, bo0, + (vm, vlib_buffer_get_free_list_index (b0), bo0, (void *)&rr, sizeof(icmp6_multicast_address_record_t)); payload_length += sizeof( icmp6_multicast_address_record_t); diff --git a/src/vnet/l2/l2_bvi.h b/src/vnet/l2/l2_bvi.h index e21a1616..662ec402 100644 --- a/src/vnet/l2/l2_bvi.h +++ b/src/vnet/l2/l2_bvi.h @@ -57,7 +57,7 @@ l2_to_bvi (vlib_main_t * vlib_main, } /* Save L2 header position which may be changed due to packet replication */ - vnet_buffer (b0)->ethernet.start_of_ethernet_header = b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; /* Strip L2 header */ l2_len = vnet_buffer (b0)->l2.l2_len; diff --git a/src/vnet/lisp-cp/control.c b/src/vnet/lisp-cp/control.c index 22b5c82c..d8a1372d 100644 --- a/src/vnet/lisp-cp/control.c +++ b/src/vnet/lisp-cp/control.c @@ -3706,7 +3706,7 @@ send_map_reply (lisp_cp_main_t * lcm, u32 mi, ip_address_t * dst, static void find_ip_header (vlib_buffer_t * b, u8 ** ip_hdr) { - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < -sizeof (b->pre_data)) { *ip_hdr = 0; diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 1c6f28d2..0fdca0bf 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -43,12 +43,12 @@ replication_prep (vlib_main_t * vm, ctx_id = ctx - rm->contexts[thread_index]; /* Save state from vlib buffer */ - ctx->saved_free_list_index = b0->free_list_index; + ctx->saved_free_list_index = vlib_buffer_get_free_list_index (b0); ctx->current_data = b0->current_data; /* Set up vlib buffer hooks */ b0->recycle_count = ctx_id; - b0->free_list_index = rm->recycle_list_index; + vlib_buffer_set_free_list_index (b0, rm->recycle_list_index); b0->flags |= VLIB_BUFFER_RECYCLE; /* Save feature state */ @@ -129,7 +129,7 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) * This is the last replication in the list. * Restore original buffer free functionality. */ - b0->free_list_index = ctx->saved_free_list_index; + vlib_buffer_set_free_list_index (b0, ctx->saved_free_list_index); b0->flags &= ~VLIB_BUFFER_RECYCLE; /* Free context back to its pool */ -- cgit 1.2.3-korg From 213b5aae860c2a9d5de8d4d070d0d2091af699f5 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 13 Jul 2017 21:19:27 +0200 Subject: vnet_buffer_t flags cleanup Change-Id: I123eccea98abafeb31f25d2a162501e2eded60d4 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 7 +++--- src/plugins/gtpu/gtpu_decap.c | 18 ++++++++-------- src/plugins/ixge/ixge.c | 12 +++++------ src/vnet/bfd/bfd_udp.c | 4 ++-- src/vnet/buffer.h | 49 +++++++++++++++++++++--------------------- src/vnet/ethernet/ethernet.h | 14 ++++++------ src/vnet/handoff.c | 2 +- src/vnet/ip/icmp4.c | 6 +++--- src/vnet/ip/ip4_forward.c | 38 ++++++++++++++++---------------- src/vnet/ip/ip6_forward.c | 44 +++++++++++++++++++++---------------- src/vnet/ip/ip6_neighbor.c | 4 ++-- src/vnet/session/node.c | 2 +- src/vnet/span/node.c | 4 ++-- src/vnet/tcp/tcp_output.c | 8 +++---- src/vnet/vxlan-gpe/decap.c | 18 ++++++++-------- src/vnet/vxlan/decap.c | 18 ++++++++-------- 16 files changed, 127 insertions(+), 121 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 2ec1664b..7ca3d358 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -196,8 +196,8 @@ dpdk_lib_init (dpdk_main_t * dm) "dpdk rx"); if (dm->conf->enable_tcp_udp_checksum) - dm->buffer_flags_template &= ~(IP_BUFFER_L4_CHECKSUM_CORRECT - | IP_BUFFER_L4_CHECKSUM_COMPUTED); + dm->buffer_flags_template &= ~(VNET_BUFFER_F_L4_CHECKSUM_CORRECT + | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED); /* vlib_buffer_t template */ vec_validate_aligned (dm->buffer_templates, tm->n_vlib_mains - 1, @@ -1544,7 +1544,8 @@ dpdk_init (vlib_main_t * vm) /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */ dm->buffer_flags_template = (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_EXT_HDR_VALID - | IP_BUFFER_L4_CHECKSUM_COMPUTED | IP_BUFFER_L4_CHECKSUM_CORRECT); + | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED | + VNET_BUFFER_F_L4_CHECKSUM_CORRECT); dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL; dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL; diff --git a/src/plugins/gtpu/gtpu_decap.c b/src/plugins/gtpu/gtpu_decap.c index fc74e7cb..de235889 100644 --- a/src/plugins/gtpu/gtpu_decap.c +++ b/src/plugins/gtpu/gtpu_decap.c @@ -982,7 +982,7 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, } flags0 = b0->flags; - good_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp0 |= udp0->checksum == 0; @@ -998,14 +998,14 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp0)) { - if ((flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags0 = ip4_tcp_udp_validate_checksum (vm, b0); else flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); good_udp0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } @@ -1064,7 +1064,7 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, } flags1 = b1->flags; - good_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp1 |= udp1->checksum == 0; @@ -1080,14 +1080,14 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp1)) { - if ((flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags1 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags1 = ip4_tcp_udp_validate_checksum (vm, b1); else flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, b1); good_udp1 = - (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } @@ -1182,7 +1182,7 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, } flags0 = b0->flags; - good_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp0 |= udp0->checksum == 0; @@ -1198,14 +1198,14 @@ ip_gtpu_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp0)) { - if ((flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags0 = ip4_tcp_udp_validate_checksum (vm, b0); else flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); good_udp0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index 628d6d71..e0150f41 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -656,11 +656,11 @@ ixge_rx_next_and_error_from_status_x1 (ixge_device_t * xd, f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); *error0 = e0; *next0 = n0; @@ -715,17 +715,17 @@ ixge_rx_next_and_error_from_status_x2 (ixge_device_t * xd, f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); f1 = ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); f1 |= ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); *flags0 = f0; *flags1 = f1; diff --git a/src/vnet/bfd/bfd_udp.c b/src/vnet/bfd/bfd_udp.c index 06b843c6..533d98d6 100644 --- a/src/vnet/bfd/bfd_udp.c +++ b/src/vnet/bfd/bfd_udp.c @@ -246,7 +246,7 @@ bfd_add_udp4_transport (vlib_main_t * vm, u32 bi, const bfd_session_t * bs, const bfd_udp_key_t *key = &bus->key; vlib_buffer_t *b = vlib_get_buffer (vm, bi); - b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; vnet_buffer (b)->ip.adj_index[VLIB_RX] = bus->adj_index; vnet_buffer (b)->ip.adj_index[VLIB_TX] = bus->adj_index; vnet_buffer (b)->sw_if_index[VLIB_RX] = 0; @@ -301,7 +301,7 @@ bfd_add_udp6_transport (vlib_main_t * vm, u32 bi, const bfd_session_t * bs, const bfd_udp_key_t *key = &bus->key; vlib_buffer_t *b = vlib_get_buffer (vm, bi); - b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; vnet_buffer (b)->ip.adj_index[VLIB_RX] = bus->adj_index; vnet_buffer (b)->ip.adj_index[VLIB_TX] = bus->adj_index; vnet_buffer (b)->sw_if_index[VLIB_RX] = 0; diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index 8647db00..52dada30 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -42,33 +42,32 @@ #include -/* VLIB buffer flags for ip4/ip6 packets. Set by input interfaces for ip4/ip6 - tcp/udp packets with hardware computed checksums. */ -#define LOG2_IP_BUFFER_L4_CHECKSUM_COMPUTED LOG2_VLIB_BUFFER_FLAG_USER(1) -#define LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT LOG2_VLIB_BUFFER_FLAG_USER(2) -#define IP_BUFFER_L4_CHECKSUM_COMPUTED (1 << LOG2_IP_BUFFER_L4_CHECKSUM_COMPUTED) -#define IP_BUFFER_L4_CHECKSUM_CORRECT (1 << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT) - -/* VLAN header flags. - * These bits are zeroed in vlib_buffer_init_for_free_list() - * meaning wherever the buffer comes from they have a reasonable - * value (eg, if ip4/ip6 generates the packet.) - */ -#define LOG2_ETH_BUFFER_VLAN_2_DEEP LOG2_VLIB_BUFFER_FLAG_USER(3) -#define LOG2_ETH_BUFFER_VLAN_1_DEEP LOG2_VLIB_BUFFER_FLAG_USER(4) -#define ETH_BUFFER_VLAN_2_DEEP (1 << LOG2_ETH_BUFFER_VLAN_2_DEEP) -#define ETH_BUFFER_VLAN_1_DEEP (1 << LOG2_ETH_BUFFER_VLAN_1_DEEP) -#define ETH_BUFFER_VLAN_BITS (ETH_BUFFER_VLAN_1_DEEP | \ - ETH_BUFFER_VLAN_2_DEEP) - -#define LOG2_BUFFER_HANDOFF_NEXT_VALID LOG2_VLIB_BUFFER_FLAG_USER(6) -#define BUFFER_HANDOFF_NEXT_VALID (1 << LOG2_BUFFER_HANDOFF_NEXT_VALID) +#define foreach_vnet_buffer_field \ + _( 1, L4_CHECKSUM_COMPUTED) \ + _( 2, L4_CHECKSUM_CORRECT) \ + _( 3, VLAN_2_DEEP) \ + _( 4, VLAN_1_DEEP) \ + _( 6, HANDOFF_NEXT_VALID) \ + _( 7, LOCALLY_ORIGINATED) \ + _( 8, SPAN_CLONE) + +#define VNET_BUFFER_FLAGS_VLAN_BITS \ + (VNET_BUFFER_F_VLAN_1_DEEP | VNET_BUFFER_F_VLAN_2_DEEP) + +enum +{ +#define _(bit, name) VNET_BUFFER_F_##name = (1 << LOG2_VLIB_BUFFER_FLAG_USER(bit)), + foreach_vnet_buffer_field +#undef _ +}; -#define LOG2_VNET_BUFFER_LOCALLY_ORIGINATED LOG2_VLIB_BUFFER_FLAG_USER(7) -#define VNET_BUFFER_LOCALLY_ORIGINATED (1 << LOG2_VNET_BUFFER_LOCALLY_ORIGINATED) +enum +{ +#define _(bit, name) VNET_BUFFER_F_LOG2_##name = LOG2_VLIB_BUFFER_FLAG_USER(bit), + foreach_vnet_buffer_field +#undef _ +}; -#define LOG2_VNET_BUFFER_SPAN_CLONE LOG2_VLIB_BUFFER_FLAG_USER(8) -#define VNET_BUFFER_SPAN_CLONE (1 << LOG2_VNET_BUFFER_SPAN_CLONE) #define foreach_buffer_opaque_union_subtype \ _(ip) \ diff --git a/src/vnet/ethernet/ethernet.h b/src/vnet/ethernet/ethernet.h index 2fc5b804..d9ab8c10 100644 --- a/src/vnet/ethernet/ethernet.h +++ b/src/vnet/ethernet/ethernet.h @@ -352,7 +352,7 @@ ethernet_buffer_get_header (vlib_buffer_t * b) * the number of headers is not known. */ #define ethernet_buffer_get_vlan_count(b) ( \ - ((b)->flags & ETH_BUFFER_VLAN_BITS) >> LOG2_ETH_BUFFER_VLAN_1_DEEP \ + ((b)->flags & VNET_BUFFER_FLAGS_VLAN_BITS) >> VNET_BUFFER_F_LOG2_VLAN_1_DEEP \ ) /** Sets the number of VLAN headers in the current Ethernet frame in the @@ -360,8 +360,8 @@ ethernet_buffer_get_header (vlib_buffer_t * b) * the number of headers is not known. */ #define ethernet_buffer_set_vlan_count(b, v) ( \ - (b)->flags = ((b)->flags & ~ETH_BUFFER_VLAN_BITS) | \ - (((v) << LOG2_ETH_BUFFER_VLAN_1_DEEP) & ETH_BUFFER_VLAN_BITS) \ + (b)->flags = ((b)->flags & ~VNET_BUFFER_FLAGS_VLAN_BITS) | \ + (((v) << VNET_BUFFER_F_LOG2_VLAN_1_DEEP) & VNET_BUFFER_FLAGS_VLAN_BITS) \ ) /** Adjusts the vlan count by the delta in 'v' */ @@ -372,10 +372,10 @@ ethernet_buffer_get_header (vlib_buffer_t * b) /** Adjusts the vlan count by the header size byte delta in 'v' */ #define ethernet_buffer_adjust_vlan_count_by_bytes(b, v) ( \ - (b)->flags = ((b)->flags & ~ETH_BUFFER_VLAN_BITS) | (( \ - ((b)->flags & ETH_BUFFER_VLAN_BITS) + \ - ((v) << (LOG2_ETH_BUFFER_VLAN_1_DEEP - 2)) \ - ) & ETH_BUFFER_VLAN_BITS) \ + (b)->flags = ((b)->flags & ~VNET_BUFFER_FLAGS_VLAN_BITS) | (( \ + ((b)->flags & VNET_BUFFER_FLAGS_VLAN_BITS) + \ + ((v) << (VNET_BUFFER_F_LOG2_VLAN_1_DEEP - 2)) \ + ) & VNET_BUFFER_FLAGS_VLAN_BITS) \ ) /** diff --git a/src/vnet/handoff.c b/src/vnet/handoff.c index 9f3c93b4..81cb9f55 100644 --- a/src/vnet/handoff.c +++ b/src/vnet/handoff.c @@ -130,7 +130,7 @@ worker_handoff_node_fn (vlib_main_t * vm, /* if input node did not specify next index, then packet should go to eternet-input */ - if (PREDICT_FALSE ((b0->flags & BUFFER_HANDOFF_NEXT_VALID) == 0)) + if (PREDICT_FALSE ((b0->flags & VNET_BUFFER_F_HANDOFF_NEXT_VALID) == 0)) vnet_buffer (b0)->handoff.next_index = HANDOFF_DISPATCH_NEXT_ETHERNET_INPUT; else if (vnet_buffer (b0)->handoff.next_index == diff --git a/src/vnet/ip/icmp4.c b/src/vnet/ip/icmp4.c index c3afff72..bbeab32b 100644 --- a/src/vnet/ip/icmp4.c +++ b/src/vnet/ip/icmp4.c @@ -336,8 +336,8 @@ ip4_icmp_echo_request (vlib_main_t * vm, ASSERT (ip0->checksum == ip4_header_checksum (ip0)); ASSERT (ip1->checksum == ip4_header_checksum (ip1)); - p0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; - p1->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + p0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + p1->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; } while (n_left_from > 0 && n_left_to_next > 0) @@ -392,7 +392,7 @@ ip4_icmp_echo_request (vlib_main_t * vm, ASSERT (ip0->checksum == ip4_header_checksum (ip0)); - p0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + p0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; } vlib_put_next_frame (vm, node, next, n_left_to_next); diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index b8dfa847..8dd927d4 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1509,15 +1509,15 @@ ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0) udp0 = (void *) (ip0 + 1); if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0) { - p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED - | IP_BUFFER_L4_CHECKSUM_CORRECT); + p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED + | VNET_BUFFER_F_L4_CHECKSUM_CORRECT); return p0->flags; } sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0); - p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED - | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT)); + p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED + | ((sum16 == 0) << VNET_BUFFER_F_LOG2_L4_CHECKSUM_CORRECT)); return p0->flags; } @@ -1629,8 +1629,8 @@ ip4_local_inline (vlib_main_t * vm, flags0 = p0->flags; flags1 = p1->flags; - good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; - good_tcp_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_tcp_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + good_tcp_udp1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; udp0 = ip4_next_header (ip0); udp1 = ip4_next_header (ip1); @@ -1657,19 +1657,19 @@ ip4_local_inline (vlib_main_t * vm, if (is_tcp_udp0) { if (is_tcp_udp0 - && !(flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED)) + && !(flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED)) flags0 = ip4_tcp_udp_validate_checksum (vm, p0); good_tcp_udp0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; good_tcp_udp0 |= is_udp0 && udp0->checksum == 0; } if (is_tcp_udp1) { if (is_tcp_udp1 - && !(flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED)) + && !(flags1 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED)) flags1 = ip4_tcp_udp_validate_checksum (vm, p1); good_tcp_udp1 = - (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; good_tcp_udp1 |= is_udp1 && udp1->checksum == 0; } } @@ -1817,7 +1817,7 @@ ip4_local_inline (vlib_main_t * vm, flags0 = p0->flags; - good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_tcp_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; udp0 = ip4_next_header (ip0); @@ -1837,10 +1837,10 @@ ip4_local_inline (vlib_main_t * vm, if (is_tcp_udp0) { if (is_tcp_udp0 - && !(flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED)) + && !(flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED)) flags0 = ip4_tcp_udp_validate_checksum (vm, p0); good_tcp_udp0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; good_tcp_udp0 |= is_udp0 && udp0->checksum == 0; } } @@ -2428,7 +2428,7 @@ ip4_rewrite_inline (vlib_main_t * vm, /* Decrement TTL & update checksum. Works either endian, so no need for byte swap. */ - if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED))) + if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))) { i32 ttl0 = ip0->ttl; @@ -2461,9 +2461,9 @@ ip4_rewrite_inline (vlib_main_t * vm, } else { - p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED; + p0->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED; } - if (PREDICT_TRUE (!(p1->flags & VNET_BUFFER_LOCALLY_ORIGINATED))) + if (PREDICT_TRUE (!(p1->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))) { i32 ttl1 = ip1->ttl; @@ -2496,7 +2496,7 @@ ip4_rewrite_inline (vlib_main_t * vm, } else { - p1->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED; + p1->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED; } /* Rewrite packet header and updates lengths. */ @@ -2614,7 +2614,7 @@ ip4_rewrite_inline (vlib_main_t * vm, next0 = IP4_REWRITE_NEXT_DROP; /* drop on error */ /* Decrement TTL & update checksum. */ - if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED))) + if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))) { i32 ttl0 = ip0->ttl; @@ -2648,7 +2648,7 @@ ip4_rewrite_inline (vlib_main_t * vm, } else { - p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED; + p0->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED; } if (do_counters) diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 2b8c2bd2..604e1492 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -1284,15 +1284,15 @@ ip6_tcp_udp_icmp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0) udp0 = (void *) (ip0 + 1); if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0) { - p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED - | IP_BUFFER_L4_CHECKSUM_CORRECT); + p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED + | VNET_BUFFER_F_L4_CHECKSUM_CORRECT); return p0->flags; } sum16 = ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0, &bogus_length); - p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED - | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT)); + p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED + | ((sum16 == 0) << VNET_BUFFER_F_LOG2_L4_CHECKSUM_CORRECT)); return p0->flags; } @@ -1374,8 +1374,10 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) flags0 = p0->flags; flags1 = p1->flags; - good_l4_checksum0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; - good_l4_checksum1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_l4_checksum0 = + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + good_l4_checksum1 = + (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; len_diff0 = 0; len_diff1 = 0; @@ -1414,19 +1416,21 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) if (PREDICT_FALSE (type0 != IP_BUILTIN_PROTOCOL_UNKNOWN && !good_l4_checksum0 - && !(flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))) + && !(flags0 & + VNET_BUFFER_F_L4_CHECKSUM_COMPUTED))) { flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, p0); good_l4_checksum0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } if (PREDICT_FALSE (type1 != IP_BUILTIN_PROTOCOL_UNKNOWN && !good_l4_checksum1 - && !(flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED))) + && !(flags1 & + VNET_BUFFER_F_L4_CHECKSUM_COMPUTED))) { flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, p1); good_l4_checksum1 = - (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } error0 = error1 = IP6_ERROR_UNKNOWN_PROTOCOL; @@ -1500,7 +1504,8 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) flags0 = p0->flags; - good_l4_checksum0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_l4_checksum0 = + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; len_diff0 = 0; if (PREDICT_TRUE (IP_PROTOCOL_UDP == ip6_locate_header (p0, ip0, @@ -1522,11 +1527,12 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) if (PREDICT_FALSE (type0 != IP_BUILTIN_PROTOCOL_UNKNOWN && !good_l4_checksum0 - && !(flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))) + && !(flags0 & + VNET_BUFFER_F_L4_CHECKSUM_COMPUTED))) { flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, p0); good_l4_checksum0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } error0 = IP6_ERROR_UNKNOWN_PROTOCOL; @@ -2019,7 +2025,7 @@ ip6_rewrite_inline (vlib_main_t * vm, error0 = error1 = IP6_ERROR_NONE; next0 = next1 = IP6_REWRITE_NEXT_DROP; - if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED))) + if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))) { i32 hop_limit0 = ip0->hop_limit; @@ -2046,9 +2052,9 @@ ip6_rewrite_inline (vlib_main_t * vm, } else { - p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED; + p0->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED; } - if (PREDICT_TRUE (!(p1->flags & VNET_BUFFER_LOCALLY_ORIGINATED))) + if (PREDICT_TRUE (!(p1->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))) { i32 hop_limit1 = ip1->hop_limit; @@ -2075,7 +2081,7 @@ ip6_rewrite_inline (vlib_main_t * vm, } else { - p1->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED; + p1->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED; } adj0 = adj_get (adj_index0); adj1 = adj_get (adj_index1); @@ -2186,7 +2192,7 @@ ip6_rewrite_inline (vlib_main_t * vm, next0 = IP6_REWRITE_NEXT_DROP; /* Check hop limit */ - if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED))) + if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))) { i32 hop_limit0 = ip0->hop_limit; @@ -2212,7 +2218,7 @@ ip6_rewrite_inline (vlib_main_t * vm, } else { - p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED; + p0->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED; } /* Guess we are only writing on simple Ethernet header. */ diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 68a8cbbc..e8eebd4e 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -1649,7 +1649,7 @@ icmp6_router_solicitation (vlib_main_t * vm, adj_index0; } } - p0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + p0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; radv_info->n_solicitations_dropped += is_dropped; radv_info->n_solicitations_rcvd += is_solicitation; @@ -2348,7 +2348,7 @@ ip6_neighbor_send_mldpv2_report (u32 sw_if_index) vnet_main.local_interface_sw_if_index; vnet_buffer (b0)->ip.adj_index[VLIB_TX] = radv_info->mcast_adj_index; - b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; vlib_node_t *node = vlib_get_node_by_name (vm, (u8 *) "ip6-rewrite-mcast"); diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 56e62637..983b78b8 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -256,7 +256,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, b0 = vlib_get_buffer (vm, bi0); b0->error = 0; b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID - | VNET_BUFFER_LOCALLY_ORIGINATED; + | VNET_BUFFER_F_LOCALLY_ORIGINATED; b0->current_data = 0; b0->total_length_not_including_first_buffer = 0; diff --git a/src/vnet/span/node.c b/src/vnet/span/node.c index 2a43b6e3..3a461b0a 100644 --- a/src/vnet/span/node.c +++ b/src/vnet/span/node.c @@ -77,7 +77,7 @@ span_mirror (vlib_main_t * vm, vlib_node_runtime_t * node, u32 sw_if_index0, return; /* Don't do it again */ - if (PREDICT_FALSE (b0->flags & VNET_BUFFER_SPAN_CLONE)) + if (PREDICT_FALSE (b0->flags & VNET_BUFFER_F_SPAN_CLONE)) return; /* *INDENT-OFF* */ @@ -92,7 +92,7 @@ span_mirror (vlib_main_t * vm, vlib_node_runtime_t * node, u32 sw_if_index0, if (PREDICT_TRUE(c0 != 0)) { vnet_buffer (c0)->sw_if_index[VLIB_TX] = i; - c0->flags |= VNET_BUFFER_SPAN_CLONE; + c0->flags |= VNET_BUFFER_F_SPAN_CLONE; to_mirror_next[0] = vlib_get_buffer_index (vm, c0); mirror_frames[i]->n_vectors++; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index b418e8ba..f34eb797 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -585,7 +585,7 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u32 *to_next, next_index; vlib_frame_t *f; - b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b->error = 0; /* Default FIB for now */ @@ -847,7 +847,7 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) u32 *to_next, next_index; vlib_frame_t *f; - b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b->error = 0; /* Decide where to send the packet */ @@ -1563,7 +1563,7 @@ tcp46_output_inline (vlib_main_t * vm, vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; - b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; done: b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -1728,7 +1728,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, done: b0->error = node->errors[error0]; - b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { th0 = vlib_buffer_get_current (b0); diff --git a/src/vnet/vxlan-gpe/decap.c b/src/vnet/vxlan-gpe/decap.c index 075b0f51..1b3a8b00 100644 --- a/src/vnet/vxlan-gpe/decap.c +++ b/src/vnet/vxlan-gpe/decap.c @@ -857,7 +857,7 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, } flags0 = b0->flags; - good_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp0 |= udp0->checksum == 0; @@ -873,14 +873,14 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp0)) { - if ((flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags0 = ip4_tcp_udp_validate_checksum (vm, b0); else flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); good_udp0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } @@ -939,7 +939,7 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, } flags1 = b1->flags; - good_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp1 |= udp1->checksum == 0; @@ -955,14 +955,14 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp1)) { - if ((flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags1 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags1 = ip4_tcp_udp_validate_checksum (vm, b1); else flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, b1); good_udp1 = - (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } @@ -1055,7 +1055,7 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, } flags0 = b0->flags; - good_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp0 |= udp0->checksum == 0; @@ -1071,14 +1071,14 @@ ip_vxlan_gpe_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp0)) { - if ((flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags0 = ip4_tcp_udp_validate_checksum (vm, b0); else flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); good_udp0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c index 2acb1f6f..0dc89d3f 100644 --- a/src/vnet/vxlan/decap.c +++ b/src/vnet/vxlan/decap.c @@ -916,7 +916,7 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, } flags0 = b0->flags; - good_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp0 |= udp0->checksum == 0; @@ -932,14 +932,14 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp0)) { - if ((flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags0 = ip4_tcp_udp_validate_checksum (vm, b0); else flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); good_udp0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } @@ -998,7 +998,7 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, } flags1 = b1->flags; - good_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp1 |= udp1->checksum == 0; @@ -1014,14 +1014,14 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp1)) { - if ((flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags1 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags1 = ip4_tcp_udp_validate_checksum (vm, b1); else flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, b1); good_udp1 = - (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } @@ -1116,7 +1116,7 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, } flags0 = b0->flags; - good_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; /* Don't verify UDP checksum for packets with explicit zero checksum. */ good_udp0 |= udp0->checksum == 0; @@ -1132,14 +1132,14 @@ ip_vxlan_bypass_inline (vlib_main_t * vm, /* Verify UDP checksum */ if (PREDICT_FALSE (!good_udp0)) { - if ((flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED) == 0) + if ((flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) { if (is_ip4) flags0 = ip4_tcp_udp_validate_checksum (vm, b0); else flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); good_udp0 = - (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } } -- cgit 1.2.3-korg From 2c0a4f407f565d8dd33ff3a9fada346860d30ad2 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Thu, 29 Jun 2017 09:30:15 -0400 Subject: TCP/UDP checksum offload API Change-Id: I2cb6ce4e29813f6602b14e6e61713fb381fbcef8 Signed-off-by: Dave Barach --- src/plugins/dpdk/device/device.c | 41 +++++++++++++++++ src/plugins/dpdk/device/dpdk.h | 3 ++ src/plugins/dpdk/device/init.c | 8 ++++ src/vnet/buffer.h | 7 ++- src/vnet/interface.h | 3 ++ src/vnet/interface_output.c | 95 +++++++++++++++++++++++++++++++++++++--- src/vnet/ip/ip4.h | 4 +- src/vnet/ip/ip4_forward.c | 9 ++-- src/vnet/tcp/tcp_output.c | 16 +++++-- 9 files changed, 173 insertions(+), 13 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 8801bfd3..c755060d 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -335,6 +335,37 @@ dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node, vec_add1 (dm->recycle[my_cpu], bi); } +static_always_inline void +dpdk_buffer_tx_offload (dpdk_device_t * xd, vlib_buffer_t * b, + struct rte_mbuf *mb) +{ + u32 ip_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM; + u32 tcp_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + u32 udp_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; + int is_ip4 = b->flags & VNET_BUFFER_F_IS_IP4; + u64 ol_flags; + + /* Is there any work for us? */ + if (PREDICT_TRUE ((ip_cksum | tcp_cksum | udp_cksum) == 0)) + return; + + mb->l2_len = vnet_buffer (b)->l3_hdr_offset - b->current_data; + mb->l3_len = vnet_buffer (b)->l4_hdr_offset - + vnet_buffer (b)->l3_hdr_offset; + mb->outer_l3_len = 0; + mb->outer_l2_len = 0; + ol_flags = is_ip4 ? PKT_TX_IPV4 : PKT_TX_IPV6; + ol_flags |= ip_cksum ? PKT_TX_IP_CKSUM : 0; + ol_flags |= tcp_cksum ? PKT_TX_TCP_CKSUM : 0; + ol_flags |= udp_cksum ? PKT_TX_UDP_CKSUM : 0; + mb->ol_flags |= ol_flags; + + /* we are trying to help compiler here by using local ol_flags with known + state of all flags */ + if (xd->flags & DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM) + rte_net_intel_cksum_flags_prepare (mb, ol_flags); +} + /* * Transmits the packets on the frame to the interface associated with the * node. It first copies packets on the frame to a tx_vector containing the @@ -455,6 +486,15 @@ dpdk_interface_tx (vlib_main_t * vm, mb2 = rte_mbuf_from_vlib_buffer (b2); mb3 = rte_mbuf_from_vlib_buffer (b3); + if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) && + (or_flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM))) + { + dpdk_buffer_tx_offload (xd, b0, mb0); + dpdk_buffer_tx_offload (xd, b1, mb1); + dpdk_buffer_tx_offload (xd, b2, mb2); + dpdk_buffer_tx_offload (xd, b3, mb3); + } + if (PREDICT_FALSE (or_flags & VLIB_BUFFER_RECYCLE)) { dpdk_buffer_recycle (vm, node, b0, bi0, &mb0); @@ -521,6 +561,7 @@ dpdk_interface_tx (vlib_main_t * vm, dpdk_validate_rte_mbuf (vm, b0, 1); mb0 = rte_mbuf_from_vlib_buffer (b0); + dpdk_buffer_tx_offload (xd, b0, mb0); dpdk_buffer_recycle (vm, node, b0, bi0, &mb0); if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 55f63b37..29a2c760 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -176,6 +177,8 @@ typedef struct #define DPDK_DEVICE_FLAG_HQOS (1 << 6) #define DPDK_DEVICE_FLAG_BOND_SLAVE (1 << 7) #define DPDK_DEVICE_FLAG_BOND_SLAVE_UP (1 << 8) +#define DPDK_DEVICE_FLAG_TX_OFFLOAD (1 << 9) +#define DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM (1 << 10) u16 nb_tx_desc; CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 7ca3d358..8a708035 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -351,6 +351,11 @@ dpdk_lib_init (dpdk_main_t * dm) case VNET_DPDK_PMD_IGB: case VNET_DPDK_PMD_IXGBE: case VNET_DPDK_PMD_I40E: + xd->port_type = port_type_from_speed_capa (&dev_info); + xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD | + DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM; + + break; case VNET_DPDK_PMD_CXGBE: case VNET_DPDK_PMD_MLX4: case VNET_DPDK_PMD_MLX5: @@ -575,6 +580,9 @@ dpdk_lib_init (dpdk_main_t * dm) hi = vnet_get_hw_interface (dm->vnet_main, xd->hw_if_index); + if (xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) + hi->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD; + dpdk_device_setup (xd); if (vec_len (xd->errors)) diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index 52dada30..56cb07ae 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -47,9 +47,14 @@ _( 2, L4_CHECKSUM_CORRECT) \ _( 3, VLAN_2_DEEP) \ _( 4, VLAN_1_DEEP) \ + _( 8, SPAN_CLONE) \ _( 6, HANDOFF_NEXT_VALID) \ _( 7, LOCALLY_ORIGINATED) \ - _( 8, SPAN_CLONE) + _( 8, IS_IP4) \ + _( 9, IS_IP6) \ + _(10, OFFLOAD_IP_CKSUM) \ + _(11, OFFLOAD_TCP_CKSUM) \ + _(12, OFFLOAD_UDP_CKSUM) #define VNET_BUFFER_FLAGS_VLAN_BITS \ (VNET_BUFFER_F_VLAN_1_DEEP | VNET_BUFFER_F_VLAN_2_DEEP) diff --git a/src/vnet/interface.h b/src/vnet/interface.h index 9d64fc28..fb75ff34 100644 --- a/src/vnet/interface.h +++ b/src/vnet/interface.h @@ -419,6 +419,9 @@ typedef struct vnet_hw_interface_t /* rx mode flags */ #define VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE (1 << 10) + /* tx checksum offload */ +#define VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD (1 << 11) + /* Hardware address as vector. Zero (e.g. zero-length vector) if no address for this class (e.g. PPP). */ u8 *hw_address; diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c index cdf18738..c95f08b1 100644 --- a/src/vnet/interface_output.c +++ b/src/vnet/interface_output.c @@ -38,6 +38,10 @@ */ #include +#include +#include +#include +#include #include typedef struct @@ -153,14 +157,58 @@ vnet_interface_output_trace (vlib_main_t * vm, } } -uword -vnet_interface_output_node (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * frame) +static_always_inline void +calc_checksums (vlib_main_t * vm, vlib_buffer_t * b) +{ + ip4_header_t *ip4; + ip6_header_t *ip6; + tcp_header_t *th; + udp_header_t *uh; + + int is_ip4 = (b->flags & VNET_BUFFER_F_IS_IP4) != 0; + int is_ip6 = (b->flags & VNET_BUFFER_F_IS_IP6) != 0; + + ASSERT (!(is_ip4 && is_ip6)); + + ip4 = (ip4_header_t *) (b->data + vnet_buffer (b)->l3_hdr_offset); + ip6 = (ip6_header_t *) (b->data + vnet_buffer (b)->l3_hdr_offset); + th = (tcp_header_t *) (b->data + vnet_buffer (b)->l4_hdr_offset); + uh = (udp_header_t *) (b->data + vnet_buffer (b)->l4_hdr_offset); + + if (is_ip4) + { + ip4 = (ip4_header_t *) (b->data + vnet_buffer (b)->l3_hdr_offset); + if (b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM) + ip4->checksum = ip4_header_checksum (ip4); + if (b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM) + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ip4); + if (b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM) + uh->checksum = ip4_tcp_udp_compute_checksum (vm, b, ip4); + } + if (is_ip6) + { + int bogus; + ASSERT (b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM); + if (b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM) + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus); + if (b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM) + uh->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus); + } + + b->flags &= ~VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + b->flags &= ~VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; + b->flags &= ~VNET_BUFFER_F_OFFLOAD_IP_CKSUM; +} + +static_always_inline uword +vnet_interface_output_node_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, vnet_main_t * vnm, + vnet_hw_interface_t * hi, + int do_tx_offloads) { - vnet_main_t *vnm = vnet_get_main (); vnet_interface_output_runtime_t *rt = (void *) node->runtime_data; vnet_sw_interface_t *si; - vnet_hw_interface_t *hi; u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 n_bytes_b0, n_bytes_b1, n_bytes_b2, n_bytes_b3; @@ -234,6 +282,7 @@ vnet_interface_output_node (vlib_main_t * vm, u32 bi0, bi1, bi2, bi3; vlib_buffer_t *b0, *b1, *b2, *b3; u32 tx_swif0, tx_swif1, tx_swif2, tx_swif3; + u32 or_flags; /* Prefetch next iteration. */ vlib_prefetch_buffer_with_index (vm, from[4], LOAD); @@ -324,6 +373,22 @@ vnet_interface_output_node (vlib_main_t * vm, thread_index, tx_swif3, 1, n_bytes_b3); } + + or_flags = b0->flags | b1->flags | b2->flags | b3->flags; + + if (do_tx_offloads) + { + if (or_flags & + (VNET_BUFFER_F_OFFLOAD_TCP_CKSUM | + VNET_BUFFER_F_OFFLOAD_UDP_CKSUM | + VNET_BUFFER_F_OFFLOAD_IP_CKSUM)) + { + calc_checksums (vm, b0); + calc_checksums (vm, b1); + calc_checksums (vm, b2); + calc_checksums (vm, b3); + } + } } while (from + 1 <= from_end && n_left_to_tx >= 1) @@ -363,6 +428,9 @@ vnet_interface_output_node (vlib_main_t * vm, thread_index, tx_swif0, 1, n_bytes_b0); } + + if (do_tx_offloads) + calc_checksums (vm, b0); } vlib_put_next_frame (vm, node, next_index, n_left_to_tx); @@ -376,6 +444,23 @@ vnet_interface_output_node (vlib_main_t * vm, return n_buffers; } +static_always_inline uword +vnet_interface_output_node (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hi; + vnet_interface_output_runtime_t *rt = (void *) node->runtime_data; + hi = vnet_get_sup_hw_interface (vnm, rt->sw_if_index); + + if (hi->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD) + return vnet_interface_output_node_inline (vm, node, frame, vnm, hi, + /* do_tx_offloads */ 0); + else + return vnet_interface_output_node_inline (vm, node, frame, vnm, hi, + /* do_tx_offloads */ 1); +} + VLIB_NODE_FUNCTION_MULTIARCH_CLONE (vnet_interface_output_node); CLIB_MULTIARCH_SELECT_FN (vnet_interface_output_node); diff --git a/src/vnet/ip/ip4.h b/src/vnet/ip/ip4.h index 71640def..19198bcb 100644 --- a/src/vnet/ip/ip4.h +++ b/src/vnet/ip/ip4.h @@ -42,6 +42,7 @@ #include #include +#include #include typedef struct ip4_mfib_t @@ -346,7 +347,8 @@ vlib_buffer_push_ip4 (vlib_main_t * vm, vlib_buffer_t * b, ih->src_address.as_u32 = src->as_u32; ih->dst_address.as_u32 = dst->as_u32; - ih->checksum = ip4_header_checksum (ih); + ih->checksum = 0; + b->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM | VNET_BUFFER_F_IS_IP4; return ih; } #endif /* included_ip_ip4_h */ diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 8dd927d4..4a9e7919 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -2457,7 +2457,8 @@ ip4_rewrite_inline (vlib_main_t * vm, } /* Verify checksum. */ - ASSERT (ip0->checksum == ip4_header_checksum (ip0)); + ASSERT ((ip0->checksum == ip4_header_checksum (ip0)) || + (p0->flags | VNET_BUFFER_F_OFFLOAD_IP_CKSUM)); } else { @@ -2492,7 +2493,8 @@ ip4_rewrite_inline (vlib_main_t * vm, } /* Verify checksum. */ - ASSERT (ip1->checksum == ip4_header_checksum (ip1)); + ASSERT ((ip1->checksum == ip4_header_checksum (ip1)) || + (p1->flags | VNET_BUFFER_F_OFFLOAD_IP_CKSUM)); } else { @@ -2630,7 +2632,8 @@ ip4_rewrite_inline (vlib_main_t * vm, ip0->ttl = ttl0; - ASSERT (ip0->checksum == ip4_header_checksum (ip0)); + ASSERT ((ip0->checksum == ip4_header_checksum (ip0)) || + (p0->flags | VNET_BUFFER_F_OFFLOAD_IP_CKSUM)); if (PREDICT_FALSE (ttl0 <= 0)) { diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index f34eb797..35f3eba1 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1485,7 +1485,12 @@ tcp46_output_inline (vlib_main_t * vm, ip4_header_t *ih0; ih0 = vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4, IP_PROTOCOL_TCP); - th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih0); + b0->flags |= VNET_BUFFER_F_IS_IP4 | + VNET_BUFFER_F_OFFLOAD_IP_CKSUM | + VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; + vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; + th0->checksum = 0; } else { @@ -1494,8 +1499,13 @@ tcp46_output_inline (vlib_main_t * vm, ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6, &tc0->c_rmt_ip6, IP_PROTOCOL_TCP); - th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih0, - &bogus); + + b0->flags |= VNET_BUFFER_F_IS_IP6 | + VNET_BUFFER_F_OFFLOAD_IP_CKSUM | + VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; + vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; + th0->checksum = 0; ASSERT (!bogus); } -- cgit 1.2.3-korg From 66b11318a1e5f24880e3ec77c95d70647732a4a8 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 31 Jul 2017 17:18:03 -0700 Subject: Fix tcp tx buffer allocation - Make tcp output buffer allocation macro an inline function - Use per ip version per thread tx frames for retransmits and timer events - Fix / parameterize tcp data structure preallocation - Add a couple of gdb-callable show commands - Fix local endpoint cleanup Change-Id: I67b47b7570aa14cb4634b6fd93c57cd2eacbfa29 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/plugins/dpdk/device/cli.c | 1 + src/vlib/error.c | 2 +- src/vlib/node_funcs.h | 16 ++-- src/vnet/ip/ip4.h | 2 - src/vnet/session/session.c | 82 +++++++++++++++-- src/vnet/session/session.h | 10 ++ src/vnet/session/session_cli.c | 2 +- src/vnet/session/session_lookup.c | 40 ++++++-- src/vnet/tcp/builtin_client.c | 39 ++++++-- src/vnet/tcp/tcp.c | 52 +++++++---- src/vnet/tcp/tcp.h | 12 ++- src/vnet/tcp/tcp_input.c | 2 + src/vnet/tcp/tcp_output.c | 188 +++++++++++++++++++++++++------------- src/vnet/unix/gdb_funcs.c | 45 ++++++++- src/vppinfra/pool.h | 2 +- 15 files changed, 375 insertions(+), 120 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index aeeb772d..fe1c41c2 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -357,6 +357,7 @@ show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, "name=\"%s\" available = %7d allocated = %7d total = %7d\n", rmp->name, (u32) count, (u32) free_count, (u32) (count + free_count)); + rte_mempool_dump (stderr, rmp); } else { diff --git a/src/vlib/error.c b/src/vlib/error.c index e4ed4ee3..dec90bbe 100644 --- a/src/vlib/error.c +++ b/src/vlib/error.c @@ -280,7 +280,7 @@ show_errors (vlib_main_t * vm, } /* *INDENT-OFF* */ -VLIB_CLI_COMMAND (cli_show_errors, static) = { +VLIB_CLI_COMMAND (vlib_cli_show_errors) = { .path = "show errors", .short_help = "Show error counts", .function = show_errors, diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index c0389b2f..c4c06454 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -410,19 +410,21 @@ vlib_frame_t *vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index); void vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, vlib_frame_t * f); -always_inline vlib_process_t * -vlib_get_current_process (vlib_main_t * vm) -{ - vlib_node_main_t *nm = &vm->node_main; - return vec_elt (nm->processes, nm->current_process_index); -} - always_inline uword vlib_in_process_context (vlib_main_t * vm) { return vm->node_main.current_process_index != ~0; } +always_inline vlib_process_t * +vlib_get_current_process (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + if (vlib_in_process_context (vm)) + return vec_elt (nm->processes, nm->current_process_index); + return 0; +} + always_inline uword vlib_current_process (vlib_main_t * vm) { diff --git a/src/vnet/ip/ip4.h b/src/vnet/ip/ip4.h index 8f9a8e27..74faa059 100644 --- a/src/vnet/ip/ip4.h +++ b/src/vnet/ip/ip4.h @@ -354,8 +354,6 @@ vlib_buffer_push_ip4 (vlib_main_t * vm, vlib_buffer_t * b, ih->checksum = 0; b->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM | VNET_BUFFER_F_IS_IP4; vnet_buffer (b)->l3_hdr_offset = (u8 *) ih - b->data; - vnet_buffer (b)->l4_hdr_offset = vnet_buffer (b)->l3_hdr_offset + - sizeof (*ih); } else ih->checksum = ip4_header_checksum (ih); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 004c7193..4ba15291 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -759,6 +759,7 @@ session_manager_main_enable (vlib_main_t * vm) session_manager_main_t *smm = &session_manager_main; vlib_thread_main_t *vtm = vlib_get_thread_main (); u32 num_threads; + u32 preallocated_sessions_per_worker; int i; num_threads = 1 /* main thread */ + vtm->n_threads; @@ -795,15 +796,35 @@ session_manager_main_enable (vlib_main_t * vm) for (i = 0; i < vec_len (smm->vpp_event_queues); i++) session_vpp_event_queue_allocate (smm, i); - /* $$$$ preallocate hack config parameter */ - for (i = 0; i < smm->preallocated_sessions; i++) + /* Preallocate sessions */ + if (num_threads == 1) { - stream_session_t *ss __attribute__ ((unused)); - pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); + for (i = 0; i < smm->preallocated_sessions; i++) + { + stream_session_t *ss __attribute__ ((unused)); + pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); + } + + for (i = 0; i < smm->preallocated_sessions; i++) + pool_put_index (smm->sessions[0], i); } + else + { + int j; + preallocated_sessions_per_worker = smm->preallocated_sessions / + (num_threads - 1); - for (i = 0; i < smm->preallocated_sessions; i++) - pool_put_index (smm->sessions[0], i); + for (j = 1; j < num_threads; j++) + { + for (i = 0; i < preallocated_sessions_per_worker; i++) + { + stream_session_t *ss __attribute__ ((unused)); + pool_get_aligned (smm->sessions[j], ss, CLIB_CACHE_LINE_BYTES); + } + for (i = 0; i < preallocated_sessions_per_worker; i++) + pool_put_index (smm->sessions[j], i); + } + } session_lookup_init (); @@ -863,6 +884,7 @@ session_config_fn (vlib_main_t * vm, unformat_input_t * input) { session_manager_main_t *smm = &session_manager_main; u32 nitems; + uword tmp; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -873,9 +895,53 @@ session_config_fn (vlib_main_t * vm, unformat_input_t * input) else clib_warning ("event queue length %d too small, ignored", nitems); } - if (unformat (input, "preallocated-sessions %d", - &smm->preallocated_sessions)) + else if (unformat (input, "preallocated-sessions %d", + &smm->preallocated_sessions)) + ; + else if (unformat (input, "v4-session-table-buckets %d", + &smm->configured_v4_session_table_buckets)) ; + else if (unformat (input, "v4-halfopen-table-buckets %d", + &smm->configured_v4_halfopen_table_buckets)) + ; + else if (unformat (input, "v6-session-table-buckets %d", + &smm->configured_v6_session_table_buckets)) + ; + else if (unformat (input, "v6-halfopen-table-buckets %d", + &smm->configured_v6_halfopen_table_buckets)) + ; + else if (unformat (input, "v4-session-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v4_session_table_memory = tmp; + } + else if (unformat (input, "v4-halfopen-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v4_halfopen_table_memory = tmp; + } + else if (unformat (input, "v6-session-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v6_session_table_memory = tmp; + } + else if (unformat (input, "v6-halfopen-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v6_halfopen_table_memory = tmp; + } else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 180b9f8a..538433da 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -133,6 +133,16 @@ struct _session_manager_main /** vpp fifo event queue configured length */ u32 configured_event_queue_length; + /** session table size parameters */ + u32 configured_v4_session_table_buckets; + u32 configured_v4_session_table_memory; + u32 configured_v4_halfopen_table_buckets; + u32 configured_v4_halfopen_table_memory; + u32 configured_v6_session_table_buckets; + u32 configured_v6_session_table_memory; + u32 configured_v6_halfopen_table_buckets; + u32 configured_v6_halfopen_table_memory; + /** Unique segment name counter */ u32 unique_segment_name_counter; diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index de564ea7..9f3d217c 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -312,7 +312,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, } /* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_session_command, static) = +VLIB_CLI_COMMAND (vlib_cli_show_session_command) = { .path = "show session", .short_help = "show session [verbose]", diff --git a/src/vnet/session/session_lookup.c b/src/vnet/session/session_lookup.c index 1ce22f80..41f9dbf0 100644 --- a/src/vnet/session/session_lookup.c +++ b/src/vnet/session/session_lookup.c @@ -569,23 +569,45 @@ stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, return 0; } +#define foreach_hash_table_parameter \ + _(v4,session,buckets,20000) \ + _(v4,session,memory,(64<<20)) \ + _(v6,session,buckets,20000) \ + _(v6,session,memory,(64<<20)) \ + _(v4,halfopen,buckets,20000) \ + _(v4,halfopen,memory,(64<<20)) \ + _(v6,halfopen,buckets,20000) \ + _(v6,halfopen,memory,(64<<20)) + void session_lookup_init (void) { session_lookup_t *sl = &session_lookup; + +#define _(af,table,parm,value) \ + u32 configured_##af##_##table##_table_##parm = value; + foreach_hash_table_parameter; +#undef _ + +#define _(af,table,parm,value) \ + if (session_manager_main.configured_##af##_##table##_table_##parm) \ + configured_##af##_##table##_table_##parm = \ + session_manager_main.configured_##af##_##table##_table_##parm; + foreach_hash_table_parameter; +#undef _ + clib_bihash_init_16_8 (&sl->v4_session_hash, "v4 session table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v4_session_table_buckets, + configured_v4_session_table_memory); clib_bihash_init_48_8 (&sl->v6_session_hash, "v6 session table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); - + configured_v6_session_table_buckets, + configured_v6_session_table_memory); clib_bihash_init_16_8 (&sl->v4_half_open_hash, "v4 half-open table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v4_halfopen_table_buckets, + configured_v4_halfopen_table_memory); clib_bihash_init_48_8 (&sl->v6_half_open_hash, "v6 half-open table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v6_halfopen_table_buckets, + configured_v6_halfopen_table_memory); } /* diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 27e20f8e..48daffb4 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -597,8 +597,9 @@ clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) a->mp = 0; vnet_connect_uri (a); - /* Crude pacing for call setups, 100k/sec */ - vlib_process_suspend (vm, 10e-6); + /* Crude pacing for call setups */ + if ((i % 4) == 0) + vlib_process_suspend (vm, 10e-6); } } @@ -612,8 +613,10 @@ test_tcp_clients_command_fn (vlib_main_t * vm, uword *event_data = 0, event_type; u8 *default_connect_uri = (u8 *) "tcp://6.0.1.1/1234", *uri; u64 tmp, total_bytes; - f64 cli_timeout = 20.0, delta; + f64 test_timeout = 20.0, syn_timeout = 20.0, delta; + f64 time_before_connects; u32 n_clients = 1; + int preallocate_sessions = 0; char *transfer_type; int i; @@ -640,7 +643,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, ; else if (unformat (input, "uri %s", &tm->connect_uri)) ; - else if (unformat (input, "cli-timeout %f", &cli_timeout)) + else if (unformat (input, "test-timeout %f", &test_timeout)) + ; + else if (unformat (input, "syn-timeout %f", &syn_timeout)) ; else if (unformat (input, "no-return")) tm->no_return = 1; @@ -657,6 +662,8 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->private_segment_size = tmp; else if (unformat (input, "preallocate-fifos")) tm->prealloc_fifos = 1; + else if (unformat (input, "preallocate-sessions")) + preallocate_sessions = 1; else if (unformat (input, "client-batch %d", &tm->connections_per_batch)) ; @@ -674,6 +681,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, return clib_error_return (0, "failed init"); } + tm->ready_connections = 0; tm->expected_connections = n_clients; tm->rx_total = 0; @@ -705,11 +713,21 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_node_set_state (vlib_mains[i], builtin_client_node.index, VLIB_NODE_STATE_POLLING); + if (preallocate_sessions) + { + session_t *sp __attribute__ ((unused)); + for (i = 0; i < n_clients; i++) + pool_get (tm->sessions, sp); + for (i = 0; i < n_clients; i++) + pool_put_index (tm->sessions, i); + } + /* Fire off connect requests */ + time_before_connects = vlib_time_now (vm); clients_connect (vm, uri, n_clients); /* Park until the sessions come up, or ten seconds elapse... */ - vlib_process_wait_for_event_or_clock (vm, 10 /* timeout, seconds */ ); + vlib_process_wait_for_event_or_clock (vm, syn_timeout); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) { @@ -719,6 +737,15 @@ test_tcp_clients_command_fn (vlib_main_t * vm, goto cleanup; case 1: + delta = vlib_time_now (vm) - time_before_connects; + + if (delta != 0.0) + { + vlib_cli_output + (vm, "%d three-way handshakes in %.2f seconds, %.2f/sec", + n_clients, delta, ((f64) n_clients) / delta); + } + tm->test_start_time = vlib_time_now (tm->vlib_main); vlib_cli_output (vm, "Test started at %.6f", tm->test_start_time); break; @@ -729,7 +756,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, } /* Now wait for the sessions to finish... */ - vlib_process_wait_for_event_or_clock (vm, cli_timeout); + vlib_process_wait_for_event_or_clock (vm, test_timeout); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) { diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 59b20747..8e2eb9f4 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -173,7 +173,7 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Cleanup local endpoint if this was an active connect */ tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip, - tc->c_lcl_port); + clib_net_to_host_u16 (tc->c_lcl_port)); if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX) { tep = pool_elt_at_index (tm->local_endpoints, tepi); @@ -367,25 +367,24 @@ tcp_allocate_local_port (ip46_address_t * ip) { tcp_main_t *tm = vnet_get_tcp_main (); transport_endpoint_t *tep; - u32 time_now, tei; + u32 tei; u16 min = 1024, max = 65535; /* XXX configurable ? */ - int tries; + int tries, limit; - tries = max - min; - time_now = tcp_time_now (); + limit = max - min; /* Only support active opens from thread 0 */ ASSERT (vlib_get_thread_index () == 0); /* Search for first free slot */ - for (; tries >= 0; tries--) + for (tries = 0; tries < limit; tries++) { u16 port = 0; /* Find a port in the specified range */ while (1) { - port = random_u32 (&time_now) & PORT_MASK; + port = random_u32 (&tm->port_allocator_seed) & PORT_MASK; if (PREDICT_TRUE (port >= min && port < max)) break; } @@ -1189,8 +1188,9 @@ tcp_main_enable (vlib_main_t * vm) vlib_thread_main_t *vtm = vlib_get_thread_main (); clib_error_t *error = 0; u32 num_threads; - int thread, i; + int i, thread; tcp_connection_t *tc __attribute__ ((unused)); + u32 preallocated_connections_per_thread; if ((error = vlib_call_init_function (vm, ip_main_init))) return error; @@ -1224,14 +1224,26 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->connections, num_threads - 1); /* - * Preallocate connections + * Preallocate connections. Assume that thread 0 won't + * use preallocated threads when running multi-core */ - for (thread = 0; thread < num_threads; thread++) + if (num_threads == 1) { - for (i = 0; i < tm->preallocated_connections; i++) + thread = 0; + preallocated_connections_per_thread = tm->preallocated_connections; + } + else + { + thread = 1; + preallocated_connections_per_thread = + tm->preallocated_connections / (num_threads - 1); + } + for (; thread < num_threads; thread++) + { + for (i = 0; i < preallocated_connections_per_thread; i++) pool_get (tm->connections[thread], tc); - for (i = 0; i < tm->preallocated_connections; i++) + for (i = 0; i < preallocated_connections_per_thread; i++) pool_put_index (tm->connections[thread], i); } @@ -1257,13 +1269,21 @@ tcp_main_enable (vlib_main_t * vm) / TCP_TSTAMP_RESOLUTION; clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + 1000000 /* $$$$ config parameter nbuckets */ , + (512 << 20) /*$$$ config parameter table size */ ); + + /* Initialize [port-allocator] random number seed */ + tm->port_allocator_seed = (u32) clib_cpu_time_now (); + if (num_threads > 1) { clib_spinlock_init (&tm->half_open_lock); clib_spinlock_init (&tm->local_endpoints_lock); } + + vec_validate (tm->tx_frames[0], num_threads - 1); + vec_validate (tm->tx_frames[1], num_threads - 1); + return error; } @@ -1289,16 +1309,12 @@ clib_error_t * tcp_init (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); - - tm->vnet_main = vnet_get_main (); tm->is_enabled = 0; - return 0; } VLIB_INIT_FUNCTION (tcp_init); - static clib_error_t * tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) { diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 4fa681f8..997df76f 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -369,6 +369,8 @@ typedef struct _tcp_main /** per-worker tx buffer free lists */ u32 **tx_buffers; + /** per-worker tx frames to 4/6 output nodes */ + vlib_frame_t **tx_frames[2]; /* Per worker-thread timer wheel for connections timers */ tw_timer_wheel_16t_2w_512sl_t *timer_wheels; @@ -400,11 +402,8 @@ typedef struct _tcp_main u32 last_v6_address_rotor; ip6_address_t *ip6_src_addresses; - /* convenience */ - vlib_main_t *vlib_main; - vnet_main_t *vnet_main; - ip4_main_t *ip4_main; - ip6_main_t *ip6_main; + /** Port allocator random number generator seed */ + u32 port_allocator_seed; } tcp_main_t; extern tcp_main_t tcp_main; @@ -493,6 +492,8 @@ void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); void tcp_update_snd_mss (tcp_connection_t * tc); void tcp_update_rto (tcp_connection_t * tc); +void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4); +void tcp_flush_frames_to_output (u8 thread_index); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) @@ -614,6 +615,7 @@ tcp_update_time (f64 now, u32 thread_index) { tw_timer_expire_timers_16t_2w_512sl (&tcp_main.timer_wheels[thread_index], now); + tcp_flush_frames_to_output (thread_index); } u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 6c59d70f..29f4f08d 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1751,6 +1751,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, errors = session_manager_flush_enqueue_events (my_thread_index); tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); + tcp_flush_frame_to_output (vm, my_thread_index, is_ip4); + return from_frame->n_vectors; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index ad13493a..f8fbb8a9 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -436,34 +436,41 @@ tcp_init_mss (tcp_connection_t * tc) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } -#define tcp_get_free_buffer_index(tm, bidx) \ -do { \ - u32 *my_tx_buffers, n_free_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ - { \ - n_free_buffers = 32; /* TODO config or macro */ \ - vec_validate (my_tx_buffers, n_free_buffers - 1); \ - _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - vlib_get_main(), my_tx_buffers, n_free_buffers, \ - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[thread_index] = my_tx_buffers; \ - } \ - /* buffer shortage */ \ - if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ - return; \ - *bidx = my_tx_buffers[_vec_len (my_tx_buffers)-1]; \ - _vec_len (my_tx_buffers) -= 1; \ -} while (0) - -#define tcp_return_buffer(tm) \ -do { \ - u32 *my_tx_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - _vec_len (my_tx_buffers) +=1; \ -} while (0) +always_inline int +tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) +{ + u32 *my_tx_buffers, n_free_buffers; + u32 thread_index = vlib_get_thread_index (); + my_tx_buffers = tm->tx_buffers[thread_index]; + if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) + { + n_free_buffers = VLIB_FRAME_SIZE; + vec_validate (my_tx_buffers, n_free_buffers - 1); + _vec_len (my_tx_buffers) = + vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers, + n_free_buffers, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + /* buffer shortage, report failure */ + if (vec_len (my_tx_buffers) == 0) + { + clib_warning ("out of buffers"); + return -1; + } + tm->tx_buffers[thread_index] = my_tx_buffers; + } + *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1]; + _vec_len (my_tx_buffers) -= 1; + return 0; +} + +always_inline void +tcp_return_buffer (tcp_main_t * tm) +{ + u32 *my_tx_buffers; + u32 thread_index = vlib_get_thread_index (); + my_tx_buffers = tm->tx_buffers[thread_index]; + _vec_len (my_tx_buffers) += 1; +} always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) @@ -706,7 +713,9 @@ tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) ip4_header_t *ih4, *pkt_ih4; ip6_header_t *ih6, *pkt_ih6; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ @@ -811,7 +820,9 @@ tcp_send_syn (tcp_connection_t * tc) u16 initial_wnd; tcp_options_t snd_opts; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ @@ -854,8 +865,11 @@ tcp_send_syn (tcp_connection_t * tc) } always_inline void -tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) { + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); u32 *to_next, next_index; vlib_frame_t *f; @@ -872,12 +886,62 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) b->pre_data[1] = next_index; } - /* Enqueue the packet */ - f = vlib_get_frame_to_node (vm, next_index); + /* Get frame to v4/6 output node */ + f = tm->tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->tx_frames[!is_ip4][thread_index] = f; + } to_next = vlib_frame_vector_args (f); - to_next[0] = bi; - f->n_vectors = 1; - vlib_put_frame_to_node (vm, next_index, f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0); +} + +always_inline void +tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1); +} + +/** + * Flush tx frame populated by retransmits and timer pops + */ +void +tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4) +{ + if (tcp_main.tx_frames[!is_ip4][thread_index]) + { + u32 next_index; + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + vlib_put_frame_to_node (vm, next_index, + tcp_main.tx_frames[!is_ip4][thread_index]); + tcp_main.tx_frames[!is_ip4][thread_index] = 0; + } +} + +/** + * Flush both v4 and v6 tx frames for thread index + */ +void +tcp_flush_frames_to_output (u8 thread_index) +{ + vlib_main_t *vm = vlib_get_main (); + tcp_flush_frame_to_output (vm, thread_index, 1); + tcp_flush_frame_to_output (vm, thread_index, 0); } /** @@ -891,14 +955,15 @@ tcp_send_fin (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ vlib_buffer_make_headroom (b, MAX_HDRS_LEN); tcp_make_fin (tc, b); - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); @@ -981,7 +1046,8 @@ tcp_send_ack (tcp_connection_t * tc) u32 bi; /* Get buffer */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); /* Fill in the ACK */ @@ -1108,7 +1174,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) @@ -1116,6 +1184,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Lost FIN, retransmit and return */ if (tc->flags & TCP_CONN_FINSNT) { + tcp_return_buffer (tm); tcp_send_fin (tc); return; } @@ -1143,6 +1212,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_retransmit_timer_set (tc); ASSERT (0 || (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion)); + tcp_return_buffer (tm); return; } @@ -1164,6 +1234,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) clib_warning ("could not remove half-open connection"); ASSERT (0); } + tcp_return_buffer (tm); return; } @@ -1185,6 +1256,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_CLOSED); clib_warning ("connection closed ..."); + tcp_return_buffer (tm); return; } @@ -1254,7 +1326,9 @@ tcp_timer_persist_handler (u32 index) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); /* Try to force the first unsent segment */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); @@ -1300,7 +1374,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tc->snd_nxt = tc->snd_una; /* Get buffer */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); @@ -1344,9 +1420,10 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); while (hole && snd_space > 0) { - tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (vm, bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); hole = scoreboard_next_rxt_hole (sb, hole, tcp_fastrecovery_sent_1_smss (tc), &can_rescue, &snd_limited); @@ -1414,9 +1491,9 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) while (snd_space > 0) { - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); - offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); @@ -1506,32 +1583,21 @@ tcp46_output_inline (vlib_main_t * vm, if (is_ip4) { - ip4_header_t *ih0; - ih0 = vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, - &tc0->c_rmt_ip4, IP_PROTOCOL_TCP, - 1); - b0->flags |= - VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_OFFLOAD_IP_CKSUM | - VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; - vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; + vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4, + IP_PROTOCOL_TCP, 1); + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; th0->checksum = 0; } else { ip6_header_t *ih0; - int bogus = ~0; - ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6, &tc0->c_rmt_ip6, IP_PROTOCOL_TCP); - - b0->flags |= VNET_BUFFER_F_IS_IP6 | - VNET_BUFFER_F_OFFLOAD_IP_CKSUM | - VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; th0->checksum = 0; - ASSERT (!bogus); } /* Filter out DUPACKs if there are no OOO segments left */ diff --git a/src/vnet/unix/gdb_funcs.c b/src/vnet/unix/gdb_funcs.c index cca2e420..32e22d92 100644 --- a/src/vnet/unix/gdb_funcs.c +++ b/src/vnet/unix/gdb_funcs.c @@ -21,7 +21,7 @@ #include #include - +#include /** * @brief GDB callable function: vl - Return vector length of vector @@ -135,6 +135,47 @@ void vlib_runtime_index_to_node_name (u32 index) fformat(stderr, "node runtime index %d name %s\n", index, nm->nodes[index]->name); } +void gdb_show_errors (int verbose) +{ + extern vlib_cli_command_t vlib_cli_show_errors; + unformat_input_t input; + vlib_main_t * vm = vlib_get_main(); + + if (verbose == 0) + unformat_init_string (&input, "verbose 0", 9); + else if (verbose == 1) + unformat_init_string (&input, "verbose 1", 9); + else + { + fformat(stderr, "verbose not 0 or 1\n"); + return; + } + + vlib_cli_show_errors.function (vm, &input, 0 /* cmd */); + unformat_free (&input); +} + +void gdb_show_session (int verbose) +{ + extern vlib_cli_command_t vlib_cli_show_session_command; + unformat_input_t input; + vlib_main_t * vm = vlib_get_main(); + + if (verbose == 0) + unformat_init_string (&input, "verbose 0", 9); + else if (verbose == 1) + unformat_init_string (&input, "verbose 1", 9); + else if (verbose == 2) + unformat_init_string (&input, "verbose 2", 9); + else + { + fformat(stderr, "verbose not 0 - 2\n"); + return; + } + + vlib_cli_show_session_command.function (vm, &input, 0 /* cmd */); + unformat_free (&input); +} /** * @brief GDB callable function: show_gdb_command_fn - show gdb @@ -151,6 +192,8 @@ show_gdb_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "vl(p) returns vec_len(p)"); vlib_cli_output (vm, "pe(p) returns pool_elts(p)"); vlib_cli_output (vm, "pifi(p, i) returns pool_is_free_index(p, i)"); + vlib_cli_output (vm, "gdb_show_errors(0|1) dumps error counters"); + vlib_cli_output (vm, "gdb_show_session dumps session counters"); vlib_cli_output (vm, "debug_hex_bytes (ptr, n_bytes) dumps n_bytes in hex"); vlib_cli_output (vm, "vlib_dump_frame_ownership() does what it says"); vlib_cli_output (vm, "vlib_runtime_index_to_node_name (index) prints NN"); diff --git a/src/vppinfra/pool.h b/src/vppinfra/pool.h index 57838e1c..56536b77 100644 --- a/src/vppinfra/pool.h +++ b/src/vppinfra/pool.h @@ -200,7 +200,7 @@ do { \ #define pool_get(P,E) pool_get_aligned(P,E,0) /** See if pool_get will expand the pool or not */ -#define pool_get_aligned_will_expand (P,YESNO,A) \ +#define pool_get_aligned_will_expand(P,YESNO,A) \ do { \ pool_header_t * _pool_var (p) = pool_header (P); \ uword _pool_var (l); \ -- cgit 1.2.3-korg From adcf45217d08d66d9c85dbada54c6f1f401c7ef2 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 22 Aug 2017 13:51:31 +0200 Subject: dpdk: fix Cavium ThunderX support Change-Id: Id5812448917ce52984d5a778d5b304c448a752e7 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/format.c | 4 ++++ src/plugins/dpdk/device/init.c | 1 + 2 files changed, 5 insertions(+) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index 403d7204..cfe8851f 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -314,6 +314,10 @@ format_dpdk_device_type (u8 * s, va_list * args) dev_type = "Virtio User"; break; + case VNET_DPDK_PMD_THUNDERX: + dev_type = "Cavium ThunderX"; + break; + default: case VNET_DPDK_PMD_UNKNOWN: dev_type = "### UNKNOWN ###"; diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 8a708035..c6c9ee34 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -372,6 +372,7 @@ dpdk_lib_init (dpdk_main_t * dm) case VNET_DPDK_PMD_THUNDERX: xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF; + xd->port_conf.rxmode.hw_strip_crc = 1; break; case VNET_DPDK_PMD_DPAA2: -- cgit 1.2.3-korg From 57d963f88b2c99e698e2b29f72e190f47f41b1ad Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 20 Jul 2017 19:17:06 +0200 Subject: Make VPP runtime directory configurable New startup config command: unix { runtime-dir /run/vpp } Also, adds recursive mkdir funtion for use in deifferent places like cli-config socket path and dpdk hugepage directory path. Change-Id: I1446ceab9c220c25804e73a743a3ebb383450124 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 29 +++++++++++++---------------- src/plugins/memif/memif.c | 14 +++++++++----- src/plugins/memif/private.h | 1 - src/vlib/unix/cli.c | 20 ++++++++++++-------- src/vlib/unix/main.c | 13 ++++++++++++- src/vlib/unix/unix.h | 14 ++++++++++---- src/vlib/unix/util.c | 36 +++++++++++++++++++++++++++++------- src/vpp/vnet/main.c | 5 +++++ 8 files changed, 90 insertions(+), 42 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index c6c9ee34..6f7e168b 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -37,8 +37,6 @@ dpdk_main_t dpdk_main; #define LINK_STATE_ELOGS 0 -#define DEFAULT_HUGE_DIR (VPP_RUN_DIR "/hugepages") - /* Port configuration, mildly modified Intel app values */ static struct rte_eth_conf port_conf_template = { @@ -835,6 +833,10 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) u8 huge_dir = 0; u8 file_prefix = 0; u8 *socket_mem = 0; + u8 *huge_dir_path = 0; + + huge_dir_path = + format (0, "%s/hugepages%c", vlib_unix_get_runtime_dir (), 0); conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword)); log_level = RTE_LOG_NOTICE; @@ -980,7 +982,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) u8 less_than_1g = 1; int rv; - umount (DEFAULT_HUGE_DIR); + umount ((char *) huge_dir_path); /* Process "socket-mem" parameter value */ if (vec_len (socket_mem)) @@ -1057,27 +1059,20 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) vec_free (mem_by_socket); - /* Make sure VPP_RUN_DIR exists */ - error = unix_make_vpp_run_dir (); + error = vlib_unix_recursive_mkdir ((char *) huge_dir_path); if (error) - goto done; - - rv = mkdir (DEFAULT_HUGE_DIR, 0755); - if (rv && errno != EEXIST) { - error = clib_error_return (0, "mkdir '%s' failed errno %d", - DEFAULT_HUGE_DIR, errno); goto done; } if (use_1g && !(less_than_1g && use_2m)) { - rv = - mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G"); + rv = mount ("none", (char *) huge_dir_path, "hugetlbfs", 0, + "pagesize=1G"); } else if (use_2m) { - rv = mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL); + rv = mount ("none", (char *) huge_dir_path, "hugetlbfs", 0, NULL); } else { @@ -1092,7 +1087,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) tmp = format (0, "--huge-dir%c", 0); vec_add1 (conf->eal_init_args, tmp); - tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0); + tmp = format (0, "%s%c", huge_dir_path, 0); vec_add1 (conf->eal_init_args, tmp); if (!file_prefix) { @@ -1209,7 +1204,9 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) (char **) conf->eal_init_args); /* lazy umount hugepages */ - umount2 (DEFAULT_HUGE_DIR, MNT_DETACH); + umount2 ((char *) huge_dir_path, MNT_DETACH); + rmdir ((char *) huge_dir_path); + vec_free (huge_dir_path); if (ret < 0) return clib_error_return (0, "rte_eal_init returned %d", ret); diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index ba123149..af81faf2 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -556,15 +556,19 @@ memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args) if (args->socket_filename == 0 || args->socket_filename[0] != '/') { - rv = mkdir (MEMIF_DEFAULT_SOCKET_DIR, 0755); - if (rv && errno != EEXIST) - return VNET_API_ERROR_SYSCALL_ERROR_1; + clib_error_t *error; + error = vlib_unix_recursive_mkdir (vlib_unix_get_runtime_dir ()); + if (error) + { + clib_error_free (error); + return VNET_API_ERROR_SYSCALL_ERROR_1; + } if (args->socket_filename == 0) - socket_filename = format (0, "%s/%s%c", MEMIF_DEFAULT_SOCKET_DIR, + socket_filename = format (0, "%s/%s%c", vlib_unix_get_runtime_dir (), MEMIF_DEFAULT_SOCKET_FILENAME, 0); else - socket_filename = format (0, "%s/%s%c", MEMIF_DEFAULT_SOCKET_DIR, + socket_filename = format (0, "%s/%s%c", vlib_unix_get_runtime_dir (), args->socket_filename, 0); } diff --git a/src/plugins/memif/private.h b/src/plugins/memif/private.h index 0f82f1e9..985ac5ec 100644 --- a/src/plugins/memif/private.h +++ b/src/plugins/memif/private.h @@ -17,7 +17,6 @@ #include -#define MEMIF_DEFAULT_SOCKET_DIR "/run/vpp" #define MEMIF_DEFAULT_SOCKET_FILENAME "memif.sock" #define MEMIF_DEFAULT_RING_SIZE 1024 #define MEMIF_DEFAULT_RX_QUEUES 1 diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 1befa25d..068a4e16 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -2642,15 +2642,19 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) /* CLI listen. */ unix_file_t template = { 0 }; - /* If our listen address looks like a path and it starts with - * VPP_RUN_DIR, go make sure VPP_RUN_DIR exists before trying to open - * a socket in it. - */ - if (strncmp (s->config, VPP_RUN_DIR "/", strlen (VPP_RUN_DIR) + 1) == 0) + /* mkdir of file socketu, only under /run */ + if (strncmp (s->config, "/run", 4) == 0) { - error = unix_make_vpp_run_dir (); - if (error) - return error; + u8 *tmp = format (0, "%s", s->config); + int i = vec_len (tmp); + while (i && tmp[--i] != '/') + ; + + tmp[i] = 0; + + if (i) + vlib_unix_recursive_mkdir ((char *) tmp); + vec_free (tmp); } s->flags = SOCKET_IS_SERVER | /* listen, don't connect */ diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index ad1a7c3c..cb34a89f 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -56,6 +56,8 @@ /** Default CLI history depth if not configured in startup.conf */ #define UNIX_CLI_DEFAULT_HISTORY 50 +char *vlib_default_runtime_dir __attribute__ ((weak)); +char *vlib_default_runtime_dir = "/run/vlib"; unix_main_t unix_main; @@ -332,6 +334,8 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "cli-listen %s", &um->cli_listen_socket.config)) ; + else if (unformat (input, "runtime-dir %s", &um->runtime_dir)) + ; else if (unformat (input, "cli-line-mode")) um->cli_line_mode = 1; else if (unformat (input, "cli-no-banner")) @@ -432,6 +436,9 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) } um->unix_config_complete = 1; + if (um->runtime_dir == 0) + um->runtime_dir = format (0, "%s%c", vlib_default_runtime_dir, 0); + return 0; } @@ -463,6 +470,10 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) * Ask the Linux kernel to dump all memory-mapped address regions, instead * of just text+data+bss. * + * @cfgcmd{runtime-dir} + * Define directory where VPP is going to store all runtime files. + * Default is /run/vpp. + * * @cfgcmd{cli-listen, <address:port>} * Bind the CLI to listen at the address and port given. @clocalhost * on TCP port @c 5002, given as cli-listen localhost:5002, @@ -489,7 +500,7 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) * Limit pager buffer to @c nn lines of output. * A value of @c 0 disables the pager. Default value: @c 100000 ?*/ -VLIB_CONFIG_FUNCTION (unix_config, "unix"); +VLIB_EARLY_CONFIG_FUNCTION (unix_config, "unix"); static clib_error_t * unix_exit (vlib_main_t * vm) diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index ffa92bba..ee1312e3 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -44,9 +44,6 @@ #include -/** VPP runtime ephemeral directory. Typically stored in a tmpfs. */ -#define VPP_RUN_DIR "/run/vpp" - struct unix_file; typedef clib_error_t *(unix_file_function_t) (struct unix_file * f); @@ -106,6 +103,9 @@ typedef struct /* startup-config filename */ u8 *startup_config_filename; + /* runtime directory path */ + u8 *runtime_dir; + /* unix config complete */ volatile int unix_config_complete; @@ -214,6 +214,12 @@ vlib_unix_get_main (void) return &unix_main; } +static inline char * +vlib_unix_get_runtime_dir (void) +{ + return (char *) unix_main.runtime_dir; +} + /* thread stack array; vec_len = max number of threads */ extern u8 **vlib_thread_stacks; @@ -233,7 +239,7 @@ clib_error_t *foreach_directory_file (char *dir_name, u8 * file_name), void *arg, int scan_dirs); -clib_error_t *unix_make_vpp_run_dir (void); +clib_error_t *vlib_unix_recursive_mkdir (char *path); #endif /* included_unix_unix_h */ diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 51b4a4ed..93aeb99c 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -223,16 +223,38 @@ done: } clib_error_t * -unix_make_vpp_run_dir (void) +vlib_unix_recursive_mkdir (char *path) { - int rv; + clib_error_t *error = 0; + char *c = 0; + int i = 0; - rv = mkdir (VPP_RUN_DIR, 0755); - if (rv && errno != EEXIST) - return clib_error_return (0, "mkdir '%s' failed errno %d", - VPP_RUN_DIR, errno); + while (path[i] != 0) + { + if (c && path[i] == '/') + { + vec_add1 (c, 0); + if ((mkdir (c, 0755)) && (errno != EEXIST)) + { + error = clib_error_return_unix (0, "mkdir '%s'", c); + goto done; + } + _vec_len (c)--; + } + vec_add1 (c, path[i]); + i++; + } - return 0; + if ((mkdir (path, 0755)) && (errno != EEXIST)) + { + error = clib_error_return_unix (0, "mkdir '%s'", path); + goto done; + } + +done: + vec_free (c); + + return error; } /* diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index ade32aa1..9fe65fe2 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -41,6 +41,11 @@ vpe_main_init (vlib_main_t * vm) vat_plugin_hash_create (); } +/* + * Default path for runtime data + */ +char *vlib_default_runtime_dir = "/run/vpp"; + /* * Load plugins from /usr/lib/vpp_plugins by default */ -- cgit 1.2.3-korg From acdc306093aaea2633cf765307d6cb7c1b80081c Mon Sep 17 00:00:00 2001 From: Sergio Gonzalez Monroy Date: Thu, 24 Aug 2017 14:09:17 +0100 Subject: dpdk: required changes for 17.08 DPDK 17.08 breaks ethdev and cryptodev APIs. Address those changes while keeping backwards compatibility for DPDK 17.02 and 17.05. Change-Id: Idd6ac264d0d047fe586c41d4c4ca74e8fc778a54 Signed-off-by: Sergio Gonzalez Monroy --- Makefile | 6 +- dpdk/Makefile | 47 +++++--- src/configure.ac | 70 ++++++++++-- src/plugins/dpdk.am | 16 ++- src/plugins/dpdk/device/common.c | 28 ++++- src/plugins/dpdk/device/dpdk.h | 7 ++ src/plugins/dpdk/ipsec/cli.c | 15 +++ src/plugins/dpdk/ipsec/esp.h | 212 ++++++++++++++++++++++++++++++----- src/plugins/dpdk/ipsec/esp_decrypt.c | 140 ++++++++++------------- src/plugins/dpdk/ipsec/esp_encrypt.c | 106 +++++++----------- src/plugins/dpdk/ipsec/ipsec.c | 130 +++++++++++++++++---- src/plugins/dpdk/ipsec/ipsec.h | 33 ++++-- 12 files changed, 567 insertions(+), 243 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/Makefile b/Makefile index c1a7cbb5..6ac6f6e7 100644 --- a/Makefile +++ b/Makefile @@ -60,7 +60,7 @@ endif DEB_DEPENDS = curl build-essential autoconf automake bison libssl-dev ccache DEB_DEPENDS += debhelper dkms git libtool libapr1-dev dh-systemd DEB_DEPENDS += libconfuse-dev git-review exuberant-ctags cscope pkg-config -DEB_DEPENDS += lcov chrpath autoconf nasm indent +DEB_DEPENDS += lcov chrpath autoconf nasm indent libnuma-dev DEB_DEPENDS += python-all python-dev python-virtualenv python-pip libffi6 ifeq ($(OS_VERSION_ID),14.04) DEB_DEPENDS += openjdk-8-jdk-headless @@ -73,7 +73,7 @@ endif RPM_DEPENDS = redhat-lsb glibc-static java-1.8.0-openjdk-devel yum-utils RPM_DEPENDS += openssl-devel https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm apr-devel -RPM_DEPENDS += python-devel +RPM_DEPENDS += python-devel numactl-devel ifeq ($(OS_ID)-$(OS_VERSION_ID),fedora-25) RPM_DEPENDS += python2-virtualenv RPM_DEPENDS_GROUPS = 'C Development Tools and Libraries' @@ -99,7 +99,7 @@ endif RPM_SUSE_DEPENDS = autoconf automake bison ccache chrpath distribution-release gcc6 glibc-devel-static RPM_SUSE_DEPENDS += java-1_8_0-openjdk-devel libopenssl-devel libtool lsb-release make openssl-devel -RPM_SUSE_DEPENDS += python-devel python-pip python-rpm-macros shadow nasm +RPM_SUSE_DEPENDS += python-devel python-pip python-rpm-macros shadow nasm numactl-devel ifneq ($(wildcard $(STARTUP_DIR)/startup.conf),) STARTUP_CONF ?= $(STARTUP_DIR)/startup.conf diff --git a/dpdk/Makefile b/dpdk/Makefile index 2e4b0e96..8d5b42ef 100644 --- a/dpdk/Makefile +++ b/dpdk/Makefile @@ -35,17 +35,27 @@ DPDK_17.08_TARBALL_MD5_CKSUM := 0641f59ea8ea98afefa7cfa2699f6241 DPDK_SOURCE := $(B)/dpdk-$(DPDK_VERSION) MACHINE=$(shell uname -m) +AESNI ?= n +ISA_L_CRYPTO_LIB := n + +IPSEC_MB_VER ?= 0.46 +ISA_L_CRYPTO_VER := 2.18.0 + ifeq ($(MACHINE),$(filter $(MACHINE),x86_64)) -AESNI := y -else -AESNI := n +AESNI = y +# DPDK pre 17.08 depends on ISA-L Crypto library for GCM PMD + ifneq ($(firstword $(sort $(DPDK_VERSION), 17.08)), 17.08) + ISA_L_CRYPTO_LIB = y + IPSEC_MB_VER = 0.45 + $(info Building ISA-L Crypto $(ISA_L_CRYPTO_VER) library) + endif +$(info Building IPSec-MB $(IPSEC_MB_VER) library) endif -IPSEC_MB_VER := 0.45 AESNIMB_LIB_TARBALL := v$(IPSEC_MB_VER).tar.gz AESNIMB_LIB_TARBALL_URL := http://github.com/01org/intel-ipsec-mb/archive/$(AESNIMB_LIB_TARBALL) AESNIMB_LIB_SOURCE := $(B)/intel-ipsec-mb-$(IPSEC_MB_VER) -ISA_L_CRYPTO_VER := 2.18.0 + ISA_L_CRYPTO_LIB_TARBALL := v$(ISA_L_CRYPTO_VER).tar.gz ISA_L_CRYPTO_LIB_TARBALL_URL := http://github.com/01org/isa-l_crypto/archive/$(ISA_L_CRYPTO_LIB_TARBALL) ISA_L_CRYPTO_LIB_SOURCE := $(B)/isa-l_crypto-$(ISA_L_CRYPTO_VER) @@ -100,8 +110,10 @@ else DPDK_EXTRA_CFLAGS := -g -O0 endif +ifeq ($(ISA_L_CRYPTO_LIB),y) DPDK_EXTRA_CFLAGS += -I$(ISA_L_CRYPTO_INSTALL_DIR)/include -Wl,-z,muldefs DPDK_EXTRA_LDFLAGS += -L$(I)/lib +endif DPDK_MAKE_EXTRA_ARGS += AESNI_MULTI_BUFFER_LIB_PATH=$(AESNIMB_LIB_SOURCE) # assemble DPDK make arguments @@ -185,6 +197,8 @@ $(CURDIR)/$(ISA_L_CRYPTO_LIB_TARBALL): DPDK_DOWNLOADS = $(CURDIR)/$(DPDK_TARBALL) ifeq ($(AESNI),y) DPDK_DOWNLOADS += $(CURDIR)/$(AESNIMB_LIB_TARBALL) +endif +ifeq ($(ISA_L_CRYPTO_LIB),y) DPDK_DOWNLOADS += $(CURDIR)/$(ISA_L_CRYPTO_LIB_TARBALL) endif @@ -205,10 +219,12 @@ $(B)/.extract.ok: $(B)/.download.ok ifeq ($(AESNI),y) @echo --- extracting $(AESNIMB_LIB_TARBALL) --- @tar --directory $(B) --extract --file $(CURDIR)/$(AESNIMB_LIB_TARBALL) +endif +ifeq ($(ISA_L_CRYPTO_LIB),y) @echo --- extracting $(ISA_L_CRYPTO_LIB_TARBALL) --- @tar --directory $(B) --extract --file $(CURDIR)/$(ISA_L_CRYPTO_LIB_TARBALL) - @touch $@ endif + @touch $@ .PHONY: extract extract: $(B)/.extract.ok @@ -233,17 +249,11 @@ $(B)/.config.ok: $(B)/.patch.ok $(B)/custom-config .PHONY: config config: $(B)/.config.ok -# Order matters -ifeq ($(AESNI),y) -BUILD_TARGETS += build-ipsec-mb build-isal-crypto build-dpdk -else -BUILD_TARGETS += build-dpdk -endif - .PHONY: build-ipsec-mb build-ipsec-mb: mkdir -p $(I)/lib/ - make -C $(AESNIMB_LIB_SOURCE) -j NO_GCM=y + # Do not build GCM stuff if we are building ISA_L + make -C $(AESNIMB_LIB_SOURCE) -j NO_GCM=$(ISA_L_CRYPTO_LIB) cp $(AESNIMB_LIB_SOURCE)/libIPSec_MB.a $(I)/lib/ .PHONY: build-isal-crypto @@ -260,6 +270,15 @@ build-dpdk: @if [ ! -e $(B)/.config.ok ] ; then echo 'Please run "make config" first' && false ; fi @make $(DPDK_MAKE_ARGS) install +# Order matters +ifeq ($(AESNI),y) +BUILD_TARGETS += build-ipsec-mb +endif +ifeq ($(ISA_L_CRYPTO_LIB),y) +BUILD_TARGETS += build-isal-crypto +endif +BUILD_TARGETS += build-dpdk + $(B)/.build.ok: $(BUILD_TARGETS) @touch $@ diff --git a/src/configure.ac b/src/configure.ac index 4c2d3b47..6b6d9636 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -97,6 +97,26 @@ AC_DEFUN([DPDK_IS_PMD_ENABLED], m4_append_uniq([list_of_with], [$2], [, ]) ]) +AC_DEFUN([DETECT_DPDK_IS_1702_OR_1705], +[ + AC_MSG_CHECKING([for RTE_VERSION 17.02/17.05 in rte_version.h]) + AC_TRY_RUN( + [ + #include + int main() + { + return ((RTE_VER_YEAR != 17) || + (RTE_VER_MONTH != 2 && RTE_VER_MONTH != 5)); + } + ], + [dpdk_is_1702_or_1705=yes] + [AC_MSG_RESULT([yes])], + [dpdk_is_1702_or_1705=no] + [AC_MSG_RESULT([no])] + ) + AM_CONDITIONAL(DPDK_IS_1702_OR_1705, test "$dpdk_is_1702_or_1705" = "yes") +]) + ############################################################################### # configure arguments ############################################################################### @@ -185,34 +205,64 @@ AM_COND_IF([ENABLE_DPDK_SHARED], [AC_MSG_ERROR([DPDK shared library not found])],) ]) +with_aesni_mb_lib=no +with_isa_l_crypto_lib=no + DPDK_IS_PMD_ENABLED(LIBRTE_PMD_AESNI_MB, dpdk_aesni_mb_pmd) +DPDK_IS_PMD_ENABLED(LIBRTE_PMD_AESNI_GCM, dpdk_aesni_gcm_pmd) + +DETECT_DPDK_IS_1702_OR_1705() + AM_COND_IF([WITH_DPDK_AESNI_MB_PMD], [ - AC_CHECK_LIB([IPSec_MB], [submit_job_sse], [], + AC_CHECK_LIB([IPSec_MB], [submit_job_sse], + [with_aesni_mb_lib=yes], [AC_MSG_ERROR([IPSec_MB library not found])]) ]) -DPDK_IS_PMD_ENABLED(LIBRTE_PMD_AESNI_GCM, dpdk_aesni_gcm_pmd) AM_COND_IF([WITH_DPDK_AESNI_GCM_PMD], [ - AC_CHECK_LIB([isal_crypto], [aesni_gcm128_init], [], - [AC_MSG_ERROR([isal_crypto library not found])]) + AM_COND_IF([DPDK_IS_1702_OR_1705], + [ + AC_CHECK_LIB([isal_crypto], [aesni_gcm128_init], + [with_isa_l_crypto_lib=yes], + [AC_MSG_ERROR([isal_crypto library not found])]) + ], + [ + AC_CHECK_LIB([IPSec_MB], [submit_job_sse], + [with_aesni_mb_lib=yes], + [AC_MSG_ERROR([IPSec_MB library not found])]) + ]) ]) -DPDK_IS_PMD_ENABLED(LIBRTE_MLX5_PMD, dpdk_mlx5_pmd) -AM_COND_IF([WITH_DPDK_MLX5_PMD], +m4_append([list_of_with], [aesni_mb_lib], [, ]) +AM_CONDITIONAL(WITH_AESNI_MB_LIB, test "$with_aesni_mb_lib" = "yes") + +m4_append([list_of_with], [isa_l_crypto_lib], [, ]) +AM_CONDITIONAL(WITH_ISA_L_CRYPTO_LIB, test "$with_isa_l_crypto_lib" = "yes") + + +with_ibverbs_lib=no +DPDK_IS_PMD_ENABLED(LIBRTE_MLX4_PMD, dpdk_mlx4_pmd) +AM_COND_IF([WITH_DPDK_MLX4_PMD], [ - AC_CHECK_LIB([ibverbs], [ibv_fork_init], [], + AC_CHECK_LIB([ibverbs], [ibv_fork_init], + [with_ibverbs_lib=yes], [AC_MSG_ERROR([ibverbs library not found])]) ]) -DPDK_IS_PMD_ENABLED(LIBRTE_MLX4_PMD, dpdk_mlx4_pmd) -AM_COND_IF([WITH_DPDK_MLX4_PMD], +DPDK_IS_PMD_ENABLED(LIBRTE_MLX5_PMD, dpdk_mlx5_pmd) +AM_COND_IF([WITH_DPDK_MLX5_PMD], [ - AC_CHECK_LIB([ibverbs], [ibv_fork_init], [], + AC_CHECK_LIB([ibverbs], [ibv_fork_init], + [with_ibverbs_lib=yes], [AC_MSG_ERROR([ibverbs library not found])]) ]) +m4_append([list_of_with], [ibverbs_lib], [, ]) +AM_CONDITIONAL(WITH_IBVERBS_LIB, test "$with_ibverbs_lib" = "yes") + + AM_COND_IF([ENABLE_G2], [ PKG_CHECK_MODULES(g2, gtk+-2.0) diff --git a/src/plugins/dpdk.am b/src/plugins/dpdk.am index 3a1ffeeb..15195a21 100644 --- a/src/plugins/dpdk.am +++ b/src/plugins/dpdk.am @@ -19,20 +19,24 @@ dpdk_plugin_la_LDFLAGS = $(AM_LDFLAGS) -ldpdk else dpdk_plugin_la_LDFLAGS = $(AM_LDFLAGS) -Wl,--whole-archive,-l:libdpdk.a,--no-whole-archive endif -if WITH_DPDK_AESNI_MB_PMD +if WITH_AESNI_MB_LIB dpdk_plugin_la_LDFLAGS += -Wl,--exclude-libs,libIPSec_MB.a,-l:libIPSec_MB.a endif -if WITH_DPDK_AESNI_GCM_PMD +if WITH_ISA_L_CRYPTO_LIB dpdk_plugin_la_LDFLAGS += -Wl,--exclude-libs,libisal_crypto.a,-l:libisal_crypto.a endif -dpdk_plugin_la_LDFLAGS += -Wl,-lm,-ldl -if WITH_DPDK_MLX5_PMD +if WITH_IBVERBS_LIB dpdk_plugin_la_LDFLAGS += -Wl,-libverbs endif -if WITH_DPDK_MLX4_PMD -dpdk_plugin_la_LDFLAGS += -Wl,-libverbs +if DPDK_IS_1702_OR_1705 +dpdk_plugin_la_CFLAGS = $(AM_CFLAGS) -DDPDK_VOID_CALLBACK=1 -DDPDK_NO_AEAD=1 +else +dpdk_plugin_la_CFLAGS = $(AM_CFLAGS) -DDPDK_VOID_CALLBACK=0 -DDPDK_NO_AEAD=0 +dpdk_plugin_la_LDFLAGS += -Wl,-lnuma endif +dpdk_plugin_la_LDFLAGS += -Wl,-lm,-ldl + dpdk_plugin_la_SOURCES = \ dpdk/main.c \ dpdk/buffer.c \ diff --git a/src/plugins/dpdk/device/common.c b/src/plugins/dpdk/device/common.c index df52c58f..2707b4d8 100644 --- a/src/plugins/dpdk/device/common.c +++ b/src/plugins/dpdk/device/common.c @@ -181,9 +181,9 @@ dpdk_device_stop (dpdk_device_t * xd) } } -void -dpdk_port_state_callback (uint8_t port_id, - enum rte_eth_event_type type, void *param) +always_inline int +dpdk_port_state_callback_inline (uint8_t port_id, + enum rte_eth_event_type type, void *param) { struct rte_eth_link link; vlib_main_t *vm = vlib_get_main (); @@ -193,7 +193,7 @@ dpdk_port_state_callback (uint8_t port_id, if (type != RTE_ETH_EVENT_INTR_LSC) { clib_warning ("Unknown event %d received for port %d", type, port_id); - return; + return -1; } rte_eth_link_get_nowait (port_id, &link); @@ -238,8 +238,28 @@ dpdk_port_state_callback (uint8_t port_id, else clib_warning ("Port %d Link Down\n\n", port_id); } + + return 0; +} + +#if DPDK_VOID_CALLBACK +void +dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, void *param) +{ + dpdk_port_state_callback_inline (port_id, type, param); } +#else +int +dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, + void *param, + void *ret_param __attribute__ ((unused))) +{ + return dpdk_port_state_callback_inline (port_id, type, param); +} +#endif /* * fd.io coding-style-patch-verification: ON * diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 29a2c760..1e34e3fb 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -418,8 +418,15 @@ typedef struct void dpdk_device_setup (dpdk_device_t * xd); void dpdk_device_start (dpdk_device_t * xd); void dpdk_device_stop (dpdk_device_t * xd); + +#if DPDK_VOID_CALLBACK void dpdk_port_state_callback (uint8_t port_id, enum rte_eth_event_type type, void *param); +#else +int dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, + void *param, void *ret_param); +#endif #define foreach_dpdk_error \ _(NONE, "no error") \ diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c index a9314065..a9cf2502 100644 --- a/src/plugins/dpdk/ipsec/cli.c +++ b/src/plugins/dpdk/ipsec/cli.c @@ -86,13 +86,28 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) hash_foreach (key, data, cwm->algo_qp_map, ({ cap.op = RTE_CRYPTO_OP_TYPE_SYMMETRIC; +#if DPDK_NO_AEAD cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_CIPHER; cap.sym.cipher.algo = p_key->cipher_algo; +#else + if (p_key->is_aead) + { + cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_AEAD; + cap.sym.aead.algo = p_key->cipher_algo; + } + else + { + cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_CIPHER; + cap.sym.cipher.algo = p_key->cipher_algo; + } +#endif check_algo_is_supported (&cap, cipher_str); + cap.op = RTE_CRYPTO_OP_TYPE_SYMMETRIC; cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_AUTH; cap.sym.auth.algo = p_key->auth_algo; check_algo_is_supported (&cap, auth_str); + vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n", vlib_mains[i]->thread_index, cipher_str, auth_str, p_key->is_outbound ? "out" : "in", diff --git a/src/plugins/dpdk/ipsec/esp.h b/src/plugins/dpdk/ipsec/esp.h index 56f0c756..308a66af 100644 --- a/src/plugins/dpdk/ipsec/esp.h +++ b/src/plugins/dpdk/ipsec/esp.h @@ -22,6 +22,9 @@ typedef struct { enum rte_crypto_cipher_algorithm algo; +#if ! DPDK_NO_AEAD + enum rte_crypto_aead_algorithm aead_algo; +#endif u8 key_len; u8 iv_len; } dpdk_esp_crypto_alg_t; @@ -65,7 +68,11 @@ dpdk_esp_init () c->iv_len = 16; c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_GCM_128]; +#if DPDK_NO_AEAD c->algo = RTE_CRYPTO_CIPHER_AES_GCM; +#else + c->aead_algo = RTE_CRYPTO_AEAD_AES_GCM; +#endif c->key_len = 16; c->iv_len = 8; @@ -90,42 +97,68 @@ dpdk_esp_init () i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_512_256]; i->algo = RTE_CRYPTO_AUTH_SHA512_HMAC; i->trunc_size = 32; - +#if DPDK_NO_AEAD i = &em->esp_integ_algs[IPSEC_INTEG_ALG_AES_GCM_128]; i->algo = RTE_CRYPTO_AUTH_AES_GCM; i->trunc_size = 16; +#endif } static_always_inline int translate_crypto_algo (ipsec_crypto_alg_t crypto_algo, - struct rte_crypto_sym_xform *cipher_xform) + struct rte_crypto_sym_xform *xform, u8 use_esn) { +#if ! DPDK_NO_AEAD + const u16 iv_off = + sizeof (struct rte_crypto_op) + sizeof (struct rte_crypto_sym_op) + + offsetof (dpdk_cop_priv_t, cb); +#endif + + xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER; + switch (crypto_algo) { case IPSEC_CRYPTO_ALG_NONE: - cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_NULL; +#if ! DPDK_NO_AEAD + xform->cipher.iv.offset = iv_off; + xform->cipher.iv.length = 0; +#endif + xform->cipher.algo = RTE_CRYPTO_CIPHER_NULL; break; case IPSEC_CRYPTO_ALG_AES_CBC_128: case IPSEC_CRYPTO_ALG_AES_CBC_192: case IPSEC_CRYPTO_ALG_AES_CBC_256: - cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_CBC; +#if ! DPDK_NO_AEAD + xform->cipher.iv.offset = iv_off; + xform->cipher.iv.length = 16; +#endif + xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_CBC; break; case IPSEC_CRYPTO_ALG_AES_GCM_128: - cipher_xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_GCM; +#if DPDK_NO_AEAD + xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_GCM; +#else + xform->type = RTE_CRYPTO_SYM_XFORM_AEAD; + xform->aead.algo = RTE_CRYPTO_AEAD_AES_GCM; + xform->aead.iv.offset = iv_off; + xform->aead.iv.length = 12; /* GCM IV, not ESP IV */ + xform->aead.digest_length = 16; + xform->aead.aad_length = use_esn ? 12 : 8; +#endif break; default: return -1; } - cipher_xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER; - return 0; } static_always_inline int translate_integ_algo (ipsec_integ_alg_t integ_alg, - struct rte_crypto_sym_xform *auth_xform, int use_esn) + struct rte_crypto_sym_xform *auth_xform, u8 use_esn) { + auth_xform->type = RTE_CRYPTO_SYM_XFORM_AUTH; + switch (integ_alg) { case IPSEC_INTEG_ALG_NONE: @@ -152,21 +185,21 @@ translate_integ_algo (ipsec_integ_alg_t integ_alg, auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA512_HMAC; auth_xform->auth.digest_length = 32; break; +#if DPDK_NO_AEAD case IPSEC_INTEG_ALG_AES_GCM_128: auth_xform->auth.algo = RTE_CRYPTO_AUTH_AES_GCM; auth_xform->auth.digest_length = 16; auth_xform->auth.add_auth_data_length = use_esn ? 12 : 8; break; +#endif default: return -1; } - auth_xform->type = RTE_CRYPTO_SYM_XFORM_AUTH; - return 0; } -static_always_inline int +static_always_inline i32 create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, u8 is_outbound) { @@ -178,6 +211,10 @@ create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, struct rte_crypto_sym_xform *xfs; uword key = 0, *data; crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *) & key; +#if ! DPDK_NO_AEAD + i32 socket_id = rte_socket_id (); + i32 ret; +#endif if (sa->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) { @@ -190,15 +227,7 @@ create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, sa->salt = random_u32 (&seed); } - cipher_xform.type = RTE_CRYPTO_SYM_XFORM_CIPHER; - cipher_xform.cipher.key.data = sa->crypto_key; - cipher_xform.cipher.key.length = sa->crypto_key_len; - - auth_xform.type = RTE_CRYPTO_SYM_XFORM_AUTH; - auth_xform.auth.key.data = sa->integ_key; - auth_xform.auth.key.length = sa->integ_key_len; - - if (translate_crypto_algo (sa->crypto_alg, &cipher_xform) < 0) + if (translate_crypto_algo (sa->crypto_alg, &cipher_xform, sa->use_esn) < 0) return -1; p_key->cipher_algo = cipher_xform.cipher.algo; @@ -206,19 +235,44 @@ create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, return -1; p_key->auth_algo = auth_xform.auth.algo; - if (is_outbound) +#if ! DPDK_NO_AEAD + if (sa->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) { - cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT; - auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_GENERATE; - cipher_xform.next = &auth_xform; + cipher_xform.aead.key.data = sa->crypto_key; + cipher_xform.aead.key.length = sa->crypto_key_len; + + if (is_outbound) + cipher_xform.cipher.op = RTE_CRYPTO_AEAD_OP_ENCRYPT; + else + cipher_xform.cipher.op = RTE_CRYPTO_AEAD_OP_DECRYPT; + cipher_xform.next = NULL; xfs = &cipher_xform; + p_key->is_aead = 1; } - else + else /* Cipher + Auth */ +#endif { - cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT; - auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_VERIFY; - auth_xform.next = &cipher_xform; - xfs = &auth_xform; + cipher_xform.cipher.key.data = sa->crypto_key; + cipher_xform.cipher.key.length = sa->crypto_key_len; + + auth_xform.auth.key.data = sa->integ_key; + auth_xform.auth.key.length = sa->integ_key_len; + + if (is_outbound) + { + cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT; + auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_GENERATE; + cipher_xform.next = &auth_xform; + xfs = &cipher_xform; + } + else + { + cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT; + auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_VERIFY; + auth_xform.next = &cipher_xform; + xfs = &auth_xform; + } + p_key->is_aead = 0; } p_key->is_outbound = is_outbound; @@ -227,17 +281,115 @@ create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, if (!data) return -1; +#if DPDK_NO_AEAD sa_sess->sess = rte_cryptodev_sym_session_create (cwm->qp_data[*data].dev_id, xfs); - if (!sa_sess->sess) return -1; +#else + sa_sess->sess = + rte_cryptodev_sym_session_create (dcm->sess_h_pools[socket_id]); + if (!sa_sess->sess) + return -1; + + ret = + rte_cryptodev_sym_session_init (cwm->qp_data[*data].dev_id, sa_sess->sess, + xfs, dcm->sess_pools[socket_id]); + if (ret) + return -1; +#endif sa_sess->qp_index = (u8) * data; return 0; } +static_always_inline void +crypto_set_icb (dpdk_gcm_cnt_blk * icb, u32 salt, u32 seq, u32 seq_hi) +{ + icb->salt = salt; + icb->iv[0] = seq; + icb->iv[1] = seq_hi; +#if DPDK_NO_AEAD + icb->cnt = clib_host_to_net_u32 (1); +#endif +} + +#define __unused __attribute__((unused)) +static_always_inline void +crypto_op_setup (u8 is_aead, struct rte_mbuf *mb0, + struct rte_crypto_op *cop, void *session, + u32 cipher_off, u32 cipher_len, + u8 * icb __unused, u32 iv_size __unused, + u32 auth_off, u32 auth_len, + u8 * aad __unused, u32 aad_size __unused, + u8 * digest, u64 digest_paddr, u32 digest_size __unused) +{ + struct rte_crypto_sym_op *sym_cop; + + sym_cop = (struct rte_crypto_sym_op *) (cop + 1); + + sym_cop->m_src = mb0; + rte_crypto_op_attach_sym_session (cop, session); + + if (!digest_paddr) + digest_paddr = + rte_pktmbuf_mtophys_offset (mb0, (uintptr_t) digest - (uintptr_t) mb0); + +#if DPDK_NO_AEAD + sym_cop->cipher.data.offset = cipher_off; + sym_cop->cipher.data.length = cipher_len; + + sym_cop->cipher.iv.data = icb; + sym_cop->cipher.iv.phys_addr = + cop->phys_addr + (uintptr_t) icb - (uintptr_t) cop; + sym_cop->cipher.iv.length = iv_size; + + if (is_aead) + { + sym_cop->auth.aad.data = aad; + sym_cop->auth.aad.phys_addr = + cop->phys_addr + (uintptr_t) aad - (uintptr_t) cop; + sym_cop->auth.aad.length = aad_size; + } + else + { + sym_cop->auth.data.offset = auth_off; + sym_cop->auth.data.length = auth_len; + } + + sym_cop->auth.digest.data = digest; + sym_cop->auth.digest.phys_addr = digest_paddr; + sym_cop->auth.digest.length = digest_size; +#else /* ! DPDK_NO_AEAD */ + if (is_aead) + { + sym_cop->aead.data.offset = cipher_off; + sym_cop->aead.data.length = cipher_len; + + sym_cop->aead.aad.data = aad; + sym_cop->aead.aad.phys_addr = + cop->phys_addr + (uintptr_t) aad - (uintptr_t) cop; + + sym_cop->aead.digest.data = digest; + sym_cop->aead.digest.phys_addr = digest_paddr; + } + else + { + sym_cop->cipher.data.offset = cipher_off; + sym_cop->cipher.data.length = cipher_len; + + sym_cop->auth.data.offset = auth_off; + sym_cop->auth.data.length = auth_len; + + sym_cop->auth.digest.data = digest; + sym_cop->auth.digest.phys_addr = digest_paddr; + } +#endif /* DPDK_NO_AEAD */ +} + +#undef __unused + #endif /* __DPDK_ESP_H__ */ /* diff --git a/src/plugins/dpdk/ipsec/esp_decrypt.c b/src/plugins/dpdk/ipsec/esp_decrypt.c index 9377970a..c4f295d3 100644 --- a/src/plugins/dpdk/ipsec/esp_decrypt.c +++ b/src/plugins/dpdk/ipsec/esp_decrypt.c @@ -44,8 +44,7 @@ typedef enum { _(NOT_IP, "Not IP packet (dropped)") \ _(ENQ_FAIL, "Enqueue failed (buffer full)") \ _(NO_CRYPTODEV, "Cryptodev not configured") \ - _(BAD_LEN, "Invalid ciphertext length") \ - _(UNSUPPORTED, "Cipher/Auth not supported") + _(BAD_LEN, "Invalid ciphertext length") typedef enum { @@ -122,7 +121,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, while (n_left_from > 0 && n_left_to_next > 0) { - u32 bi0, sa_index0 = ~0, seq, icv_size, iv_size; + u32 bi0, sa_index0 = ~0, seq, trunc_size, iv_size; vlib_buffer_t * b0; esp_header_t * esp0; ipsec_sa_t * sa0; @@ -169,18 +168,6 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, sa0->total_data_size += b0->current_length; - if (PREDICT_FALSE(sa0->integ_alg == IPSEC_INTEG_ALG_NONE) || - PREDICT_FALSE(sa0->crypto_alg == IPSEC_CRYPTO_ALG_NONE)) - { - clib_warning ("SPI %u : only cipher + auth supported", sa0->spi); - vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index, - ESP_DECRYPT_ERROR_UNSUPPORTED, 1); - to_next[0] = bi0; - to_next += 1; - n_left_to_next -= 1; - goto trace; - } - sa_sess = pool_elt_at_index(cwm->sa_sess_d[0], sa_index0); if (PREDICT_FALSE(!sa_sess->sess)) @@ -211,7 +198,10 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, rte_crypto_op_attach_sym_session(cop, sess); - icv_size = em->esp_integ_algs[sa0->integ_alg].trunc_size; + if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + trunc_size = 16; + else + trunc_size = em->esp_integ_algs[sa0->integ_alg].trunc_size; iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len; /* Convert vlib buffer to mbuf */ @@ -222,7 +212,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, /* Outer IP header has already been stripped */ u16 payload_len = rte_pktmbuf_pkt_len(mb0) - sizeof (esp_header_t) - - iv_size - icv_size; + iv_size - trunc_size; if ((payload_len & (BLOCK_SIZE - 1)) || (payload_len <= 0)) { @@ -242,84 +232,64 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, struct rte_crypto_sym_op *sym_cop = (struct rte_crypto_sym_op *)(cop + 1); - sym_cop->m_src = mb0; - sym_cop->cipher.data.offset = sizeof (esp_header_t) + iv_size; - sym_cop->cipher.data.length = payload_len; + u8 is_aead = sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128; + u32 cipher_off, cipher_len; + u32 auth_off = 0, auth_len = 0, aad_size = 0; + u8 *aad = NULL, *digest = NULL; + u64 digest_paddr = 0; u8 *iv = rte_pktmbuf_mtod_offset(mb0, void*, sizeof (esp_header_t)); - dpdk_cop_priv_t * priv = (dpdk_cop_priv_t *)(sym_cop + 1); + dpdk_cop_priv_t *priv = (dpdk_cop_priv_t *)(sym_cop + 1); + dpdk_gcm_cnt_blk *icb = &priv->cb; + + cipher_off = sizeof (esp_header_t) + iv_size; + cipher_len = payload_len; - if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + digest = + vlib_buffer_get_current (b0) + sizeof(esp_header_t) + + iv_size + payload_len; + + if (is_aead) { - dpdk_gcm_cnt_blk *icb = &priv->cb; - icb->salt = sa0->salt; - clib_memcpy(icb->iv, iv, 8); - icb->cnt = clib_host_to_net_u32(1); - sym_cop->cipher.iv.data = (u8 *)icb; - sym_cop->cipher.iv.phys_addr = cop->phys_addr + - (uintptr_t)icb - (uintptr_t)cop; - sym_cop->cipher.iv.length = 16; - - u8 *aad = priv->aad; - clib_memcpy(aad, iv - sizeof(esp_header_t), 8); - sym_cop->auth.aad.data = aad; - sym_cop->auth.aad.phys_addr = cop->phys_addr + - (uintptr_t)aad - (uintptr_t)cop; - if (sa0->use_esn) - { - *((u32*)&aad[8]) = sa0->seq_hi; - sym_cop->auth.aad.length = 12; - } - else - { - sym_cop->auth.aad.length = 8; - } + u32 *_iv = (u32 *) iv; - sym_cop->auth.digest.data = rte_pktmbuf_mtod_offset(mb0, void*, - rte_pktmbuf_pkt_len(mb0) - icv_size); - sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset(mb0, - rte_pktmbuf_pkt_len(mb0) - icv_size); - sym_cop->auth.digest.length = icv_size; + crypto_set_icb (icb, sa0->salt, _iv[0], _iv[1]); + iv_size = 16; + aad = priv->aad; + clib_memcpy(aad, esp0, 8); + aad_size = 8; + if (sa0->use_esn) + { + *((u32*)&aad[8]) = sa0->seq_hi; + aad_size = 12; + } } else { - sym_cop->cipher.iv.data = rte_pktmbuf_mtod_offset(mb0, void*, - sizeof (esp_header_t)); - sym_cop->cipher.iv.phys_addr = rte_pktmbuf_mtophys_offset(mb0, - sizeof (esp_header_t)); - sym_cop->cipher.iv.length = iv_size; + clib_memcpy(icb, iv, 16); + + auth_off = 0; + auth_len = sizeof(esp_header_t) + iv_size + payload_len; if (sa0->use_esn) { dpdk_cop_priv_t* priv = (dpdk_cop_priv_t*) (sym_cop + 1); - u8* payload_end = rte_pktmbuf_mtod_offset( - mb0, u8*, sizeof(esp_header_t) + iv_size + payload_len); - - clib_memcpy (priv->icv, payload_end, icv_size); - *((u32*) payload_end) = sa0->seq_hi; - sym_cop->auth.data.offset = 0; - sym_cop->auth.data.length = sizeof(esp_header_t) + iv_size - + payload_len + sizeof(sa0->seq_hi); - sym_cop->auth.digest.data = priv->icv; - sym_cop->auth.digest.phys_addr = cop->phys_addr - + (uintptr_t) priv->icv - (uintptr_t) cop; - sym_cop->auth.digest.length = icv_size; - } - else - { - sym_cop->auth.data.offset = 0; - sym_cop->auth.data.length = sizeof(esp_header_t) + - iv_size + payload_len; - - sym_cop->auth.digest.data = rte_pktmbuf_mtod_offset(mb0, void*, - rte_pktmbuf_pkt_len(mb0) - icv_size); - sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset(mb0, - rte_pktmbuf_pkt_len(mb0) - icv_size); - sym_cop->auth.digest.length = icv_size; + + clib_memcpy (priv->icv, digest, trunc_size); + *((u32*) digest) = sa0->seq_hi; + auth_len += sizeof(sa0->seq_hi); + + digest = priv->icv; + digest_paddr = + cop->phys_addr + (uintptr_t) priv->icv - (uintptr_t) cop; } } + crypto_op_setup (is_aead, mb0, cop, sess, + cipher_off, cipher_len, (u8 *) icb, iv_size, + auth_off, auth_len, aad, aad_size, + digest, digest_paddr, trunc_size); trace: if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -339,6 +309,9 @@ trace: { u32 enq; + if (!n_cop_qp[i]) + continue; + qpd = vec_elt_at_index(cwm->qp_data, i); enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id, qpd->cops, n_cop_qp[i]); @@ -433,7 +406,7 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm, while (n_left_from > 0 && n_left_to_next > 0) { esp_footer_t * f0; - u32 bi0, next0, icv_size, iv_size; + u32 bi0, next0, trunc_size, iv_size; vlib_buffer_t * b0 = 0; ip4_header_t *ih4 = 0, *oh4 = 0; ip6_header_t *ih6 = 0, *oh6 = 0; @@ -455,7 +428,10 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm, to_next[0] = bi0; to_next += 1; - icv_size = em->esp_integ_algs[sa0->integ_alg].trunc_size; + if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + trunc_size = 16; + else + trunc_size = em->esp_integ_algs[sa0->integ_alg].trunc_size; iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len; if (sa0->use_anti_replay) @@ -472,7 +448,7 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm, ih4 = (ip4_header_t *) (b0->data + sizeof(ethernet_header_t)); vlib_buffer_advance (b0, sizeof (esp_header_t) + iv_size); - b0->current_length -= (icv_size + 2); + b0->current_length -= (trunc_size + 2); b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; f0 = (esp_footer_t *) ((u8 *) vlib_buffer_get_current (b0) + b0->current_length); diff --git a/src/plugins/dpdk/ipsec/esp_encrypt.c b/src/plugins/dpdk/ipsec/esp_encrypt.c index ac552f6c..6de444fd 100644 --- a/src/plugins/dpdk/ipsec/esp_encrypt.c +++ b/src/plugins/dpdk/ipsec/esp_encrypt.c @@ -43,8 +43,7 @@ typedef enum _(RX_PKTS, "ESP pkts received") \ _(SEQ_CYCLED, "sequence number cycled") \ _(ENQ_FAIL, "Enqueue failed (buffer full)") \ - _(NO_CRYPTODEV, "Cryptodev not configured") \ - _(UNSUPPORTED, "Cipher/Auth not supported") + _(NO_CRYPTODEV, "Cryptodev not configured") typedef enum @@ -142,6 +141,7 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, const int BLOCK_SIZE = 16; u32 iv_size; u16 orig_sz; + u8 trunc_size; crypto_sa_session_t *sa_sess; void *sess; struct rte_crypto_op *cop = 0; @@ -199,6 +199,11 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, ssize_t adv; iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len; + if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + trunc_size = 16; + else + trunc_size = em->esp_integ_algs[sa0->integ_alg].trunc_size; + ih0 = vlib_buffer_get_current (b0); orig_sz = b0->current_length; is_ipv6 = (ih0->ip4.ip_version_and_header_length & 0xF0) == 0x60; @@ -314,9 +319,6 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, transport_mode = 1; } - ASSERT (sa0->crypto_alg < IPSEC_CRYPTO_N_ALG); - ASSERT (sa0->crypto_alg != IPSEC_CRYPTO_ALG_NONE); - int blocks = 1 + (orig_sz + 1) / BLOCK_SIZE; /* pad packet in input buffer */ @@ -330,8 +332,7 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, f0 = vlib_buffer_get_current (b0) + b0->current_length + pad_bytes; f0->pad_length = pad_bytes; f0->next_header = next_hdr_type; - b0->current_length += pad_bytes + 2 + - em->esp_integ_algs[sa0->integ_alg].trunc_size; + b0->current_length += pad_bytes + 2 + trunc_size; vnet_buffer (b0)->sw_if_index[VLIB_RX] = vnet_buffer (b0)->sw_if_index[VLIB_RX]; @@ -349,88 +350,64 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, mb0->pkt_len = b0->current_length; mb0->data_off = RTE_PKTMBUF_HEADROOM + b0->current_data; - rte_crypto_op_attach_sym_session (cop, sess); + dpdk_gcm_cnt_blk *icb = &priv->cb; - sym_cop->m_src = mb0; + crypto_set_icb (icb, sa0->salt, sa0->seq, sa0->seq_hi); - dpdk_gcm_cnt_blk *icb = &priv->cb; - icb->salt = sa0->salt; - icb->iv[0] = sa0->seq; - icb->iv[1] = sa0->seq_hi; - icb->cnt = clib_host_to_net_u32 (1); + u8 is_aead = sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128; + u32 cipher_off, cipher_len; + u32 auth_off = 0, auth_len = 0, aad_size = 0; + u8 *aad = NULL, *digest = NULL; - if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + if (is_aead) { u32 *esp_iv = (u32 *) (b0->data + b0->current_data + ip_hdr_size + sizeof (esp_header_t)); esp_iv[0] = sa0->seq; esp_iv[1] = sa0->seq_hi; - sym_cop->cipher.data.offset = - ip_hdr_size + sizeof (esp_header_t) + iv_size; - sym_cop->cipher.data.length = BLOCK_SIZE * blocks; - sym_cop->cipher.iv.length = 16; - } - else - { - sym_cop->cipher.data.offset = - ip_hdr_size + sizeof (esp_header_t); - sym_cop->cipher.data.length = BLOCK_SIZE * blocks + iv_size; - sym_cop->cipher.iv.length = iv_size; - } - sym_cop->cipher.iv.data = (u8 *) icb; - sym_cop->cipher.iv.phys_addr = cop->phys_addr + (uintptr_t) icb - - (uintptr_t) cop; + cipher_off = ip_hdr_size + sizeof (esp_header_t) + iv_size; + cipher_len = BLOCK_SIZE * blocks; + iv_size = 16; /* GCM IV size, not ESP IV size */ - - ASSERT (sa0->integ_alg < IPSEC_INTEG_N_ALG); - ASSERT (sa0->integ_alg != IPSEC_INTEG_ALG_NONE); - - if (PREDICT_FALSE (sa0->integ_alg == IPSEC_INTEG_ALG_AES_GCM_128)) - { - u8 *aad = priv->aad; + aad = priv->aad; clib_memcpy (aad, vlib_buffer_get_current (b0) + ip_hdr_size, 8); - sym_cop->auth.aad.data = aad; - sym_cop->auth.aad.phys_addr = cop->phys_addr + - (uintptr_t) aad - (uintptr_t) cop; - + aad_size = 8; if (PREDICT_FALSE (sa0->use_esn)) { *((u32 *) & aad[8]) = sa0->seq_hi; - sym_cop->auth.aad.length = 12; - } - else - { - sym_cop->auth.aad.length = 8; + aad_size = 12; } + + digest = + vlib_buffer_get_current (b0) + b0->current_length - + trunc_size; } else { - sym_cop->auth.data.offset = ip_hdr_size; - sym_cop->auth.data.length = b0->current_length - ip_hdr_size - - em->esp_integ_algs[sa0->integ_alg].trunc_size; + cipher_off = ip_hdr_size + sizeof (esp_header_t); + cipher_len = BLOCK_SIZE * blocks + iv_size; + + auth_off = ip_hdr_size; + auth_len = b0->current_length - ip_hdr_size - trunc_size; + + digest = + vlib_buffer_get_current (b0) + b0->current_length - + trunc_size; if (PREDICT_FALSE (sa0->use_esn)) { - u8 *payload_end = - vlib_buffer_get_current (b0) + b0->current_length; - *((u32 *) payload_end) = sa0->seq_hi; - sym_cop->auth.data.length += sizeof (sa0->seq_hi); + *((u32 *) digest) = sa0->seq_hi; + auth_len += sizeof (sa0->seq_hi); } } - sym_cop->auth.digest.data = vlib_buffer_get_current (b0) + - b0->current_length - - em->esp_integ_algs[sa0->integ_alg].trunc_size; - sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset (mb0, - b0->current_length - - - em->esp_integ_algs - [sa0->integ_alg].trunc_size); - sym_cop->auth.digest.length = - em->esp_integ_algs[sa0->integ_alg].trunc_size; + crypto_op_setup (is_aead, mb0, cop, sess, + cipher_off, cipher_len, (u8 *) icb, iv_size, + auth_off, auth_len, aad, aad_size, + digest, 0, trunc_size); if (PREDICT_FALSE (is_ipv6)) { @@ -470,6 +447,9 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, { u32 enq; + if (!n_cop_qp[i]) + continue; + qpd = vec_elt_at_index(cwm->qp_data, i); enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id, qpd->cops, n_cop_qp[i]); diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index 7066564d..c922940c 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -56,18 +56,23 @@ add_del_sa_sess (u32 sa_index, u8 is_add) else { u8 dev_id; + i32 ret; sa_sess = pool_elt_at_index (cwm->sa_sess_d[is_outbound], sa_index); dev_id = cwm->qp_data[sa_sess->qp_index].dev_id; if (!sa_sess->sess) continue; - - if (rte_cryptodev_sym_session_free(dev_id, sa_sess->sess)) - { - clib_warning("failed to free session"); - return -1; - } +#if DPDK_NO_AEAD + ret = (rte_cryptodev_sym_session_free(dev_id, sa_sess->sess) == NULL); + ASSERT (ret); +#else + ret = rte_cryptodev_sym_session_clear(dev_id, sa_sess->sess); + ASSERT (!ret); + + ret = rte_cryptodev_sym_session_free(sa_sess->sess); + ASSERT (!ret); +#endif memset(sa_sess, 0, sizeof(sa_sess[0])); } } @@ -94,7 +99,7 @@ update_qp_data (crypto_worker_main_t * cwm, } /* *INDENT-ON* */ - vec_add2 (cwm->qp_data, qpd, 1); + vec_add2_aligned (cwm->qp_data, qpd, 1, CLIB_CACHE_LINE_BYTES); qpd->dev_id = cdev_id; qpd->qp_id = qp_id; @@ -119,6 +124,9 @@ add_mapping (crypto_worker_main_t * cwm, p_key->cipher_algo = (u8) cipher_cap->sym.cipher.algo; p_key->auth_algo = (u8) auth_cap->sym.auth.algo; p_key->is_outbound = is_outbound; +#if ! DPDK_NO_AEAD + p_key->is_aead = cipher_cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AEAD; +#endif ret = hash_get (cwm->algo_qp_map, key); if (ret) @@ -147,6 +155,20 @@ add_cdev_mapping (crypto_worker_main_t * cwm, for (i = dev_info->capabilities; i->op != RTE_CRYPTO_OP_TYPE_UNDEFINED; i++) { +#if ! DPDK_NO_AEAD + if (i->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AEAD) + { + struct rte_cryptodev_capabilities none = { 0 }; + + if (check_algo_is_supported (i, NULL) != 0) + continue; + + none.sym.auth.algo = RTE_CRYPTO_AUTH_NULL; + + mapped |= add_mapping (cwm, cdev_id, qp, is_outbound, i, &none); + continue; + } +#endif if (i->sym.xform_type != RTE_CRYPTO_SYM_XFORM_CIPHER) continue; @@ -205,17 +227,23 @@ dpdk_ipsec_check_support (ipsec_sa_t * sa) { if (sa->integ_alg != IPSEC_INTEG_ALG_NONE) return clib_error_return (0, "unsupported integ-alg %U with " - "crypto-algo aes-gcm-128", + "crypto-alg aes-gcm-128", format_ipsec_integ_alg, sa->integ_alg); +#if DPDK_NO_AEAD sa->integ_alg = IPSEC_INTEG_ALG_AES_GCM_128; +#endif } - else - { - if (sa->integ_alg == IPSEC_INTEG_ALG_NONE || - sa->integ_alg == IPSEC_INTEG_ALG_AES_GCM_128) - return clib_error_return (0, "unsupported integ-alg %U", - format_ipsec_integ_alg, sa->integ_alg); - } +#if DPDK_NO_AEAD + else if (sa->crypto_alg == IPSEC_CRYPTO_ALG_NONE || + sa->integ_alg == IPSEC_INTEG_ALG_NONE || + sa->integ_alg == IPSEC_INTEG_ALG_AES_GCM_128) +#else + else if (sa->integ_alg == IPSEC_INTEG_ALG_NONE) +#endif + return clib_error_return (0, + "unsupported integ-alg %U with crypto-alg %U", + format_ipsec_integ_alg, sa->integ_alg, + format_ipsec_crypto_alg, sa->crypto_alg); return 0; } @@ -233,6 +261,10 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, struct rte_mempool *rmp; i32 dev_id, ret; u32 i, skip_master; +#if ! DPDK_NO_AEAD + u32 max_sess_size = 0, sess_size; + i8 socket_id; +#endif if (check_cryptodev_queues () < 0) { @@ -297,9 +329,10 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, dev_conf.socket_id = rte_cryptodev_socket_id (dev_id); dev_conf.nb_queue_pairs = cdev_info.max_nb_queue_pairs; +#if DPDK_NO_AEAD dev_conf.session_mp.nb_objs = DPDK_CRYPTO_NB_SESS_OBJS; dev_conf.session_mp.cache_size = DPDK_CRYPTO_CACHE_SIZE; - +#endif ret = rte_cryptodev_configure (dev_id, &dev_conf); if (ret < 0) { @@ -310,16 +343,26 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, qp_conf.nb_descriptors = DPDK_CRYPTO_N_QUEUE_DESC; for (qp = 0; qp < dev_conf.nb_queue_pairs; qp++) { +#if DPDK_NO_AEAD ret = rte_cryptodev_queue_pair_setup (dev_id, qp, &qp_conf, dev_conf.socket_id); +#else + ret = rte_cryptodev_queue_pair_setup (dev_id, qp, &qp_conf, + dev_conf.socket_id, NULL); +#endif if (ret < 0) { clib_warning ("cryptodev %u qp %u setup error", dev_id, qp); goto error; } } - vec_validate_aligned (dcm->cop_pools, dev_conf.socket_id, - CLIB_CACHE_LINE_BYTES); + vec_validate (dcm->cop_pools, dev_conf.socket_id); + +#if ! DPDK_NO_AEAD + sess_size = rte_cryptodev_get_private_session_size (dev_id); + if (sess_size > max_sess_size) + max_sess_size = sess_size; +#endif if (!vec_elt (dcm->cop_pools, dev_conf.socket_id)) { @@ -333,14 +376,14 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, DPDK_CRYPTO_CACHE_SIZE, DPDK_CRYPTO_PRIV_SIZE, dev_conf.socket_id); - vec_free (pool_name); if (!rmp) { - clib_warning ("failed to allocate mempool on socket %u", - dev_conf.socket_id); + clib_warning ("failed to allocate %s", pool_name); + vec_free (pool_name); goto error; } + vec_free (pool_name); vec_elt (dcm->cop_pools, dev_conf.socket_id) = rmp; } @@ -348,6 +391,51 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, DPDK_CRYPTO_NB_SESS_OBJS, DPDK_CRYPTO_CACHE_SIZE); } +#if ! DPDK_NO_AEAD + /* *INDENT-OFF* */ + vec_foreach_index (socket_id, dcm->cop_pools) + { + u8 *pool_name; + + if (!vec_elt (dcm->cop_pools, socket_id)) + continue; + + vec_validate (dcm->sess_h_pools, socket_id); + pool_name = format (0, "crypto_sess_h_socket%u%c", + socket_id, 0); + rmp = + rte_mempool_create((i8 *)pool_name, DPDK_CRYPTO_NB_SESS_OBJS, + rte_cryptodev_get_header_session_size (), + 512, 0, NULL, NULL, NULL, NULL, + socket_id, 0); + if (!rmp) + { + clib_warning ("failed to allocate %s", pool_name); + vec_free (pool_name); + goto error; + } + vec_free (pool_name); + vec_elt (dcm->sess_h_pools, socket_id) = rmp; + + vec_validate (dcm->sess_pools, socket_id); + pool_name = format (0, "crypto_sess_socket%u%c", + socket_id, 0); + rmp = + rte_mempool_create((i8 *)pool_name, DPDK_CRYPTO_NB_SESS_OBJS, + max_sess_size, 512, 0, NULL, NULL, NULL, NULL, + socket_id, 0); + if (!rmp) + { + clib_warning ("failed to allocate %s", pool_name); + vec_free (pool_name); + goto error; + } + vec_free (pool_name); + vec_elt (dcm->sess_pools, socket_id) = rmp; + } + /* *INDENT-ON* */ +#endif + dpdk_esp_init (); /* Add new next node and set as default */ diff --git a/src/plugins/dpdk/ipsec/ipsec.h b/src/plugins/dpdk/ipsec/ipsec.h index d7940345..a94dd682 100644 --- a/src/plugins/dpdk/ipsec/ipsec.h +++ b/src/plugins/dpdk/ipsec/ipsec.h @@ -53,6 +53,7 @@ typedef struct u8 cipher_algo; u8 auth_algo; u8 is_outbound; + u8 is_aead; } crypto_worker_qp_key_t; typedef struct @@ -81,6 +82,8 @@ typedef struct typedef struct { + struct rte_mempool **sess_h_pools; + struct rte_mempool **sess_pools; struct rte_mempool **cop_pools; crypto_worker_main_t *workers_main; u8 enabled; @@ -146,12 +149,14 @@ check_algo_is_supported (const struct rte_cryptodev_capabilities *cap, { struct { - uint8_t cipher_algo; enum rte_crypto_sym_xform_type type; union { enum rte_crypto_auth_algorithm auth; enum rte_crypto_cipher_algorithm cipher; +#if ! DPDK_NO_AEAD + enum rte_crypto_aead_algorithm aead; +#endif }; char *name; } supported_algo[] = @@ -162,15 +167,18 @@ check_algo_is_supported (const struct rte_cryptodev_capabilities *cap, { .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher = RTE_CRYPTO_CIPHER_AES_CBC,.name = "AES_CBC"}, +#if DPDK_NO_AEAD { .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher = - RTE_CRYPTO_CIPHER_AES_CTR,.name = "AES_CTR"}, + RTE_CRYPTO_CIPHER_AES_GCM,.name = "AES-GCM"}, +#else { - .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher = - RTE_CRYPTO_CIPHER_3DES_CBC,.name = "3DES-CBC"}, + .type = RTE_CRYPTO_SYM_XFORM_AEAD,.aead = + RTE_CRYPTO_AEAD_AES_GCM,.name = "AES-GCM"}, +#endif { - .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher = - RTE_CRYPTO_CIPHER_AES_GCM,.name = "AES-GCM"}, + .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = + RTE_CRYPTO_AUTH_NULL,.name = "NULL"}, { .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = RTE_CRYPTO_AUTH_SHA1_HMAC,.name = "HMAC-SHA1"}, @@ -183,15 +191,16 @@ check_algo_is_supported (const struct rte_cryptodev_capabilities *cap, { .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = RTE_CRYPTO_AUTH_SHA512_HMAC,.name = "HMAC-SHA512"}, - { - .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = - RTE_CRYPTO_AUTH_AES_XCBC_MAC,.name = "AES-XCBC-MAC"}, +#if DPDK_NO_AEAD { .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = RTE_CRYPTO_AUTH_AES_GCM,.name = "AES-GCM"}, +#endif { /* tail */ - .type = RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED},}; + .type = RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED} + }; + uint32_t i = 0; if (cap->op != RTE_CRYPTO_OP_TYPE_SYMMETRIC) @@ -203,6 +212,10 @@ check_algo_is_supported (const struct rte_cryptodev_capabilities *cap, { if ((cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_CIPHER && cap->sym.cipher.algo == supported_algo[i].cipher) || +#if ! DPDK_NO_AEAD + (cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AEAD && + cap->sym.aead.algo == supported_algo[i].aead) || +#endif (cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AUTH && cap->sym.auth.algo == supported_algo[i].auth)) { -- cgit 1.2.3-korg From 206b2d4342b1c1b7715c7d442f582da02e5ec9b9 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 25 Aug 2017 19:10:57 +0200 Subject: dpdk: bump to dpdk 17.08, remove support for dpdk 17.02 Change-Id: I674fb1212e48693939045523df085326a4dd1809 Signed-off-by: Damjan Marion --- dpdk/Makefile | 5 ++--- src/plugins/dpdk/buffer.c | 17 ----------------- src/plugins/dpdk/device/device.c | 4 ---- src/plugins/dpdk/device/init.c | 4 ---- src/plugins/dpdk/hqos/hqos.c | 8 -------- src/plugins/dpdk/ipsec/esp.h | 6 ++++-- 6 files changed, 6 insertions(+), 38 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/dpdk/Makefile b/dpdk/Makefile index 8d5b42ef..06e38f76 100644 --- a/dpdk/Makefile +++ b/dpdk/Makefile @@ -24,12 +24,11 @@ DPDK_MLX5_PMD ?= n B := $(DPDK_BUILD_DIR) I := $(DPDK_INSTALL_DIR) -DPDK_VERSION ?= 17.05 -PKG_SUFFIX ?= vpp6 +DPDK_VERSION ?= 17.08 +PKG_SUFFIX ?= vpp1 DPDK_BASE_URL ?= http://fast.dpdk.org/rel DPDK_TARBALL := dpdk-$(DPDK_VERSION).tar.xz DPDK_TAR_URL := $(DPDK_BASE_URL)/$(DPDK_TARBALL) -DPDK_17.02_TARBALL_MD5_CKSUM := 6b9f7387c35641f4e8dbba3e528f2376 DPDK_17.05_TARBALL_MD5_CKSUM := 0a68c31cd6a6cabeed0a4331073e4c05 DPDK_17.08_TARBALL_MD5_CKSUM := 0641f59ea8ea98afefa7cfa2699f6241 DPDK_SOURCE := $(B)/dpdk-$(DPDK_VERSION) diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index b0f247e1..28af100a 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -213,18 +213,6 @@ fill_free_list (vlib_main_t * vm, mb2 = vm->mbuf_alloc_list[i + 2]; mb3 = vm->mbuf_alloc_list[i + 3]; -#if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 0) - ASSERT (rte_mbuf_refcnt_read (mb0) == 0); - ASSERT (rte_mbuf_refcnt_read (mb1) == 0); - ASSERT (rte_mbuf_refcnt_read (mb2) == 0); - ASSERT (rte_mbuf_refcnt_read (mb3) == 0); - - rte_mbuf_refcnt_set (mb0, 1); - rte_mbuf_refcnt_set (mb1, 1); - rte_mbuf_refcnt_set (mb2, 1); - rte_mbuf_refcnt_set (mb3, 1); -#endif - b0 = vlib_buffer_from_rte_mbuf (mb0); b1 = vlib_buffer_from_rte_mbuf (mb1); b2 = vlib_buffer_from_rte_mbuf (mb2); @@ -259,11 +247,6 @@ fill_free_list (vlib_main_t * vm, { mb0 = vm->mbuf_alloc_list[i]; -#if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 0) - ASSERT (rte_mbuf_refcnt_read (mb0) == 0); - rte_mbuf_refcnt_set (mb0, 1); -#endif - b0 = vlib_buffer_from_rte_mbuf (mb0); bi0 = vlib_get_buffer_index (vm, b0); diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index c755060d..a247c7c9 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -254,11 +254,7 @@ static_always_inline &tx_vector[tx_tail], tx_head - tx_tail); rv = rte_ring_sp_enqueue_burst (hqos->swq, (void **) &tx_vector[tx_tail], -#if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) (uint16_t) (tx_head - tx_tail), 0); -#else - (uint16_t) (tx_head - tx_tail)); -#endif } else if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD)) { diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 6f7e168b..a795ba0e 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1186,11 +1186,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) /* Set up DPDK eal and packet mbuf pool early. */ -#if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) rte_log_set_global_level (log_level); -#else - rte_set_log_level (log_level); -#endif vm = vlib_get_main (); diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index 2f2504d6..813eb91c 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -430,11 +430,7 @@ dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) pkts_enq_len += rte_ring_sc_dequeue_burst (swq, (void **) &pkts_enq[pkts_enq_len], -#if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) hqos->hqos_burst_enq, 0); -#else - hqos->hqos_burst_enq); -#endif /* Get next SWQ for this device */ swq_pos++; @@ -525,11 +521,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) pkts_enq_len += rte_ring_sc_dequeue_burst (swq, (void **) &pkts_enq[pkts_enq_len], -#if RTE_VERSION >= RTE_VERSION_NUM(17, 5, 0, 0) hqos->hqos_burst_enq, 0); -#else - hqos->hqos_burst_enq); -#endif /* Get next SWQ for this device */ swq_pos++; diff --git a/src/plugins/dpdk/ipsec/esp.h b/src/plugins/dpdk/ipsec/esp.h index 308a66af..5b5c81ae 100644 --- a/src/plugins/dpdk/ipsec/esp.h +++ b/src/plugins/dpdk/ipsec/esp.h @@ -242,9 +242,11 @@ create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, cipher_xform.aead.key.length = sa->crypto_key_len; if (is_outbound) - cipher_xform.cipher.op = RTE_CRYPTO_AEAD_OP_ENCRYPT; + cipher_xform.cipher.op = + (enum rte_crypto_cipher_operation) RTE_CRYPTO_AEAD_OP_ENCRYPT; else - cipher_xform.cipher.op = RTE_CRYPTO_AEAD_OP_DECRYPT; + cipher_xform.cipher.op = + (enum rte_crypto_cipher_operation) RTE_CRYPTO_AEAD_OP_DECRYPT; cipher_xform.next = NULL; xfs = &cipher_xform; p_key->is_aead = 1; -- cgit 1.2.3-korg From e3434397f2533b52aeccb594eb9610e99b556f43 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Wed, 30 Aug 2017 08:14:03 -0400 Subject: Offload prep calls needed for IP and UDP checksum offload pkts Change-Id: I9b29bcff348bddfb49b1c38b1d409249e37bb075 Signed-off-by: Dave Barach --- src/plugins/dpdk/device/device.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index a247c7c9..97c13630 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -483,7 +483,10 @@ dpdk_interface_tx (vlib_main_t * vm, mb3 = rte_mbuf_from_vlib_buffer (b3); if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) && - (or_flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM))) + (or_flags & + (VNET_BUFFER_F_OFFLOAD_TCP_CKSUM + | VNET_BUFFER_F_OFFLOAD_IP_CKSUM + | VNET_BUFFER_F_OFFLOAD_UDP_CKSUM)))) { dpdk_buffer_tx_offload (xd, b0, mb0); dpdk_buffer_tx_offload (xd, b1, mb1); -- cgit 1.2.3-korg From 49d66f1f42cbc310e4fa0dc526b9fdb91d0ca220 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 20 Jul 2017 18:10:35 +0200 Subject: vlib physmem rework This patch adds supprot support for multiple numa-aware physmem regions. Change-Id: I5c69a6f4da33c8ee21bdb8604d52fd2886f2327e Signed-off-by: Damjan Marion --- src/examples/vlib/main_stub.c | 3 +- src/plugins/dpdk/api/dpdk_api.c | 1 - src/plugins/dpdk/device/init.c | 17 +- src/plugins/dpdk/hqos/hqos.c | 1 - src/plugins/ixge/ixge.c | 32 ++- src/plugins/ixge/ixge.h | 2 + src/vlib.am | 4 +- src/vlib/buffer.c | 72 ++++- src/vlib/buffer.h | 3 +- src/vlib/buffer_funcs.h | 39 +-- src/vlib/main.c | 20 +- src/vlib/main.h | 16 +- src/vlib/physmem.h | 69 ++--- src/vlib/physmem_funcs.h | 161 +++++++++++ src/vlib/unix/physmem.c | 572 +++++++++++++++++++--------------------- src/vlib/unix/physmem.h | 65 ----- src/vlib/unix/unix.h | 24 +- src/vlib/unix/util.c | 113 +++++++- src/vlib/vlib.h | 3 +- 19 files changed, 700 insertions(+), 517 deletions(-) create mode 100644 src/vlib/physmem_funcs.h delete mode 100644 src/vlib/unix/physmem.h (limited to 'src/plugins/dpdk/device') diff --git a/src/examples/vlib/main_stub.c b/src/examples/vlib/main_stub.c index 4d74bd77..3b19c53f 100644 --- a/src/examples/vlib/main_stub.c +++ b/src/examples/vlib/main_stub.c @@ -27,8 +27,7 @@ main_stub_init (vlib_main_t * vm) { clib_error_t *error; - if ((error = - unix_physmem_init (vm, /* fail_if_physical_memory_not_present */ 0))) + if ((error = unix_physmem_init (vm))) return error; if ((error = vlib_call_init_function (vm, unix_cli_init))) diff --git a/src/plugins/dpdk/api/dpdk_api.c b/src/plugins/dpdk/api/dpdk_api.c index 08afdd70..97c4bc75 100755 --- a/src/plugins/dpdk/api/dpdk_api.c +++ b/src/plugins/dpdk/api/dpdk_api.c @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index a795ba0e..e23542f7 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -17,10 +17,10 @@ #include #include #include +#include #include #include -#include #include #include @@ -1026,21 +1026,28 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( { int pages_avail, page_size, mem; + clib_error_t *e = 0; vec_validate(mem_by_socket, c); mem = mem_by_socket[c]; page_size = 1024; - pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); + e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); - if (pages_avail < 0 || page_size * pages_avail < mem) + if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_1g = 0; + if (e) + clib_error_free (e); + page_size = 2; - pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); + e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); - if (pages_avail < 0 || page_size * pages_avail < mem) + if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_2m = 0; + + if (e) + clib_error_free (e); })); /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index 813eb91c..c9b85652 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include /* enumerate all vlib messages */ diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index e0150f41..222c148c 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -2493,10 +2493,11 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); dq->head_index = dq->tail_index = 0; - dq->descriptors = vlib_physmem_alloc_aligned (vm, &error, - dq->n_descriptors * - sizeof (dq->descriptors[0]), - 128 /* per chip spec */ ); + dq->descriptors = + vlib_physmem_alloc_aligned (vm, xm->physmem_region, &error, + dq->n_descriptors * + sizeof (dq->descriptors[0]), + 128 /* per chip spec */ ); if (error) return error; @@ -2518,7 +2519,8 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) vlib_buffer_t *b = vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]); dq->descriptors[i].rx_to_hw.tail_address = - vlib_physmem_virtual_to_physical (vm, b->data); + vlib_physmem_virtual_to_physical (vm, xm->physmem_region, + b->data); } } else @@ -2526,7 +2528,8 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) u32 i; dq->tx.head_index_write_back = - vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES); + vlib_physmem_alloc (vm, vm->buffer_main->physmem_region, &error, + CLIB_CACHE_LINE_BYTES); for (i = 0; i < dq->n_descriptors; i++) dq->descriptors[i].tx = xm->tx_descriptor_template; @@ -2538,7 +2541,9 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); u64 a; - a = vlib_physmem_virtual_to_physical (vm, dq->descriptors); + a = + vlib_physmem_virtual_to_physical (vm, vm->buffer_main->physmem_region, + dq->descriptors); dr->descriptor_address[0] = a & 0xFFFFFFFF; dr->descriptor_address[1] = a >> (u64) 32; dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); @@ -2564,7 +2569,9 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) dq->tx.head_index_write_back[0] = dq->head_index; a = - vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back); + vlib_physmem_virtual_to_physical (vm, + vm->buffer_main->physmem_region, + dq->tx.head_index_write_back); dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; } @@ -2850,9 +2857,12 @@ ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev) void *r; ixge_device_t *xd; - /* Device found: make sure we have dma memory. */ - if (unix_physmem_is_fake (vm)) - return clib_error_return (0, "no physical memory available"); + /* Allocate physmem region for DMA buffers */ + error = vlib_physmem_region_alloc (vm, "ixge decriptors", 2 << 20, 0, + VLIB_PHYSMEM_F_INIT_MHEAP, + &xm->physmem_region); + if (error) + return error; error = vlib_pci_map_resource (dev, 0, &r); if (error) diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h index 779603b3..42c1bfa5 100644 --- a/src/plugins/ixge/ixge.h +++ b/src/plugins/ixge/ixge.h @@ -1266,6 +1266,8 @@ typedef struct u32 *rx_buffers_to_add; f64 time_last_stats_update; + + vlib_physmem_region_index_t physmem_region; } ixge_main_t; ixge_main_t ixge_main; diff --git a/src/vlib.am b/src/vlib.am index 111dcfa3..cab90e2d 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -13,7 +13,7 @@ lib_LTLIBRARIES += libvlib.la -libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread +libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread -lnuma libvlib_la_DEPENDENCIES = libvppinfra.la BUILT_SOURCES += vlib/config.h @@ -65,6 +65,7 @@ nobase_include_HEADERS += \ vlib/physmem.h \ vlib/pci/pci.h \ vlib/pci/pci_config.h \ + vlib/physmem_funcs.h \ vlib/threads.h \ vlib/trace_funcs.h \ vlib/trace.h \ @@ -84,7 +85,6 @@ libvlib_la_SOURCES += \ nobase_include_HEADERS += \ vlib/unix/cj.h \ vlib/unix/mc_socket.h \ - vlib/unix/physmem.h \ vlib/unix/plugin.h \ vlib/unix/unix.h diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 908368c0..a5ec0e0a 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -47,6 +47,7 @@ #include vlib_buffer_callbacks_t *vlib_buffer_callbacks = 0; +static u32 vlib_buffer_physmem_sz = 32 << 20; uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, @@ -461,7 +462,8 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) u32 i; for (i = 0; i < vec_len (f->buffer_memory_allocated); i++) - vm->os_physmem_free (f->buffer_memory_allocated[i]); + vm->os_physmem_free (vm, vm->buffer_main->physmem_region, + f->buffer_memory_allocated[i]); vec_free (f->name); vec_free (f->buffer_memory_allocated); vec_free (f->buffers); @@ -552,9 +554,9 @@ fill_free_list (vlib_main_t * vm, n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes); /* drb: removed power-of-2 ASSERT */ - buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main, - n_bytes, - sizeof (vlib_buffer_t)); + buffers = + vm->os_physmem_alloc_aligned (vm, vm->buffer_main->physmem_region, + n_bytes, sizeof (vlib_buffer_t)); if (!buffers) return n_alloc; @@ -1051,10 +1053,25 @@ VLIB_CLI_COMMAND (show_buffers_command, static) = { }; /* *INDENT-ON* */ -void -vlib_buffer_cb_init (struct vlib_main_t *vm) +clib_error_t * +vlib_buffer_main_init (struct vlib_main_t * vm) { - vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_main_t *bm; + clib_error_t *error; + + vec_validate (vm->buffer_main, 0); + bm = vm->buffer_main; + + if (vlib_buffer_callbacks) + { + /* external plugin has registered own buffer callbacks + so we just copy them and quit */ + vlib_buffer_main_t *bm = vm->buffer_main; + clib_memcpy (&bm->cb, vlib_buffer_callbacks, + sizeof (vlib_buffer_callbacks_t)); + bm->callbacks_registered = 1; + return 0; + } bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal; bm->cb.vlib_buffer_alloc_from_free_list_cb = @@ -1064,8 +1081,49 @@ vlib_buffer_cb_init (struct vlib_main_t *vm) bm->cb.vlib_buffer_delete_free_list_cb = &vlib_buffer_delete_free_list_internal; clib_spinlock_init (&bm->buffer_known_hash_lockp); + + /* allocate default region */ + error = vlib_physmem_region_alloc (vm, "buffers", + vlib_buffer_physmem_sz, 0, + VLIB_PHYSMEM_F_INIT_MHEAP | + VLIB_PHYSMEM_F_HAVE_BUFFERS, + &bm->physmem_region); + + if (error == 0) + return 0; + + clib_error_free (error); + + /* we my be running unpriviledged, so try to allocate fake physmem */ + error = vlib_physmem_region_alloc (vm, "buffers (fake)", + vlib_buffer_physmem_sz, 0, + VLIB_PHYSMEM_F_FAKE | + VLIB_PHYSMEM_F_INIT_MHEAP | + VLIB_PHYSMEM_F_HAVE_BUFFERS, + &bm->physmem_region); + return error; } +static clib_error_t * +vlib_buffers_configure (vlib_main_t * vm, unformat_input_t * input) +{ + u32 size_in_mb; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "memory-size-in-mb %d", &size_in_mb)) + vlib_buffer_physmem_sz = size_in_mb << 20; + else + return unformat_parse_error (input); + } + + unformat_free (input); + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (vlib_buffers_configure, "buffers"); + + /** @endcond */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 5504bf7c..e47dbc6d 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -408,6 +408,7 @@ typedef struct buffer index */ uword buffer_mem_start; uword buffer_mem_size; + vlib_physmem_region_index_t physmem_region; /* Buffer free callback, for subversive activities */ u32 (*buffer_free_callback) (struct vlib_main_t * vm, @@ -442,7 +443,7 @@ typedef struct void vlib_buffer_add_mem_range (struct vlib_main_t *vm, uword start, uword size); -void vlib_buffer_cb_init (struct vlib_main_t *vm); +clib_error_t *vlib_buffer_main_init (struct vlib_main_t *vm); typedef struct { diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 78bf9317..d51de6be 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -162,7 +162,7 @@ vlib_buffer_contents (vlib_main_t * vm, u32 buffer_index, u8 * contents) always_inline u64 vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index) { - return vlib_physmem_offset_to_physical (&vm->physmem_main, + return vlib_physmem_offset_to_physical (vm, vm->buffer_main->physmem_region, (((uword) buffer_index) << CLIB_LOG2_CACHE_LINE_BYTES) + STRUCT_OFFSET_OF (vlib_buffer_t, @@ -455,43 +455,6 @@ vlib_copy_buffers (u32 * dst, u32 * src, u32 n) } } -always_inline void * -vlib_physmem_alloc_aligned (vlib_main_t * vm, clib_error_t ** error, - uword n_bytes, uword alignment) -{ - void *r = - vm->os_physmem_alloc_aligned (&vm->physmem_main, n_bytes, alignment); - if (!r) - *error = - clib_error_return (0, "failed to allocate %wd bytes of I/O memory", - n_bytes); - else - *error = 0; - return r; -} - -/* By default allocate I/O memory with cache line alignment. */ -always_inline void * -vlib_physmem_alloc (vlib_main_t * vm, clib_error_t ** error, uword n_bytes) -{ - return vlib_physmem_alloc_aligned (vm, error, n_bytes, - CLIB_CACHE_LINE_BYTES); -} - -always_inline void -vlib_physmem_free (vlib_main_t * vm, void *mem) -{ - return vm->os_physmem_free (mem); -} - -always_inline u64 -vlib_physmem_virtual_to_physical (vlib_main_t * vm, void *mem) -{ - vlib_physmem_main_t *pm = &vm->physmem_main; - uword o = pointer_to_uword (mem) - pm->virtual.start; - return vlib_physmem_offset_to_physical (pm, o); -} - /* Append given data to end of buffer, possibly allocating new buffers. */ u32 vlib_buffer_add_data (vlib_main_t * vm, u32 free_list_index, diff --git a/src/vlib/main.c b/src/vlib/main.c index 5d99e899..7875f62a 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1705,22 +1705,16 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) if (!vm->name) vm->name = "VLIB"; - vec_validate (vm->buffer_main, 0); - if (vlib_buffer_callbacks) + if ((error = unix_physmem_init (vm))) { - /* external plugin has registered own buffer callbacks - so we just copy them */ - vlib_buffer_main_t *bm = vm->buffer_main; - clib_memcpy (&bm->cb, vlib_buffer_callbacks, - sizeof (vlib_buffer_callbacks_t)); - bm->callbacks_registered = 1; + clib_error_report (error); + goto done; } - else + + if ((error = vlib_buffer_main_init (vm))) { - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_buffer_cb_init (vm); - unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); - vlib_buffer_add_mem_range (vm, vpm->virtual.start, vpm->virtual.size); + clib_error_report (error); + goto done; } if ((error = vlib_thread_init (vm))) diff --git a/src/vlib/main.h b/src/vlib/main.h index b63c63fa..4c0cde3f 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -107,9 +107,21 @@ typedef struct vlib_main_t /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc. buffer memory is guaranteed to be cache-aligned. */ - void *(*os_physmem_alloc_aligned) (vlib_physmem_main_t * pm, + + clib_error_t *(*os_physmem_region_alloc) (struct vlib_main_t * vm, + char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * + idx); + + void (*os_physmem_region_free) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx); + + void *(*os_physmem_alloc_aligned) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx, uword n_bytes, uword alignment); - void (*os_physmem_free) (void *x); + void (*os_physmem_free) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx, void *x); /* Node graph main structure. */ vlib_node_main_t node_main; diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h index 9e7d52a6..a7fed124 100644 --- a/src/vlib/physmem.h +++ b/src/vlib/physmem.h @@ -40,62 +40,35 @@ #ifndef included_vlib_physmem_h #define included_vlib_physmem_h -typedef struct -{ - uword start, end, size; -} vlib_physmem_region_t; +typedef u8 vlib_physmem_region_index_t; typedef struct { - vlib_physmem_region_t virtual; - - uword log2_n_bytes_per_page; - - /* 1 << log2_n_bytes_per_page - 1. */ - uword page_mask; - + vlib_physmem_region_index_t index; + void *mem; + uword size; + int fd; + u8 log2_page_size; + u16 n_pages; + u32 page_mask; + + void *heap; + u32 flags; +#define VLIB_PHYSMEM_F_INIT_MHEAP (1<<0) +#define VLIB_PHYSMEM_F_HAVE_BUFFERS (1<<1) +#define VLIB_PHYSMEM_F_FAKE (1<<2) + + u8 numa_node; u64 *page_table; + u8 *name; +} vlib_physmem_region_t; - /* is fake physmem */ - u8 is_fake; -} vlib_physmem_main_t; - -always_inline u64 -vlib_physmem_offset_to_physical (vlib_physmem_main_t * pm, uword o) -{ - uword page_index = o >> pm->log2_n_bytes_per_page; - ASSERT (o < pm->virtual.size); - ASSERT (pm->page_table[page_index] != 0); - return (vec_elt (pm->page_table, page_index) + (o & pm->page_mask)); -} - -always_inline int -vlib_physmem_is_virtual (vlib_physmem_main_t * pm, uword p) -{ - return p >= pm->virtual.start && p < pm->virtual.end; -} - -always_inline uword -vlib_physmem_offset_of (vlib_physmem_main_t * pm, void *p) -{ - uword a = pointer_to_uword (p); - uword o; - - ASSERT (vlib_physmem_is_virtual (pm, a)); - o = a - pm->virtual.start; - - /* Offset must fit in 32 bits. */ - ASSERT ((uword) o == a - pm->virtual.start); - return o; -} -always_inline void * -vlib_physmem_at_offset (vlib_physmem_main_t * pm, uword offset) +typedef struct { - ASSERT (offset < pm->virtual.size); - return uword_to_pointer (pm->virtual.start + offset, void *); -} + vlib_physmem_region_t *regions; +} vlib_physmem_main_t; #endif /* included_vlib_physmem_h */ diff --git a/src/vlib/physmem_funcs.h b/src/vlib/physmem_funcs.h new file mode 100644 index 00000000..dbb8d9de --- /dev/null +++ b/src/vlib/physmem_funcs.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.h: virtual <-> physical memory mapping for VLIB buffers + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_physmem_funcs_h +#define included_vlib_physmem_funcs_h + +always_inline vlib_physmem_region_t * +vlib_physmem_get_region (vlib_main_t * vm, u8 index) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + return pool_elt_at_index (vpm->regions, index); +} + +always_inline u64 +vlib_physmem_offset_to_physical (vlib_main_t * vm, + vlib_physmem_region_index_t idx, uword o) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword page_index = o >> pr->log2_page_size; + ASSERT (o < pr->size); + ASSERT (pr->page_table[page_index] != 0); + return (vec_elt (pr->page_table, page_index) + (o & pr->page_mask)); +} + +always_inline int +vlib_physmem_is_virtual (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword p) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + return p >= pointer_to_uword (pr->mem) + && p < (pointer_to_uword (pr->mem) + pr->size); +} + +always_inline uword +vlib_physmem_offset_of (vlib_main_t * vm, vlib_physmem_region_index_t idx, + void *p) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword a = pointer_to_uword (p); + uword o; + + ASSERT (vlib_physmem_is_virtual (vm, idx, a)); + o = a - pointer_to_uword (pr->mem); + + /* Offset must fit in 32 bits. */ + ASSERT ((uword) o == a - pointer_to_uword (pr->mem)); + + return o; +} + +always_inline void * +vlib_physmem_at_offset (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword offset) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + ASSERT (offset < pr->size); + return uword_to_pointer (pointer_to_uword (pr->mem) + offset, void *); +} + +always_inline void * +vlib_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + clib_error_t ** error, + uword n_bytes, uword alignment) +{ + void *r = vm->os_physmem_alloc_aligned (vm, idx, n_bytes, alignment); + if (!r) + *error = + clib_error_return (0, "failed to allocate %wd bytes of I/O memory", + n_bytes); + else + *error = 0; + return r; +} + +/* By default allocate I/O memory with cache line alignment. */ +always_inline void * +vlib_physmem_alloc (vlib_main_t * vm, vlib_physmem_region_index_t idx, + clib_error_t ** error, uword n_bytes) +{ + return vlib_physmem_alloc_aligned (vm, idx, error, n_bytes, + CLIB_CACHE_LINE_BYTES); +} + +always_inline void +vlib_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, + void *mem) +{ + return vm->os_physmem_free (vm, idx, mem); +} + +always_inline u64 +vlib_physmem_virtual_to_physical (vlib_main_t * vm, + vlib_physmem_region_index_t idx, void *mem) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = pool_elt_at_index (vpm->regions, idx); + uword o = mem - pr->mem; + return vlib_physmem_offset_to_physical (vm, idx, o); +} + + +always_inline clib_error_t * +vlib_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + return vm->os_physmem_region_alloc (vm, name, size, numa_node, flags, idx); +} + +always_inline void +vlib_physmem_region_free (struct vlib_main_t *vm, + vlib_physmem_region_index_t idx) +{ + vm->os_physmem_region_free (vm, idx); +} + +#endif /* included_vlib_physmem_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 27a5bacf..d5d5d6c8 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -37,24 +37,66 @@ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifndef __NR_memfd_create +#if defined __x86_64__ +#define __NR_memfd_create 319 +#elif defined __arm__ +#define __NR_memfd_create 385 +#elif defined __aarch64__ +#define __NR_memfd_create 279 +#else +#error "__NR_memfd_create unknown for this architecture" +#endif +#endif + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#define MFD_ALLOW_SEALING 0x0002U +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -static physmem_main_t physmem_main; +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ static void * -unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, - uword alignment) +unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword n_bytes, uword alignment) { - physmem_main_t *pm = &physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); uword lo_offset, hi_offset; uword *to_free = 0; + if (pr->heap == 0) + return 0; + /* IO memory is always at least cache aligned. */ alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); while (1) { - mheap_get_aligned (pm->heap, n_bytes, + mheap_get_aligned (pr->heap, n_bytes, /* align */ alignment, /* align offset */ 0, &lo_offset); @@ -63,11 +105,14 @@ unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, if (lo_offset == ~0) break; + if (pr->flags & VLIB_PHYSMEM_F_FAKE) + break; + /* Make sure allocation does not span DMA physical chunk boundary. */ hi_offset = lo_offset + n_bytes - 1; - if ((lo_offset >> vpm->log2_n_bytes_per_page) == - (hi_offset >> vpm->log2_n_bytes_per_page)) + if ((lo_offset >> pr->log2_page_size) == + (hi_offset >> pr->log2_page_size)) break; /* Allocation would span chunk boundary, queue it to be freed as soon as @@ -79,380 +124,311 @@ unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, { uword i; for (i = 0; i < vec_len (to_free); i++) - mheap_put (pm->heap, to_free[i]); + mheap_put (pr->heap, to_free[i]); vec_free (to_free); } - return lo_offset != ~0 ? pm->heap + lo_offset : 0; + return lo_offset != ~0 ? pr->heap + lo_offset : 0; } static void -unix_physmem_free (void *x) +unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) { - physmem_main_t *pm = &physmem_main; - + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); /* Return object to region's heap. */ - mheap_put (pm->heap, x - pm->heap); + mheap_put (pr->heap, x - pr->heap); } -static void -htlb_shutdown (void) +static u64 +get_page_paddr (int fd, uword addr) { - physmem_main_t *pm = &physmem_main; - - if (!pm->shmid) - return; - shmctl (pm->shmid, IPC_RMID, 0); - pm->shmid = 0; -} + int pagesize = sysconf (_SC_PAGESIZE); + u64 seek, pagemap = 0; -/* try to use huge TLB pgs if possible */ -static int -htlb_init (vlib_main_t * vm) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - physmem_main_t *pm = &physmem_main; - u64 hugepagesize, pagesize; - u64 pfn, seek_loc; - u64 cur, physaddr, ptbits; - int fd, i; - - pm->shmid = shmget (11 /* key, my amp goes to 11 */ , pm->mem_size, - IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W); - if (pm->shmid < 0) + seek = ((u64) addr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) { - clib_unix_warning ("shmget"); + clib_unix_warning ("lseek to 0x%llx", seek); return 0; } - - pm->mem = shmat (pm->shmid, NULL, 0 /* flags */ ); - if (pm->mem == 0) + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) { - shmctl (pm->shmid, IPC_RMID, 0); + clib_unix_warning ("read ptbits"); return 0; } + if ((pagemap & (1ULL << 63)) == 0) + return 0; - memset (pm->mem, 0, pm->mem_size); + pagemap &= pow2_mask (55); - /* $$$ get page size info from /proc/meminfo */ - hugepagesize = 2 << 20; - pagesize = 4 << 10; - vpm->log2_n_bytes_per_page = min_log2 (hugepagesize); - vec_resize (vpm->page_table, pm->mem_size / hugepagesize); + return pagemap * pagesize; +} - vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); - vpm->virtual.start = pointer_to_uword (pm->mem); - vpm->virtual.size = pm->mem_size; - vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; +static clib_error_t * +unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + clib_error_t *error = 0; + int pagemap_fd = -1; + u8 *mount_dir = 0; + u8 *filename = 0; + struct stat st; + int old_mpol; + int mmap_flags; + struct bitmask *old_mask = numa_allocate_nodemask (); - fd = open ("/proc/self/pagemap", O_RDONLY); + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) + return clib_error_return (0, "not allowed"); - if (fd < 0) + pool_get (vpm->regions, pr); + + if ((pr - vpm->regions) >= 256) { - (void) shmdt (pm->mem); - return 0; + error = clib_error_return (0, "maximum number of regions reached"); + goto error; } - pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size, - /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM | - MHEAP_FLAG_THREAD_SAFE); + pr->index = pr - vpm->regions; + pr->fd = -1; + pr->flags = flags; - cur = pointer_to_uword (pm->mem); - i = 0; + if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) + == -1) + { + error = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } - while (cur < pointer_to_uword (pm->mem) + pm->mem_size) + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - pfn = (u64) cur / pagesize; - seek_loc = pfn * sizeof (u64); - if (lseek (fd, seek_loc, SEEK_SET) != seek_loc) - { - clib_unix_warning ("lseek to 0x%llx", seek_loc); - shmctl (pm->shmid, IPC_RMID, 0); - close (fd); - return 0; - } - if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof (ptbits))) + if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) { - clib_unix_warning ("read ptbits"); - shmctl (pm->shmid, IPC_RMID, 0); - close (fd); - return 0; + error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); + goto error; } - /* bits 0-54 are the physical page number */ - physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize; - if (CLIB_DEBUG > 1) - fformat (stderr, "pm: virtual 0x%llx physical 0x%llx\n", - cur, physaddr); - vpm->page_table[i++] = physaddr; + mount_dir = format (0, "%s/physmem_region%d%c", + vlib_unix_get_runtime_dir (), pr->index, 0); + filename = format (0, "%s/mem%c", mount_dir, 0); - cur += hugepagesize; - } - close (fd); - atexit (htlb_shutdown); - return 1; -} - -int vlib_app_physmem_init (vlib_main_t * vm, - physmem_main_t * pm, int) __attribute__ ((weak)); -int -vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x) -{ - return 0; -} - -clib_error_t * -unix_physmem_init (vlib_main_t * vm, int physical_memory_required) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - physmem_main_t *pm = &physmem_main; - clib_error_t *error = 0; - - /* Avoid multiple calls. */ - if (vm->os_physmem_alloc_aligned) - return error; + unlink ((char *) mount_dir); - vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; - vm->os_physmem_free = unix_physmem_free; - pm->mem = MAP_FAILED; + error = vlib_unix_recursive_mkdir ((char *) mount_dir); + if (error) + goto error; - if (pm->mem_size == 0) - pm->mem_size = 16 << 20; + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + error = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } - /* OK, Mr. App, you tell us */ - if (vlib_app_physmem_init (vm, pm, physical_memory_required)) - return 0; + if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + error = clib_error_return_unix (0, "open"); + goto error; + } - if (!pm->no_hugepages && htlb_init (vm)) + mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + } + else { - fformat (stderr, "%s: use huge pages\n", __FUNCTION__); - return 0; + if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) + return clib_error_return_unix (0, "memfd_create"); + + if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + error = + clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + goto error; + } + mmap_flags = MAP_SHARED; } - pm->mem = - mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (pm->mem == MAP_FAILED) + if (fstat (pr->fd, &st)) { - error = clib_error_return_unix (0, "mmap"); - goto done; + error = clib_error_return_unix (0, "fstat"); + goto error; } - pm->heap = mheap_alloc (pm->mem, pm->mem_size); - - /* Identity map with a single page. */ - vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size); - vec_add1 (vpm->page_table, pointer_to_uword (pm->mem)); - - vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); - vpm->virtual.start = pointer_to_uword (pm->mem); - vpm->virtual.size = pm->mem_size; - vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; - vpm->is_fake = 1; + pr->log2_page_size = min_log2 (st.st_blksize); + pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; + size = pr->n_pages * (1 << pr->log2_page_size); - fformat (stderr, "%s: use fake dma pages\n", __FUNCTION__); + if ((ftruncate (pr->fd, size)) == -1) + { + error = clib_error_return_unix (0, "ftruncate length: %d", size); + goto error; + } -done: - if (error) + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if (pm->mem != MAP_FAILED) - munmap (pm->mem, pm->mem_size); + error = vlib_sysfs_prealloc_hugepages (numa_node, + 1 << (pr->log2_page_size - 10), + pr->n_pages); + if (error) + goto error; } - return error; -} -static clib_error_t * -show_physmem (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - physmem_main_t *pm = &physmem_main; + numa_set_preferred (numa_node); - if (pm->heap) - vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); - else - vlib_cli_output (vm, "No physmem allocated."); - return 0; -} + pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_physmem_command, static) = { - .path = "show physmem", - .short_help = "Show physical memory allocation", - .function = show_physmem, -}; -/* *INDENT-ON* */ + if (pr->mem == MAP_FAILED) + { + pr->mem = 0; + error = clib_error_return_unix (0, "mmap"); + goto error; + } -static clib_error_t * -show_affinity (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - cpu_set_t set; - cpu_set_t *setp = &set; - int i, rv; - u8 *s = 0; - int first_set_bit_in_run = -1; - int last_set_bit_in_run = -1; - int output_done = 0; - - rv = sched_getaffinity (0 /* pid, 0 = this proc */ , - sizeof (*setp), setp); - if (rv < 0) + if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) { - vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", - strerror (errno)); - return 0; + error = clib_error_return_unix (0, "set_mempolicy"); + goto error; } - for (i = 0; i < 64; i++) + pr->size = pr->n_pages << pr->log2_page_size; + pr->page_mask = (1 << pr->log2_page_size) - 1; + pr->numa_node = numa_node; + pr->name = format (0, "%s", name); + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if (CPU_ISSET (i, setp)) - { - if (first_set_bit_in_run == -1) - { - first_set_bit_in_run = i; - last_set_bit_in_run = i; - if (output_done) - s = format (s, ","); - s = format (s, "%d-", i); - output_done = 1; - } - else - { - if (i == (last_set_bit_in_run + 1)) - last_set_bit_in_run = i; - } - } - else + int i; + for (i = 0; i < pr->n_pages; i++) { - if (first_set_bit_in_run != -1) + void *ptr = pr->mem + (i << pr->log2_page_size); + int node; + move_pages (0, 1, &ptr, 0, &node, 0); + if (numa_node != node) { - if (first_set_bit_in_run == (i - 1)) - { - _vec_len (s) -= 2 + ((first_set_bit_in_run / 10)); - } - s = format (s, "%d", last_set_bit_in_run); - first_set_bit_in_run = -1; - last_set_bit_in_run = -1; + clib_warning + ("physmem page for region \'%s\' allocated on the wrong" + " numa node (requested %u actual %u)", pr->name, + pr->numa_node, node, i); + break; } } } - if (first_set_bit_in_run != -1) - s = format (s, "%d", first_set_bit_in_run); - - vlib_cli_output (vm, "Process runs on: %v", s); - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_affinity_command, static) = { - .path = "show affinity", - .short_help = "Show process cpu affinity", - .function = show_affinity, -}; -/* *INDENT-ON* */ + if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) + { + pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); + fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); + } -static clib_error_t * -set_affinity (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - cpu_set_t set; - cpu_set_t *setp = &set; - int i, rv; - int another_round; - u32 first, last; + if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) + { + vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); + } - memset (setp, 0, sizeof (*setp)); + *idx = pr->index; - do + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - another_round = 0; - if (unformat (input, "%d-%d,", &first, &last)) + int i; + for (i = 0; i < pr->n_pages; i++) { - if (first > 64 || last > 64) - { - barf1: - vlib_cli_output (vm, "range %d-%d invalid", first, last); - return 0; - } - - for (i = first; i <= last; i++) - CPU_SET (i, setp); - another_round = 1; + uword vaddr = + pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); + u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); + vec_add1 (pr->page_table, page_paddr); } - else if (unformat (input, "%d-%d", &first, &last)) - { - if (first > 64 || last > 64) - goto barf1; + } - for (i = first; i <= last; i++) - CPU_SET (i, setp); - } - else if (unformat (input, "%d,", &first)) - { - if (first > 64) - { - barf2: - vlib_cli_output (vm, "cpu %d invalid", first); - return 0; - } - CPU_SET (first, setp); - another_round = 1; - } - else if (unformat (input, "%d", &first)) - { - if (first > 64) - goto barf2; + goto done; - CPU_SET (first, setp); - } - } - while (another_round); +error: + if (pr->fd > -1) + close (pr->fd); - rv = sched_setaffinity (0 /* pid, 0 = this proc */ , - sizeof (*setp), setp); + if (pr->mem) + munmap (pr->mem, size); - if (rv < 0) + memset (pr, 0, sizeof (*pr)); + pool_put (vpm->regions, pr); + +done: + if (mount_dir) { - vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", - strerror (errno)); - return 0; + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + vec_free (mount_dir); } - return show_affinity (vm, input, cmd); + numa_free_cpumask (old_mask); + vec_free (filename); + if (pagemap_fd > -1) + close (pagemap_fd); + return error; } -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (set_affinity_command, static) = { - .path = "set affinity", - .short_help = "Set process cpu affinity", - .function = set_affinity, -}; -/* *INDENT-ON* */ +static void +unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + + if (pr->fd > 0) + close (pr->fd); + munmap (pr->mem, pr->size); + vec_free (pr->name); + pool_put (vpm->regions, pr); +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + vm->os_physmem_region_alloc = unix_physmem_region_alloc; + vm->os_physmem_region_free = unix_physmem_region_free; + + return error; +} static clib_error_t * -vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input) +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) { - physmem_main_t *pm = &physmem_main; - u32 size_in_mb; + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + /* *INDENT-OFF* */ + pool_foreach (pr, vpm->regions, ( { - if (unformat (input, "no-huge") || unformat (input, "no-huge-pages")) - pm->no_hugepages = 1; - - else if (unformat (input, "size-in-mb %d", &size_in_mb) || - unformat (input, "size %d", &size_in_mb)) - pm->mem_size = size_in_mb << 20; + vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " + "numa-node %u fd %d\n", + pr->index, pr->name, (1 << (pr->log2_page_size -10)), + pr->n_pages, pr->numa_node, pr->fd); + if (pr->heap) + vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); else - return unformat_parse_error (input); - } - - unformat_free (input); + vlib_cli_output (vm, " no heap\n"); + })); + /* *INDENT-ON* */ return 0; } -VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem"); +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vlib/unix/physmem.h b/src/vlib/unix/physmem.h deleted file mode 100644 index 5519a7d6..00000000 --- a/src/vlib/unix/physmem.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __included_physmem_h__ -#define __included_physmem_h__ - -/* Manage I/O physical memory. */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -#include -#include - -#include /* for open */ -#include /* for flock */ -#include -#include -#include -#include -#include -#include - -typedef struct -{ - /* Virtual memory via mmaped. */ - void *mem; - - /* Size in bytes. */ - uword mem_size; - - /* Heap allocated out of virtual memory. */ - void *heap; - - /* huge TLB segment id */ - int shmid; - - /* should we try to use htlb ? */ - int no_hugepages; - -} physmem_main_t; - -#endif /* __included_physmem_h__ */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index 97f58944..b5a33427 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -195,18 +195,7 @@ unix_save_error (unix_main_t * um, clib_error_t * error) /* Main function for Unix VLIB. */ int vlib_unix_main (int argc, char *argv[]); -/* Call to allocate/initialize physical DMA memory subsystem. - This is not an init function so that users can explicitly enable/disable - physmem when its not needed. */ -clib_error_t *unix_physmem_init (vlib_main_t * vm, - int fail_if_physical_memory_not_present); - -static inline int -unix_physmem_is_fake (vlib_main_t * vm) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - return vpm->is_fake; -} +clib_error_t *unix_physmem_init (vlib_main_t * vm); /* Set prompt for CLI. */ void vlib_unix_cli_set_prompt (char *prompt); @@ -234,7 +223,16 @@ clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); u8 *vlib_sysfs_link_to_name (char *link); -int vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size); +clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, + int page_size, int nr); +clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, + int page_size, int nr); clib_error_t *foreach_directory_file (char *dir_name, clib_error_t * (*f) (void *arg, diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 312cc9b5..0e252aca 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -189,37 +189,132 @@ vlib_sysfs_link_to_name (char *link) return s; } -int -vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size) +clib_error_t * +vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) { + clib_error_t *error = 0; struct stat sb; u8 *p = 0; - int r = -1; p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); if (stat ((char *) p, &sb) == 0) { if (S_ISDIR (sb.st_mode) == 0) - goto done; + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } } else if (numa_node == 0) { vec_reset_length (p); p = format (p, "/sys/kernel/mm%c", 0); if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - goto done; + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } } else - goto done; + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/free_hugepages%c", page_size, 0); - vlib_sysfs_read ((char *) p, "%d", &r); + p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); + vlib_sysfs_write ((char *) p, "%d", nr); done: vec_free (p); - return r; + return error; +} + + +static clib_error_t * +vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, + int page_size, int *val) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, + type, 0); + error = vlib_sysfs_read ((char *) p, "%d", val); + +done: + vec_free (p); + return error; +} + +clib_error_t * +vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, + int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + int n, needed; + error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); + if (error) + return error; + needed = nr - n; + if (needed <= 0) + return 0; + + error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); + if (error) + return error; + clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", + needed, page_size, numa_node); + return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); } clib_error_t * diff --git a/src/vlib/vlib.h b/src/vlib/vlib.h index b146a49b..eed5c5bc 100644 --- a/src/vlib/vlib.h +++ b/src/vlib/vlib.h @@ -50,6 +50,7 @@ struct vlib_main_t; /* All includes in alphabetical order. */ +#include #include #include #include @@ -57,7 +58,6 @@ struct vlib_main_t; #include #include #include -#include #include /* Main include depends on other vlib/ includes so we put it last. */ @@ -65,6 +65,7 @@ struct vlib_main_t; /* Inline/extern function declarations. */ #include +#include #include #include #include -- cgit 1.2.3-korg From 3b64d6334b4e8d0759cff043a55042f88d1ccb0e Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 8 Sep 2017 12:26:12 +0200 Subject: vlib: move linux-specific code to vlib/linux Change-Id: Id79d2c2be7a98e15416a537c890a8f2dd6d4464d Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 1 + src/plugins/memif/memif.c | 1 + src/plugins/memif/private.h | 30 -- src/vlib.am | 7 +- src/vlib/linux/pci.c | 666 +++++++++++++++++++++++++++++++++ src/vlib/linux/physmem.c | 411 ++++++++++++++++++++ src/vlib/linux/syscall.h | 58 +++ src/vlib/linux/sysfs.c | 250 +++++++++++++ src/vlib/linux/sysfs.h | 44 +++ src/vlib/pci/linux_pci.c | 665 -------------------------------- src/vlib/threads_cli.c | 1 + src/vlib/unix/physmem.c | 439 ---------------------- src/vlib/unix/unix.h | 17 - src/vlib/unix/util.c | 219 ----------- src/vnet/devices/af_packet/af_packet.c | 1 + 15 files changed, 1438 insertions(+), 1372 deletions(-) create mode 100644 src/vlib/linux/pci.c create mode 100644 src/vlib/linux/physmem.c create mode 100644 src/vlib/linux/syscall.h create mode 100644 src/vlib/linux/sysfs.c create mode 100644 src/vlib/linux/sysfs.h delete mode 100644 src/vlib/pci/linux_pci.c delete mode 100644 src/vlib/unix/physmem.c (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index e23542f7..4ef3b676 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index 7e2d947f..4c387b92 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include diff --git a/src/plugins/memif/private.h b/src/plugins/memif/private.h index 985ac5ec..b5f2f8ff 100644 --- a/src/plugins/memif/private.h +++ b/src/plugins/memif/private.h @@ -228,24 +228,6 @@ int memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args); int memif_delete_if (vlib_main_t * vm, memif_if_t * mif); clib_error_t *memif_plugin_api_hookup (vlib_main_t * vm); -#ifndef __NR_memfd_create -#if defined __x86_64__ -#define __NR_memfd_create 319 -#elif defined __arm__ -#define __NR_memfd_create 385 -#elif defined __aarch64__ -#define __NR_memfd_create 279 -#else -#error "__NR_memfd_create unknown for this architecture" -#endif -#endif - -static inline int -memfd_create (const char *name, unsigned int flags) -{ - return syscall (__NR_memfd_create, name, flags); -} - static_always_inline void * memif_get_buffer (memif_if_t * mif, memif_ring_t * ring, u16 slot) { @@ -253,18 +235,6 @@ memif_get_buffer (memif_if_t * mif, memif_ring_t * ring, u16 slot) return mif->regions[region].shm + ring->desc[slot].offset; } -#ifndef F_LINUX_SPECIFIC_BASE -#define F_LINUX_SPECIFIC_BASE 1024 -#endif -#define MFD_ALLOW_SEALING 0x0002U -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ - /* memif.c */ clib_error_t *memif_init_regions_and_queues (memif_if_t * mif); clib_error_t *memif_connect (memif_if_t * mif); diff --git a/src/vlib.am b/src/vlib.am index cab90e2d..41d68690 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -32,13 +32,15 @@ libvlib_la_SOURCES = \ vlib/format.c \ vlib/i2c.c \ vlib/init.c \ + vlib/linux/pci.c \ + vlib/linux/physmem.c \ + vlib/linux/sysfs.c \ vlib/main.c \ vlib/mc.c \ vlib/node.c \ vlib/node_cli.c \ vlib/node_format.c \ vlib/pci/pci.c \ - vlib/pci/linux_pci.c \ vlib/threads.c \ vlib/threads_cli.c \ vlib/trace.c @@ -58,6 +60,8 @@ nobase_include_HEADERS += \ vlib/global_funcs.h \ vlib/i2c.h \ vlib/init.h \ + vlib/linux/sysfs.h \ + vlib/linux/syscall.h \ vlib/main.h \ vlib/mc.h \ vlib/node_funcs.h \ @@ -79,7 +83,6 @@ libvlib_la_SOURCES += \ vlib/unix/mc_socket.c \ vlib/unix/plugin.c \ vlib/unix/plugin.h \ - vlib/unix/physmem.c \ vlib/unix/util.c nobase_include_HEADERS += \ diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c new file mode 100644 index 00000000..cd2affdc --- /dev/null +++ b/src/vlib/linux/pci.c @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct +{ + /* /sys/bus/pci/devices/... directory name for this device. */ + u8 *dev_dir_name; + + /* Resource file descriptors. */ + int *resource_fds; + + /* File descriptor for config space read/write. */ + int config_fd; + + /* File descriptor for /dev/uio%d */ + int uio_fd; + + /* Minor device for uio device. */ + u32 uio_minor; + + /* Index given by unix_file_add. */ + u32 unix_file_index; + +} linux_pci_device_t; + +/* Pool of PCI devices. */ +typedef struct +{ + vlib_main_t *vlib_main; + linux_pci_device_t *linux_pci_devices; +} linux_pci_main_t; + +extern linux_pci_main_t linux_pci_main; + +/* Call to allocate/initialize the pci subsystem. + This is not an init function so that users can explicitly enable + pci only when it's needed. */ +clib_error_t *pci_bus_init (vlib_main_t * vm); + +clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, + char *uio_driver_name); + +linux_pci_main_t linux_pci_main; + +clib_error_t * +vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) +{ + clib_error_t *error = 0; + u8 *s = 0, *driver_name = 0; + DIR *dir = 0; + struct dirent *e; + int fd, clear_driver_override = 0; + u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", + format_vlib_pci_addr, &d->bus_address); + + s = format (s, "%v/driver%c", dev_dir_name, 0); + driver_name = vlib_sysfs_link_to_name ((char *) s); + vec_reset_length (s); + + if (driver_name && + ((strcmp ("vfio-pci", (char *) driver_name) == 0) || + (strcmp ("uio_pci_generic", (char *) driver_name) == 0) || + (strcmp ("igb_uio", (char *) driver_name) == 0))) + goto done; + + /* walk trough all linux interfaces and if interface belonging to + this device is founf check if interface is admin up */ + dir = opendir ("/sys/class/net"); + s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); + + if (!dir) + { + error = clib_error_return (0, "Skipping PCI device %U: failed to " + "read /sys/class/net", + format_vlib_pci_addr, &d->bus_address); + goto done; + } + + fd = socket (PF_INET, SOCK_DGRAM, 0); + if (fd < 0) + { + error = clib_error_return_unix (0, "socket"); + goto done; + } + + while ((e = readdir (dir))) + { + struct ifreq ifr; + struct ethtool_drvinfo drvinfo; + + if (e->d_name[0] == '.') /* skip . and .. */ + continue; + + memset (&ifr, 0, sizeof ifr); + memset (&drvinfo, 0, sizeof drvinfo); + ifr.ifr_data = (char *) &drvinfo; + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + drvinfo.cmd = ETHTOOL_GDRVINFO; + if (ioctl (fd, SIOCETHTOOL, &ifr) < 0) + { + /* Some interfaces (eg "lo") don't support this ioctl */ + if ((errno != ENOTSUP) && (errno != ENODEV)) + clib_unix_warning ("ioctl fetch intf %s bus info error", + e->d_name); + continue; + } + + if (strcmp ((char *) s, drvinfo.bus_info)) + continue; + + memset (&ifr, 0, sizeof (ifr)); + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl fetch intf %s flags", + e->d_name); + close (fd); + goto done; + } + + if (ifr.ifr_flags & IFF_UP) + { + error = clib_error_return (0, "Skipping PCI device %U as host " + "interface %s is up", + format_vlib_pci_addr, &d->bus_address, + e->d_name); + close (fd); + goto done; + } + } + + close (fd); + vec_reset_length (s); + + s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + s = format (s, "%v/driver_override%c", dev_dir_name, 0); + if (access ((char *) s, F_OK) == 0) + { + vlib_sysfs_write ((char *) s, "%s", uio_driver_name); + clear_driver_override = 1; + } + else + { + vec_reset_length (s); + s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, + d->device_id); + } + vec_reset_length (s); + + s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + if (clear_driver_override) + { + s = format (s, "%v/driver_override%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%c", 0); + vec_reset_length (s); + } + +done: + closedir (dir); + vec_free (s); + vec_free (dev_dir_name); + vec_free (driver_name); + return error; +} + + +static clib_error_t * +scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) +{ + linux_pci_device_t *l = arg; + unformat_input_t input; + + unformat_init_string (&input, (char *) file_name, vec_len (file_name)); + + if (!unformat (&input, "uio%d", &l->uio_minor)) + abort (); + + unformat_free (&input); + return 0; +} + +static clib_error_t * +linux_pci_uio_read_ready (unix_file_t * uf) +{ + vlib_pci_main_t *pm = &pci_main; + vlib_pci_device_t *d; + int __attribute__ ((unused)) rv; + + u32 icount; + rv = read (uf->file_descriptor, &icount, 4); + + d = pool_elt_at_index (pm->pci_devs, uf->private_data); + + if (d->interrupt_handler) + d->interrupt_handler (d); + + vlib_pci_intr_enable (d); + + return /* no error */ 0; +} + +static clib_error_t * +linux_pci_uio_error_ready (unix_file_t * uf) +{ + u32 error_index = (u32) uf->private_data; + + return clib_error_return (0, "pci device %d: error", error_index); +} + +static void +add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *l; + + pool_get (lpm->linux_pci_devices, l); + l[0] = pdev[0]; + + l->dev_dir_name = vec_dup (l->dev_dir_name); + + dev->os_handle = l - lpm->linux_pci_devices; + + { + u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name); + foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ + 1); + vec_free (uio_dir); + } + + { + char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); + l->uio_fd = open (uio_name, O_RDWR); + if (l->uio_fd < 0) + clib_unix_error ("open `%s'", uio_name); + vec_free (uio_name); + } + + { + unix_file_t template = { 0 }; + unix_main_t *um = &unix_main; + + template.read_function = linux_pci_uio_read_ready; + template.file_descriptor = l->uio_fd; + template.error_function = linux_pci_uio_error_ready; + template.private_data = dev - pm->pci_devs; + + l->unix_file_index = unix_file_add (um, &template); + } +} + +static void +linux_pci_device_free (linux_pci_device_t * l) +{ + int i; + for (i = 0; i < vec_len (l->resource_fds); i++) + if (l->resource_fds[i] > 0) + close (l->resource_fds[i]); + if (l->config_fd > 0) + close (l->config_fd); + if (l->uio_fd > 0) + close (l->uio_fd); + vec_free (l->resource_fds); + vec_free (l->dev_dir_name); +} + +/* Configuration space read/write. */ +clib_error_t * +vlib_pci_read_write_config (vlib_pci_device_t * dev, + vlib_read_or_write_t read_or_write, + uword address, void *data, u32 n_bytes) +{ + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *p; + int n; + + p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle); + + if (read_or_write == VLIB_READ) + n = pread (p->config_fd, data, n_bytes, address); + else + n = pwrite (p->config_fd, data, n_bytes, address); + + if (n != n_bytes) + return clib_error_return_unix (0, "%s", + read_or_write == VLIB_READ + ? "read" : "write"); + + return 0; +} + +static clib_error_t * +os_map_pci_resource_internal (uword os_handle, + u32 resource, u8 * addr, void **result) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *p; + struct stat stat_buf; + u8 *file_name; + int fd; + clib_error_t *error; + int flags = MAP_SHARED; + + error = 0; + p = pool_elt_at_index (pm->linux_pci_devices, os_handle); + + file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); + fd = open ((char *) file_name, O_RDWR); + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", file_name); + goto done; + } + + if (fstat (fd, &stat_buf) < 0) + { + error = clib_error_return_unix (0, "fstat `%s'", file_name); + goto done; + } + + vec_validate (p->resource_fds, resource); + p->resource_fds[resource] = fd; + if (addr != 0) + flags |= MAP_FIXED; + + *result = mmap (addr, + /* size */ stat_buf.st_size, + PROT_READ | PROT_WRITE, flags, + /* file */ fd, + /* offset */ 0); + if (*result == (void *) -1) + { + error = clib_error_return_unix (0, "mmap `%s'", file_name); + goto done; + } + +done: + if (error) + { + if (fd >= 0) + close (fd); + } + vec_free (file_name); + return error; +} + +clib_error_t * +vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, 0 /* addr */ , + result)); +} + +clib_error_t * +vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, + u32 resource, u8 * addr, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, addr, result)); +} + +void +vlib_pci_free_device (vlib_pci_device_t * dev) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *l; + + l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle); + linux_pci_device_free (l); + pool_put (pm->linux_pci_devices, l); +} + +pci_device_registration_t * __attribute__ ((unused)) +pci_device_next_registered (pci_device_registration_t * r) +{ + uword i; + + /* Null vendor id marks end of initialized list. */ + for (i = 0; r->supported_devices[i].vendor_id != 0; i++) + ; + + return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); +} + +static clib_error_t * +init_device_from_registered (vlib_main_t * vm, + vlib_pci_device_t * dev, + linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + pci_device_registration_t *r; + pci_device_id_t *i; + clib_error_t *error; + + r = pm->pci_device_registrations; + + while (r) + { + for (i = r->supported_devices; i->vendor_id != 0; i++) + if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id) + { + error = vlib_pci_bind_to_uio (dev, "uio_pci_generic"); + if (error) + { + clib_error_report (error); + continue; + } + + add_device (dev, pdev); + dev->interrupt_handler = r->interrupt_handler; + return r->init_function (vm, dev); + } + r = r->next_registration; + } + /* No driver, close the PCI config-space FD */ + close (pdev->config_fd); + return 0; +} + +static clib_error_t * +init_device (vlib_main_t * vm, + vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + return init_device_from_registered (vm, dev, pdev); +} + +static clib_error_t * +scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) +{ + vlib_main_t *vm = arg; + vlib_pci_main_t *pm = &pci_main; + int fd; + u8 *f; + clib_error_t *error = 0; + vlib_pci_device_t *dev; + linux_pci_device_t pdev = { 0 }; + u32 tmp; + + f = format (0, "%v/config%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDWR); + + /* Try read-only access if write fails. */ + if (fd < 0) + fd = open ((char *) f, O_RDONLY); + + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", f); + goto done; + } + + pool_get (pm->pci_devs, dev); + + /* You can only read more that 64 bytes of config space as root; so we try to + read the full space but fall back to just the first 64 bytes. */ + if (read (fd, &dev->config_data, sizeof (dev->config_data)) != + sizeof (dev->config_data) + && read (fd, &dev->config0, + sizeof (dev->config0)) != sizeof (dev->config0)) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return_unix (0, "read `%s'", f); + close (fd); + goto done; + } + + { + static pci_config_header_t all_ones; + if (all_ones.vendor_id == 0) + memset (&all_ones, ~0, sizeof (all_ones)); + + if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones))) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return (0, "invalid PCI config for `%s'", f); + close (fd); + goto done; + } + } + + if (dev->config0.header.header_type == 0) + pci_config_type0_little_to_host (&dev->config0); + else + pci_config_type1_little_to_host (&dev->config1); + + /* Parse bus, dev, function from directory name. */ + { + unformat_input_t input; + + unformat_init_string (&input, (char *) dev_dir_name, + vec_len (dev_dir_name)); + + if (!unformat (&input, "/sys/bus/pci/devices/%U", + unformat_vlib_pci_addr, &dev->bus_address)) + abort (); + + unformat_free (&input); + + } + + + pdev.config_fd = fd; + pdev.dev_dir_name = dev_dir_name; + + hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, + dev - pm->pci_devs); + + vec_reset_length (f); + f = format (f, "%v/vpd%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDONLY); + if (fd >= 0) + { + while (1) + { + u8 tag[3]; + u8 *data = 0; + int len; + + if (read (fd, &tag, 3) != 3) + break; + + if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) + break; + + len = (tag[2] << 8) | tag[1]; + vec_validate (data, len); + + if (read (fd, data, len) != len) + { + vec_free (data); + break; + } + if (tag[0] == 0x82) + dev->product_name = data; + else if (tag[0] == 0x90) + dev->vpd_r = data; + else if (tag[0] == 0x91) + dev->vpd_w = data; + + data = 0; + } + close (fd); + } + + dev->numa_node = -1; + vec_reset_length (f); + f = format (f, "%v/numa_node%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); + + vec_reset_length (f); + f = format (f, "%v/class%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_class = tmp >> 8; + + vec_reset_length (f); + f = format (f, "%v/vendor%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->vendor_id = tmp; + + vec_reset_length (f); + f = format (f, "%v/device%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_id = tmp; + + error = init_device (vm, dev, &pdev); + + vec_reset_length (f); + f = format (f, "%v/driver%c", dev_dir_name, 0); + dev->driver_name = vlib_sysfs_link_to_name ((char *) f); + +done: + vec_free (f); + return error; +} + +clib_error_t * +linux_pci_init (vlib_main_t * vm) +{ + vlib_pci_main_t *pm = &pci_main; + clib_error_t *error; + + pm->vlib_main = vm; + + if ((error = vlib_call_init_function (vm, unix_input_init))) + return error; + + ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); + pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword)); + + error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, + /* scan_dirs */ 0); + + /* Complain and continue. might not be root, etc. */ + if (error) + clib_error_report (error); + + return error; +} + +VLIB_INIT_FUNCTION (linux_pci_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c new file mode 100644 index 00000000..6731295c --- /dev/null +++ b/src/vlib/linux/physmem.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.c: Unix physical memory + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static void * +unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword n_bytes, uword alignment) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword lo_offset, hi_offset; + uword *to_free = 0; + + if (pr->heap == 0) + return 0; + + /* IO memory is always at least cache aligned. */ + alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); + + while (1) + { + mheap_get_aligned (pr->heap, n_bytes, + /* align */ alignment, + /* align offset */ 0, + &lo_offset); + + /* Allocation failed? */ + if (lo_offset == ~0) + break; + + if (pr->flags & VLIB_PHYSMEM_F_FAKE) + break; + + /* Make sure allocation does not span DMA physical chunk boundary. */ + hi_offset = lo_offset + n_bytes - 1; + + if ((lo_offset >> pr->log2_page_size) == + (hi_offset >> pr->log2_page_size)) + break; + + /* Allocation would span chunk boundary, queue it to be freed as soon as + we find suitable chunk. */ + vec_add1 (to_free, lo_offset); + } + + if (to_free != 0) + { + uword i; + for (i = 0; i < vec_len (to_free); i++) + mheap_put (pr->heap, to_free[i]); + vec_free (to_free); + } + + return lo_offset != ~0 ? pr->heap + lo_offset : 0; +} + +static void +unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + /* Return object to region's heap. */ + mheap_put (pr->heap, x - pr->heap); +} + +static u64 +get_page_paddr (int fd, uword addr) +{ + int pagesize = sysconf (_SC_PAGESIZE); + u64 seek, pagemap = 0; + + seek = ((u64) addr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) + { + clib_unix_warning ("lseek to 0x%llx", seek); + return 0; + } + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) + { + clib_unix_warning ("read ptbits"); + return 0; + } + if ((pagemap & (1ULL << 63)) == 0) + return 0; + + pagemap &= pow2_mask (55); + + return pagemap * pagesize; +} + +static clib_error_t * +unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + clib_error_t *error = 0; + int pagemap_fd = -1; + u8 *mount_dir = 0; + u8 *filename = 0; + struct stat st; + int old_mpol; + int mmap_flags; + struct bitmask *old_mask = numa_allocate_nodemask (); + + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) + return clib_error_return (0, "not allowed"); + + pool_get (vpm->regions, pr); + + if ((pr - vpm->regions) >= 256) + { + error = clib_error_return (0, "maximum number of regions reached"); + goto error; + } + + pr->index = pr - vpm->regions; + pr->fd = -1; + pr->flags = flags; + + if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) + == -1) + { + error = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) + { + error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); + goto error; + } + + mount_dir = format (0, "%s/physmem_region%d%c", + vlib_unix_get_runtime_dir (), pr->index, 0); + filename = format (0, "%s/mem%c", mount_dir, 0); + + unlink ((char *) mount_dir); + + error = vlib_unix_recursive_mkdir ((char *) mount_dir); + if (error) + goto error; + + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + error = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } + + if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + error = clib_error_return_unix (0, "open"); + goto error; + } + + mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + } + else + { + if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) + return clib_error_return_unix (0, "memfd_create"); + + if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + error = + clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + goto error; + } + mmap_flags = MAP_SHARED; + } + + if (fstat (pr->fd, &st)) + { + error = clib_error_return_unix (0, "fstat"); + goto error; + } + + pr->log2_page_size = min_log2 (st.st_blksize); + pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; + size = pr->n_pages * (1 << pr->log2_page_size); + + if ((ftruncate (pr->fd, size)) == -1) + { + error = clib_error_return_unix (0, "ftruncate length: %d", size); + goto error; + } + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + error = vlib_sysfs_prealloc_hugepages (numa_node, + 1 << (pr->log2_page_size - 10), + pr->n_pages); + if (error) + goto error; + } + + numa_set_preferred (numa_node); + + pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); + + if (pr->mem == MAP_FAILED) + { + pr->mem = 0; + error = clib_error_return_unix (0, "mmap"); + goto error; + } + + if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) + { + error = clib_error_return_unix (0, "set_mempolicy"); + goto error; + } + + pr->size = pr->n_pages << pr->log2_page_size; + pr->page_mask = (1 << pr->log2_page_size) - 1; + pr->numa_node = numa_node; + pr->name = format (0, "%s", name); + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + int i; + for (i = 0; i < pr->n_pages; i++) + { + void *ptr = pr->mem + (i << pr->log2_page_size); + int node; + move_pages (0, 1, &ptr, 0, &node, 0); + if (numa_node != node) + { + clib_warning + ("physmem page for region \'%s\' allocated on the wrong" + " numa node (requested %u actual %u)", pr->name, + pr->numa_node, node, i); + break; + } + } + } + + if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) + { + pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); + fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); + } + + if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) + { + vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); + } + + *idx = pr->index; + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + int i; + for (i = 0; i < pr->n_pages; i++) + { + uword vaddr = + pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); + u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); + vec_add1 (pr->page_table, page_paddr); + } + } + + goto done; + +error: + if (pr->fd > -1) + close (pr->fd); + + if (pr->mem) + munmap (pr->mem, size); + + memset (pr, 0, sizeof (*pr)); + pool_put (vpm->regions, pr); + +done: + if (mount_dir) + { + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + vec_free (mount_dir); + } + numa_free_cpumask (old_mask); + vec_free (filename); + if (pagemap_fd > -1) + close (pagemap_fd); + return error; +} + +static void +unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + + if (pr->fd > 0) + close (pr->fd); + munmap (pr->mem, pr->size); + vec_free (pr->name); + pool_put (vpm->regions, pr); +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + vm->os_physmem_region_alloc = unix_physmem_region_alloc; + vm->os_physmem_region_free = unix_physmem_region_free; + + return error; +} + +static clib_error_t * +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + + /* *INDENT-OFF* */ + pool_foreach (pr, vpm->regions, ( + { + vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " + "numa-node %u fd %d\n", + pr->index, pr->name, (1 << (pr->log2_page_size -10)), + pr->n_pages, pr->numa_node, pr->fd); + if (pr->heap) + vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); + else + vlib_cli_output (vm, " no heap\n"); + })); + /* *INDENT-ON* */ + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/syscall.h b/src/vlib/linux/syscall.h new file mode 100644 index 00000000..9e37997e --- /dev/null +++ b/src/vlib/linux/syscall.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_syscall_h +#define included_linux_syscall_h + +#ifndef __NR_memfd_create +#if defined __x86_64__ +#define __NR_memfd_create 319 +#elif defined __arm__ +#define __NR_memfd_create 385 +#elif defined __aarch64__ +#define __NR_memfd_create 279 +#else +#error "__NR_memfd_create unknown for this architecture" +#endif +#endif + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#define MFD_ALLOW_SEALING 0x0002U +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ + + +#endif /* included_linux_syscall_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/sysfs.c b/src/vlib/linux/sysfs.c new file mode 100644 index 00000000..f92f9ef5 --- /dev/null +++ b/src/vlib/linux/sysfs.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +clib_error_t * +vlib_sysfs_write (char *file_name, char *fmt, ...) +{ + u8 *s; + int fd; + clib_error_t *error = 0; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + if (write (fd, s, vec_len (s)) < 0) + error = clib_error_return_unix (0, "write `%s'", file_name); + + vec_free (s); + close (fd); + return error; +} + +clib_error_t * +vlib_sysfs_read (char *file_name, char *fmt, ...) +{ + unformat_input_t input; + u8 *s = 0; + int fd; + ssize_t sz; + uword result; + + fd = open (file_name, O_RDONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + vec_validate (s, 4095); + + sz = read (fd, s, vec_len (s)); + if (sz < 0) + { + close (fd); + vec_free (s); + return clib_error_return_unix (0, "read `%s'", file_name); + } + + _vec_len (s) = sz; + unformat_init_vector (&input, s); + + va_list va; + va_start (va, fmt); + result = va_unformat (&input, fmt, &va); + va_end (va); + + vec_free (s); + close (fd); + + if (result == 0) + return clib_error_return (0, "unformat error"); + + return 0; +} + +u8 * +vlib_sysfs_link_to_name (char *link) +{ + char *p, buffer[64]; + unformat_input_t in; + u8 *s = 0; + int r; + + r = readlink (link, buffer, sizeof (buffer) - 1); + + if (r < 0) + return 0; + + buffer[r] = 0; + p = strrchr (buffer, '/'); + + if (!p) + return 0; + + unformat_init_string (&in, p + 1, strlen (p + 1)); + if (unformat (&in, "%s", &s) != 1) + clib_unix_warning ("no string?"); + unformat_free (&in); + + return s; +} + +clib_error_t * +vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); + vlib_sysfs_write ((char *) p, "%d", nr); + +done: + vec_free (p); + return error; +} + + +static clib_error_t * +vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, + int page_size, int *val) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, + type, 0); + error = vlib_sysfs_read ((char *) p, "%d", val); + +done: + vec_free (p); + return error; +} + +clib_error_t * +vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, + int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + int n, needed; + error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); + if (error) + return error; + needed = nr - n; + if (needed <= 0) + return 0; + + error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); + if (error) + return error; + clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", + needed, page_size, numa_node); + return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/sysfs.h b/src/vlib/linux/sysfs.h new file mode 100644 index 00000000..14b71317 --- /dev/null +++ b/src/vlib/linux/sysfs.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_sysfs_h +#define included_linux_sysfs_h + +clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); + +clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); + +u8 *vlib_sysfs_link_to_name (char *link); + +clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, + int page_size, int nr); +clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, + int page_size, int nr); + +#endif /* included_linux_sysfs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/linux_pci.c b/src/vlib/pci/linux_pci.c deleted file mode 100644 index 2d3c0a88..00000000 --- a/src/vlib/pci/linux_pci.c +++ /dev/null @@ -1,665 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * pci.c: Linux user space PCI bus management. - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -typedef struct -{ - /* /sys/bus/pci/devices/... directory name for this device. */ - u8 *dev_dir_name; - - /* Resource file descriptors. */ - int *resource_fds; - - /* File descriptor for config space read/write. */ - int config_fd; - - /* File descriptor for /dev/uio%d */ - int uio_fd; - - /* Minor device for uio device. */ - u32 uio_minor; - - /* Index given by unix_file_add. */ - u32 unix_file_index; - -} linux_pci_device_t; - -/* Pool of PCI devices. */ -typedef struct -{ - vlib_main_t *vlib_main; - linux_pci_device_t *linux_pci_devices; -} linux_pci_main_t; - -extern linux_pci_main_t linux_pci_main; - -/* Call to allocate/initialize the pci subsystem. - This is not an init function so that users can explicitly enable - pci only when it's needed. */ -clib_error_t *pci_bus_init (vlib_main_t * vm); - -clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, - char *uio_driver_name); - -linux_pci_main_t linux_pci_main; - -clib_error_t * -vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) -{ - clib_error_t *error = 0; - u8 *s = 0, *driver_name = 0; - DIR *dir = 0; - struct dirent *e; - int fd, clear_driver_override = 0; - u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", - format_vlib_pci_addr, &d->bus_address); - - s = format (s, "%v/driver%c", dev_dir_name, 0); - driver_name = vlib_sysfs_link_to_name ((char *) s); - vec_reset_length (s); - - if (driver_name && - ((strcmp ("vfio-pci", (char *) driver_name) == 0) || - (strcmp ("uio_pci_generic", (char *) driver_name) == 0) || - (strcmp ("igb_uio", (char *) driver_name) == 0))) - goto done; - - /* walk trough all linux interfaces and if interface belonging to - this device is founf check if interface is admin up */ - dir = opendir ("/sys/class/net"); - s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); - - if (!dir) - { - error = clib_error_return (0, "Skipping PCI device %U: failed to " - "read /sys/class/net", - format_vlib_pci_addr, &d->bus_address); - goto done; - } - - fd = socket (PF_INET, SOCK_DGRAM, 0); - if (fd < 0) - { - error = clib_error_return_unix (0, "socket"); - goto done; - } - - while ((e = readdir (dir))) - { - struct ifreq ifr; - struct ethtool_drvinfo drvinfo; - - if (e->d_name[0] == '.') /* skip . and .. */ - continue; - - memset (&ifr, 0, sizeof ifr); - memset (&drvinfo, 0, sizeof drvinfo); - ifr.ifr_data = (char *) &drvinfo; - strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); - drvinfo.cmd = ETHTOOL_GDRVINFO; - if (ioctl (fd, SIOCETHTOOL, &ifr) < 0) - { - /* Some interfaces (eg "lo") don't support this ioctl */ - if ((errno != ENOTSUP) && (errno != ENODEV)) - clib_unix_warning ("ioctl fetch intf %s bus info error", - e->d_name); - continue; - } - - if (strcmp ((char *) s, drvinfo.bus_info)) - continue; - - memset (&ifr, 0, sizeof (ifr)); - strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); - if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0) - { - error = clib_error_return_unix (0, "ioctl fetch intf %s flags", - e->d_name); - close (fd); - goto done; - } - - if (ifr.ifr_flags & IFF_UP) - { - error = clib_error_return (0, "Skipping PCI device %U as host " - "interface %s is up", - format_vlib_pci_addr, &d->bus_address, - e->d_name); - close (fd); - goto done; - } - } - - close (fd); - vec_reset_length (s); - - s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); - vec_reset_length (s); - - s = format (s, "%v/driver_override%c", dev_dir_name, 0); - if (access ((char *) s, F_OK) == 0) - { - vlib_sysfs_write ((char *) s, "%s", uio_driver_name); - clear_driver_override = 1; - } - else - { - vec_reset_length (s); - s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, - d->device_id); - } - vec_reset_length (s); - - s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); - vec_reset_length (s); - - if (clear_driver_override) - { - s = format (s, "%v/driver_override%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%c", 0); - vec_reset_length (s); - } - -done: - closedir (dir); - vec_free (s); - vec_free (dev_dir_name); - vec_free (driver_name); - return error; -} - - -static clib_error_t * -scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) -{ - linux_pci_device_t *l = arg; - unformat_input_t input; - - unformat_init_string (&input, (char *) file_name, vec_len (file_name)); - - if (!unformat (&input, "uio%d", &l->uio_minor)) - abort (); - - unformat_free (&input); - return 0; -} - -static clib_error_t * -linux_pci_uio_read_ready (unix_file_t * uf) -{ - vlib_pci_main_t *pm = &pci_main; - vlib_pci_device_t *d; - int __attribute__ ((unused)) rv; - - u32 icount; - rv = read (uf->file_descriptor, &icount, 4); - - d = pool_elt_at_index (pm->pci_devs, uf->private_data); - - if (d->interrupt_handler) - d->interrupt_handler (d); - - vlib_pci_intr_enable (d); - - return /* no error */ 0; -} - -static clib_error_t * -linux_pci_uio_error_ready (unix_file_t * uf) -{ - u32 error_index = (u32) uf->private_data; - - return clib_error_return (0, "pci device %d: error", error_index); -} - -static void -add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) -{ - vlib_pci_main_t *pm = &pci_main; - linux_pci_main_t *lpm = &linux_pci_main; - linux_pci_device_t *l; - - pool_get (lpm->linux_pci_devices, l); - l[0] = pdev[0]; - - l->dev_dir_name = vec_dup (l->dev_dir_name); - - dev->os_handle = l - lpm->linux_pci_devices; - - { - u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name); - foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ - 1); - vec_free (uio_dir); - } - - { - char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); - l->uio_fd = open (uio_name, O_RDWR); - if (l->uio_fd < 0) - clib_unix_error ("open `%s'", uio_name); - vec_free (uio_name); - } - - { - unix_file_t template = { 0 }; - unix_main_t *um = &unix_main; - - template.read_function = linux_pci_uio_read_ready; - template.file_descriptor = l->uio_fd; - template.error_function = linux_pci_uio_error_ready; - template.private_data = dev - pm->pci_devs; - - l->unix_file_index = unix_file_add (um, &template); - } -} - -static void -linux_pci_device_free (linux_pci_device_t * l) -{ - int i; - for (i = 0; i < vec_len (l->resource_fds); i++) - if (l->resource_fds[i] > 0) - close (l->resource_fds[i]); - if (l->config_fd > 0) - close (l->config_fd); - if (l->uio_fd > 0) - close (l->uio_fd); - vec_free (l->resource_fds); - vec_free (l->dev_dir_name); -} - -/* Configuration space read/write. */ -clib_error_t * -vlib_pci_read_write_config (vlib_pci_device_t * dev, - vlib_read_or_write_t read_or_write, - uword address, void *data, u32 n_bytes) -{ - linux_pci_main_t *lpm = &linux_pci_main; - linux_pci_device_t *p; - int n; - - p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle); - - if (read_or_write == VLIB_READ) - n = pread (p->config_fd, data, n_bytes, address); - else - n = pwrite (p->config_fd, data, n_bytes, address); - - if (n != n_bytes) - return clib_error_return_unix (0, "%s", - read_or_write == VLIB_READ - ? "read" : "write"); - - return 0; -} - -static clib_error_t * -os_map_pci_resource_internal (uword os_handle, - u32 resource, u8 * addr, void **result) -{ - linux_pci_main_t *pm = &linux_pci_main; - linux_pci_device_t *p; - struct stat stat_buf; - u8 *file_name; - int fd; - clib_error_t *error; - int flags = MAP_SHARED; - - error = 0; - p = pool_elt_at_index (pm->linux_pci_devices, os_handle); - - file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); - fd = open ((char *) file_name, O_RDWR); - if (fd < 0) - { - error = clib_error_return_unix (0, "open `%s'", file_name); - goto done; - } - - if (fstat (fd, &stat_buf) < 0) - { - error = clib_error_return_unix (0, "fstat `%s'", file_name); - goto done; - } - - vec_validate (p->resource_fds, resource); - p->resource_fds[resource] = fd; - if (addr != 0) - flags |= MAP_FIXED; - - *result = mmap (addr, - /* size */ stat_buf.st_size, - PROT_READ | PROT_WRITE, flags, - /* file */ fd, - /* offset */ 0); - if (*result == (void *) -1) - { - error = clib_error_return_unix (0, "mmap `%s'", file_name); - goto done; - } - -done: - if (error) - { - if (fd >= 0) - close (fd); - } - vec_free (file_name); - return error; -} - -clib_error_t * -vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result) -{ - return (os_map_pci_resource_internal - (dev->os_handle, resource, 0 /* addr */ , - result)); -} - -clib_error_t * -vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, - u32 resource, u8 * addr, void **result) -{ - return (os_map_pci_resource_internal - (dev->os_handle, resource, addr, result)); -} - -void -vlib_pci_free_device (vlib_pci_device_t * dev) -{ - linux_pci_main_t *pm = &linux_pci_main; - linux_pci_device_t *l; - - l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle); - linux_pci_device_free (l); - pool_put (pm->linux_pci_devices, l); -} - -pci_device_registration_t * __attribute__ ((unused)) -pci_device_next_registered (pci_device_registration_t * r) -{ - uword i; - - /* Null vendor id marks end of initialized list. */ - for (i = 0; r->supported_devices[i].vendor_id != 0; i++) - ; - - return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); -} - -static clib_error_t * -init_device_from_registered (vlib_main_t * vm, - vlib_pci_device_t * dev, - linux_pci_device_t * pdev) -{ - vlib_pci_main_t *pm = &pci_main; - pci_device_registration_t *r; - pci_device_id_t *i; - clib_error_t *error; - - r = pm->pci_device_registrations; - - while (r) - { - for (i = r->supported_devices; i->vendor_id != 0; i++) - if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id) - { - error = vlib_pci_bind_to_uio (dev, "uio_pci_generic"); - if (error) - { - clib_error_report (error); - continue; - } - - add_device (dev, pdev); - dev->interrupt_handler = r->interrupt_handler; - return r->init_function (vm, dev); - } - r = r->next_registration; - } - /* No driver, close the PCI config-space FD */ - close (pdev->config_fd); - return 0; -} - -static clib_error_t * -init_device (vlib_main_t * vm, - vlib_pci_device_t * dev, linux_pci_device_t * pdev) -{ - return init_device_from_registered (vm, dev, pdev); -} - -static clib_error_t * -scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) -{ - vlib_main_t *vm = arg; - vlib_pci_main_t *pm = &pci_main; - int fd; - u8 *f; - clib_error_t *error = 0; - vlib_pci_device_t *dev; - linux_pci_device_t pdev = { 0 }; - u32 tmp; - - f = format (0, "%v/config%c", dev_dir_name, 0); - fd = open ((char *) f, O_RDWR); - - /* Try read-only access if write fails. */ - if (fd < 0) - fd = open ((char *) f, O_RDONLY); - - if (fd < 0) - { - error = clib_error_return_unix (0, "open `%s'", f); - goto done; - } - - pool_get (pm->pci_devs, dev); - - /* You can only read more that 64 bytes of config space as root; so we try to - read the full space but fall back to just the first 64 bytes. */ - if (read (fd, &dev->config_data, sizeof (dev->config_data)) != - sizeof (dev->config_data) - && read (fd, &dev->config0, - sizeof (dev->config0)) != sizeof (dev->config0)) - { - pool_put (pm->pci_devs, dev); - error = clib_error_return_unix (0, "read `%s'", f); - close (fd); - goto done; - } - - { - static pci_config_header_t all_ones; - if (all_ones.vendor_id == 0) - memset (&all_ones, ~0, sizeof (all_ones)); - - if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones))) - { - pool_put (pm->pci_devs, dev); - error = clib_error_return (0, "invalid PCI config for `%s'", f); - close (fd); - goto done; - } - } - - if (dev->config0.header.header_type == 0) - pci_config_type0_little_to_host (&dev->config0); - else - pci_config_type1_little_to_host (&dev->config1); - - /* Parse bus, dev, function from directory name. */ - { - unformat_input_t input; - - unformat_init_string (&input, (char *) dev_dir_name, - vec_len (dev_dir_name)); - - if (!unformat (&input, "/sys/bus/pci/devices/%U", - unformat_vlib_pci_addr, &dev->bus_address)) - abort (); - - unformat_free (&input); - - } - - - pdev.config_fd = fd; - pdev.dev_dir_name = dev_dir_name; - - hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, - dev - pm->pci_devs); - - vec_reset_length (f); - f = format (f, "%v/vpd%c", dev_dir_name, 0); - fd = open ((char *) f, O_RDONLY); - if (fd >= 0) - { - while (1) - { - u8 tag[3]; - u8 *data = 0; - int len; - - if (read (fd, &tag, 3) != 3) - break; - - if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) - break; - - len = (tag[2] << 8) | tag[1]; - vec_validate (data, len); - - if (read (fd, data, len) != len) - { - vec_free (data); - break; - } - if (tag[0] == 0x82) - dev->product_name = data; - else if (tag[0] == 0x90) - dev->vpd_r = data; - else if (tag[0] == 0x91) - dev->vpd_w = data; - - data = 0; - } - close (fd); - } - - dev->numa_node = -1; - vec_reset_length (f); - f = format (f, "%v/numa_node%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); - - vec_reset_length (f); - f = format (f, "%v/class%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->device_class = tmp >> 8; - - vec_reset_length (f); - f = format (f, "%v/vendor%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->vendor_id = tmp; - - vec_reset_length (f); - f = format (f, "%v/device%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->device_id = tmp; - - error = init_device (vm, dev, &pdev); - - vec_reset_length (f); - f = format (f, "%v/driver%c", dev_dir_name, 0); - dev->driver_name = vlib_sysfs_link_to_name ((char *) f); - -done: - vec_free (f); - return error; -} - -clib_error_t * -linux_pci_init (vlib_main_t * vm) -{ - vlib_pci_main_t *pm = &pci_main; - clib_error_t *error; - - pm->vlib_main = vm; - - if ((error = vlib_call_init_function (vm, unix_input_init))) - return error; - - ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); - pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword)); - - error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, - /* scan_dirs */ 0); - - /* Complain and continue. might not be root, etc. */ - if (error) - clib_error_report (error); - - return error; -} - -VLIB_INIT_FUNCTION (linux_pci_init); - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index 36f8109e..f8d5d8f9 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -18,6 +18,7 @@ #include #include +#include #include static u8 * diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c deleted file mode 100644 index d5d5d6c8..00000000 --- a/src/vlib/unix/physmem.c +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * physmem.c: Unix physical memory - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#ifndef __NR_memfd_create -#if defined __x86_64__ -#define __NR_memfd_create 319 -#elif defined __arm__ -#define __NR_memfd_create 385 -#elif defined __aarch64__ -#define __NR_memfd_create 279 -#else -#error "__NR_memfd_create unknown for this architecture" -#endif -#endif - -static inline int -memfd_create (const char *name, unsigned int flags) -{ - return syscall (__NR_memfd_create, name, flags); -} - -#ifndef F_LINUX_SPECIFIC_BASE -#define F_LINUX_SPECIFIC_BASE 1024 -#endif -#define MFD_ALLOW_SEALING 0x0002U -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ - -static void * -unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, - uword n_bytes, uword alignment) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - uword lo_offset, hi_offset; - uword *to_free = 0; - - if (pr->heap == 0) - return 0; - - /* IO memory is always at least cache aligned. */ - alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); - - while (1) - { - mheap_get_aligned (pr->heap, n_bytes, - /* align */ alignment, - /* align offset */ 0, - &lo_offset); - - /* Allocation failed? */ - if (lo_offset == ~0) - break; - - if (pr->flags & VLIB_PHYSMEM_F_FAKE) - break; - - /* Make sure allocation does not span DMA physical chunk boundary. */ - hi_offset = lo_offset + n_bytes - 1; - - if ((lo_offset >> pr->log2_page_size) == - (hi_offset >> pr->log2_page_size)) - break; - - /* Allocation would span chunk boundary, queue it to be freed as soon as - we find suitable chunk. */ - vec_add1 (to_free, lo_offset); - } - - if (to_free != 0) - { - uword i; - for (i = 0; i < vec_len (to_free); i++) - mheap_put (pr->heap, to_free[i]); - vec_free (to_free); - } - - return lo_offset != ~0 ? pr->heap + lo_offset : 0; -} - -static void -unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - /* Return object to region's heap. */ - mheap_put (pr->heap, x - pr->heap); -} - -static u64 -get_page_paddr (int fd, uword addr) -{ - int pagesize = sysconf (_SC_PAGESIZE); - u64 seek, pagemap = 0; - - seek = ((u64) addr / pagesize) * sizeof (u64); - if (lseek (fd, seek, SEEK_SET) != seek) - { - clib_unix_warning ("lseek to 0x%llx", seek); - return 0; - } - if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) - { - clib_unix_warning ("read ptbits"); - return 0; - } - if ((pagemap & (1ULL << 63)) == 0) - return 0; - - pagemap &= pow2_mask (55); - - return pagemap * pagesize; -} - -static clib_error_t * -unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, - u8 numa_node, u32 flags, - vlib_physmem_region_index_t * idx) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr; - clib_error_t *error = 0; - int pagemap_fd = -1; - u8 *mount_dir = 0; - u8 *filename = 0; - struct stat st; - int old_mpol; - int mmap_flags; - struct bitmask *old_mask = numa_allocate_nodemask (); - - if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) - return clib_error_return (0, "not allowed"); - - pool_get (vpm->regions, pr); - - if ((pr - vpm->regions) >= 256) - { - error = clib_error_return (0, "maximum number of regions reached"); - goto error; - } - - pr->index = pr - vpm->regions; - pr->fd = -1; - pr->flags = flags; - - if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) - == -1) - { - error = clib_error_return_unix (0, "get_mempolicy"); - goto error; - } - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) - { - error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); - goto error; - } - - mount_dir = format (0, "%s/physmem_region%d%c", - vlib_unix_get_runtime_dir (), pr->index, 0); - filename = format (0, "%s/mem%c", mount_dir, 0); - - unlink ((char *) mount_dir); - - error = vlib_unix_recursive_mkdir ((char *) mount_dir); - if (error) - goto error; - - if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) - { - error = clib_error_return_unix (0, "mount hugetlb directory '%s'", - mount_dir); - goto error; - } - - if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) - { - error = clib_error_return_unix (0, "open"); - goto error; - } - - mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; - } - else - { - if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) - return clib_error_return_unix (0, "memfd_create"); - - if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) - { - error = - clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); - goto error; - } - mmap_flags = MAP_SHARED; - } - - if (fstat (pr->fd, &st)) - { - error = clib_error_return_unix (0, "fstat"); - goto error; - } - - pr->log2_page_size = min_log2 (st.st_blksize); - pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; - size = pr->n_pages * (1 << pr->log2_page_size); - - if ((ftruncate (pr->fd, size)) == -1) - { - error = clib_error_return_unix (0, "ftruncate length: %d", size); - goto error; - } - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - error = vlib_sysfs_prealloc_hugepages (numa_node, - 1 << (pr->log2_page_size - 10), - pr->n_pages); - if (error) - goto error; - } - - numa_set_preferred (numa_node); - - pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); - - if (pr->mem == MAP_FAILED) - { - pr->mem = 0; - error = clib_error_return_unix (0, "mmap"); - goto error; - } - - if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) - { - error = clib_error_return_unix (0, "set_mempolicy"); - goto error; - } - - pr->size = pr->n_pages << pr->log2_page_size; - pr->page_mask = (1 << pr->log2_page_size) - 1; - pr->numa_node = numa_node; - pr->name = format (0, "%s", name); - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - int i; - for (i = 0; i < pr->n_pages; i++) - { - void *ptr = pr->mem + (i << pr->log2_page_size); - int node; - move_pages (0, 1, &ptr, 0, &node, 0); - if (numa_node != node) - { - clib_warning - ("physmem page for region \'%s\' allocated on the wrong" - " numa node (requested %u actual %u)", pr->name, - pr->numa_node, node, i); - break; - } - } - } - - if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) - { - pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, - /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM | - MHEAP_FLAG_THREAD_SAFE); - fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); - } - - if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) - { - vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); - } - - *idx = pr->index; - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - int i; - for (i = 0; i < pr->n_pages; i++) - { - uword vaddr = - pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); - u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); - vec_add1 (pr->page_table, page_paddr); - } - } - - goto done; - -error: - if (pr->fd > -1) - close (pr->fd); - - if (pr->mem) - munmap (pr->mem, size); - - memset (pr, 0, sizeof (*pr)); - pool_put (vpm->regions, pr); - -done: - if (mount_dir) - { - umount2 ((char *) mount_dir, MNT_DETACH); - rmdir ((char *) mount_dir); - vec_free (mount_dir); - } - numa_free_cpumask (old_mask); - vec_free (filename); - if (pagemap_fd > -1) - close (pagemap_fd); - return error; -} - -static void -unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - - if (pr->fd > 0) - close (pr->fd); - munmap (pr->mem, pr->size); - vec_free (pr->name); - pool_put (vpm->regions, pr); -} - -clib_error_t * -unix_physmem_init (vlib_main_t * vm) -{ - clib_error_t *error = 0; - - /* Avoid multiple calls. */ - if (vm->os_physmem_alloc_aligned) - return error; - - vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; - vm->os_physmem_free = unix_physmem_free; - vm->os_physmem_region_alloc = unix_physmem_region_alloc; - vm->os_physmem_region_free = unix_physmem_region_free; - - return error; -} - -static clib_error_t * -show_physmem (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr; - - /* *INDENT-OFF* */ - pool_foreach (pr, vpm->regions, ( - { - vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " - "numa-node %u fd %d\n", - pr->index, pr->name, (1 << (pr->log2_page_size -10)), - pr->n_pages, pr->numa_node, pr->fd); - if (pr->heap) - vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); - else - vlib_cli_output (vm, " no heap\n"); - })); - /* *INDENT-ON* */ - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_physmem_command, static) = { - .path = "show physmem", - .short_help = "Show physical memory allocation", - .function = show_physmem, -}; -/* *INDENT-ON* */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index b5a33427..1b0d8b9d 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -217,23 +217,6 @@ extern u8 **vlib_thread_stacks; /* utils */ -clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); - -clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); - -u8 *vlib_sysfs_link_to_name (char *link); - -clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, - int page_size, int nr); -clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, - int page_size, int nr); - clib_error_t *foreach_directory_file (char *dir_name, clib_error_t * (*f) (void *arg, u8 * path_name, diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 0e252aca..5472751e 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -98,225 +98,6 @@ foreach_directory_file (char *dir_name, return error; } -clib_error_t * -vlib_sysfs_write (char *file_name, char *fmt, ...) -{ - u8 *s; - int fd; - clib_error_t *error = 0; - - fd = open (file_name, O_WRONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - va_list va; - va_start (va, fmt); - s = va_format (0, fmt, &va); - va_end (va); - - if (write (fd, s, vec_len (s)) < 0) - error = clib_error_return_unix (0, "write `%s'", file_name); - - vec_free (s); - close (fd); - return error; -} - -clib_error_t * -vlib_sysfs_read (char *file_name, char *fmt, ...) -{ - unformat_input_t input; - u8 *s = 0; - int fd; - ssize_t sz; - uword result; - - fd = open (file_name, O_RDONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - vec_validate (s, 4095); - - sz = read (fd, s, vec_len (s)); - if (sz < 0) - { - close (fd); - vec_free (s); - return clib_error_return_unix (0, "read `%s'", file_name); - } - - _vec_len (s) = sz; - unformat_init_vector (&input, s); - - va_list va; - va_start (va, fmt); - result = va_unformat (&input, fmt, &va); - va_end (va); - - vec_free (s); - close (fd); - - if (result == 0) - return clib_error_return (0, "unformat error"); - - return 0; -} - -u8 * -vlib_sysfs_link_to_name (char *link) -{ - char *p, buffer[64]; - unformat_input_t in; - u8 *s = 0; - int r; - - r = readlink (link, buffer, sizeof (buffer) - 1); - - if (r < 0) - return 0; - - buffer[r] = 0; - p = strrchr (buffer, '/'); - - if (!p) - return 0; - - unformat_init_string (&in, p + 1, strlen (p + 1)); - if (unformat (&in, "%s", &s) != 1) - clib_unix_warning ("no string?"); - unformat_free (&in); - - return s; -} - -clib_error_t * -vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); - vlib_sysfs_write ((char *) p, "%d", nr); - -done: - vec_free (p); - return error; -} - - -static clib_error_t * -vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, - int page_size, int *val) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, - type, 0); - error = vlib_sysfs_read ((char *) p, "%d", val); - -done: - vec_free (p); - return error; -} - -clib_error_t * -vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, - int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - int n, needed; - error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); - if (error) - return error; - needed = nr - n; - if (needed <= 0) - return 0; - - error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); - if (error) - return error; - clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", - needed, page_size, numa_node); - return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); -} - clib_error_t * vlib_unix_recursive_mkdir (char *path) { diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index ea52878d..e7e69214 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -26,6 +26,7 @@ #include #include +#include #include #include -- cgit 1.2.3-korg From 6c333cecd6ff95ea442828cb3ed002d516453875 Mon Sep 17 00:00:00 2001 From: Shachar Beiser Date: Mon, 11 Sep 2017 08:06:18 +0000 Subject: net/mlx5: fix warning message Add support for Mellanox provider id and vendor id Change-Id: Ib2451e69fbe949f01bcb1ee556aa3020da225610 Signed-off-by: Shachar Beiser --- src/plugins/dpdk/device/init.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 4ef3b676..59df808d 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -686,6 +686,9 @@ dpdk_bind_devices_to_uio (dpdk_config_main_t * conf) /* Chelsio T4/T5 */ else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000) ; + /* Mellanox */ + else if (d->vendor_id == 0x15b3 && d->device_id >= 0x1013 && d->device_id <= 0x101a) + ; else { clib_warning ("Unsupported PCI device 0x%04x:0x%04x found " -- cgit 1.2.3-korg From 2504ac699e423f1ca840a63247ce55cb27735e0a Mon Sep 17 00:00:00 2001 From: Steven Date: Thu, 27 Jul 2017 14:54:44 -0700 Subject: devices: Display VhostEthernet instead of UnknownEthernet for dpdk net_vhost driver Added a new macro VNET_DPDK_PORT_TYPE_VHOST_ETHER for dpdk net_vhost driver to display VhostEthernet instead of UnknownEthernet. Change-Id: Ie407d83ebbbd60e4d11be79f0ebc44e2867a5f3b Signed-off-by: Steven --- src/plugins/dpdk/device/dpdk.h | 5 +++-- src/plugins/dpdk/device/format.c | 7 +++++++ src/plugins/dpdk/device/init.c | 4 ++++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 1e34e3fb..849e687b 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -75,8 +75,8 @@ extern vlib_node_registration_t dpdk_input_node; _ ("net_mlx4", MLX4) \ _ ("net_mlx5", MLX5) \ _ ("net_dpaa2", DPAA2) \ - _ ("net_virtio_user", VIRTIO_USER) - + _ ("net_virtio_user", VIRTIO_USER) \ + _ ("net_vhost", VHOST_ETHER) typedef enum { @@ -100,6 +100,7 @@ typedef enum VNET_DPDK_PORT_TYPE_AF_PACKET, VNET_DPDK_PORT_TYPE_ETH_VF, VNET_DPDK_PORT_TYPE_VIRTIO_USER, + VNET_DPDK_PORT_TYPE_VHOST_ETHER, VNET_DPDK_PORT_TYPE_UNKNOWN, } dpdk_port_type_t; diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index cfe8851f..21475ac9 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -213,6 +213,10 @@ format_dpdk_device_name (u8 * s, va_list * args) device_name = "VirtioUser"; break; + case VNET_DPDK_PORT_TYPE_VHOST_ETHER: + device_name = "VhostEthernet"; + break; + default: case VNET_DPDK_PORT_TYPE_UNKNOWN: device_name = "UnknownEthernet"; @@ -316,6 +320,9 @@ format_dpdk_device_type (u8 * s, va_list * args) case VNET_DPDK_PMD_THUNDERX: dev_type = "Cavium ThunderX"; + + case VNET_DPDK_PMD_VHOST_ETHER: + dev_type = "VhostEthernet"; break; default: diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 59df808d..95176fb8 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -420,6 +420,10 @@ dpdk_lib_init (dpdk_main_t * dm) xd->port_type = VNET_DPDK_PORT_TYPE_VIRTIO_USER; break; + case VNET_DPDK_PMD_VHOST_ETHER: + xd->port_type = VNET_DPDK_PORT_TYPE_VHOST_ETHER; + break; + default: xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; } -- cgit 1.2.3-korg From d9226b25f145c64e5bc4a38c3fee7e9b2eaac2de Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 12 Sep 2017 15:34:17 +0200 Subject: physmem: remove debug leftovers Change-Id: I5a5dc0794d3398e749b64b07dfd1e2fc2230089b Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/cli.c | 1 - src/vlib/linux/physmem.c | 1 - 2 files changed, 2 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index fe1c41c2..aeeb772d 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -357,7 +357,6 @@ show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, "name=\"%s\" available = %7d allocated = %7d total = %7d\n", rmp->name, (u32) count, (u32) free_count, (u32) (count + free_count)); - rte_mempool_dump (stderr, rmp); } else { diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c index fddff2ea..d8c5dc9b 100644 --- a/src/vlib/linux/physmem.c +++ b/src/vlib/linux/physmem.c @@ -300,7 +300,6 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, /* Don't want mheap mmap/munmap with IO memory. */ MHEAP_FLAG_DISABLE_VM | MHEAP_FLAG_THREAD_SAFE); - fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); } if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) -- cgit 1.2.3-korg From 9641f011da2f65cf47705d634981b0eaac0b9007 Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 13 Sep 2017 06:18:46 -0700 Subject: devices: Fix coverity discovered 9/13/2017 Missing a break statement in the case Change-Id: I1b1e198748343bc116ee0eaf9012abc3be15c40a Signed-off-by: Steven --- src/plugins/dpdk/device/format.c | 1 + 1 file changed, 1 insertion(+) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c index 21475ac9..697bdbe5 100644 --- a/src/plugins/dpdk/device/format.c +++ b/src/plugins/dpdk/device/format.c @@ -320,6 +320,7 @@ format_dpdk_device_type (u8 * s, va_list * args) case VNET_DPDK_PMD_THUNDERX: dev_type = "Cavium ThunderX"; + break; case VNET_DPDK_PMD_VHOST_ETHER: dev_type = "VhostEthernet"; -- cgit 1.2.3-korg From 01914ce45729833cec88c65689de9a0336cd40cc Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 14 Sep 2017 19:04:50 +0200 Subject: vppinfra: add clib_mem_vm_ext_alloc function Change-Id: Iff33694fc42cc3bcc73cf1372339053a6365039c Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 6 +- src/plugins/memif/memif.c | 21 ++- src/vlib.am | 5 +- src/vlib/linux/pci.c | 25 ++-- src/vlib/linux/physmem.c | 192 ++++-------------------- src/vlib/linux/syscall.h | 58 -------- src/vlib/linux/sysfs.c | 250 ------------------------------- src/vlib/linux/sysfs.h | 44 ------ src/vlib/threads.c | 6 +- src/vlib/threads_cli.c | 6 +- src/vnet/devices/af_packet/af_packet.c | 4 +- src/vppinfra.am | 5 +- src/vppinfra/linux/mem.c | 260 +++++++++++++++++++++++++++++++++ src/vppinfra/linux/syscall.h | 56 +++++++ src/vppinfra/linux/sysfs.c | 250 +++++++++++++++++++++++++++++++ src/vppinfra/linux/sysfs.h | 46 ++++++ src/vppinfra/mem.h | 94 ++++++++++-- src/vppinfra/vm_linux_kernel.h | 78 ---------- src/vppinfra/vm_standalone.h | 74 ---------- src/vppinfra/vm_unix.h | 106 -------------- 20 files changed, 761 insertions(+), 825 deletions(-) delete mode 100644 src/vlib/linux/syscall.h delete mode 100644 src/vlib/linux/sysfs.c delete mode 100644 src/vlib/linux/sysfs.h create mode 100644 src/vppinfra/linux/mem.c create mode 100644 src/vppinfra/linux/syscall.h create mode 100644 src/vppinfra/linux/sysfs.c create mode 100644 src/vppinfra/linux/sysfs.h delete mode 100644 src/vppinfra/vm_linux_kernel.h delete mode 100644 src/vppinfra/vm_standalone.h delete mode 100644 src/vppinfra/vm_unix.h (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 95176fb8..ee61f94e 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include @@ -1040,7 +1040,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) mem = mem_by_socket[c]; page_size = 1024; - e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); + e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_1g = 0; @@ -1049,7 +1049,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) clib_error_free (e); page_size = 2; - e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); + e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_2m = 0; diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index 8fec409a..6a609a57 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include @@ -267,6 +267,8 @@ memif_init_regions_and_queues (memif_if_t * mif) int i, j; u64 buffer_offset; memif_region_t *r; + clib_mem_vm_alloc_t alloc = { 0 }; + clib_error_t *err; vec_validate_aligned (mif->regions, 0, CLIB_CACHE_LINE_BYTES); r = vec_elt_at_index (mif->regions, 0); @@ -279,18 +281,15 @@ memif_init_regions_and_queues (memif_if_t * mif) mif->run.buffer_size * (1 << mif->run.log2_ring_size) * (mif->run.num_s2m_rings + mif->run.num_m2s_rings); - if ((r->fd = memfd_create ("memif region 0", MFD_ALLOW_SEALING)) == -1) - return clib_error_return_unix (0, "memfd_create"); - - if ((fcntl (r->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) - return clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + alloc.name = "memif region"; + alloc.size = r->region_size; + alloc.flags = CLIB_MEM_VM_F_SHARED; - if ((ftruncate (r->fd, r->region_size)) == -1) - return clib_error_return_unix (0, "ftruncate"); + err = clib_mem_vm_ext_alloc (&alloc); + if (err) + return err; - if ((r->shm = mmap (NULL, r->region_size, PROT_READ | PROT_WRITE, - MAP_SHARED, r->fd, 0)) == MAP_FAILED) - return clib_error_return_unix (0, "mmap"); + r->fd = alloc.fd; for (i = 0; i < mif->run.num_s2m_rings; i++) { diff --git a/src/vlib.am b/src/vlib.am index 41d68690..067e4afc 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -13,7 +13,7 @@ lib_LTLIBRARIES += libvlib.la -libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread -lnuma +libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread libvlib_la_DEPENDENCIES = libvppinfra.la BUILT_SOURCES += vlib/config.h @@ -34,7 +34,6 @@ libvlib_la_SOURCES = \ vlib/init.c \ vlib/linux/pci.c \ vlib/linux/physmem.c \ - vlib/linux/sysfs.c \ vlib/main.c \ vlib/mc.c \ vlib/node.c \ @@ -60,8 +59,6 @@ nobase_include_HEADERS += \ vlib/global_funcs.h \ vlib/i2c.h \ vlib/init.h \ - vlib/linux/sysfs.h \ - vlib/linux/syscall.h \ vlib/main.h \ vlib/mc.h \ vlib/node_funcs.h \ diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c index 4ce19190..790f168a 100644 --- a/src/vlib/linux/pci.c +++ b/src/vlib/linux/pci.c @@ -37,10 +37,11 @@ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include -#include #include #include @@ -104,7 +105,7 @@ vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) format_vlib_pci_addr, &d->bus_address); s = format (s, "%v/driver%c", dev_dir_name, 0); - driver_name = vlib_sysfs_link_to_name ((char *) s); + driver_name = clib_sysfs_link_to_name ((char *) s); vec_reset_length (s); if (driver_name && @@ -183,32 +184,32 @@ vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) vec_reset_length (s); s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + clib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); vec_reset_length (s); s = format (s, "%v/driver_override%c", dev_dir_name, 0); if (access ((char *) s, F_OK) == 0) { - vlib_sysfs_write ((char *) s, "%s", uio_driver_name); + clib_sysfs_write ((char *) s, "%s", uio_driver_name); clear_driver_override = 1; } else { vec_reset_length (s); s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, + clib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, d->device_id); } vec_reset_length (s); s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + clib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); vec_reset_length (s); if (clear_driver_override) { s = format (s, "%v/driver_override%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%c", 0); + clib_sysfs_write ((char *) s, "%c", 0); vec_reset_length (s); } @@ -602,28 +603,28 @@ scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) dev->numa_node = -1; vec_reset_length (f); f = format (f, "%v/numa_node%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); + clib_sysfs_read ((char *) f, "%u", &dev->numa_node); vec_reset_length (f); f = format (f, "%v/class%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); + clib_sysfs_read ((char *) f, "0x%x", &tmp); dev->device_class = tmp >> 8; vec_reset_length (f); f = format (f, "%v/vendor%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); + clib_sysfs_read ((char *) f, "0x%x", &tmp); dev->vendor_id = tmp; vec_reset_length (f); f = format (f, "%v/device%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); + clib_sysfs_read ((char *) f, "0x%x", &tmp); dev->device_id = tmp; error = init_device (vm, dev, &pdev); vec_reset_length (f); f = format (f, "%v/driver%c", dev_dir_name, 0); - dev->driver_name = vlib_sysfs_link_to_name ((char *) f); + dev->driver_name = clib_sysfs_link_to_name ((char *) f); done: vec_free (f); diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c index d8c5dc9b..3cc42a06 100644 --- a/src/vlib/linux/physmem.c +++ b/src/vlib/linux/physmem.c @@ -43,14 +43,12 @@ #include #include #include -#include -#include +#include +#include #include #include #include -#include -#include static void * unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, @@ -111,31 +109,6 @@ unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) mheap_put (pr->heap, x - pr->heap); } -static u64 -get_page_paddr (int fd, uword addr) -{ - int pagesize = sysconf (_SC_PAGESIZE); - u64 seek, pagemap = 0; - - seek = ((u64) addr / pagesize) * sizeof (u64); - if (lseek (fd, seek, SEEK_SET) != seek) - { - clib_unix_warning ("lseek to 0x%llx", seek); - return 0; - } - if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) - { - clib_unix_warning ("read ptbits"); - return 0; - } - if ((pagemap & (1ULL << 63)) == 0) - return 0; - - pagemap &= pow2_mask (55); - - return pagemap * pagesize; -} - static clib_error_t * unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, u8 numa_node, u32 flags, @@ -144,13 +117,8 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, vlib_physmem_main_t *vpm = &vm->physmem_main; vlib_physmem_region_t *pr; clib_error_t *error = 0; - int pagemap_fd = -1; - u8 *mount_dir = 0; - u8 *filename = 0; - struct stat st; - int old_mpol; - int mmap_flags; - struct bitmask *old_mask = numa_allocate_nodemask (); + clib_mem_vm_alloc_t alloc = { 0 }; + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) return clib_error_return (0, "not allowed"); @@ -163,113 +131,32 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, goto error; } - pr->index = pr - vpm->regions; - pr->fd = -1; - pr->flags = flags; - - if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) - == -1) - { - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - error = clib_error_return_unix (0, "get_mempolicy"); - goto error; - } - else - old_mpol = -1; - } + alloc.name = name; + alloc.size = size; + alloc.numa_node = numa_node; + alloc.flags = CLIB_MEM_VM_F_SHARED; if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) - { - error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); - goto error; - } - - mount_dir = format (0, "%s/physmem_region%d%c", - vlib_unix_get_runtime_dir (), pr->index, 0); - filename = format (0, "%s/mem%c", mount_dir, 0); - - unlink ((char *) mount_dir); - - error = vlib_unix_recursive_mkdir ((char *) mount_dir); - if (error) - goto error; - - if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) - { - error = clib_error_return_unix (0, "mount hugetlb directory '%s'", - mount_dir); - goto error; - } - - if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) - { - error = clib_error_return_unix (0, "open"); - goto error; - } - - mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + alloc.flags |= CLIB_MEM_VM_F_HUGETLB; + alloc.flags |= CLIB_MEM_VM_F_HUGETLB_PREALLOC; + alloc.flags |= CLIB_MEM_VM_F_NUMA_FORCE; } else { - if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) - return clib_error_return_unix (0, "memfd_create"); - - if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) - { - error = - clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); - goto error; - } - mmap_flags = MAP_SHARED; - } - - if (fstat (pr->fd, &st)) - { - error = clib_error_return_unix (0, "fstat"); - goto error; - } - - pr->log2_page_size = min_log2 (st.st_blksize); - pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; - size = pr->n_pages * (1 << pr->log2_page_size); - - if ((ftruncate (pr->fd, size)) == -1) - { - error = clib_error_return_unix (0, "ftruncate length: %d", size); - goto error; - } - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - error = vlib_sysfs_prealloc_hugepages (numa_node, - 1 << (pr->log2_page_size - 10), - pr->n_pages); - if (error) - goto error; - } - - if (old_mpol != -1) - numa_set_preferred (numa_node); - - pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); - - if (pr->mem == MAP_FAILED) - { - pr->mem = 0; - error = clib_error_return_unix (0, "mmap"); - goto error; + alloc.flags |= CLIB_MEM_VM_F_NUMA_PREFER; } - if (old_mpol != -1 && - set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) - { - error = clib_error_return_unix (0, "set_mempolicy"); - goto error; - } + error = clib_mem_vm_ext_alloc (&alloc); + if (error) + goto error; + pr->index = pr - vpm->regions; + pr->flags = flags; + pr->fd = alloc.fd; + pr->mem = alloc.addr; + pr->log2_page_size = alloc.log2_page_size; + pr->n_pages = alloc.n_pages; pr->size = pr->n_pages << pr->log2_page_size; pr->page_mask = (1 << pr->log2_page_size) - 1; pr->numa_node = numa_node; @@ -285,13 +172,14 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, move_pages (0, 1, &ptr, 0, &node, 0); if (numa_node != node) { - clib_warning - ("physmem page for region \'%s\' allocated on the wrong" - " numa node (requested %u actual %u)", pr->name, - pr->numa_node, node, i); + clib_warning ("physmem page for region \'%s\' allocated on the" + " wrong numa node (requested %u actual %u)", + pr->name, pr->numa_node, node, i); break; } } + pr->page_table = clib_mem_vm_get_paddr (pr->mem, pr->log2_page_size, + pr->n_pages); } if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) @@ -309,41 +197,13 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, *idx = pr->index; - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - int i; - for (i = 0; i < pr->n_pages; i++) - { - uword vaddr = - pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); - u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); - vec_add1 (pr->page_table, page_paddr); - } - } - goto done; error: - if (pr->fd > -1) - close (pr->fd); - - if (pr->mem) - munmap (pr->mem, size); - memset (pr, 0, sizeof (*pr)); pool_put (vpm->regions, pr); done: - if (mount_dir) - { - umount2 ((char *) mount_dir, MNT_DETACH); - rmdir ((char *) mount_dir); - vec_free (mount_dir); - } - numa_free_cpumask (old_mask); - vec_free (filename); - if (pagemap_fd > -1) - close (pagemap_fd); return error; } diff --git a/src/vlib/linux/syscall.h b/src/vlib/linux/syscall.h deleted file mode 100644 index 9e37997e..00000000 --- a/src/vlib/linux/syscall.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_linux_syscall_h -#define included_linux_syscall_h - -#ifndef __NR_memfd_create -#if defined __x86_64__ -#define __NR_memfd_create 319 -#elif defined __arm__ -#define __NR_memfd_create 385 -#elif defined __aarch64__ -#define __NR_memfd_create 279 -#else -#error "__NR_memfd_create unknown for this architecture" -#endif -#endif - -static inline int -memfd_create (const char *name, unsigned int flags) -{ - return syscall (__NR_memfd_create, name, flags); -} - -#ifndef F_LINUX_SPECIFIC_BASE -#define F_LINUX_SPECIFIC_BASE 1024 -#endif -#define MFD_ALLOW_SEALING 0x0002U -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ - - -#endif /* included_linux_syscall_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/linux/sysfs.c b/src/vlib/linux/sysfs.c deleted file mode 100644 index f92f9ef5..00000000 --- a/src/vlib/linux/sysfs.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include - -clib_error_t * -vlib_sysfs_write (char *file_name, char *fmt, ...) -{ - u8 *s; - int fd; - clib_error_t *error = 0; - - fd = open (file_name, O_WRONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - va_list va; - va_start (va, fmt); - s = va_format (0, fmt, &va); - va_end (va); - - if (write (fd, s, vec_len (s)) < 0) - error = clib_error_return_unix (0, "write `%s'", file_name); - - vec_free (s); - close (fd); - return error; -} - -clib_error_t * -vlib_sysfs_read (char *file_name, char *fmt, ...) -{ - unformat_input_t input; - u8 *s = 0; - int fd; - ssize_t sz; - uword result; - - fd = open (file_name, O_RDONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - vec_validate (s, 4095); - - sz = read (fd, s, vec_len (s)); - if (sz < 0) - { - close (fd); - vec_free (s); - return clib_error_return_unix (0, "read `%s'", file_name); - } - - _vec_len (s) = sz; - unformat_init_vector (&input, s); - - va_list va; - va_start (va, fmt); - result = va_unformat (&input, fmt, &va); - va_end (va); - - vec_free (s); - close (fd); - - if (result == 0) - return clib_error_return (0, "unformat error"); - - return 0; -} - -u8 * -vlib_sysfs_link_to_name (char *link) -{ - char *p, buffer[64]; - unformat_input_t in; - u8 *s = 0; - int r; - - r = readlink (link, buffer, sizeof (buffer) - 1); - - if (r < 0) - return 0; - - buffer[r] = 0; - p = strrchr (buffer, '/'); - - if (!p) - return 0; - - unformat_init_string (&in, p + 1, strlen (p + 1)); - if (unformat (&in, "%s", &s) != 1) - clib_unix_warning ("no string?"); - unformat_free (&in); - - return s; -} - -clib_error_t * -vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); - vlib_sysfs_write ((char *) p, "%d", nr); - -done: - vec_free (p); - return error; -} - - -static clib_error_t * -vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, - int page_size, int *val) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, - type, 0); - error = vlib_sysfs_read ((char *) p, "%d", val); - -done: - vec_free (p); - return error; -} - -clib_error_t * -vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, - int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - int n, needed; - error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); - if (error) - return error; - needed = nr - n; - if (needed <= 0) - return 0; - - error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); - if (error) - return error; - clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", - needed, page_size, numa_node); - return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); -} - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/linux/sysfs.h b/src/vlib/linux/sysfs.h deleted file mode 100644 index 14b71317..00000000 --- a/src/vlib/linux/sysfs.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_linux_sysfs_h -#define included_linux_sysfs_h - -clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); - -clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); - -u8 *vlib_sysfs_link_to_name (char *link); - -clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, - int page_size, int nr); -clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, - int page_size, int nr); - -#endif /* included_linux_sysfs_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 2d9ce84a..f9c7043c 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -289,7 +289,7 @@ sort_registrations_by_no_clone (void *a0, void *a1) } static uword * -vlib_sysfs_list_to_bitmap (char *filename) +clib_sysfs_list_to_bitmap (char *filename) { FILE *fp; uword *r = 0; @@ -331,9 +331,9 @@ vlib_thread_init (vlib_main_t * vm) /* get bitmaps of active cpu cores and sockets */ tm->cpu_core_bitmap = - vlib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online"); + clib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online"); tm->cpu_socket_bitmap = - vlib_sysfs_list_to_bitmap ("/sys/devices/system/node/online"); + clib_sysfs_list_to_bitmap ("/sys/devices/system/node/online"); avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap); diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index f8d5d8f9..02bdea5c 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -15,10 +15,10 @@ #define _GNU_SOURCE #include +#include #include #include -#include #include static u8 * @@ -98,14 +98,14 @@ show_threads_fn (vlib_main_t * vm, u8 *p = 0; p = format (p, "%s%u/topology/core_id%c", sys_cpu_path, lcore, 0); - vlib_sysfs_read ((char *) p, "%d", &core_id); + clib_sysfs_read ((char *) p, "%d", &core_id); vec_reset_length (p); p = format (p, "%s%u/topology/physical_package_id%c", sys_cpu_path, lcore, 0); - vlib_sysfs_read ((char *) p, "%d", &socket_id); + clib_sysfs_read ((char *) p, "%d", &socket_id); vec_free (p); line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id); diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index 62bb228f..32696014 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -24,9 +24,9 @@ #include #include +#include #include #include -#include #include #include @@ -75,7 +75,7 @@ af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, { s = format (0, "/sys/class/net/%s/mtu%c", apif->host_if_name, 0); - error = vlib_sysfs_write ((char *) s, "%d", hi->max_packet_bytes); + error = clib_sysfs_write ((char *) s, "%d", hi->max_packet_bytes); vec_free (s); if (error) diff --git a/src/vppinfra.am b/src/vppinfra.am index a5769a0d..daca9954 100644 --- a/src/vppinfra.am +++ b/src/vppinfra.am @@ -188,6 +188,8 @@ nobase_include_HEADERS = \ vppinfra/graph.h \ vppinfra/hash.h \ vppinfra/heap.h \ + vppinfra/linux/sysfs.h \ + vppinfra/linux/syscall.h \ vppinfra/lock.h \ vppinfra/longjmp.h \ vppinfra/macros.h \ @@ -233,7 +235,6 @@ nobase_include_HEADERS = \ vppinfra/vector_neon.h \ vppinfra/vector_sse2.h \ vppinfra/valgrind.h \ - vppinfra/vm_unix.h \ vppinfra/xxhash.h \ vppinfra/xy.h \ vppinfra/zvec.h @@ -291,6 +292,8 @@ CLIB_CORE = \ libvppinfra_la_SOURCES = \ $(CLIB_CORE) \ vppinfra/elf_clib.c \ + vppinfra/linux/mem.c \ + vppinfra/linux/sysfs.c \ vppinfra/socket.c \ vppinfra/timer.c \ vppinfra/unix-formats.c \ diff --git a/src/vppinfra/linux/mem.c b/src/vppinfra/linux/mem.c new file mode 100644 index 00000000..665ddf61 --- /dev/null +++ b/src/vppinfra/linux/mem.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +int +clib_mem_vm_get_log2_page_size (int fd) +{ + struct stat st = { 0 }; + if (fstat (fd, &st)) + return 0; + return min_log2 (st.st_blksize); +} + +clib_error_t * +clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a) +{ + int fd = -1; + clib_error_t *err = 0; + void *addr = 0; + u8 *filename = 0; + int mmap_flags = MAP_SHARED; + int log2_page_size; + int n_pages; + int old_mpol = -1; + u64 old_mask[16] = { 0 }; + + /* save old numa mem policy if needed */ + if (a->flags & (CLIB_MEM_VM_F_NUMA_PREFER | CLIB_MEM_VM_F_NUMA_FORCE)) + { + int rv; + rv = + get_mempolicy (&old_mpol, old_mask, sizeof (old_mask) * 8 + 1, 0, 0); + + if (rv == -1) + { + if ((a->flags & CLIB_MEM_VM_F_NUMA_FORCE) != 0) + { + err = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } + else + old_mpol = -1; + } + } + + /* if we are creating shared segment, we need file descriptor */ + if (a->flags & CLIB_MEM_VM_F_SHARED) + { + /* if hugepages are needed we need to create mount point */ + if (a->flags & CLIB_MEM_VM_F_HUGETLB) + { + char *mount_dir; + char template[] = "/tmp/hugepage_mount.XXXXXX"; + + mount_dir = mkdtemp (template); + if (mount_dir == 0) + return clib_error_return_unix (0, "mkdtemp \'%s\'", template); + + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + err = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } + + filename = format (0, "%s/%s%c", mount_dir, a->name, 0); + + if ((fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + err = clib_error_return_unix (0, "open"); + goto error; + } + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + mmap_flags |= MAP_LOCKED; + } + else + { + if ((fd = memfd_create (a->name, MFD_ALLOW_SEALING)) == -1) + { + err = clib_error_return_unix (0, "memfd_create"); + goto error; + } + + if ((fcntl (fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + err = clib_error_return_unix (0, "fcntl (F_ADD_SEALS)"); + goto error; + } + } + log2_page_size = clib_mem_vm_get_log2_page_size (fd); + } + else /* not CLIB_MEM_VM_F_SHARED */ + { + if (a->flags & CLIB_MEM_VM_F_HUGETLB) + { + mmap_flags |= MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS; + log2_page_size = 21; + } + else + { + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + log2_page_size = min_log2 (sysconf (_SC_PAGESIZE)); + } + } + + n_pages = ((a->size - 1) >> log2_page_size) + 1; + + + if (a->flags & CLIB_MEM_VM_F_HUGETLB_PREALLOC) + { + err = clib_sysfs_prealloc_hugepages (a->numa_node, + 1 << (log2_page_size - 10), + n_pages); + if (err) + goto error; + + } + + if (fd != -1) + if ((ftruncate (fd, a->size)) == -1) + { + err = clib_error_return_unix (0, "ftruncate"); + goto error; + } + + if (old_mpol != -1) + { + int rv; + u64 mask[16] = { 0 }; + mask[0] = 1 << a->numa_node; + rv = set_mempolicy (MPOL_BIND, mask, sizeof (mask) * 8 + 1); + if (rv) + { + err = clib_error_return_unix (0, "set_mempolicy"); + goto error; + } + } + + addr = mmap (0, a->size, (PROT_READ | PROT_WRITE), mmap_flags, fd, 0); + if (addr == MAP_FAILED) + { + err = clib_error_return_unix (0, "mmap"); + goto error; + } + + /* re-apply ole numa memory policy */ + if (old_mpol != -1 && + set_mempolicy (old_mpol, old_mask, sizeof (old_mask) * 8 + 1) == -1) + { + err = clib_error_return_unix (0, "set_mempolicy"); + goto error; + } + + a->log2_page_size = log2_page_size; + a->n_pages = n_pages; + a->addr = addr; + a->fd = fd; + goto done; + +error: + if (fd != -1) + close (fd); + +done: + vec_free (filename); + return err; +} + +u64 * +clib_mem_vm_get_paddr (void *mem, int log2_page_size, int n_pages) +{ + int pagesize = sysconf (_SC_PAGESIZE); + int fd; + int i; + u64 *r = 0; + + if ((fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) + return 0; + + for (i = 0; i < n_pages; i++) + { + u64 seek, pagemap = 0; + uword vaddr = pointer_to_uword (mem) + (((u64) i) << log2_page_size); + seek = ((u64) vaddr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) + goto done; + + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) + goto done; + + if ((pagemap & (1ULL << 63)) == 0) + goto done; + + pagemap &= pow2_mask (55); + vec_add1 (r, pagemap * pagesize); + } + +done: + close (fd); + if (vec_len (r) != n_pages) + { + vec_free (r); + return 0; + } + return r; +} + + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/linux/syscall.h b/src/vppinfra/linux/syscall.h new file mode 100644 index 00000000..f8ec5919 --- /dev/null +++ b/src/vppinfra/linux/syscall.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_syscall_h +#define included_linux_syscall_h + +#include +#include + +static inline long +set_mempolicy (int mode, const unsigned long *nodemask, unsigned long maxnode) +{ + return syscall (__NR_set_mempolicy, mode, nodemask, maxnode); +} + +static inline int +get_mempolicy (int *mode, unsigned long *nodemask, unsigned long maxnode, + void *addr, unsigned long flags) +{ + return syscall (__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags); +} + +static inline long +move_pages (int pid, unsigned long count, void **pages, const int *nodes, + int *status, int flags) +{ + return syscall (__NR_move_pages, pid, count, pages, nodes, status, flags); +} + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#endif /* included_linux_syscall_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/linux/sysfs.c b/src/vppinfra/linux/sysfs.c new file mode 100644 index 00000000..5f611e6a --- /dev/null +++ b/src/vppinfra/linux/sysfs.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +clib_error_t * +clib_sysfs_write (char *file_name, char *fmt, ...) +{ + u8 *s; + int fd; + clib_error_t *error = 0; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + if (write (fd, s, vec_len (s)) < 0) + error = clib_error_return_unix (0, "write `%s'", file_name); + + vec_free (s); + close (fd); + return error; +} + +clib_error_t * +clib_sysfs_read (char *file_name, char *fmt, ...) +{ + unformat_input_t input; + u8 *s = 0; + int fd; + ssize_t sz; + uword result; + + fd = open (file_name, O_RDONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + vec_validate (s, 4095); + + sz = read (fd, s, vec_len (s)); + if (sz < 0) + { + close (fd); + vec_free (s); + return clib_error_return_unix (0, "read `%s'", file_name); + } + + _vec_len (s) = sz; + unformat_init_vector (&input, s); + + va_list va; + va_start (va, fmt); + result = va_unformat (&input, fmt, &va); + va_end (va); + + vec_free (s); + close (fd); + + if (result == 0) + return clib_error_return (0, "unformat error"); + + return 0; +} + +u8 * +clib_sysfs_link_to_name (char *link) +{ + char *p, buffer[64]; + unformat_input_t in; + u8 *s = 0; + int r; + + r = readlink (link, buffer, sizeof (buffer) - 1); + + if (r < 0) + return 0; + + buffer[r] = 0; + p = strrchr (buffer, '/'); + + if (!p) + return 0; + + unformat_init_string (&in, p + 1, strlen (p + 1)); + if (unformat (&in, "%s", &s) != 1) + clib_unix_warning ("no string?"); + unformat_free (&in); + + return s; +} + +clib_error_t * +clib_sysfs_set_nr_hugepages (int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); + clib_sysfs_write ((char *) p, "%d", nr); + +done: + vec_free (p); + return error; +} + + +static clib_error_t * +clib_sysfs_get_xxx_hugepages (char *type, int numa_node, + int page_size, int *val) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, + type, 0); + error = clib_sysfs_read ((char *) p, "%d", val); + +done: + vec_free (p); + return error; +} + +clib_error_t * +clib_sysfs_get_free_hugepages (int numa_node, int page_size, int *v) +{ + return clib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); +} + +clib_error_t * +clib_sysfs_get_nr_hugepages (int numa_node, int page_size, int *v) +{ + return clib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); +} + +clib_error_t * +clib_sysfs_get_surplus_hugepages (int numa_node, int page_size, int *v) +{ + return clib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); +} + +clib_error_t * +clib_sysfs_prealloc_hugepages (int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + int n, needed; + error = clib_sysfs_get_free_hugepages (numa_node, page_size, &n); + if (error) + return error; + needed = nr - n; + if (needed <= 0) + return 0; + + error = clib_sysfs_get_nr_hugepages (numa_node, page_size, &n); + if (error) + return error; + clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", + needed, page_size, numa_node); + return clib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/linux/sysfs.h b/src/vppinfra/linux/sysfs.h new file mode 100644 index 00000000..6c80cf95 --- /dev/null +++ b/src/vppinfra/linux/sysfs.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_sysfs_h +#define included_linux_sysfs_h + +#include + +clib_error_t *clib_sysfs_write (char *file_name, char *fmt, ...); + +clib_error_t *clib_sysfs_read (char *file_name, char *fmt, ...); + +u8 *clib_sysfs_link_to_name (char *link); + +clib_error_t *clib_sysfs_set_nr_hugepages (int numa_node, + int page_size, int nr); +clib_error_t *clib_sysfs_get_nr_hugepages (int numa_node, + int page_size, int *v); +clib_error_t *clib_sysfs_get_free_hugepages (int numa_node, + int page_size, int *v); +clib_error_t *clib_sysfs_get_surplus_hugepages (int numa_node, + int page_size, int *v); +clib_error_t *clib_sysfs_prealloc_hugepages (int numa_node, + int page_size, int nr); + +#endif /* included_linux_sysfs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/mem.h b/src/vppinfra/mem.h index 63c5ac16..69ab8803 100644 --- a/src/vppinfra/mem.h +++ b/src/vppinfra/mem.h @@ -39,8 +39,11 @@ #define _included_clib_mem_h #include +#include +#include #include /* uword, etc */ +#include #include #include #include /* memcpy, memset */ @@ -264,19 +267,90 @@ void clib_mem_usage (clib_mem_usage_t * usage); u8 *format_clib_mem_usage (u8 * s, va_list * args); -/* Include appropriate VM functions depending on whether - we are compiling for linux kernel, for Unix or standalone. */ -#ifdef CLIB_LINUX_KERNEL -#include -#endif +/* Allocate virtual address space. */ +always_inline void * +clib_mem_vm_alloc (uword size) +{ + void *mmap_addr; + uword flags = MAP_PRIVATE; -#ifdef CLIB_UNIX -#include +#ifdef MAP_ANONYMOUS + flags |= MAP_ANONYMOUS; #endif -#ifdef CLIB_STANDALONE -#include -#endif + mmap_addr = mmap (0, size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (mmap_addr == (void *) -1) + mmap_addr = 0; + + return mmap_addr; +} + +always_inline void +clib_mem_vm_free (void *addr, uword size) +{ + munmap (addr, size); +} + +always_inline void * +clib_mem_vm_unmap (void *addr, uword size) +{ + void *mmap_addr; + uword flags = MAP_PRIVATE | MAP_FIXED; + + /* To unmap we "map" with no protection. If we actually called + munmap then other callers could steal the address space. By + changing to PROT_NONE the kernel can free up the pages which is + really what we want "unmap" to mean. */ + mmap_addr = mmap (addr, size, PROT_NONE, flags, -1, 0); + if (mmap_addr == (void *) -1) + mmap_addr = 0; + + return mmap_addr; +} + +always_inline void * +clib_mem_vm_map (void *addr, uword size) +{ + void *mmap_addr; + uword flags = MAP_PRIVATE | MAP_FIXED; + + mmap_addr = mmap (addr, size, (PROT_READ | PROT_WRITE), flags, -1, 0); + if (mmap_addr == (void *) -1) + mmap_addr = 0; + + return mmap_addr; +} + +typedef struct +{ +#define CLIB_MEM_VM_F_SHARED (1 << 0) +#define CLIB_MEM_VM_F_HUGETLB (1 << 1) +#define CLIB_MEM_VM_F_NUMA_PREFER (1 << 2) +#define CLIB_MEM_VM_F_NUMA_FORCE (1 << 3) +#define CLIB_MEM_VM_F_HUGETLB_PREALLOC (1 << 4) + u32 flags; /**< vm allocation flags: +
CLIB_MEM_VM_F_SHARED: request shared memory, file + destiptor will be provided on successful allocation. +
CLIB_MEM_VM_F_HUGETLB: request hugepages. +
CLIB_MEM_VM_F_NUMA_PREFER: numa_node field contains valid + numa node preference. +
CLIB_MEM_VM_F_NUMA_FORCE: fail if setting numa policy fails. +
CLIB_MEM_VM_F_HUGETLB_PREALLOC: pre-allocate hugepages if + number of available pages is not sufficient. + */ + char *name; /**< Name for memory allocation, set by caller. */ + uword size; /**< Allocation size, set by caller. */ + int numa_node; /**< numa node preference. Valid if CLIB_MEM_VM_F_NUMA_PREFER set. */ + void *addr; /**< Pointer to allocated memory, set on successful allocation. */ + int fd; /**< File desriptor, set on successful allocation if CLIB_MEM_VM_F_SHARED is set. */ + int log2_page_size; /* Page size in log2 format, set on successful allocation. */ + int n_pages; /* Number of pages. */ +} clib_mem_vm_alloc_t; + +clib_error_t *clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a); +int clib_mem_vm_get_log2_page_size (int fd); +u64 *clib_mem_vm_get_paddr (void *mem, int log2_page_size, int n_pages); + #include /* clib_panic */ diff --git a/src/vppinfra/vm_linux_kernel.h b/src/vppinfra/vm_linux_kernel.h deleted file mode 100644 index fd9e6148..00000000 --- a/src/vppinfra/vm_linux_kernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef included_vm_linux_kernel_h -#define included_vm_linux_kernel_h - -#include -#include /* for GFP_* */ -#include /* for PAGE_KERNEL */ - -/* Allocate virtual address space. */ -always_inline void * -clib_mem_vm_alloc (uword size) -{ - return vmalloc (size); -} - -always_inline void -clib_mem_vm_free (void *addr, uword size) -{ - vfree (addr); -} - -always_inline void * -clib_mem_vm_unmap (void *addr, uword size) -{ - return 0; -} - -always_inline void * -clib_mem_vm_map (void *addr, uword size) -{ - return addr; -} - -#endif /* included_vm_linux_kernel_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vppinfra/vm_standalone.h b/src/vppinfra/vm_standalone.h deleted file mode 100644 index 2cd431bc..00000000 --- a/src/vppinfra/vm_standalone.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef included_vm_standalone_h -#define included_vm_standalone_h - -/* Stubs for standalone "system" which has no VM support. */ - -always_inline void * -clib_mem_vm_alloc (uword size) -{ - return 0; -} - -always_inline void -clib_mem_vm_free (void *addr, uword size) -{ -} - -always_inline void * -clib_mem_vm_unmap (void *addr, uword size) -{ - return 0; -} - -always_inline void * -clib_mem_vm_map (void *addr, uword size) -{ - return addr; -} - -#endif /* included_vm_standalone_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vppinfra/vm_unix.h b/src/vppinfra/vm_unix.h deleted file mode 100644 index 07e86516..00000000 --- a/src/vppinfra/vm_unix.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef included_vm_unix_h -#define included_vm_unix_h - -#include -#include - -/* Allocate virtual address space. */ -always_inline void * -clib_mem_vm_alloc (uword size) -{ - void *mmap_addr; - uword flags = MAP_PRIVATE; - -#ifdef MAP_ANONYMOUS - flags |= MAP_ANONYMOUS; -#endif - - mmap_addr = mmap (0, size, PROT_READ | PROT_WRITE, flags, -1, 0); - if (mmap_addr == (void *) -1) - mmap_addr = 0; - - return mmap_addr; -} - -always_inline void -clib_mem_vm_free (void *addr, uword size) -{ - munmap (addr, size); -} - -always_inline void * -clib_mem_vm_unmap (void *addr, uword size) -{ - void *mmap_addr; - uword flags = MAP_PRIVATE | MAP_FIXED; - - /* To unmap we "map" with no protection. If we actually called - munmap then other callers could steal the address space. By - changing to PROT_NONE the kernel can free up the pages which is - really what we want "unmap" to mean. */ - mmap_addr = mmap (addr, size, PROT_NONE, flags, -1, 0); - if (mmap_addr == (void *) -1) - mmap_addr = 0; - - return mmap_addr; -} - -always_inline void * -clib_mem_vm_map (void *addr, uword size) -{ - void *mmap_addr; - uword flags = MAP_PRIVATE | MAP_FIXED; - - mmap_addr = mmap (addr, size, (PROT_READ | PROT_WRITE), flags, -1, 0); - if (mmap_addr == (void *) -1) - mmap_addr = 0; - - return mmap_addr; -} - -#endif /* included_vm_unix_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ -- cgit 1.2.3-korg From 2f9b0c05fca7ca829ea438da1d87e2bf93969500 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 11 Sep 2017 20:54:15 -0400 Subject: dpdk: cli to check for buffer leakage Use buffer pre_data and existing buffer trace trajectory code to find out dpdk buffer leakages. Change-Id: I26a5d8bd2f23d01cb6070ffc3ddcc6d3d863b575 Signed-off-by: Florin Coras --- src/plugins/dpdk/buffer.c | 63 +++++++++++++++++++++++++++++++++++++++- src/plugins/dpdk/device/cli.c | 53 +++++++++++++++++++++++++++++++++ src/plugins/dpdk/device/device.c | 6 ++++ src/plugins/dpdk/device/dpdk.h | 5 ++++ src/vlib/buffer.c | 2 +- src/vnet/ip/ip4_forward.c | 1 + src/vnet/tcp/tcp_output.c | 10 ++++--- 7 files changed, 134 insertions(+), 6 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 28af100a..e09d8019 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -340,7 +340,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, vlib_buffer_t *b; b = vlib_get_buffer (vm, buffers[i]); - + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b); fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); /* The only current use of this callback: multicast recycle */ @@ -493,6 +493,67 @@ buffer_state_validation_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (buffer_state_validation_init); #endif +#if CLI_DEBUG +struct dpdk_validate_buf_result +{ + u32 invalid; + u32 uninitialized; +}; + +#define DPDK_TRAJECTORY_POISON 31 + +static void +dpdk_buffer_validate_trajectory (struct rte_mempool *mp, void *opaque, + void *obj, unsigned obj_idx) +{ + vlib_buffer_t *b; + struct dpdk_validate_buf_result *counter = opaque; + b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj); + if (b->pre_data[0] != 0) + { + if (b->pre_data[0] == DPDK_TRAJECTORY_POISON) + counter->uninitialized++; + else + counter->invalid++; + } +} + +int +dpdk_buffer_validate_trajectory_all (u32 * uninitialized) +{ + dpdk_main_t *dm = &dpdk_main; + struct dpdk_validate_buf_result counter = { 0 }; + int i; + + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + rte_mempool_obj_iter (dm->pktmbuf_pools[i], + dpdk_buffer_validate_trajectory, &counter); + if (uninitialized) + *uninitialized = counter.uninitialized; + return counter.invalid; +} + +static void +dpdk_buffer_poison_trajectory (struct rte_mempool *mp, void *opaque, + void *obj, unsigned obj_idx) +{ + vlib_buffer_t *b; + b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj); + b->pre_data[0] = DPDK_TRAJECTORY_POISON; +} + +void +dpdk_buffer_poison_trajectory_all (void) +{ + dpdk_main_t *dm = &dpdk_main; + int i; + + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + rte_mempool_obj_iter (dm->pktmbuf_pools[i], dpdk_buffer_poison_trajectory, + 0); +} +#endif + /* *INDENT-OFF* */ VLIB_BUFFER_REGISTER_CALLBACKS (dpdk, static) = { .vlib_buffer_alloc_cb = &dpdk_buffer_alloc, diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index aeeb772d..c9fcea5c 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -1885,6 +1885,59 @@ VLIB_CLI_COMMAND (show_vpe_version_command, static) = { }; /* *INDENT-ON* */ +#if CLI_DEBUG + +static clib_error_t * +dpdk_validate_buffers_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + u32 n_invalid_bufs = 0, uninitialized = 0; + u32 is_poison = 0, is_test = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "poison")) + is_poison = 1; + else if (unformat (input, "trajectory")) + is_test = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (VLIB_BUFFER_TRACE_TRAJECTORY == 0) + { + vlib_cli_output (vm, "Trajectory not enabled. Recompile with " + "VLIB_BUFFER_TRACE_TRAJECTORY 1"); + return 0; + } + if (is_poison) + { + dpdk_buffer_poison_trajectory_all (); + } + if (is_test) + { + n_invalid_bufs = dpdk_buffer_validate_trajectory_all (&uninitialized); + if (!n_invalid_bufs) + vlib_cli_output (vm, "All buffers are valid %d uninitialized", + uninitialized); + else + vlib_cli_output (vm, "Found %d invalid buffers and %d uninitialized", + n_invalid_bufs, uninitialized); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_dpdk_buffers_command, static) = +{ + .path = "test dpdk buffers", + .short_help = "test dpdk buffers [poison] [trajectory]", + .function = dpdk_validate_buffers_fn, +}; +/* *INDENT-ON* */ + +#endif + clib_error_t * dpdk_cli_init (vlib_main_t * vm) { diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 97c13630..aa134327 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -462,6 +462,11 @@ dpdk_interface_tx (vlib_main_t * vm, or_flags = b0->flags | b1->flags | b2->flags | b3->flags; + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3); + if (or_flags & VLIB_BUFFER_NEXT_PRESENT) { dpdk_validate_rte_mbuf (vm, b0, 1); @@ -556,6 +561,7 @@ dpdk_interface_tx (vlib_main_t * vm, from++; b0 = vlib_get_buffer (vm, bi0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); dpdk_validate_rte_mbuf (vm, b0, 1); diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 849e687b..9762c713 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -467,6 +467,11 @@ admin_up_down_process (vlib_main_t * vm, clib_error_t *dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id); +#if CLI_DEBUG +int dpdk_buffer_validate_trajectory_all (u32 * uninitialized); +void dpdk_buffer_poison_trajectory_all (void); +#endif + #endif /* __included_dpdk_h__ */ /* diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a5ec0e0a..7399b618 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -686,7 +686,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 bi = buffers[i]; b = vlib_get_buffer (vm, bi); - + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b); fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); /* The only current use of this callback: multicast recycle */ diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index b3de1201..c526003c 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -2131,6 +2131,7 @@ ip4_arp_inline (vlib_main_t * vm, vlib_buffer_copy_trace_flag (vm, p0, bi0); b0 = vlib_get_buffer (vm, bi0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0; vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 9cb3e779..b843c926 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -439,14 +439,16 @@ tcp_init_mss (tcp_connection_t * tc) always_inline int tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) { + vlib_main_t *vm = vlib_get_main (); u32 current_length = vec_len (tm->tx_buffers[thread_index]); + u32 n_allocated; vec_validate (tm->tx_buffers[thread_index], current_length + n_free_buffers - 1); - _vec_len (tm->tx_buffers[thread_index]) = current_length - + vlib_buffer_alloc (vlib_get_main (), - &tm->tx_buffers[thread_index][current_length], - n_free_buffers); + n_allocated = + vlib_buffer_alloc (vm, &tm->tx_buffers[thread_index][current_length], + n_free_buffers); + _vec_len (tm->tx_buffers[thread_index]) = current_length + n_allocated; /* buffer shortage, report failure */ if (vec_len (tm->tx_buffers[thread_index]) == 0) { -- cgit 1.2.3-korg From a0a10830fc9974c21fa260b22323abc798a98049 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 15 Sep 2017 18:22:01 +0200 Subject: dpdk: display EAL init args on startup Change-Id: I83ea1eae73cb41353cacd1c1b910339d10900502 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index ee61f94e..4cd8841a 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1210,6 +1210,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ", conf->eal_init_args[i]); + clib_warning ("EAL init args: %s", conf->eal_init_args_str); ret = rte_eal_init (vec_len (conf->eal_init_args), (char **) conf->eal_init_args); -- cgit 1.2.3-korg From 7e9743aef924093c9c25bdf445637434c190d31a Mon Sep 17 00:00:00 2001 From: John Lo Date: Sat, 23 Sep 2017 08:59:58 -0400 Subject: Fix sending GARP/NA on Bonded Interface Active/Backup Link Up/Down For bonded interface in Active/Backup mode (mode 1), we need to send a GARP/NA packet, if IP address is present, on slave link state change to up or down to help with route convergence. The callback from DPDK happens in a separate thread so we need to make sure RPC call is used to signal the send_garp_na process in the main thread. Also need to fix DPDK polling so the slave links are not polled. Change-Id: If5fd8ea2d28c54dd28726ac403ad366386ce9651 Signed-off-by: John Lo --- src/plugins/dpdk/device/common.c | 94 ++++++++++++++++++++++++++++++---------- src/plugins/dpdk/device/node.c | 2 + src/vlibmemory/memory_vlib.c | 30 +++++++++++-- src/vnet/ethernet/arp.c | 38 +--------------- src/vnet/ethernet/arp_packet.h | 9 +--- 5 files changed, 101 insertions(+), 72 deletions(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/common.c b/src/plugins/dpdk/device/common.c index 2707b4d8..aedc3f52 100644 --- a/src/plugins/dpdk/device/common.c +++ b/src/plugins/dpdk/device/common.c @@ -181,12 +181,69 @@ dpdk_device_stop (dpdk_device_t * xd) } } +/* Even type for send_garp_na_process */ +enum +{ + SEND_GARP_NA = 1, +} dpdk_send_garp_na_process_event_t; + +static vlib_node_registration_t send_garp_na_proc_node; + +static uword +send_garp_na_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + uword event_type, *event_data = 0; + + while (1) + { + u32 i; + uword dpdk_port; + vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &event_data); + ASSERT (event_type == SEND_GARP_NA); + for (i = 0; i < vec_len (event_data); i++) + { + dpdk_port = event_data[i]; + if (i < 5) /* wait 0.2 sec for link to settle, max total 1 sec */ + vlib_process_suspend (vm, 0.2); + dpdk_device_t *xd = &dpdk_main.devices[dpdk_port]; + u32 hw_if_index = xd->hw_if_index; + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_update_link_state (xd, vlib_time_now (vm)); + send_ip4_garp (vm, hi); + send_ip6_na (vm, hi); + } + vec_reset_length (event_data); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (send_garp_na_proc_node, static) = { + .function = send_garp_na_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "send-garp-na-process", +}; +/* *INDENT-ON* */ + +void vl_api_force_rpc_call_main_thread (void *fp, u8 * data, u32 data_length); + +static void +garp_na_proc_callback (uword * dpdk_port) +{ + vlib_main_t *vm = vlib_get_main (); + ASSERT (vlib_get_thread_index () == 0); + vlib_process_signal_event + (vm, send_garp_na_proc_node.index, SEND_GARP_NA, *dpdk_port); +} + always_inline int dpdk_port_state_callback_inline (uint8_t port_id, enum rte_eth_event_type type, void *param) { struct rte_eth_link link; - vlib_main_t *vm = vlib_get_main (); dpdk_device_t *xd = &dpdk_main.devices[port_id]; RTE_SET_USED (param); @@ -201,32 +258,21 @@ dpdk_port_state_callback_inline (uint8_t port_id, if (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE) { - u8 bd_port = xd->bond_port; + uword bd_port = xd->bond_port; int bd_mode = rte_eth_bond_mode_get (bd_port); - - if ((link_up && !(xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE_UP)) || - (!link_up && (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE_UP))) +#if 0 + clib_warning ("Port %d state to %s, " + "slave of port %d BondEthernet%d in mode %d", + port_id, (link_up) ? "UP" : "DOWN", + bd_port, xd->port_id, bd_mode); +#endif + if (bd_mode == BONDING_MODE_ACTIVE_BACKUP) { - clib_warning ("Port %d state to %s, " - "slave of port %d BondEthernet%d in mode %d", - port_id, (link_up) ? "UP" : "DOWN", - bd_port, xd->port_id, bd_mode); - if (bd_mode == BONDING_MODE_ACTIVE_BACKUP) - { - rte_eth_link_get_nowait (bd_port, &link); - if (link.link_status) /* bonded interface up */ - { - u32 hw_if_index = dpdk_main.devices[bd_port].hw_if_index; - vlib_process_signal_event - (vm, send_garp_na_process_node_index, SEND_GARP_NA, - hw_if_index); - } - } + vl_api_force_rpc_call_main_thread + (garp_na_proc_callback, (u8 *) & bd_port, sizeof (uword)); } - if (link_up) /* Update slave link status */ - xd->flags |= DPDK_DEVICE_FLAG_BOND_SLAVE_UP; - else - xd->flags &= ~DPDK_DEVICE_FLAG_BOND_SLAVE_UP; + xd->flags |= link_up ? + DPDK_DEVICE_FLAG_BOND_SLAVE_UP : ~DPDK_DEVICE_FLAG_BOND_SLAVE_UP; } else /* Should not happen as callback not setup for "normal" links */ { diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 74fb8da1..cf8b9699 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -661,6 +661,8 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) foreach_device_and_queue (dq, rt->devices_and_queues) { xd = vec_elt_at_index(dm->devices, dq->dev_instance); + if (PREDICT_FALSE (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE)) + continue; /* Do not poll slave to a bonded interface */ if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1); else diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c index b6b87529..77959e6d 100644 --- a/src/vlibmemory/memory_vlib.c +++ b/src/vlibmemory/memory_vlib.c @@ -1452,8 +1452,9 @@ vl_api_rpc_call_reply_t_handler (vl_api_rpc_call_reply_t * mp) clib_warning ("unimplemented"); } -void -vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length) +always_inline void +vl_api_rpc_call_main_thread_inline (void *fp, u8 * data, u32 data_length, + u8 force_rpc) { vl_api_rpc_call_t *mp; api_main_t *am = &api_main; @@ -1461,7 +1462,7 @@ vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length) unix_shared_memory_queue_t *q; /* Main thread: call the function directly */ - if (vlib_get_thread_index () == 0) + if ((force_rpc == 0) && (vlib_get_thread_index () == 0)) { vlib_main_t *vm = vlib_get_main (); void (*call_fp) (void *); @@ -1507,6 +1508,29 @@ vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length) pthread_mutex_unlock (&q->mutex); } +/* + * Check if called from worker threads. + * If so, make rpc call of fp through shmem. + * Otherwise, call fp directly + */ +void +vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length) +{ + vl_api_rpc_call_main_thread_inline (fp, data, data_length, /*force_rpc */ + 0); +} + +/* + * Always make rpc call of fp through shmem, useful for calling from threads + * not setup as worker threads, such as DPDK callback thread + */ +void +vl_api_force_rpc_call_main_thread (void *fp, u8 * data, u32 data_length) +{ + vl_api_rpc_call_main_thread_inline (fp, data, data_length, /*force_rpc */ + 1); +} + static void vl_api_trace_plugin_msg_ids_t_handler (vl_api_trace_plugin_msg_ids_t * mp) { diff --git a/src/vnet/ethernet/arp.c b/src/vnet/ethernet/arp.c index e974d255..120a276c 100644 --- a/src/vnet/ethernet/arp.c +++ b/src/vnet/ethernet/arp.c @@ -2482,7 +2482,7 @@ ethernet_arp_change_mac (u32 sw_if_index) /* *INDENT-ON* */ } -void static +void send_ip4_garp (vlib_main_t * vm, vnet_hw_interface_t * hi) { ip4_main_t *i4m = &ip4_main; @@ -2526,42 +2526,6 @@ send_ip4_garp (vlib_main_t * vm, vnet_hw_interface_t * hi) } } -static vlib_node_registration_t send_garp_na_proc_node; - -static uword -send_garp_na_process (vlib_main_t * vm, - vlib_node_runtime_t * rt, vlib_frame_t * f) -{ - vnet_main_t *vnm = vnet_get_main (); - uword event_type, *event_data = 0; - - send_garp_na_process_node_index = send_garp_na_proc_node.index; - - while (1) - { - vlib_process_wait_for_event (vm); - event_type = vlib_process_get_events (vm, &event_data); - if ((event_type == SEND_GARP_NA) && (vec_len (event_data) >= 1)) - { - u32 hw_if_index = event_data[0]; - vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index); - send_ip4_garp (vm, hi); - send_ip6_na (vm, hi); - } - vec_reset_length (event_data); - } - return 0; -} - - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (send_garp_na_proc_node, static) = { - .function = send_garp_na_process, - .type = VLIB_NODE_TYPE_PROCESS, - .name = "send-garp-na-process", -}; -/* *INDENT-ON* */ - /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/ethernet/arp_packet.h b/src/vnet/ethernet/arp_packet.h index d740b844..661f33f9 100644 --- a/src/vnet/ethernet/arp_packet.h +++ b/src/vnet/ethernet/arp_packet.h @@ -167,14 +167,7 @@ typedef struct ethernet_arp_ip4_entry_t *ip4_neighbor_entries (u32 sw_if_index); u8 *format_ethernet_arp_ip4_entry (u8 * s, va_list * va); -/* Node index for send_garp_na_process */ -extern u32 send_garp_na_process_node_index; - -/* Even type for send_garp_na_process */ -enum -{ - SEND_GARP_NA = 1, -} dpdk_send_garp_na_process_event_t; +void send_ip4_garp (vlib_main_t * vm, vnet_hw_interface_t * hi); #endif /* included_ethernet_arp_packet_h */ -- cgit 1.2.3-korg From 45c40dbb2c1fcafce17d5c901c9b317cde4b5cde Mon Sep 17 00:00:00 2001 From: Steve Shin Date: Tue, 26 Sep 2017 10:07:58 -0700 Subject: Fix: unnecesary uio binding for Mellanox NIC UIO binding is not required for Mellanox NIC and calling vlib_pci_bind_to_uio() should be skipped. Change-Id: I10ea457bc3c8d4be8117dec51d5bd940ee416a44 Signed-off-by: Steve Shin --- src/plugins/dpdk/device/init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src/plugins/dpdk/device') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 4cd8841a..acf712ff 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -692,7 +692,9 @@ dpdk_bind_devices_to_uio (dpdk_config_main_t * conf) ; /* Mellanox */ else if (d->vendor_id == 0x15b3 && d->device_id >= 0x1013 && d->device_id <= 0x101a) - ; + { + continue; + } else { clib_warning ("Unsupported PCI device 0x%04x:0x%04x found " -- cgit 1.2.3-korg