diff options
Diffstat (limited to 'vnet/vnet/devices/dpdk')
-rw-r--r-- | vnet/vnet/devices/dpdk/cli.c | 974 | ||||
-rw-r--r-- | vnet/vnet/devices/dpdk/device.c | 1483 | ||||
-rw-r--r-- | vnet/vnet/devices/dpdk/dpdk.h | 515 | ||||
-rw-r--r-- | vnet/vnet/devices/dpdk/dpdk_priv.h | 437 | ||||
-rw-r--r-- | vnet/vnet/devices/dpdk/init.c | 1728 | ||||
-rw-r--r-- | vnet/vnet/devices/dpdk/node.c | 2010 | ||||
-rw-r--r-- | vnet/vnet/devices/dpdk/threads.c | 378 | ||||
-rw-r--r-- | vnet/vnet/devices/dpdk/threads.h | 30 | ||||
-rw-r--r-- | vnet/vnet/devices/dpdk/vhost_user.c | 1550 |
9 files changed, 9105 insertions, 0 deletions
diff --git a/vnet/vnet/devices/dpdk/cli.c b/vnet/vnet/devices/dpdk/cli.c new file mode 100644 index 00000000000..c27dbfabfc0 --- /dev/null +++ b/vnet/vnet/devices/dpdk/cli.c @@ -0,0 +1,974 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <vppinfra/xxhash.h> + +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/dpdk/dpdk.h> +#include <vnet/classify/vnet_classify.h> +#include <vnet/mpls-gre/packet.h> + +#include "dpdk_priv.h" + +frame_queue_trace_t *frame_queue_traces; + +static clib_error_t * +pcap_trace_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + dpdk_main_t * dm = &dpdk_main; + u8 * filename; + u32 max; + int matched = 0; + clib_error_t * error = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "on")) + { + if (dm->tx_pcap_enable == 0) + { + if (dm->pcap_filename == 0) + dm->pcap_filename = format (0, "/tmp/vpe.pcap%c", 0); + + memset (&dm->pcap_main, 0, sizeof (dm->pcap_main)); + dm->pcap_main.file_name = (char *) dm->pcap_filename; + dm->pcap_main.n_packets_to_capture = 100; + if (dm->pcap_pkts_to_capture) + dm->pcap_main.n_packets_to_capture = dm->pcap_pkts_to_capture; + + dm->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet; + dm->tx_pcap_enable = 1; + matched = 1; + vlib_cli_output (vm, "pcap tx capture on..."); + } + else + { + vlib_cli_output (vm, "pcap tx capture already on..."); + } + matched = 1; + } + else if (unformat (input, "off")) + { + if (dm->tx_pcap_enable) + { + vlib_cli_output (vm, "captured %d pkts...", + dm->pcap_main.n_packets_captured+1); + if (dm->pcap_main.n_packets_captured) + { + dm->pcap_main.n_packets_to_capture = + dm->pcap_main.n_packets_captured; + error = pcap_write (&dm->pcap_main); + if (error) + clib_error_report (error); + else + vlib_cli_output (vm, "saved to %s...", dm->pcap_filename); + } + } + else + { + vlib_cli_output (vm, "pcap tx capture already off..."); + } + + dm->tx_pcap_enable = 0; + matched = 1; + } + else if (unformat (input, "max %d", &max)) + { + dm->pcap_pkts_to_capture = max; + matched = 1; + } + + else if (unformat (input, "intfc %U", + unformat_vnet_sw_interface, dm->vnet_main, + &dm->pcap_sw_if_index)) + matched = 1; + else if (unformat (input, "intfc any")) + { + dm->pcap_sw_if_index = 0; + matched = 1; + } + else if (unformat (input, "file %s", &filename)) + { + u8 * chroot_filename; + /* Brain-police user path input */ + if (strstr((char *)filename, "..") || index((char *)filename, '/')) + { + vlib_cli_output (vm, "illegal characters in filename '%s'", + filename); + continue; + } + + chroot_filename = format (0, "/tmp/%s%c", filename, 0); + vec_free (filename); + + if (dm->pcap_filename) + vec_free (dm->pcap_filename); + vec_add1 (filename, 0); + dm->pcap_filename = chroot_filename; + matched = 1; + } + else if (unformat (input, "status")) + { + if (dm->tx_pcap_enable == 0) + { + vlib_cli_output (vm, "pcap tx capture is off..."); + continue; + } + + vlib_cli_output (vm, "pcap tx capture: %d of %d pkts...", + dm->pcap_main.n_packets_captured, + dm->pcap_main.n_packets_to_capture); + matched = 1; + } + + else + break; + } + + if (matched == 0) + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + + return 0; +} + +VLIB_CLI_COMMAND (pcap_trace_command, static) = { + .path = "pcap tx trace", + .short_help = + "pcap tx trace on off max <nn> intfc <intfc> file <name> status", + .function = pcap_trace_command_fn, +}; + + +static clib_error_t * +show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + struct rte_mempool * rmp; + int i; + + for(i = 0; i < vec_len(vm->buffer_main->pktmbuf_pools); i++) + { + rmp = vm->buffer_main->pktmbuf_pools[i]; + if (rmp) + { + unsigned count = rte_mempool_count(rmp); + unsigned free_count = rte_mempool_free_count(rmp); + + vlib_cli_output(vm, "name=\"%s\" available = %7d allocated = %7d total = %7d\n", + rmp->name, (u32)count, (u32)free_count, + (u32)(count+free_count)); + } + else + { + vlib_cli_output(vm, "rte_mempool is NULL (!)\n"); + } + } + return 0; +} + +VLIB_CLI_COMMAND (cmd_show_dpdk_bufferr,static) = { + .path = "show dpdk buffer", + .short_help = "show dpdk buffer state", + .function = show_dpdk_buffer, +}; + +static clib_error_t * +test_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + static u32 * allocated_buffers; + u32 n_alloc = 0; + u32 n_free = 0; + u32 first, actual_alloc; + + while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "allocate %d", &n_alloc)) + ; + else if (unformat (input, "free %d", &n_free)) + ; + else + break; + } + + if (n_free) + { + if (vec_len (allocated_buffers) < n_free) + return clib_error_return (0, "Can't free %d, only %d allocated", + n_free, vec_len (allocated_buffers)); + + first = vec_len(allocated_buffers) - n_free; + vlib_buffer_free (vm, allocated_buffers + first, n_free); + _vec_len (allocated_buffers) = first; + } + if (n_alloc) + { + first = vec_len (allocated_buffers); + vec_validate (allocated_buffers, + vec_len (allocated_buffers) + n_alloc - 1); + + actual_alloc = vlib_buffer_alloc (vm, allocated_buffers + first, + n_alloc); + _vec_len (allocated_buffers) = first + actual_alloc; + + if (actual_alloc < n_alloc) + vlib_cli_output (vm, "WARNING: only allocated %d buffers", + actual_alloc); + } + + vlib_cli_output (vm, "Currently %d buffers allocated", + vec_len (allocated_buffers)); + + if (allocated_buffers && vec_len(allocated_buffers) == 0) + vec_free(allocated_buffers); + + return 0; +} + +VLIB_CLI_COMMAND (cmd_test_dpdk_bufferr,static) = { + .path = "test dpdk buffer", + .short_help = "test dpdk buffer [allocate <nn>][free <nn>]", + .function = test_dpdk_buffer, +}; + +static void +show_dpdk_device_stats (vlib_main_t * vm, dpdk_device_t * xd) +{ + vlib_cli_output(vm, + "device_index %d\n" + " last_burst_sz %d\n" + " max_burst_sz %d\n" + " full_frames_cnt %u\n" + " consec_full_frames_cnt %u\n" + " congestion_cnt %d\n" + " last_poll_time %llu\n" + " max_poll_delay %llu\n" + " discard_cnt %u\n" + " total_packet_cnt %u\n", + xd->device_index, + xd->efd_agent.last_burst_sz, + xd->efd_agent.max_burst_sz, + xd->efd_agent.full_frames_cnt, + xd->efd_agent.consec_full_frames_cnt, + xd->efd_agent.congestion_cnt, + xd->efd_agent.last_poll_time, + xd->efd_agent.max_poll_delay, + xd->efd_agent.discard_cnt, + xd->efd_agent.total_packet_cnt); + + u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index, + 0 /* queue_id */); + vlib_cli_output(vm, + " device_queue_sz %u\n", + device_queue_sz); +} + + +/* + * Trigger threads to grab frame queue trace data + */ +static clib_error_t * +trace_frame_queue (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + clib_error_t * error = NULL; + frame_queue_trace_t *fqt; + u32 num_fq; + u32 fqix; + u32 enable = 0; + + if (unformat(input, "on")) { + enable = 1; + } else if (unformat(input, "off")) { + enable = 0; + } else { + return clib_error_return(0, "expecting on or off"); + } + + num_fq = vec_len(vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output(vm, "No frame queues exist\n"); + return error; + } + + // Allocate storage for trace if necessary + vec_validate_aligned(frame_queue_traces, num_fq-1, CLIB_CACHE_LINE_BYTES); + + for (fqix=0; fqix<num_fq; fqix++) { + fqt = &frame_queue_traces[fqix]; + + memset(fqt->n_vectors, 0xff, sizeof(fqt->n_vectors)); + fqt->written = 0; + vlib_frame_queues[fqix]->trace = enable; + } + return error; +} + +VLIB_CLI_COMMAND (cmd_trace_frame_queue,static) = { + .path = "trace frame-queue", + .short_help = "trace frame-queue (on|off)", + .function = trace_frame_queue, +}; + + +/* + * Display frame queue trace data gathered by threads. + */ +static clib_error_t * +show_frame_queue (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + clib_error_t * error = NULL; + frame_queue_trace_t *fqt; + u32 num_fq; + u32 fqix; + + num_fq = vec_len(frame_queue_traces); + if (num_fq == 0) + { + vlib_cli_output(vm, "No trace data for frame queues\n"); + return error; + } + + for (fqix=0; fqix<num_fq; fqix++) { + fqt = &frame_queue_traces[fqix]; + + vlib_cli_output(vm, "Thread %d %v\n", fqix, vlib_worker_threads[fqix].name); + + if (fqt->written == 0) + { + vlib_cli_output(vm, " no trace data\n"); + continue; + } + + vlib_cli_output(vm, " vector-threshold %d ring size %d in use %d\n", + fqt->threshold, fqt->nelts, fqt->n_in_use); + vlib_cli_output(vm, " head %12d head_hint %12d tail %12d\n", + fqt->head, fqt->head_hint, fqt->tail); + vlib_cli_output(vm, " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n", + fqt->n_vectors[0], fqt->n_vectors[1], fqt->n_vectors[2], fqt->n_vectors[3], + fqt->n_vectors[4], fqt->n_vectors[5], fqt->n_vectors[6], fqt->n_vectors[7], + fqt->n_vectors[8], fqt->n_vectors[9], fqt->n_vectors[10], fqt->n_vectors[11], + fqt->n_vectors[12], fqt->n_vectors[13], fqt->n_vectors[14], fqt->n_vectors[15]); + + if (fqt->nelts > 16) + { + vlib_cli_output(vm, " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n", + fqt->n_vectors[16], fqt->n_vectors[17], fqt->n_vectors[18], fqt->n_vectors[19], + fqt->n_vectors[20], fqt->n_vectors[21], fqt->n_vectors[22], fqt->n_vectors[23], + fqt->n_vectors[24], fqt->n_vectors[25], fqt->n_vectors[26], fqt->n_vectors[27], + fqt->n_vectors[28], fqt->n_vectors[29], fqt->n_vectors[30], fqt->n_vectors[31]); + } + } + return error; +} + +VLIB_CLI_COMMAND (cmd_show_frame_queue,static) = { + .path = "show frame-queue", + .short_help = "show frame-queue trace", + .function = show_frame_queue, +}; + + +/* + * Modify the number of elements on the frame_queues + */ +static clib_error_t * +test_frame_queue_nelts (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + clib_error_t * error = NULL; + u32 num_fq; + u32 fqix; + u32 nelts = 0; + + unformat(input, "%d", &nelts); + if ((nelts != 4) && (nelts != 8) && (nelts != 16) && (nelts != 32)) { + return clib_error_return(0, "expecting 4,8,16,32"); + } + + num_fq = vec_len(vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output(vm, "No frame queues exist\n"); + return error; + } + + for (fqix=0; fqix<num_fq; fqix++) { + vlib_frame_queues[fqix]->nelts = nelts; + } + + return error; +} + +VLIB_CLI_COMMAND (cmd_test_frame_queue_nelts,static) = { + .path = "test frame-queue nelts", + .short_help = "test frame-queue nelts (4,8,16,32)", + .function = test_frame_queue_nelts, +}; + + +/* + * Modify the max number of packets pulled off the frame queues + */ +static clib_error_t * +test_frame_queue_threshold (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + clib_error_t * error = NULL; + u32 num_fq; + u32 fqix; + u32 threshold = 0; + + if (unformat(input, "%d", &threshold)) { + } else { + vlib_cli_output(vm, "expecting threshold value\n"); + return error; + } + + if (threshold == 0) + threshold = ~0; + + num_fq = vec_len(vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output(vm, "No frame queues exist\n"); + return error; + } + + for (fqix=0; fqix<num_fq; fqix++) { + vlib_frame_queues[fqix]->vector_threshold = threshold; + } + + return error; +} + +VLIB_CLI_COMMAND (cmd_test_frame_queue_threshold,static) = { + .path = "test frame-queue threshold", + .short_help = "test frame-queue threshold N (0=no limit)", + .function = test_frame_queue_threshold, +}; + +static void +show_efd_config (vlib_main_t * vm) +{ + vlib_thread_main_t * tm = vlib_get_thread_main(); + dpdk_main_t * dm = &dpdk_main; + + vlib_cli_output(vm, + "dpdk: (0x%04x) enabled:%d monitor:%d drop_all:%d\n" + " dpdk_queue_hi_thresh %d\n" + " consec_full_frames_hi_thresh %d\n" + "---------\n" + "worker: (0x%04x) enabled:%d monitor:%d\n" + " worker_queue_hi_thresh %d\n", + dm->efd.enabled, + ((dm->efd.enabled & DPDK_EFD_DISCARD_ENABLED) ? 1:0), + ((dm->efd.enabled & DPDK_EFD_MONITOR_ENABLED) ? 1:0), + ((dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED) ? 1:0), + dm->efd.queue_hi_thresh, + dm->efd.consec_full_frames_hi_thresh, + tm->efd.enabled, + ((tm->efd.enabled & VLIB_EFD_DISCARD_ENABLED) ? 1:0), + ((dm->efd.enabled & VLIB_EFD_MONITOR_ENABLED) ? 1:0), + tm->efd.queue_hi_thresh); + vlib_cli_output(vm, + "---------\n" + "ip_prec_bitmap 0x%02x\n" + "mpls_exp_bitmap 0x%02x\n" + "vlan_cos_bitmap 0x%02x\n", + tm->efd.ip_prec_bitmap, + tm->efd.mpls_exp_bitmap, + tm->efd.vlan_cos_bitmap); +} + +static clib_error_t * +show_efd (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + + if (unformat(input, "config")) { + show_efd_config(vm); + } else if (unformat(input, "dpdk")) { + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + u32 device_id = ~0; + + unformat(input, "device %d", &device_id); + vec_foreach (xd, dm->devices) { + if ((xd->device_index == device_id) || (device_id == ~0)) { + show_dpdk_device_stats(vm, xd); + } + } + } else if (unformat(input, "worker")) { + vlib_thread_main_t * tm = vlib_get_thread_main(); + vlib_frame_queue_t *fq; + vlib_thread_registration_t * tr; + int thread_id; + u32 num_workers = 0; + u32 first_worker_index = 0; + uword * p; + + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + ASSERT (p); + tr = (vlib_thread_registration_t *) p[0]; + if (tr) + { + num_workers = tr->count; + first_worker_index = tr->first_index; + } + + vlib_cli_output(vm, + "num_workers %d\n" + "first_worker_index %d\n" + "vlib_frame_queues[%d]:\n", + num_workers, + first_worker_index, + tm->n_vlib_mains); + + for (thread_id = 0; thread_id < tm->n_vlib_mains; thread_id++) { + fq = vlib_frame_queues[thread_id]; + if (fq) { + vlib_cli_output(vm, + "%2d: frames_queued %u\n" + " frames_queued_hint %u\n" + " enqueue_full_events %u\n" + " enqueue_efd_discards %u\n", + thread_id, + (fq->tail - fq->head), + (fq->tail - fq->head_hint), + fq->enqueue_full_events, + fq->enqueue_efd_discards); + } + } + } else if (unformat(input, "help")) { + vlib_cli_output(vm, "Usage: show efd config | " + "dpdk [device <id>] | worker\n"); + } else { + show_efd_config(vm); + } + + return 0; +} + +VLIB_CLI_COMMAND (show_efd_command, static) = { + .path = "show efd", + .short_help = "Show efd [device <id>] | [config]", + .function = show_efd, +}; + +static clib_error_t * +clear_efd (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + vlib_thread_main_t * tm = vlib_get_thread_main(); + vlib_frame_queue_t *fq; + int thread_id; + + vec_foreach (xd, dm->devices) { + xd->efd_agent.last_burst_sz = 0; + xd->efd_agent.max_burst_sz = 0; + xd->efd_agent.full_frames_cnt = 0; + xd->efd_agent.consec_full_frames_cnt = 0; + xd->efd_agent.congestion_cnt = 0; + xd->efd_agent.last_poll_time = 0; + xd->efd_agent.max_poll_delay = 0; + xd->efd_agent.discard_cnt = 0; + xd->efd_agent.total_packet_cnt = 0; + } + + for (thread_id = 0; thread_id < tm->n_vlib_mains; thread_id++) { + fq = vlib_frame_queues[thread_id]; + if (fq) { + fq->enqueue_full_events = 0; + fq->enqueue_efd_discards = 0; + } + } + + return 0; +} + +VLIB_CLI_COMMAND (clear_efd_command,static) = { + .path = "clear efd", + .short_help = "Clear early-fast-discard counters", + .function = clear_efd, +}; + +static clib_error_t * +parse_op_and_prec (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd, + char *prec_type, u8 *prec_bitmap) +{ + clib_error_t * error = NULL; + u8 op = 0; + u8 prec = 0; + + if (unformat(input, "ge")) { + op = EFD_OPERATION_GREATER_OR_EQUAL; + } else if (unformat(input, "lt")) { + op = EFD_OPERATION_LESS_THAN; + } else if (unformat(input, "help")) { + vlib_cli_output(vm, + "enter operation [ge | lt] and precedence <0-7>)"); + return (error); + } else { + return clib_error_return(0, "unknown input `%U'", + format_unformat_error, input); + } + + if (unformat (input, "%u", &prec)) { + if (prec > 7) { + return clib_error_return(0, "precedence %d is out of range <0-7>", + prec); + } + } else { + return clib_error_return(0, "unknown input `%U'", + format_unformat_error, input); + } + + set_efd_bitmap(prec_bitmap, prec, op); + + vlib_cli_output(vm, + "EFD will be set for %s precedence %s%u%s.", + prec_type, + (op == EFD_OPERATION_LESS_THAN) ? "less than " : "", + prec, + (op == EFD_OPERATION_GREATER_OR_EQUAL) ? " and greater" : ""); + + return (error); +} + + +static clib_error_t * +set_efd (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + dpdk_main_t * dm = &dpdk_main; + vlib_thread_main_t * tm = vlib_get_thread_main(); + clib_error_t * error = NULL; + + if (unformat(input, "enable")) { + if (unformat(input, "dpdk")) { + dm->efd.enabled |= DPDK_EFD_DISCARD_ENABLED; + } else if (unformat(input, "worker")) { + tm->efd.enabled |= VLIB_EFD_DISCARD_ENABLED; + } else if (unformat(input, "monitor")) { + dm->efd.enabled |= DPDK_EFD_MONITOR_ENABLED; + tm->efd.enabled |= VLIB_EFD_MONITOR_ENABLED; + } else if (unformat(input, "drop_all")) { + dm->efd.enabled |= DPDK_EFD_DROPALL_ENABLED; + } else if (unformat(input, "default")) { + dm->efd.enabled = (DPDK_EFD_DISCARD_ENABLED | + DPDK_EFD_MONITOR_ENABLED); + tm->efd.enabled = (VLIB_EFD_DISCARD_ENABLED | + VLIB_EFD_MONITOR_ENABLED); + } else { + return clib_error_return(0, "Usage: set efd enable [dpdk | " + "worker | monitor | drop_all | default]"); + } + } else if (unformat(input, "disable")) { + if (unformat(input, "dpdk")) { + dm->efd.enabled &= ~DPDK_EFD_DISCARD_ENABLED; + } else if (unformat(input, "worker")) { + tm->efd.enabled &= ~VLIB_EFD_DISCARD_ENABLED; + } else if (unformat(input, "monitor")) { + dm->efd.enabled &= ~DPDK_EFD_MONITOR_ENABLED; + tm->efd.enabled &= ~VLIB_EFD_MONITOR_ENABLED; + } else if (unformat(input, "drop_all")) { + dm->efd.enabled &= ~DPDK_EFD_DROPALL_ENABLED; + } else if (unformat(input, "all")) { + dm->efd.enabled = 0; + tm->efd.enabled = 0; + } else { + return clib_error_return(0, "Usage: set efd disable [dpdk | " + "worker | monitor | drop_all | all]"); + } + } else if (unformat(input, "worker_queue_hi_thresh")) { + u32 mark; + if (unformat (input, "%u", &mark)) { + tm->efd.queue_hi_thresh = mark; + } else { + return clib_error_return(0, "unknown input `%U'", + format_unformat_error, input); + } + } else if (unformat(input, "dpdk_device_hi_thresh")) { + u32 thresh; + if (unformat (input, "%u", &thresh)) { + dm->efd.queue_hi_thresh = thresh; + } else { + return clib_error_return(0, "unknown input `%U'", + format_unformat_error, input); + } + } else if (unformat(input, "consec_full_frames_hi_thresh")) { + u32 thresh; + if (unformat (input, "%u", &thresh)) { + dm->efd.consec_full_frames_hi_thresh = thresh; + } else { + return clib_error_return(0, "unknown input `%U'", + format_unformat_error, input); + } + } else if (unformat(input, "ip-prec")) { + return (parse_op_and_prec(vm, input, cmd, + "ip", &tm->efd.ip_prec_bitmap)); + } else if (unformat(input, "mpls-exp")) { + return (parse_op_and_prec(vm, input, cmd, + "mpls", &tm->efd.mpls_exp_bitmap)); + } else if (unformat(input, "vlan-cos")) { + return (parse_op_and_prec(vm, input, cmd, + "vlan", &tm->efd.vlan_cos_bitmap)); + } else if (unformat(input, "help")) { + vlib_cli_output(vm, + "Usage:\n" + " set efd enable <dpdk | worker | monitor | drop_all | default> |\n" + " set efd disable <dpdk | worker | monitor | drop_all | all> |\n" + " set efd <ip-prec | mpls-exp | vlan-cos> <ge | lt> <0-7>\n" + " set efd worker_queue_hi_thresh <0-32> |\n" + " set efd dpdk_device_hi_thresh <0-%d> |\n" + " set efd consec_full_frames_hi_thresh <count> |\n", + DPDK_NB_RX_DESC_10GE); + } else { + return clib_error_return(0, "unknown input `%U'", + format_unformat_error, input); + } + + return error; +} + +VLIB_CLI_COMMAND (cmd_set_efd,static) = { + .path = "set efd", + .short_help = "set early-fast-discard commands", + .function = set_efd, +}; + +static clib_error_t * +set_dpdk_if_desc (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, * line_input = &_line_input; + dpdk_main_t * dm = &dpdk_main; + vnet_hw_interface_t * hw; + dpdk_device_t * xd; + u32 hw_if_index = (u32) ~0; + u32 nb_rx_desc = (u32) ~0; + u32 nb_tx_desc = (u32) ~0; + clib_error_t * rv; + + if (! unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { + if (unformat (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "tx %d", &nb_tx_desc)) + ; + else if (unformat (line_input, "rx %d", &nb_rx_desc)) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (hw_if_index == (u32) ~0) + return clib_error_return (0, "please specify valid interface name"); + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + if (xd->dev_type != VNET_DPDK_DEV_ETH) + return clib_error_return (0, "number of descriptors can be set only for " + "physical devices"); + + if ((nb_rx_desc == (u32) ~0 || nb_rx_desc == xd->nb_rx_desc) && + (nb_tx_desc == (u32) ~0 || nb_tx_desc == xd->nb_tx_desc)) + return clib_error_return (0, "nothing changed"); + + if (nb_rx_desc != (u32) ~0) + xd->nb_rx_desc = nb_rx_desc; + + if (nb_tx_desc != (u32) ~0) + xd->nb_rx_desc = nb_rx_desc; + + rv = dpdk_port_setup(dm, xd); + + return rv < 0 ? rv : 0; +} + +VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = { + .path = "set dpdk interface descriptors", + .short_help = "set dpdk interface descriptors <if-name> [rx <n>] [tx <n>]", + .function = set_dpdk_if_desc, +}; + +static clib_error_t * +show_dpdk_if_placement (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + vlib_thread_main_t * tm = vlib_get_thread_main(); + dpdk_main_t * dm = &dpdk_main; + dpdk_device_and_queue_t * dq; + int cpu; + + if (tm->n_vlib_mains == 1) + vlib_cli_output(vm, "All interfaces are handled by main thread"); + + for(cpu = 0; cpu < vec_len(dm->devices_by_cpu); cpu++) + { + if (vec_len(dm->devices_by_cpu[cpu])) + vlib_cli_output(vm, "Thread %u (%s at lcore %u):", cpu, + vlib_worker_threads[cpu].name, + vlib_worker_threads[cpu].dpdk_lcore_id); + + vec_foreach(dq, dm->devices_by_cpu[cpu]) + { + u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index; + vnet_hw_interface_t * hi = vnet_get_hw_interface(dm->vnet_main, hw_if_index); + vlib_cli_output(vm, " %v queue %u", hi->name, dq->queue_id); + } + } + return 0; +} + +VLIB_CLI_COMMAND (cmd_show_dpdk_if_placement,static) = { + .path = "show dpdk interface placement", + .short_help = "show dpdk interface placement", + .function = show_dpdk_if_placement, +}; + +static int +dpdk_device_queue_sort(void * a1, void * a2) +{ + dpdk_device_and_queue_t * dq1 = a1; + dpdk_device_and_queue_t * dq2 = a2; + + if (dq1->device > dq2->device) + return 1; + else if (dq1->device < dq2->device) + return -1; + else if (dq1->queue_id > dq2->queue_id) + return 1; + else if (dq1->queue_id < dq2->queue_id) + return -1; + else + return 0; +} + +static clib_error_t * +set_dpdk_if_placement (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, * line_input = &_line_input; + dpdk_main_t * dm = &dpdk_main; + dpdk_device_and_queue_t * dq; + vnet_hw_interface_t * hw; + dpdk_device_t * xd; + u32 hw_if_index = (u32) ~0; + u32 queue = (u32) 0; + u32 cpu = (u32) ~0; + int i; + + if (! unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { + if (unformat (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "queue %d", &queue)) + ; + else if (unformat (line_input, "thread %d", &cpu)) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (hw_if_index == (u32) ~0) + return clib_error_return (0, "please specify valid interface name"); + + if (cpu < dm->input_cpu_first_index || + cpu >= (dm->input_cpu_first_index + dm->input_cpu_count)) + return clib_error_return (0, "please specify valid thread id"); + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + for(i = 0; i < vec_len(dm->devices_by_cpu); i++) + { + vec_foreach(dq, dm->devices_by_cpu[i]) + { + if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index && + queue == dq->queue_id) + { + if (cpu == i) /* nothing to do */ + return 0; + + vec_del1(dm->devices_by_cpu[i], dq - dm->devices_by_cpu[i]); + vec_add2(dm->devices_by_cpu[cpu], dq, 1); + dq->queue_id = queue; + dq->device = xd->device_index; + xd->cpu_socket_id_by_queue[queue] = + rte_lcore_to_socket_id(vlib_worker_threads[cpu].dpdk_lcore_id); + + vec_sort_with_function(dm->devices_by_cpu[i], + dpdk_device_queue_sort); + + vec_sort_with_function(dm->devices_by_cpu[cpu], + dpdk_device_queue_sort); + + if (vec_len(dm->devices_by_cpu[i]) == 0) + vlib_node_set_state (vlib_mains[i], dpdk_input_node.index, + VLIB_NODE_STATE_DISABLED); + + if (vec_len(dm->devices_by_cpu[cpu]) == 1) + vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index, + VLIB_NODE_STATE_POLLING); + + return 0; + } + } + } + + return clib_error_return (0, "not found"); +} + +VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = { + .path = "set dpdk interface placement", + .short_help = "set dpdk interface placement <if-name> [queue <n>] thread <n>", + .function = set_dpdk_if_placement, +}; + +clib_error_t * +dpdk_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (dpdk_cli_init); diff --git a/vnet/vnet/devices/dpdk/device.c b/vnet/vnet/devices/dpdk/device.c new file mode 100644 index 00000000000..a19c3131ef9 --- /dev/null +++ b/vnet/vnet/devices/dpdk/device.c @@ -0,0 +1,1483 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/format.h> +#include <vlib/unix/cj.h> + +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/dpdk/dpdk.h> + +#include "dpdk_priv.h" +#include <vppinfra/error.h> + +#define foreach_dpdk_tx_func_error \ + _(BAD_RETVAL, "DPDK tx function returned an error") \ + _(RING_FULL, "Tx packet drops (ring full)") \ + _(PKT_DROP, "Tx packet drops (dpdk tx failure)") \ + _(REPL_FAIL, "Tx packet drops (replication failure)") + +typedef enum { +#define _(f,s) DPDK_TX_FUNC_ERROR_##f, + foreach_dpdk_tx_func_error +#undef _ + DPDK_TX_FUNC_N_ERROR, +} dpdk_tx_func_error_t; + +static char * dpdk_tx_func_error_strings[] = { +#define _(n,s) s, + foreach_dpdk_tx_func_error +#undef _ +}; + +static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b) +{ + vlib_main_t * vm = vlib_get_main(); + vlib_buffer_main_t * bm = vm->buffer_main; + struct rte_mbuf * first_mb = 0, * new_mb, * pkt_mb, ** prev_mb_next = 0; + u8 nb_segs, nb_segs_left; + u32 copy_bytes; + unsigned socket_id = rte_socket_id(); + + ASSERT (bm->pktmbuf_pools[socket_id]); + pkt_mb = ((struct rte_mbuf *)b)-1; + nb_segs = pkt_mb->nb_segs; + for (nb_segs_left = nb_segs; nb_segs_left; nb_segs_left--) + { + if (PREDICT_FALSE(pkt_mb == 0)) + { + clib_warning ("Missing %d mbuf chain segment(s): " + "(nb_segs = %d, nb_segs_left = %d)!", + nb_segs - nb_segs_left, nb_segs, nb_segs_left); + if (first_mb) + rte_pktmbuf_free(first_mb); + return NULL; + } + new_mb = rte_pktmbuf_alloc (bm->pktmbuf_pools[socket_id]); + if (PREDICT_FALSE(new_mb == 0)) + { + if (first_mb) + rte_pktmbuf_free(first_mb); + return NULL; + } + + /* + * Copy packet info into 1st segment. + */ + if (first_mb == 0) + { + first_mb = new_mb; + rte_pktmbuf_pkt_len (first_mb) = pkt_mb->pkt_len; + first_mb->nb_segs = pkt_mb->nb_segs; + first_mb->port = pkt_mb->port; +#ifdef DAW_FIXME // TX Offload support TBD + first_mb->vlan_macip = pkt_mb->vlan_macip; + first_mb->hash = pkt_mb->hash; + first_mb->ol_flags = pkt_mb->ol_flags +#endif + } + else + { + ASSERT(prev_mb_next != 0); + *prev_mb_next = new_mb; + } + + /* + * Copy packet segment data into new mbuf segment. + */ + rte_pktmbuf_data_len (new_mb) = pkt_mb->data_len; + copy_bytes = pkt_mb->data_len + RTE_PKTMBUF_HEADROOM; + ASSERT(copy_bytes <= pkt_mb->buf_len); + memcpy(new_mb->buf_addr, pkt_mb->buf_addr, copy_bytes); + + prev_mb_next = &new_mb->next; + pkt_mb = pkt_mb->next; + } + + ASSERT(pkt_mb == 0); + __rte_mbuf_sanity_check(first_mb, 1); + + return first_mb; +} + +typedef struct { + u32 buffer_index; + u16 device_index; + u8 queue_index; + struct rte_mbuf mb; + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} dpdk_tx_dma_trace_t; + +static void +dpdk_tx_trace_buffer (dpdk_main_t * dm, + vlib_node_runtime_t * node, + dpdk_device_t * xd, + u16 queue_id, + u32 buffer_index, + vlib_buffer_t * buffer) +{ + vlib_main_t * vm = vlib_get_main(); + dpdk_tx_dma_trace_t * t0; + struct rte_mbuf * mb; + + mb = ((struct rte_mbuf *)buffer)-1; + + t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0])); + t0->queue_index = queue_id; + t0->device_index = xd->device_index; + t0->buffer_index = buffer_index; + memcpy (&t0->mb, mb, sizeof (t0->mb)); + memcpy (&t0->buffer, buffer, sizeof (buffer[0]) - sizeof (buffer->pre_data)); + memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data, + sizeof (t0->buffer.pre_data)); +} + +/* + * This function calls the dpdk's tx_burst function to transmit the packets + * on the tx_vector. It manages a lock per-device if the device does not + * support multiple queues. It returns the number of packets untransmitted + * on the tx_vector. If all packets are transmitted (the normal case), the + * function returns 0. + * + * The tx_burst function may not be able to transmit all packets because the + * dpdk ring is full. If a flowcontrol callback function has been configured + * then the function simply returns. If no callback has been configured, the + * function will retry calling tx_burst with the remaining packets. This will + * continue until all packets are transmitted or tx_burst indicates no packets + * could be transmitted. (The caller can drop the remaining packets.) + * + * The function assumes there is at least one packet on the tx_vector. + */ +static_always_inline +u32 tx_burst_vector_internal (vlib_main_t * vm, + dpdk_device_t * xd, + struct rte_mbuf ** tx_vector) +{ + dpdk_main_t * dm = &dpdk_main; + u32 n_packets; + u32 tx_head; + u32 tx_tail; + u32 n_retry; + int rv; + int queue_id; + tx_ring_hdr_t *ring; + + ring = vec_header(tx_vector, sizeof(*ring)); + + n_packets = ring->tx_head - ring->tx_tail; + + tx_head = ring->tx_head % DPDK_TX_RING_SIZE; + + /* + * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to + * unpredictable results. + */ + ASSERT(n_packets > 0); + + /* + * Check for tx_vector overflow. If this fails it is a system configuration + * error. The ring should be sized big enough to handle the largest un-flowed + * off burst from a traffic manager. A larger size also helps performance + * a bit because it decreases the probability of having to issue two tx_burst + * calls due to a ring wrap. + */ + ASSERT(n_packets < DPDK_TX_RING_SIZE); + + /* + * If there is no flowcontrol callback, there is only temporary buffering + * on the tx_vector and so the tail should always be 0. + */ + ASSERT(dm->flowcontrol_callback || ring->tx_tail == 0); + + /* + * If there is a flowcontrol callback, don't retry any incomplete tx_bursts. + * Apply backpressure instead. If there is no callback, keep retrying until + * a tx_burst sends no packets. n_retry of 255 essentially means no retry + * limit. + */ + n_retry = dm->flowcontrol_callback ? 0 : 255; + + queue_id = vm->cpu_index; + + do { + /* start the burst at the tail */ + tx_tail = ring->tx_tail % DPDK_TX_RING_SIZE; + + /* + * This device only supports one TX queue, + * and we're running multi-threaded... + */ + if (PREDICT_FALSE(xd->lockp != 0)) + { + queue_id = 0; + while (__sync_lock_test_and_set (xd->lockp, 1)) + /* zzzz */; + } + + if (PREDICT_TRUE(xd->dev_type == VNET_DPDK_DEV_ETH)) + { + if (PREDICT_TRUE(tx_head > tx_tail)) + { + /* no wrap, transmit in one burst */ + rv = rte_eth_tx_burst(xd->device_index, + (uint16_t) queue_id, + &tx_vector[tx_tail], + (uint16_t) (tx_head-tx_tail)); + } + else + { + /* + * This can only happen if there is a flowcontrol callback. + * We need to split the transmit into two calls: one for + * the packets up to the wrap point, and one to continue + * at the start of the ring. + * Transmit pkts up to the wrap point. + */ + rv = rte_eth_tx_burst(xd->device_index, + (uint16_t) queue_id, + &tx_vector[tx_tail], + (uint16_t) (DPDK_TX_RING_SIZE - tx_tail)); + + /* + * If we transmitted everything we wanted, then allow 1 retry + * so we can try to transmit the rest. If we didn't transmit + * everything, stop now. + */ + n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0; + } + } + else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER) + { + if (PREDICT_TRUE(tx_head > tx_tail)) + { + /* no wrap, transmit in one burst */ + rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, VIRTIO_RXQ, + &tx_vector[tx_tail], + (uint16_t) (tx_head-tx_tail)); + if (PREDICT_TRUE(rv > 0)) + { + if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) { + dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]); + vring->n_since_last_int += rv; + + if (vring->n_since_last_int > dm->vhost_coalesce_frames) + dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ); + } + + int c = rv; + while(c--) + rte_pktmbuf_free (tx_vector[tx_tail+c]); + } + } + else + { + /* + * If we transmitted everything we wanted, then allow 1 retry + * so we can try to transmit the rest. If we didn't transmit + * everything, stop now. + */ + rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, VIRTIO_RXQ, + &tx_vector[tx_tail], + (uint16_t) (DPDK_TX_RING_SIZE - tx_tail)); + + if (PREDICT_TRUE(rv > 0)) + { + if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) { + dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]); + vring->n_since_last_int += rv; + + if (vring->n_since_last_int > dm->vhost_coalesce_frames) + dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ); + } + + int c = rv; + while(c--) + rte_pktmbuf_free (tx_vector[tx_tail+c]); + } + + n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0; + } + } + else if (xd->dev_type == VNET_DPDK_DEV_KNI) + { + if (PREDICT_TRUE(tx_head > tx_tail)) + { + /* no wrap, transmit in one burst */ + rv = rte_kni_tx_burst(xd->kni, + &tx_vector[tx_tail], + (uint16_t) (tx_head-tx_tail)); + } + else + { + /* + * This can only happen if there is a flowcontrol callback. + * We need to split the transmit into two calls: one for + * the packets up to the wrap point, and one to continue + * at the start of the ring. + * Transmit pkts up to the wrap point. + */ + rv = rte_kni_tx_burst(xd->kni, + &tx_vector[tx_tail], + (uint16_t) (DPDK_TX_RING_SIZE - tx_tail)); + + /* + * If we transmitted everything we wanted, then allow 1 retry + * so we can try to transmit the rest. If we didn't transmit + * everything, stop now. + */ + n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0; + } + } + else + { + ASSERT(0); + rv = 0; + } + + if (PREDICT_FALSE(xd->lockp != 0)) + *xd->lockp = 0; + + if (PREDICT_FALSE(rv < 0)) + { + // emit non-fatal message, bump counter + vnet_main_t * vnm = dm->vnet_main; + vnet_interface_main_t * im = &vnm->interface_main; + u32 node_index; + + node_index = vec_elt_at_index(im->hw_interfaces, + xd->vlib_hw_if_index)->tx_node_index; + + vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1); + clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index, rv); + return n_packets; // untransmitted packets + } + ring->tx_tail += (u16)rv; + n_packets -= (uint16_t) rv; + } while (rv && n_packets && (n_retry>0)); + + return n_packets; +} + + +/* + * This function transmits any packets on the interface's tx_vector and returns + * the number of packets untransmitted on the tx_vector. If the tx_vector is + * empty the function simply returns 0. + * + * It is intended to be called by a traffic manager which has flowed-off an + * interface to see if the interface can be flowed-on again. + */ +u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + int queue_id; + struct rte_mbuf ** tx_vector; + tx_ring_hdr_t *ring; + + /* param is dev_instance and not hw_if_index to save another lookup */ + xd = vec_elt_at_index (dm->devices, dev_instance); + + queue_id = vm->cpu_index; + tx_vector = xd->tx_vectors[queue_id]; + + /* If no packets on the ring, don't bother calling tx function */ + ring = vec_header(tx_vector, sizeof(*ring)); + if (ring->tx_head == ring->tx_tail) + { + return 0; + } + + return tx_burst_vector_internal (vm, xd, tx_vector); +} + +/* + * Transmits the packets on the frame to the interface associated with the + * node. It first copies packets on the frame to a tx_vector containing the + * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal + * which calls the dpdk tx_burst function. + * + * The tx_vector is treated slightly differently depending on whether or + * not a flowcontrol callback function has been configured. If there is no + * callback, the tx_vector is a temporary array of rte_mbuf packet pointers. + * Its entries are written and consumed before the function exits. + * + * If there is a callback then the transmit is being invoked in the presence + * of a traffic manager. Here the tx_vector is treated like a ring of rte_mbuf + * pointers. If not all packets can be transmitted, the untransmitted packets + * stay on the tx_vector until the next call. The callback allows the traffic + * manager to flow-off dequeues to the interface. The companion function + * dpdk_interface_tx_vector() allows the traffic manager to detect when + * it should flow-on the interface again. + */ +static uword +dpdk_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * f) +{ + dpdk_main_t * dm = &dpdk_main; + vnet_interface_output_runtime_t * rd = (void *) node->runtime_data; + dpdk_device_t * xd = vec_elt_at_index (dm->devices, rd->dev_instance); + u32 n_packets = f->n_vectors; + u32 n_left; + u32 * from; + struct rte_mbuf ** tx_vector; + int i; + int queue_id; + u32 my_cpu; + u32 tx_pkts = 0; + tx_ring_hdr_t *ring; + u32 n_on_ring; + + my_cpu = vm->cpu_index; + + queue_id = my_cpu; + + tx_vector = xd->tx_vectors[queue_id]; + ring = vec_header(tx_vector, sizeof(*ring)); + + n_on_ring = ring->tx_head - ring->tx_tail; + from = vlib_frame_vector_args (f); + + ASSERT(n_packets <= VLIB_FRAME_SIZE); + + if (PREDICT_FALSE(n_on_ring + n_packets > DPDK_TX_RING_SIZE)) + { + /* + * Overflowing the ring should never happen. + * If it does then drop the whole frame. + */ + vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL, + n_packets); + + while (n_packets--) + { + u32 bi0 = from[n_packets]; + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + struct rte_mbuf *mb0 = ((struct rte_mbuf *)b0) - 1; + rte_pktmbuf_free (mb0); + } + return n_on_ring; + } + + if (PREDICT_FALSE(dm->tx_pcap_enable)) + { + n_left = n_packets; + while (n_left > 0) + { + u32 bi0 = from[0]; + vlib_buffer_t * b0 = vlib_get_buffer (vm, bi0); + if (dm->pcap_sw_if_index == 0 || + dm->pcap_sw_if_index == vnet_buffer(b0)->sw_if_index [VLIB_TX]) + pcap_add_buffer (&dm->pcap_main, vm, bi0, 512); + from++; + n_left--; + } + } + + from = vlib_frame_vector_args (f); + n_left = n_packets; + i = ring->tx_head % DPDK_TX_RING_SIZE; + + while (n_left >= 4) + { + u32 bi0, bi1; + u32 pi0, pi1; + struct rte_mbuf * mb0, * mb1; + struct rte_mbuf * prefmb0, * prefmb1; + vlib_buffer_t * b0, * b1; + vlib_buffer_t * pref0, * pref1; + i16 delta0, delta1; + u16 new_data_len0, new_data_len1; + u16 new_pkt_len0, new_pkt_len1; + u32 any_clone; + + pi0 = from[2]; + pi1 = from[3]; + pref0 = vlib_get_buffer (vm, pi0); + pref1 = vlib_get_buffer (vm, pi1); + + prefmb0 = ((struct rte_mbuf *)pref0) - 1; + prefmb1 = ((struct rte_mbuf *)pref1) - 1; + + CLIB_PREFETCH(prefmb0, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH(pref0, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH(prefmb1, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH(pref1, CLIB_CACHE_LINE_BYTES, LOAD); + + bi0 = from[0]; + bi1 = from[1]; + from += 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + mb0 = ((struct rte_mbuf *)b0) - 1; + mb1 = ((struct rte_mbuf *)b1) - 1; + + any_clone = b0->clone_count | b1->clone_count; + if (PREDICT_FALSE(any_clone != 0)) + { + if (PREDICT_FALSE(b0->clone_count != 0)) + { + struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0); + if (PREDICT_FALSE(mb0_new == 0)) + { + vlib_error_count (vm, node->node_index, + DPDK_TX_FUNC_ERROR_REPL_FAIL, 1); + b0->flags |= VLIB_BUFFER_REPL_FAIL; + } + else + mb0 = mb0_new; + vec_add1 (dm->recycle[my_cpu], bi0); + } + if (PREDICT_FALSE(b1->clone_count != 0)) + { + struct rte_mbuf * mb1_new = dpdk_replicate_packet_mb (b1); + if (PREDICT_FALSE(mb1_new == 0)) + { + vlib_error_count (vm, node->node_index, + DPDK_TX_FUNC_ERROR_REPL_FAIL, 1); + b1->flags |= VLIB_BUFFER_REPL_FAIL; + } + else + mb1 = mb1_new; + vec_add1 (dm->recycle[my_cpu], bi1); + } + } + + delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 : + vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len; + delta1 = PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL) ? 0 : + vlib_buffer_length_in_chain (vm, b1) - (i16) mb1->pkt_len; + + new_data_len0 = (u16)((i16) mb0->data_len + delta0); + new_data_len1 = (u16)((i16) mb1->data_len + delta1); + new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0); + new_pkt_len1 = (u16)((i16) mb1->pkt_len + delta1); + + b0->current_length = new_data_len0; + b1->current_length = new_data_len1; + mb0->data_len = new_data_len0; + mb1->data_len = new_data_len1; + mb0->pkt_len = new_pkt_len0; + mb1->pkt_len = new_pkt_len1; + + mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ? + mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data); + mb1->data_off = (PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL)) ? + mb1->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b1->current_data); + + if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE)) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0); + if (b1->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1); + } + + if (PREDICT_TRUE(any_clone == 0)) + { + tx_vector[i % DPDK_TX_RING_SIZE] = mb0; + i++; + tx_vector[i % DPDK_TX_RING_SIZE] = mb1; + i++; + } + else + { + /* cloning was done, need to check for failure */ + if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + { + tx_vector[i % DPDK_TX_RING_SIZE] = mb0; + i++; + } + if (PREDICT_TRUE((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + { + tx_vector[i % DPDK_TX_RING_SIZE] = mb1; + i++; + } + } + + n_left -= 2; + } + while (n_left > 0) + { + u32 bi0; + struct rte_mbuf * mb0; + vlib_buffer_t * b0; + i16 delta0; + u16 new_data_len0; + u16 new_pkt_len0; + + bi0 = from[0]; + from++; + + b0 = vlib_get_buffer (vm, bi0); + + mb0 = ((struct rte_mbuf *)b0) - 1; + if (PREDICT_FALSE(b0->clone_count != 0)) + { + struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0); + if (PREDICT_FALSE(mb0_new == 0)) + { + vlib_error_count (vm, node->node_index, + DPDK_TX_FUNC_ERROR_REPL_FAIL, 1); + b0->flags |= VLIB_BUFFER_REPL_FAIL; + } + else + mb0 = mb0_new; + vec_add1 (dm->recycle[my_cpu], bi0); + } + + delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 : + vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len; + + new_data_len0 = (u16)((i16) mb0->data_len + delta0); + new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0); + + b0->current_length = new_data_len0; + mb0->data_len = new_data_len0; + mb0->pkt_len = new_pkt_len0; + mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ? + mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data); + + if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE)) + if (b0->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0); + + if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + { + tx_vector[i % DPDK_TX_RING_SIZE] = mb0; + i++; + } + n_left--; + } + + /* account for additional packets in the ring */ + ring->tx_head += n_packets; + n_on_ring = ring->tx_head - ring->tx_tail; + + /* transmit as many packets as possible */ + n_packets = tx_burst_vector_internal (vm, xd, tx_vector); + + /* + * tx_pkts is the number of packets successfully transmitted + * This is the number originally on ring minus the number remaining on ring + */ + tx_pkts = n_on_ring - n_packets; + + if (PREDICT_FALSE(dm->flowcontrol_callback != 0)) + { + if (PREDICT_FALSE(n_packets)) + { + /* Callback may want to enable flowcontrol */ + dm->flowcontrol_callback(vm, xd->vlib_hw_if_index, ring->tx_head - ring->tx_tail); + } + else + { + /* Reset head/tail to avoid unnecessary wrap */ + ring->tx_head = 0; + ring->tx_tail = 0; + } + } + else + { + /* If there is no callback then drop any non-transmitted packets */ + if (PREDICT_FALSE(n_packets)) + { + vlib_simple_counter_main_t * cm; + vnet_main_t * vnm = vnet_get_main(); + + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_TX_ERROR); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, n_packets); + + vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP, + n_packets); + + while (n_packets--) + rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]); + } + + /* Reset head/tail to avoid unnecessary wrap */ + ring->tx_head = 0; + ring->tx_tail = 0; + } + + /* Recycle replicated buffers */ + if (PREDICT_FALSE(vec_len(dm->recycle[my_cpu]))) + { + vlib_buffer_free (vm, dm->recycle[my_cpu], vec_len(dm->recycle[my_cpu])); + _vec_len(dm->recycle[my_cpu]) = 0; + } + + ASSERT(ring->tx_head >= ring->tx_tail); + + return tx_pkts; +} + +static int dpdk_device_renumber (vnet_hw_interface_t * hi, + u32 new_dev_instance) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance); + + if (!xd || xd->dev_type != VNET_DPDK_DEV_VHOST_USER) { + clib_warning("cannot renumber non-vhost-user interface (sw_if_index: %d)", + hi->sw_if_index); + return 0; + } + + xd->vu_if_id = new_dev_instance; + return 0; +} + +static u8 * format_dpdk_device_name (u8 * s, va_list * args) +{ + dpdk_main_t * dm = &dpdk_main; + char *devname_format; + char *device_name; + u32 i = va_arg (*args, u32); + struct rte_eth_dev_info dev_info; + + if (dm->interface_name_format_decimal) + devname_format = "%s%d/%d/%d"; + else + devname_format = "%s%x/%x/%x"; + + if (dm->devices[i].dev_type == VNET_DPDK_DEV_KNI) { + return format(s, "kni%d", dm->devices[i].kni_port_id); + } else if (dm->devices[i].dev_type == VNET_DPDK_DEV_VHOST_USER) { + return format(s, "VirtualEthernet0/0/%d", dm->devices[i].vu_if_id); + } + switch (dm->devices[i].port_type) + { + case VNET_DPDK_PORT_TYPE_ETH_1G: + device_name = "GigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_10G: + device_name = "TenGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_40G: + device_name = "FortyGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_SWITCH: + device_name = "EthernetSwitch"; + break; + + #ifdef NETMAP + case VNET_DPDK_PORT_TYPE_NETMAP: + rte_eth_dev_info_get(i, &dev_info); + return format(s, "netmap:%s", dev_info.driver_name); + #endif + + case VNET_DPDK_PORT_TYPE_AF_PACKET: + rte_eth_dev_info_get(i, &dev_info); + return format(s, "af_packet%d", dm->devices[i].af_packet_port_id); + + default: + case VNET_DPDK_PORT_TYPE_UNKNOWN: + device_name = "UnknownEthernet"; + break; + } + + rte_eth_dev_info_get(i, &dev_info); + return format (s, devname_format, device_name, dev_info.pci_dev->addr.bus, + dev_info.pci_dev->addr.devid, + dev_info.pci_dev->addr.function); +} + +static u8 * format_dpdk_device_type (u8 * s, va_list * args) +{ + dpdk_main_t * dm = &dpdk_main; + char *dev_type; + u32 i = va_arg (*args, u32); + + if (dm->devices[i].dev_type == VNET_DPDK_DEV_KNI) { + return format(s, "Kernel NIC Interface"); + } else if (dm->devices[i].dev_type == VNET_DPDK_DEV_VHOST_USER) { + return format(s, "vhost-user interface"); + } + + switch (dm->devices[i].pmd) + { + case VNET_DPDK_PMD_E1000EM: + dev_type = "Intel 82540EM (e1000)"; + break; + + case VNET_DPDK_PMD_IGB: + dev_type = "Intel e1000"; + break; + + case VNET_DPDK_PMD_I40E: + dev_type = "Intel X710/XL710 Family"; + break; + + case VNET_DPDK_PMD_I40EVF: + dev_type = "Intel X710/XL710 Family VF"; + break; + + case VNET_DPDK_PMD_FM10K: + dev_type = "Intel FM10000 Family Ethernet Switch"; + break; + + case VNET_DPDK_PMD_IGBVF: + dev_type = "Intel e1000 VF"; + break; + + case VNET_DPDK_PMD_VIRTIO: + dev_type = "Red Hat Virtio"; + break; + + case VNET_DPDK_PMD_IXGBEVF: + dev_type = "Intel 82599 VF"; + break; + + case VNET_DPDK_PMD_IXGBE: + dev_type = "Intel 82599"; + break; + + case VNET_DPDK_PMD_VICE: + case VNET_DPDK_PMD_ENIC: + dev_type = "Cisco VIC"; + break; + + case VNET_DPDK_PMD_VMXNET3: + dev_type = "VMware VMXNET3"; + break; + +#ifdef NETMAP + case VNET_DPDK_PMD_NETMAP: + dev_type = "Netmap/Vale"; + break; +#endif + + case VNET_DPDK_PMD_AF_PACKET: + dev_type = "af_packet"; + break; + + default: + case VNET_DPDK_PMD_UNKNOWN: + dev_type = "### UNKNOWN ###"; + break; + } + + return format (s, dev_type); +} + +static u8 * format_dpdk_link_status (u8 * s, va_list * args) +{ + dpdk_device_t * xd = va_arg (*args, dpdk_device_t *); + struct rte_eth_link * l = &xd->link; + vnet_main_t * vnm = vnet_get_main(); + vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index); + + s = format (s, "%s ", l->link_status ? "up" : "down"); + if (l->link_status) + { + u32 promisc = rte_eth_promiscuous_get (xd->device_index); + + s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ? + "full" : "half"); + s = format (s, "speed %u mtu %d %s\n", l->link_speed, + hi->max_packet_bytes, promisc ? " promisc" : ""); + } + else + s = format (s, "\n"); + + return s; +} + +#define _line_len 72 +#define _(v, str) \ +if (bitmap & v) { \ + if (format_get_indent (s) > next_split ) { \ + next_split += _line_len; \ + s = format(s,"\n%U", format_white_space, indent); \ + } \ + s = format(s, "%s ", str); \ +} + +static u8 * format_dpdk_rss_hf_name(u8 * s, va_list * args) +{ + u64 bitmap = va_arg (*args, u64); + int next_split = _line_len; + int indent = format_get_indent (s); + + if (!bitmap) + return format(s, "none"); + + foreach_dpdk_rss_hf + + return s; +} + +static u8 * format_dpdk_rx_offload_caps(u8 * s, va_list * args) +{ + u32 bitmap = va_arg (*args, u32); + int next_split = _line_len; + int indent = format_get_indent (s); + + if (!bitmap) + return format(s, "none"); + + foreach_dpdk_rx_offload_caps + + return s; +} + +static u8 * format_dpdk_tx_offload_caps(u8 * s, va_list * args) +{ + u32 bitmap = va_arg (*args, u32); + int next_split = _line_len; + int indent = format_get_indent (s); + if (!bitmap) + return format(s, "none"); + + foreach_dpdk_tx_offload_caps + + return s; +} + +#undef _line_len +#undef _ + +static u8 * format_dpdk_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + int verbose = va_arg (*args, int); + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = vec_elt_at_index (dm->devices, dev_instance); + uword indent = format_get_indent (s); + f64 now = vlib_time_now (dm->vlib_main); + + dpdk_update_counters (xd, now); + dpdk_update_link_state (xd, now); + + s = format (s, "%U\n%Ucarrier %U", + format_dpdk_device_type, xd->device_index, + format_white_space, indent + 2, + format_dpdk_link_status, xd); + + if (verbose > 1 && xd->dev_type == VNET_DPDK_DEV_ETH) + { + struct rte_eth_dev_info di; + struct rte_pci_device * pci; + struct rte_eth_rss_conf rss_conf; + int vlan_off; + + rss_conf.rss_key = 0; + rte_eth_dev_info_get(xd->device_index, &di); + rte_eth_dev_rss_hash_conf_get(xd->device_index, &rss_conf); + pci = di.pci_dev; + + s = format(s, "%Upci id: device %04x:%04x subsystem %04x:%04x\n" + "%Upci address: %04x:%02x:%02x.%02x\n", + format_white_space, indent + 2, + pci->id.vendor_id, pci->id.device_id, + pci->id.subsystem_vendor_id, + pci->id.subsystem_device_id, + format_white_space, indent + 2, + pci->addr.domain, pci->addr.bus, + pci->addr.devid, pci->addr.function); + s = format(s, "%Umax rx packet len: %d\n", + format_white_space, indent + 2, di.max_rx_pktlen); + s = format(s, "%Upromiscuous: unicast %s all-multicast %s\n", + format_white_space, indent + 2, + rte_eth_promiscuous_get(xd->device_index) ? "on" : "off", + rte_eth_promiscuous_get(xd->device_index) ? "on" : "off"); + vlan_off = rte_eth_dev_get_vlan_offload(xd->device_index); + s = format(s, "%Uvlan offload: strip %s filter %s qinq %s\n", + format_white_space, indent + 2, + vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off", + vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off", + vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off"); + s = format(s, "%Uqueue size (max): rx %d (%d) tx %d (%d)\n", + format_white_space, indent + 2, + xd->rx_q_used, di.max_rx_queues, + xd->tx_q_used, di.max_tx_queues); + s = format(s, "%Urx offload caps: %U\n", + format_white_space, indent + 2, + format_dpdk_rx_offload_caps, di.rx_offload_capa); + s = format(s, "%Utx offload caps: %U\n", + format_white_space, indent + 2, + format_dpdk_tx_offload_caps, di.tx_offload_capa); + s = format(s, "%Urss active: %U\n" + "%Urss supported: %U\n", + format_white_space, indent + 2, + format_dpdk_rss_hf_name, rss_conf.rss_hf, + format_white_space, indent + 2, + format_dpdk_rss_hf_name, di.flow_type_rss_offloads); + } + + if (xd->cpu_socket > -1) + s = format (s, "%Ucpu socket %d", + format_white_space, indent + 2, + xd->cpu_socket); + + /* $$$ MIB counters */ + + { +#define _(N, V) \ + if (xd->stats.V != 0) \ + s = format (s, "\n%U%-40U%16Ld", \ + format_white_space, indent + 2, \ + format_c_identifier, #N, xd->stats.V); + + foreach_dpdk_counter +#undef _ + } + + u8 * xs = 0; + struct rte_eth_xstats * xstat; + + vec_foreach(xstat, xd->xstats) + { + if (xstat->value) + { + /* format_c_identifier don't like c strings inside vector */ + u8 * name = format(0,"%s", xstat->name); + xs = format(xs, "\n%U%-38U%16Ld", + format_white_space, indent + 4, + format_c_identifier, name, xstat->value); + vec_free(name); + } + } + + if (xs) + { + s = format(s, "\n%Uextended stats:%v", + format_white_space, indent + 2, xs); + vec_free(xs); + } + + return s; +} + +static u8 * format_dpdk_tx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main(); + dpdk_tx_dma_trace_t * t = va_arg (*va, dpdk_tx_dma_trace_t *); + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = vec_elt_at_index (dm->devices, t->device_index); + uword indent = format_get_indent (s); + vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + + s = format (s, "%U tx queue %d", + format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, + format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U%U", format_white_space, indent, + format_ethernet_header_with_length, t->buffer.pre_data, + sizeof (t->buffer.pre_data)); + + return s; +} + +static void dpdk_clear_hw_interface_counters (u32 instance) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = vec_elt_at_index (dm->devices, instance); + + /* + * DAW-FIXME: VMXNET3 device stop/start doesn't work, + * therefore fake the stop in the dpdk driver by + * silently dropping all of the incoming pkts instead of + * stopping the driver / hardware. + */ + if (xd->admin_up != 0xff) + { + rte_eth_stats_reset (xd->device_index); + memset (&xd->last_stats, 0, sizeof (xd->last_stats)); + dpdk_update_counters (xd, vlib_time_now (dm->vlib_main)); + } + else + { + rte_eth_stats_reset (xd->device_index); + memset(&xd->stats, 0, sizeof(xd->stats)); + memset (&xd->last_stats, 0, sizeof (xd->last_stats)); + } + rte_eth_xstats_reset(xd->device_index); +} + +static int +kni_config_network_if(u8 port_id, u8 if_up) +{ + vnet_main_t * vnm = vnet_get_main(); + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + uword *p; + + p = hash_get (dm->dpdk_device_by_kni_port_id, port_id); + if (p == 0) { + clib_warning("unknown interface"); + return 0; + } else { + xd = vec_elt_at_index (dm->devices, p[0]); + } + + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, + if_up ? VNET_HW_INTERFACE_FLAG_LINK_UP | + ETH_LINK_FULL_DUPLEX : 0); + return 0; +} + +static int +kni_change_mtu(u8 port_id, unsigned new_mtu) +{ + vnet_main_t * vnm = vnet_get_main(); + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + uword *p; + vnet_hw_interface_t * hif; + + p = hash_get (dm->dpdk_device_by_kni_port_id, port_id); + if (p == 0) { + clib_warning("unknown interface"); + return 0; + } else { + xd = vec_elt_at_index (dm->devices, p[0]); + } + hif = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index); + + hif->max_packet_bytes = new_mtu; + + return 0; +} + +static clib_error_t * +dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t * hif = vnet_get_hw_interface (vnm, hw_if_index); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = vec_elt_at_index (dm->devices, hif->dev_instance); + int rv = 0; + + if (xd->dev_type == VNET_DPDK_DEV_KNI) + { + if (is_up) + { + struct rte_kni_conf conf; + struct rte_kni_ops ops; + vlib_main_t * vm = vlib_get_main(); + vlib_buffer_main_t * bm = vm->buffer_main; + memset(&conf, 0, sizeof(conf)); + snprintf(conf.name, RTE_KNI_NAMESIZE, "vpp%u", xd->kni_port_id); + conf.mbuf_size = MBUF_SIZE; + memset(&ops, 0, sizeof(ops)); + ops.port_id = xd->kni_port_id; + ops.change_mtu = kni_change_mtu; + ops.config_network_if = kni_config_network_if; + + xd->kni = rte_kni_alloc(bm->pktmbuf_pools[rte_socket_id()], &conf, &ops); + if (!xd->kni) + { + clib_warning("failed to allocate kni interface"); + } + else + { + hif->max_packet_bytes = 1500; /* kni interface default value */ + xd->admin_up = 1; + } + } + else + { + xd->admin_up = 0; + rte_kni_release(xd->kni); + } + return 0; + } + if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER) + { + if (is_up) + { + if (xd->vu_is_running) + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP | + ETH_LINK_FULL_DUPLEX ); + xd->admin_up = 1; + } + else + { + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0); + xd->admin_up = 0; + } + + return 0; + } + + + if (is_up) + { + f64 now = vlib_time_now (dm->vlib_main); + + /* + * DAW-FIXME: VMXNET3 device stop/start doesn't work, + * therefore fake the stop in the dpdk driver by + * silently dropping all of the incoming pkts instead of + * stopping the driver / hardware. + */ + if (xd->admin_up == 0) + rv = rte_eth_dev_start (xd->device_index); + + if (xd->promisc) + rte_eth_promiscuous_enable(xd->device_index); + else + rte_eth_promiscuous_disable(xd->device_index); + + rte_eth_allmulticast_enable (xd->device_index); + xd->admin_up = 1; + dpdk_update_counters (xd, now); + dpdk_update_link_state (xd, now); + } + else + { + rte_eth_allmulticast_disable (xd->device_index); + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0); + + /* + * DAW-FIXME: VMXNET3 device stop/start doesn't work, + * therefore fake the stop in the dpdk driver by + * silently dropping all of the incoming pkts instead of + * stopping the driver / hardware. + */ + if (xd->pmd != VNET_DPDK_PMD_VMXNET3) + { + rte_eth_dev_stop (xd->device_index); + xd->admin_up = 0; + } + else + xd->admin_up = ~0; + } + + if (rv < 0) + clib_warning ("rte_eth_dev_%s error: %d", is_up ? "start" : "stop", + rv); + + return /* no error */ 0; +} + +/* + * Dynamically redirect all pkts from a specific interface + * to the specified node + */ +static void dpdk_set_interface_next_node (vnet_main_t *vnm, u32 hw_if_index, + u32 node_index) +{ + dpdk_main_t * xm = &dpdk_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_device_t * xd = vec_elt_at_index (xm->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + xd->per_interface_next_index = node_index; + return; + } + + xd->per_interface_next_index = + vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index); +} + + +static clib_error_t * +dpdk_subif_add_del_function (vnet_main_t * vnm, + u32 hw_if_index, + struct vnet_sw_interface_t * st, + int is_add) +{ + dpdk_main_t * xm = &dpdk_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_device_t * xd = vec_elt_at_index (xm->devices, hw->dev_instance); + vnet_sw_interface_t * t = (vnet_sw_interface_t *) st; + int r, vlan_offload; + + + if (xd->dev_type != VNET_DPDK_DEV_ETH) + return 0; + /* currently we program VLANS only for IXGBE VF */ + if (xd->pmd != VNET_DPDK_PMD_IXGBEVF) + return 0; + + if (t->sub.eth.flags.no_tags == 1) + return 0; + + if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1 )) + return clib_error_return (0, "unsupported VLAN setup"); + + + vlan_offload = rte_eth_dev_get_vlan_offload(xd->device_index); + vlan_offload |= ETH_VLAN_FILTER_OFFLOAD; + + if ((r = rte_eth_dev_set_vlan_offload(xd->device_index, vlan_offload))) + return clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d", + xd->device_index, r); + + + if ((r = rte_eth_dev_vlan_filter(xd->device_index, t->sub.eth.outer_vlan_id, is_add))) + return clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d", + xd->device_index, r); + + return 0; +} + +VNET_DEVICE_CLASS (dpdk_device_class) = { + .name = "dpdk", + .tx_function = dpdk_interface_tx, + .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR, + .tx_function_error_strings = dpdk_tx_func_error_strings, + .format_device_name = format_dpdk_device_name, + .format_device = format_dpdk_device, + .format_tx_trace = format_dpdk_tx_dma_trace, + .clear_counters = dpdk_clear_hw_interface_counters, + .admin_up_down_function = dpdk_interface_admin_up_down, + .subif_add_del_function = dpdk_subif_add_del_function, + .rx_redirect_to_node = dpdk_set_interface_next_node, + .no_flatten_output_chains = 1, + .name_renumber = dpdk_device_renumber, +}; + +void dpdk_set_flowcontrol_callback (vlib_main_t *vm, + dpdk_flowcontrol_callback_t callback) +{ + dpdk_main.flowcontrol_callback = callback; +} + +#define UP_DOWN_FLAG_EVENT 1 + + +u32 dpdk_get_admin_up_down_in_progress (void) +{ + return dpdk_main.admin_up_down_in_progress; +} + +static uword +admin_up_down_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, + vlib_frame_t * f) +{ + clib_error_t * error = 0; + uword event_type; + uword *event_data = 0; + u32 index; + u32 sw_if_index; + u32 flags; + + while (1) + { + vlib_process_wait_for_event (vm); + + event_type = vlib_process_get_events (vm, &event_data); + + dpdk_main.admin_up_down_in_progress = 1; + + for (index=0; index<vec_len(event_data); index++) + { + sw_if_index = event_data[index] >> 32; + flags = (u32) event_data[index]; + + switch (event_type) { + case UP_DOWN_FLAG_EVENT: + error = vnet_sw_interface_set_flags (vnet_get_main(), sw_if_index, flags); + clib_error_report(error); + break; + } + } + + vec_reset_length (event_data); + + dpdk_main.admin_up_down_in_progress = 0; + + } + return 0; /* or not */ +} + +VLIB_REGISTER_NODE (admin_up_down_process_node,static) = { + .function = admin_up_down_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "admin-up-down-process", + .process_log2_n_stack_bytes = 17, // 256KB +}; + +/* + * Asynchronously invoke vnet_sw_interface_set_flags via the admin_up_down + * process. Useful for avoiding long blocking delays (>150ms) in the dpdk + * drivers. + * WARNING: when posting this event, no other interface-related calls should + * be made (e.g. vnet_create_sw_interface()) while the event is being + * processed (admin_up_down_in_progress). This is required in order to avoid + * race conditions in manipulating interface data structures. + */ +void post_sw_interface_set_flags (vlib_main_t *vm, u32 sw_if_index, u32 flags) +{ + vlib_process_signal_event + (vm, admin_up_down_process_node.index, + UP_DOWN_FLAG_EVENT, + (((uword)sw_if_index << 32) | flags)); +} + +/* + * Called by the dpdk driver's rte_delay_us() function. + * Return 0 to have the dpdk do a regular delay loop. + * Return 1 if to skip the delay loop because we are suspending + * the calling vlib process instead. + */ +int rte_delay_us_override (unsigned us) { + vlib_main_t * vm; + + /* Don't bother intercepting for short delays */ + if (us < 10) return 0; + + /* + * Only intercept if we are in a vlib process. + * If we are called from a vlib worker thread or the vlib main + * thread then do not intercept. (Must not be called from an + * independent pthread). + */ + if (os_get_cpu_number() == 0) + { + /* + * We're in the vlib main thread or a vlib process. Make sure + * the process is running and we're not still initializing. + */ + vm = vlib_get_main(); + if (vlib_in_process_context(vm)) + { + /* Only suspend for the admin_down_process */ + vlib_process_t * proc = vlib_get_current_process(vm); + if (!(proc->flags & VLIB_PROCESS_IS_RUNNING) || + (proc->node_runtime.function != admin_up_down_process)) + return 0; + + f64 delay = 1e-6 * us; + vlib_process_suspend(vm, delay); + return 1; + } + } + return 0; // no override +} diff --git a/vnet/vnet/devices/dpdk/dpdk.h b/vnet/vnet/devices/dpdk/dpdk.h new file mode 100644 index 00000000000..fd984e4d4df --- /dev/null +++ b/vnet/vnet/devices/dpdk/dpdk.h @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_dpdk_h__ +#define __included_dpdk_h__ + +/* $$$$ We should rename always_inline -> clib_always_inline */ +#undef always_inline + +#include <rte_config.h> + +#include <rte_common.h> +#include <rte_dev.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memcpy.h> +#include <rte_memzone.h> +#include <rte_tailq.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_launch.h> +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_prefetch.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_branch_prediction.h> +#include <rte_interrupts.h> +#include <rte_pci.h> +#include <rte_random.h> +#include <rte_debug.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_ring.h> +#include <rte_mempool.h> +#include <rte_mbuf.h> +#include <rte_kni.h> +#include <rte_virtio_net.h> +#include <rte_pci_dev_ids.h> +#include <rte_version.h> + +#include <vnet/unix/pcap.h> +#include <vnet/devices/virtio/vhost-user.h> + +#if CLIB_DEBUG > 0 +#define always_inline static inline +#else +#define always_inline static inline __attribute__ ((__always_inline__)) +#endif + +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) +#define NB_MBUF (32<<10) + +vnet_device_class_t dpdk_device_class; +vlib_node_registration_t dpdk_input_node; +vlib_node_registration_t dpdk_io_input_node; +vlib_node_registration_t handoff_dispatch_node; + +typedef enum { + VNET_DPDK_DEV_ETH = 1, /* Standard DPDK PMD driver */ + VNET_DPDK_DEV_KNI, /* Kernel NIC Interface */ + VNET_DPDK_DEV_VHOST_USER, + VNET_DPDK_DEV_UNKNOWN, /* must be last */ +} dpdk_device_type_t; + +#define foreach_dpdk_pmd \ + _ ("rte_em_pmd", E1000EM) \ + _ ("rte_igb_pmd", IGB) \ + _ ("rte_igbvf_pmd", IGBVF) \ + _ ("rte_ixgbe_pmd", IXGBE) \ + _ ("rte_ixgbevf_pmd", IXGBEVF) \ + _ ("rte_i40e_pmd", I40E) \ + _ ("rte_i40evf_pmd", I40EVF) \ + _ ("rte_virtio_pmd", VIRTIO) \ + _ ("rte_vice_pmd", VICE) \ + _ ("rte_enic_pmd", ENIC) \ + _ ("rte_vmxnet3_pmd", VMXNET3) \ + _ ("AF_PACKET PMD", AF_PACKET) \ + _ ("rte_pmd_fm10k", FM10K) + +typedef enum { + VNET_DPDK_PMD_NONE, +#define _(s,f) VNET_DPDK_PMD_##f, + foreach_dpdk_pmd +#undef _ +#ifdef NETMAP + VNET_DPDK_PMD_NETMAP, +#endif + VNET_DPDK_PMD_UNKNOWN, /* must be last */ +} dpdk_pmd_t; + +typedef enum { + VNET_DPDK_PORT_TYPE_ETH_1G, + VNET_DPDK_PORT_TYPE_ETH_10G, + VNET_DPDK_PORT_TYPE_ETH_40G, + VNET_DPDK_PORT_TYPE_ETH_SWITCH, +#ifdef NETMAP + VNET_DPDK_PORT_TYPE_NETMAP, +#endif + VNET_DPDK_PORT_TYPE_AF_PACKET, + VNET_DPDK_PORT_TYPE_UNKNOWN, +} dpdk_port_type_t; + +typedef struct { + f64 deadline; + vlib_frame_t * frame; +} dpdk_frame_t; + +#define DPDK_EFD_MAX_DISCARD_RATE 10 + +typedef struct { + u16 last_burst_sz; + u16 max_burst_sz; + u32 full_frames_cnt; + u32 consec_full_frames_cnt; + u32 congestion_cnt; + u64 last_poll_time; + u64 max_poll_delay; + u32 discard_cnt; + u32 total_packet_cnt; +} dpdk_efd_agent_t; + +typedef struct { + int callfd; + int kickfd; + int errfd; + u32 callfd_idx; + u32 n_since_last_int; + f64 int_deadline; +} dpdk_vu_vring; + +typedef struct { + u32 is_up; + u32 unix_fd; + u32 unix_file_index; + u32 client_fd; + char sock_filename[256]; + int sock_errno; + u8 sock_is_server; + u8 active; + + u64 feature_mask; + u32 num_vrings; + dpdk_vu_vring vrings[2]; + u64 region_addr[VHOST_MEMORY_MAX_NREGIONS]; + u32 region_fd[VHOST_MEMORY_MAX_NREGIONS]; +} dpdk_vu_intf_t; + +typedef void (*dpdk_flowcontrol_callback_t) (vlib_main_t *vm, + u32 hw_if_index, + u32 n_packets); + +/* + * The header for the tx_vector in dpdk_device_t. + * Head and tail are indexes into the tx_vector and are of type + * u64 so they never overflow. + */ +typedef struct { + u64 tx_head; + u64 tx_tail; +} tx_ring_hdr_t; + +typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK(cacheline0); + volatile u32 *lockp; + + /* Instance ID */ + u32 device_index; + + u32 vlib_hw_if_index; + u32 vlib_sw_if_index; + + /* next node index if we decide to steal the rx graph arc */ + u32 per_interface_next_index; + + /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */ + struct rte_mbuf *** tx_vectors; /* one per worker thread */ + struct rte_mbuf *** rx_vectors; + + /* vector of traced contexts, per device */ + u32 * d_trace_buffers; + + /* per-worker destination frame queue */ + dpdk_frame_t * frames; + + dpdk_device_type_t dev_type:8; + dpdk_pmd_t pmd:8; + i8 cpu_socket; + + u8 admin_up; + u8 promisc; + + CLIB_CACHE_LINE_ALIGN_MARK(cacheline1); + + /* PMD related */ + u16 tx_q_used; + u16 rx_q_used; + u16 nb_rx_desc; + u16 nb_tx_desc; + u16 * cpu_socket_id_by_queue; + struct rte_eth_conf port_conf; + struct rte_eth_txconf tx_conf; + + /* KNI related */ + struct rte_kni *kni; + u8 kni_port_id; + + /* vhost-user related */ + u32 vu_if_id; + struct virtio_net vu_vhost_dev; + u32 vu_is_running; + dpdk_vu_intf_t *vu_intf; + + /* af_packet */ + u8 af_packet_port_id; + + struct rte_eth_link link; + f64 time_last_link_update; + + struct rte_eth_stats stats; + struct rte_eth_stats last_stats; + struct rte_eth_xstats * xstats; + f64 time_last_stats_update; + dpdk_port_type_t port_type; + + dpdk_efd_agent_t efd_agent; +} dpdk_device_t; + +#define MAX_NELTS 32 +typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK(cacheline0); + u64 head; + u64 head_hint; + u64 tail; + u32 n_in_use; + u32 nelts; + u32 written; + u32 threshold; + i32 n_vectors[MAX_NELTS]; +} frame_queue_trace_t; + +#define DPDK_TX_RING_SIZE (4 * 1024) + +#define DPDK_STATS_POLL_INTERVAL 10.0 +#define DPDK_LINK_POLL_INTERVAL 3.0 + +typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK(cacheline0); + + /* total input packet counter */ + u64 aggregate_rx_packets; +} dpdk_worker_t; + +typedef struct { + u32 device; + u16 queue_id; +} dpdk_device_and_queue_t; + +/* Early-Fast-Discard (EFD) */ +#define DPDK_EFD_DISABLED 0 +#define DPDK_EFD_DISCARD_ENABLED (1 << 0) +#define DPDK_EFD_MONITOR_ENABLED (1 << 1) +#define DPDK_EFD_DROPALL_ENABLED (1 << 2) + +#define DPDK_EFD_DEFAULT_DEVICE_QUEUE_HI_THRESH_PCT 90 +#define DPDK_EFD_DEFAULT_CONSEC_FULL_FRAMES_HI_THRESH 6 + +typedef struct dpdk_efd_t { + u16 enabled; + u16 queue_hi_thresh; + u16 consec_full_frames_hi_thresh; + u16 pad; +} dpdk_efd_t; + +typedef struct { + + /* Devices */ + dpdk_device_t * devices; + dpdk_device_and_queue_t ** devices_by_cpu; + + /* per-thread recycle lists */ + u32 ** recycle; + + /* flow control callback. If 0 then flow control is disabled */ + dpdk_flowcontrol_callback_t flowcontrol_callback; + + /* vlib buffer free list, must be same size as an rte_mbuf */ + u32 vlib_buffer_free_list_index; + + /* + * format interface names ala xxxEthernet%d/%d/%d instead of + * xxxEthernet%x/%x/%x. For VIRL. + */ + u8 interface_name_format_decimal; + + + /* dpdk worker "threads" */ + dpdk_worker_t * workers; + + /* Config stuff */ + u8 ** eal_init_args; + u8 * eth_if_blacklist; + u8 * eth_if_whitelist; + u8 * uio_driver_name; + u8 no_multi_seg; + + /* Required config parameters */ + u8 coremask_set_manually; + u8 nchannels_set_manually; + u32 coremask; + u32 nchannels; + u32 num_mbufs; + u32 use_rss; + u8 num_kni; /* while kni_init allows u32, port_id in callback fn is only u8 */ + + /* Ethernet input node index */ + u32 ethernet_input_node_index; + + /* dpdk i/o thread initialization barrier */ + volatile u32 io_thread_release; + + /* pcap tracing [only works if (CLIB_DEBUG > 0)] */ + int tx_pcap_enable; + pcap_main_t pcap_main; + u8 * pcap_filename; + u32 pcap_sw_if_index; + u32 pcap_pkts_to_capture; + + /* virtio vhost-user switch */ + u8 use_virtio_vhost; + + /* vhost-user coalescence frames config */ + u32 vhost_coalesce_frames; + f64 vhost_coalesce_time; + + /* hashes */ + uword * dpdk_device_by_kni_port_id; + uword * vu_sw_if_index_by_listener_fd; + uword * vu_sw_if_index_by_sock_fd; + u32 * vu_inactive_interfaces_device_index; + + u32 next_vu_if_id; + + /* efd (early-fast-discard) settings */ + dpdk_efd_t efd; + + /* + * flag indicating that a posted admin up/down + * (via post_sw_interface_set_flags) is in progress + */ + u8 admin_up_down_in_progress; + + u8 have_io_threads; + + /* which cpus are running dpdk-input */ + int input_cpu_first_index; + int input_cpu_count; + + /* convenience */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; +} dpdk_main_t; + +dpdk_main_t dpdk_main; + +typedef enum { + DPDK_RX_NEXT_IP4_INPUT, + DPDK_RX_NEXT_IP6_INPUT, + DPDK_RX_NEXT_MPLS_INPUT, + DPDK_RX_NEXT_ETHERNET_INPUT, + DPDK_RX_NEXT_DROP, + DPDK_RX_N_NEXT, +} dpdk_rx_next_t; + +void vnet_buffer_needs_dpdk_mb (vlib_buffer_t * b); + +void dpdk_set_next_node (dpdk_rx_next_t, char *); + +typedef void (*dpdk_io_thread_callback_t) (vlib_main_t *vm); + +void dpdk_io_thread (vlib_worker_thread_t * w, + u32 instances, + u32 instance_id, + char *worker_name, + dpdk_io_thread_callback_t callback); +void dpdk_thread_input (dpdk_main_t * dm, dpdk_device_t * xd); + +clib_error_t * dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd); + +void dpdk_set_flowcontrol_callback (vlib_main_t *vm, + dpdk_flowcontrol_callback_t callback); + +u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance); + +vlib_frame_queue_elt_t * vlib_get_handoff_queue_elt (u32 vlib_worker_index); + +u32 dpdk_get_handoff_node_index (void); + +void set_efd_bitmap (u8 *bitmap, u32 value, u32 op); + +#define foreach_dpdk_error \ + _(NONE, "no error") \ + _(RX_PACKET_ERROR, "Rx packet errors") \ + _(RX_BAD_FCS, "Rx bad fcs") \ + _(L4_CHECKSUM_ERROR, "Rx L4 checksum errors") \ + _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \ + _(RX_ALLOC_FAIL, "rx buf alloc from free list failed") \ + _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem") \ + _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error") \ + _(IPV4_EFD_DROP_PKTS, "IPV4 Early Fast Discard rx drops") \ + _(IPV6_EFD_DROP_PKTS, "IPV6 Early Fast Discard rx drops") \ + _(MPLS_EFD_DROP_PKTS, "MPLS Early Fast Discard rx drops") \ + _(VLAN_EFD_DROP_PKTS, "VLAN Early Fast Discard rx drops") + +typedef enum { +#define _(f,s) DPDK_ERROR_##f, + foreach_dpdk_error +#undef _ + DPDK_N_ERROR, +} dpdk_error_t; + +/* + * Increment EFD drop counter + */ +static_always_inline +void increment_efd_drop_counter (vlib_main_t * vm, u32 counter_index, u32 count) +{ + vlib_node_t *my_n; + + my_n = vlib_get_node (vm, dpdk_input_node.index); + vm->error_main.counters[my_n->error_heap_index+counter_index] += count; +} + +void dpdk_update_link_state (dpdk_device_t * xd, f64 now); +void dpdk_efd_update_counters(dpdk_device_t *xd, u32 n_buffers, u16 enabled); +u32 is_efd_discardable(vlib_thread_main_t *tm, + vlib_buffer_t * b0, + struct rte_mbuf *mb); + +/* dpdk vhost-user interrupt management */ +u8 dpdk_vhost_user_want_interrupt (dpdk_device_t *xd, int idx); +void dpdk_vhost_user_send_interrupt (vlib_main_t * vm, dpdk_device_t * xd, + int idx); + + +static inline u64 vnet_get_aggregate_rx_packets (void) +{ + dpdk_main_t * dm = &dpdk_main; + u64 sum = 0; + dpdk_worker_t * dw; + + vec_foreach(dw, dm->workers) + sum += dw->aggregate_rx_packets; + + return sum; +} + +void dpdk_rx_trace (dpdk_main_t * dm, + vlib_node_runtime_t * node, + dpdk_device_t * xd, + u16 queue_id, + u32 * buffers, + uword n_buffers); + +#define EFD_OPERATION_LESS_THAN 0 +#define EFD_OPERATION_GREATER_OR_EQUAL 1 + +void efd_config(u32 enabled, + u32 ip_prec, u32 ip_op, + u32 mpls_exp, u32 mpls_op, + u32 vlan_cos, u32 vlan_op); + +void post_sw_interface_set_flags (vlib_main_t *vm, u32 sw_if_index, u32 flags); + +typedef struct vhost_user_memory vhost_user_memory_t; + +void dpdk_vhost_user_process_init (void **ctx); +void dpdk_vhost_user_process_cleanup (void *ctx); +uword dpdk_vhost_user_process_if (vlib_main_t *vm, dpdk_device_t *xd, void *ctx); + +// vhost-user calls +int dpdk_vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, + const char * sock_filename, + u8 is_server, + u32 * sw_if_index, + u64 feature_mask, + u8 renumber, u32 custom_dev_instance); +int dpdk_vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, + const char * sock_filename, + u8 is_server, + u32 sw_if_index, + u64 feature_mask, + u8 renumber, u32 custom_dev_instance); +int dpdk_vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, + u32 sw_if_index); +int dpdk_vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, + vhost_user_intf_details_t **out_vuids); + +u32 dpdk_get_admin_up_down_in_progress (void); + +uword +dpdk_input_rss (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f); + +#endif /* __included_dpdk_h__ */ diff --git a/vnet/vnet/devices/dpdk/dpdk_priv.h b/vnet/vnet/devices/dpdk/dpdk_priv.h new file mode 100644 index 00000000000..e452e02d90d --- /dev/null +++ b/vnet/vnet/devices/dpdk/dpdk_priv.h @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define DPDK_NB_RX_DESC_DEFAULT 512 +#define DPDK_NB_TX_DESC_DEFAULT 512 +#define DPDK_NB_RX_DESC_VIRTIO 256 +#define DPDK_NB_TX_DESC_VIRTIO 256 +#define DPDK_NB_RX_DESC_10GE 2048 +#define DPDK_NB_TX_DESC_10GE 2048 +#define DPDK_NB_RX_DESC_40GE (4096-128) +#define DPDK_NB_TX_DESC_40GE 2048 + +#define foreach_dpdk_counter \ + _ (tx_frames_ok, opackets) \ + _ (tx_bytes_ok, obytes) \ + _ (tx_errors, oerrors) \ + _ (tx_loopback_frames_ok, olbpackets) \ + _ (tx_loopback_bytes_ok, olbbytes) \ + _ (rx_frames_ok, ipackets) \ + _ (rx_bytes_ok, ibytes) \ + _ (rx_errors, ierrors) \ + _ (rx_missed, imissed) \ + _ (rx_bad_crc, ibadcrc) \ + _ (rx_bad_length, ibadlen) \ + _ (rx_multicast_frames_ok, imcasts) \ + _ (rx_no_bufs, rx_nombuf) \ + _ (rx_filter_match, fdirmatch) \ + _ (rx_filter_miss, fdirmiss) \ + _ (tx_pause_xon, tx_pause_xon) \ + _ (rx_pause_xon, rx_pause_xon) \ + _ (tx_pause_xoff, tx_pause_xoff) \ + _ (rx_pause_xoff, rx_pause_xoff) \ + _ (rx_loopback_frames_ok, ilbpackets) \ + _ (rx_loopback_bytes_ok, ilbbytes) + +#define foreach_dpdk_q_counter \ + _ (rx_frames_ok, q_ipackets) \ + _ (tx_frames_ok, q_opackets) \ + _ (rx_bytes_ok, q_ibytes) \ + _ (tx_bytes_ok, q_obytes) \ + _ (rx_errors, q_errors) + +#define foreach_dpdk_rss_hf \ + _(ETH_RSS_IPV4, "ipv4") \ + _(ETH_RSS_FRAG_IPV4, "ipv4-frag") \ + _(ETH_RSS_NONFRAG_IPV4_TCP, "ipv4-tcp") \ + _(ETH_RSS_NONFRAG_IPV4_UDP, "ipv4-udp") \ + _(ETH_RSS_NONFRAG_IPV4_SCTP, "ipv4-sctp") \ + _(ETH_RSS_NONFRAG_IPV4_OTHER, "ipv4-other") \ + _(ETH_RSS_IPV6, "ipv6") \ + _(ETH_RSS_FRAG_IPV6, "ipv6-frag") \ + _(ETH_RSS_NONFRAG_IPV6_TCP, "ipv6-tcp") \ + _(ETH_RSS_NONFRAG_IPV6_UDP, "ipv6-udp") \ + _(ETH_RSS_NONFRAG_IPV6_SCTP, "ipv6-sctp") \ + _(ETH_RSS_NONFRAG_IPV6_OTHER, "ipv6-other") \ + _(ETH_RSS_L2_PAYLOAD, "l2-payload") \ + _(ETH_RSS_IPV6_EX, "ipv6-ex") \ + _(ETH_RSS_IPV6_TCP_EX, "ipv6-tcp-ex") \ + _(ETH_RSS_IPV6_UDP_EX, "ipv6-udp-ex") + +#define foreach_dpdk_rx_offload_caps \ + _(DEV_RX_OFFLOAD_VLAN_STRIP, "vlan-strip") \ + _(DEV_RX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \ + _(DEV_RX_OFFLOAD_UDP_CKSUM , "udp-cksum") \ + _(DEV_RX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \ + _(DEV_RX_OFFLOAD_TCP_LRO , "rcp-lro") \ + _(DEV_RX_OFFLOAD_QINQ_STRIP, "qinq-strip") + +#define foreach_dpdk_tx_offload_caps \ + _(DEV_TX_OFFLOAD_VLAN_INSERT, "vlan-insert") \ + _(DEV_TX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \ + _(DEV_TX_OFFLOAD_UDP_CKSUM , "udp-cksum") \ + _(DEV_TX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \ + _(DEV_TX_OFFLOAD_SCTP_CKSUM , "sctp-cksum") \ + _(DEV_TX_OFFLOAD_TCP_TSO , "tcp-tso") \ + _(DEV_TX_OFFLOAD_UDP_TSO , "udp-tso") \ + _(DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM, "outer-ipv4-cksum") \ + _(DEV_TX_OFFLOAD_QINQ_INSERT, "qinq-insert") + +#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0) + +#define foreach_dpdk_pkt_rx_offload_flag \ + _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \ + _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \ + _ (PKT_RX_FDIR, "RX packet with FDIR infos") \ + _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \ + _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \ + _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \ + _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet") + +#define foreach_dpdk_pkt_type \ + _ (RTE_PTYPE_L3_IPV4, "Packet with IPv4 header") \ + _ (RTE_PTYPE_L3_IPV4_EXT, "Packet with extended IPv4 header") \ + _ (RTE_PTYPE_L3_IPV6, "Packet with IPv6 header") \ + _ (RTE_PTYPE_L3_IPV6_EXT, "Packet with extended IPv6 header") +#else +#define foreach_dpdk_pkt_rx_offload_flag \ + _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \ + _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \ + _ (PKT_RX_FDIR, "RX packet with FDIR infos") \ + _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \ + _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \ + _ (PKT_RX_IPV4_HDR, "RX packet with IPv4 header") \ + _ (PKT_RX_IPV4_HDR_EXT, "RX packet with extended IPv4 header") \ + _ (PKT_RX_IPV6_HDR, "RX packet with IPv6 header") \ + _ (PKT_RX_IPV6_HDR_EXT, "RX packet with extended IPv6 header") \ + _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \ + _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet") + +#define foreach_dpdk_pkt_type /* Dummy */ +#endif /* RTE_VERSION */ + +#define foreach_dpdk_pkt_tx_offload_flag \ + _ (PKT_TX_VLAN_PKT, "TX packet is a 802.1q VLAN packet") \ + _ (PKT_TX_IP_CKSUM, "IP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_TCP_CKSUM, "TCP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_SCTP_CKSUM, "SCTP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_IEEE1588_TMST, "TX IEEE1588 packet to timestamp") + +#define foreach_dpdk_pkt_offload_flag \ + foreach_dpdk_pkt_rx_offload_flag \ + foreach_dpdk_pkt_tx_offload_flag + +static inline u8 * format_dpdk_pkt_types (u8 * s, va_list * va) +{ + u32 *pkt_types = va_arg (*va, u32 *); + uword indent __attribute__((unused)) = format_get_indent (s) + 2; + + if (!*pkt_types) + return s; + + s = format (s, "Packet Types"); + +#define _(F, S) \ + if (*pkt_types & F) \ + { \ + s = format (s, "\n%U%s (0x%04x) %s", \ + format_white_space, indent, #F, F, S); \ + } + + foreach_dpdk_pkt_type + +#undef _ + + return s; +} + +static inline u8 * format_dpdk_pkt_offload_flags (u8 * s, va_list * va) +{ + u16 *ol_flags = va_arg (*va, u16 *); + uword indent = format_get_indent (s) + 2; + + if (!*ol_flags) + return s; + + s = format (s, "Packet Offload Flags"); + +#define _(F, S) \ + if (*ol_flags & F) \ + { \ + s = format (s, "\n%U%s (0x%04x) %s", \ + format_white_space, indent, #F, F, S); \ + } + + foreach_dpdk_pkt_offload_flag + +#undef _ + + return s; +} + +static inline u8 * format_dpdk_rte_mbuf (u8 * s, va_list * va) +{ + struct rte_mbuf * mb = va_arg (*va, struct rte_mbuf *); + uword indent = format_get_indent (s) + 2; + + s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d" + "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x," + "\n%Upacket_type 0x%x", + mb->port, mb->nb_segs, mb->pkt_len, + format_white_space, indent, + mb->buf_len, mb->data_len, mb->ol_flags, + format_white_space, indent, + mb->packet_type); + + if (mb->ol_flags) + s = format (s, "\n%U%U", format_white_space, indent, + format_dpdk_pkt_offload_flags, &mb->ol_flags); + + if (mb->packet_type) + s = format (s, "\n%U%U", format_white_space, indent, + format_dpdk_pkt_types, &mb->packet_type); + return s; +} + +#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS +#define foreach_dpdk_pkt_ext_rx_offload_flag \ + _ (PKT_EXT_RX_PKT_ERROR, "RX Packet Error") \ + _ (PKT_EXT_RX_BAD_FCS, "RX Bad FCS checksum") \ + _ (PKT_EXT_RX_UDP, "RX packet with UDP L4 header") \ + _ (PKT_EXT_RX_TCP, "RX packet with TCP L4 header") \ + _ (PKT_EXT_RX_IPV4_FRAGMENT, "RX packet IPv4 Fragment") + +#define foreach_dpdk_pkt_ext_offload_flag \ + foreach_dpdk_pkt_rx_offload_flag \ + foreach_dpdk_pkt_ext_rx_offload_flag + +static inline u8 * format_dpdk_pkt_rx_offload_flags (u8 * s, va_list * va) +{ + u16 *ol_flags = va_arg (*va, u16 *); + uword indent = format_get_indent (s) + 2; + + if (!*ol_flags) + return s; + + s = format (s, "Packet RX Offload Flags"); + +#define _(F, S) \ + if (*ol_flags & F) \ + { \ + s = format (s, "\n%U%s (0x%04x) %s", \ + format_white_space, indent, #F, F, S); \ + } + + foreach_dpdk_pkt_ext_offload_flag + +#undef _ + + return s; +} + +static inline u8 * format_dpdk_rx_rte_mbuf (u8 * s, va_list * va) +{ + struct rte_mbuf * mb = va_arg (*va, struct rte_mbuf *); + uword indent = format_get_indent (s) + 2; + + /* + * Note: Assumes mb is head of pkt chain -- port, nb_segs, & pkt_len + * are only valid for the 1st mbuf segment. + */ + s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d" + "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x" + "\n%Upacket_type 0x%x", + mb->port, mb->nb_segs, mb->pkt_len, + format_white_space, indent, + mb->buf_len, mb->data_len, mb->ol_flags, + format_white_space, indent, + mb->packet_type); + + if (mb->ol_flags) + s = format (s, "\n%U%U", format_white_space, indent, + format_dpdk_pkt_rx_offload_flags, &mb->ol_flags); + + if (mb->packet_type) + s = format (s, "\n%U%U", format_white_space, indent, + format_dpdk_pkt_types, &mb->packet_type); + return s; +} +#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */ + +/* These args appear by themselves */ +#define foreach_eal_double_hyphen_predicate_arg \ +_(no-shconf) \ +_(no-hpet) \ +_(no-pci) \ +_(no-huge) \ +_(vmware-tsc-map) \ +_(virtio-vhost) + +#define foreach_eal_single_hyphen_mandatory_arg \ +_(coremask, c) \ +_(nchannels, n) \ + +#define foreach_eal_single_hyphen_arg \ +_(blacklist, b) \ +_(mem-alloc-request, m) \ +_(force-ranks, r) + +/* These args are preceeded by "--" and followed by a single string */ +#define foreach_eal_double_hyphen_arg \ +_(huge-dir) \ +_(proc-type) \ +_(file-prefix) \ +_(socket-mem) \ +_(vdev) + +static inline u32 +dpdk_rx_burst ( dpdk_main_t * dm, dpdk_device_t * xd, u16 queue_id) +{ + u32 n_buffers; + u32 n_left; + u32 n_this_chunk; + + n_left = VLIB_FRAME_SIZE; + n_buffers = 0; + + if (PREDICT_TRUE(xd->dev_type == VNET_DPDK_DEV_ETH)) + { + while (n_left) + { + n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id, + xd->rx_vectors[queue_id] + n_buffers, n_left); + n_buffers += n_this_chunk; + n_left -= n_this_chunk; + + /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */ + if (n_this_chunk < 32) + break; + } + } + else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER) + { + vlib_main_t * vm = vlib_get_main(); + vlib_buffer_main_t * bm = vm->buffer_main; + unsigned socket_id = rte_socket_id(); + + if (PREDICT_FALSE(!xd->vu_is_running)) + return 0; + + n_buffers = rte_vhost_dequeue_burst(&xd->vu_vhost_dev, VIRTIO_TXQ, + bm->pktmbuf_pools[socket_id], + xd->rx_vectors[queue_id], VLIB_FRAME_SIZE); + + f64 now = vlib_time_now (dm->vlib_main); + + /* send pending interrupts if needed */ + if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_TXQ)) { + dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_TXQ]); + vring->n_since_last_int += n_buffers; + + if ((vring->n_since_last_int && (vring->int_deadline < now)) + || (vring->n_since_last_int > dm->vhost_coalesce_frames)) + dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_TXQ); + } + + if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) { + dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]); + if (vring->n_since_last_int && (vring->int_deadline < now)) + dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ); + } + + } + else if (xd->dev_type == VNET_DPDK_DEV_KNI) + { + n_buffers = rte_kni_rx_burst(xd->kni, xd->rx_vectors[queue_id], VLIB_FRAME_SIZE); + rte_kni_handle_request(xd->kni); + } + else + { + ASSERT(0); + } + + return n_buffers; +} + + +static inline void +dpdk_update_counters (dpdk_device_t * xd, f64 now) +{ + vlib_simple_counter_main_t * cm; + vnet_main_t * vnm = vnet_get_main(); + u32 my_cpu = os_get_cpu_number(); + u64 rxerrors, last_rxerrors; + int len; + + /* only update counters for PMD interfaces */ + if (xd->dev_type != VNET_DPDK_DEV_ETH) + return; + + /* + * DAW-FIXME: VMXNET3 device stop/start doesn't work, + * therefore fake the stop in the dpdk driver by + * silently dropping all of the incoming pkts instead of + * stopping the driver / hardware. + */ + if (xd->admin_up != 0xff) + { + xd->time_last_stats_update = now ? now : xd->time_last_stats_update; + memcpy (&xd->last_stats, &xd->stats, sizeof (xd->last_stats)); + rte_eth_stats_get (xd->device_index, &xd->stats); + + /* maybe bump interface rx no buffer counter */ + if (PREDICT_FALSE (xd->stats.rx_nombuf != xd->last_stats.rx_nombuf)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_NO_BUF); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + xd->stats.rx_nombuf - + xd->last_stats.rx_nombuf); + } + + /* missed pkt counter */ + if (PREDICT_FALSE (xd->stats.imissed != xd->last_stats.imissed)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_MISS); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + xd->stats.imissed - + xd->last_stats.imissed); + } + rxerrors = xd->stats.ibadcrc + + xd->stats.ibadlen + xd->stats.ierrors; + last_rxerrors = xd->last_stats.ibadcrc + + xd->last_stats.ibadlen + xd->last_stats.ierrors; + + if (PREDICT_FALSE (rxerrors != last_rxerrors)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_ERROR); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + rxerrors - last_rxerrors); + } + } + + if ((len = rte_eth_xstats_get(xd->device_index, NULL, 0)) > 0) + { + vec_validate(xd->xstats, len - 1); + len = rte_eth_xstats_get(xd->device_index, xd->xstats, vec_len(xd->xstats)); + ASSERT(vec_len(xd->xstats) == len); + _vec_len(xd->xstats) = len; + } +} diff --git a/vnet/vnet/devices/dpdk/init.c b/vnet/vnet/devices/dpdk/init.c new file mode 100644 index 00000000000..a4b0f01475f --- /dev/null +++ b/vnet/vnet/devices/dpdk/init.c @@ -0,0 +1,1728 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> + +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/dpdk/dpdk.h> +#include <vlib/unix/physmem.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <string.h> +#include <fcntl.h> + +#include "dpdk_priv.h" + +dpdk_main_t dpdk_main; + +/* force linker to link functions used by vlib and declared weak */ +void *vlib_weakly_linked_functions[] = { + &rte_pktmbuf_init, + &rte_pktmbuf_pool_init, +}; + +#define LINK_STATE_ELOGS 0 + +#define DEFAULT_HUGE_DIR "/run/vpp/hugepages" +#define VPP_RUN_DIR "/run/vpp" + +/* Port configuration, mildly modified Intel app values */ + +static struct rte_eth_conf port_conf_template = { + .rxmode = { + .split_hdr_size = 0, + .header_split = 0, /**< Header Split disabled */ + .hw_ip_checksum = 0, /**< IP checksum offload disabled */ + .hw_vlan_filter = 0, /**< VLAN filtering disabled */ + .hw_strip_crc = 1, /**< CRC stripped by hardware */ + }, + .txmode = { + .mq_mode = ETH_MQ_TX_NONE, + }, +}; + +clib_error_t * +dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) +{ + vlib_main_t * vm = vlib_get_main(); + vlib_buffer_main_t * bm = vm->buffer_main; + int rv; + int j; + + ASSERT(os_get_cpu_number() == 0); + + if (xd->admin_up) { + vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0); + rte_eth_dev_stop (xd->device_index); + } + + rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used, + xd->tx_q_used, &xd->port_conf); + + if (rv < 0) + return clib_error_return (0, "rte_eth_dev_configure[%d]: err %d", + xd->device_index, rv); + + /* Set up one TX-queue per worker thread */ + for (j = 0; j < xd->tx_q_used; j++) + { + rv = rte_eth_tx_queue_setup(xd->device_index, j, xd->nb_tx_desc, + xd->cpu_socket, &xd->tx_conf); + if (rv < 0) + break; + } + + if (rv < 0) + return clib_error_return (0, "rte_eth_tx_queue_setup[%d]: err %d", + xd->device_index, rv); + + for (j = 0; j < xd->rx_q_used; j++) + { + + rv = rte_eth_rx_queue_setup(xd->device_index, j, xd->nb_rx_desc, + xd->cpu_socket, 0, + bm->pktmbuf_pools[xd->cpu_socket_id_by_queue[j]]); + if (rv < 0) + return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d", + xd->device_index, rv); + } + + if (xd->admin_up) { + rte_eth_dev_start (xd->device_index); + } + return 0; +} + +static u32 dpdk_flag_change (vnet_main_t * vnm, + vnet_hw_interface_t * hi, + u32 flags) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance); + u32 old = 0; + + if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC(flags)) + { + old = xd->promisc; + xd->promisc = flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL; + + if (xd->admin_up) + { + if (xd->promisc) + rte_eth_promiscuous_enable(xd->device_index); + else + rte_eth_promiscuous_disable(xd->device_index); + } + } + else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU(flags)) + { + /* + * DAW-FIXME: The Cisco VIC firmware does not provide an api for a + * driver to dynamically change the mtu. If/when the + * VIC firmware gets fixed, then this should be removed. + */ + if (xd->pmd == VNET_DPDK_PMD_VICE || + xd->pmd == VNET_DPDK_PMD_ENIC) + { + struct rte_eth_dev_info dev_info; + + /* + * Restore mtu to what has been set by CIMC in the firmware cfg. + */ + rte_eth_dev_info_get(xd->device_index, &dev_info); + hi->max_packet_bytes = dev_info.max_rx_pktlen; + + vlib_cli_output (vlib_get_main(), + "Cisco VIC mtu can only be changed " + "using CIMC then rebooting the server!"); + } + else + { + int rv; + + /* + * DAW-FIXME: The DPDK VMXNET3 driver does not currently support + * multi-buffer packets. Max out at 1518 bytes for now. + * + * If/when the driver gets fixed, then this should be + * removed. + */ + if ((xd->pmd == VNET_DPDK_PMD_VMXNET3) && + (hi->max_packet_bytes > 1518)) + { + hi->max_packet_bytes = 1518; + + vlib_cli_output (vlib_get_main(), + "VMXNET3 driver does not support jumbo frames " + "yet -- setting mtu to 1518!"); + } + + xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes; + + if (xd->admin_up) + rte_eth_dev_stop (xd->device_index); + + rv = rte_eth_dev_configure + (xd->device_index, + xd->rx_q_used, + xd->tx_q_used, + &xd->port_conf); + + if (rv < 0) + vlib_cli_output (vlib_get_main(), + "rte_eth_dev_configure[%d]: err %d", + xd->device_index, rv); + + rte_eth_dev_set_mtu(xd->device_index, hi->max_packet_bytes); + + if (xd->admin_up) + rte_eth_dev_start (xd->device_index); + } + } + return old; +} + +#ifdef NETMAP +extern int rte_netmap_probe(void); +#endif + +static clib_error_t * +dpdk_lib_init (dpdk_main_t * dm) +{ + u32 nports; + u32 nb_desc = 0; + int i; + clib_error_t * error; + vlib_main_t * vm = vlib_get_main(); + vlib_thread_main_t * tm = vlib_get_thread_main(); + vnet_sw_interface_t * sw; + vnet_hw_interface_t * hi; + dpdk_device_t * xd; + vlib_thread_registration_t * tr; + uword * p; + + u32 next_cpu = 0; + u8 af_packet_port_id = 0; + + dm->input_cpu_first_index = 0; + dm->input_cpu_count = 1; + + /* find out which cpus will be used for input */ + p = hash_get_mem (tm->thread_registrations_by_name, "io"); + tr = p ? (vlib_thread_registration_t *) p[0] : 0; + + if (!tr || tr->count == 0) + { + /* no io threads, workers doing input */ + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + tr = p ? (vlib_thread_registration_t *) p[0] : 0; + } + else + { + dm->have_io_threads = 1; + } + + if (tr && tr->count > 0) + { + dm->input_cpu_first_index = tr->first_index; + dm->input_cpu_count = tr->count; + } + + vec_validate_aligned (dm->devices_by_cpu, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + + vec_validate_aligned (dm->workers, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + +#ifdef NETMAP + if(rte_netmap_probe() < 0) + return clib_error_return (0, "rte netmap probe failed"); +#endif + + nports = rte_eth_dev_count(); + if (nports < 1) + { + clib_warning ("DPDK drivers found no ports..."); + } + + if (CLIB_DEBUG > 0) + clib_warning ("DPDK drivers found %d ports...", nports); + + /* + * All buffers are all allocated from the same rte_mempool. + * Thus they all have the same number of data bytes. + */ + dm->vlib_buffer_free_list_index = + vlib_buffer_get_or_create_free_list ( + vm, VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, "dpdk rx"); + + for (i = 0; i < nports; i++) + { + u8 addr[6]; + int j; + struct rte_eth_dev_info dev_info; + clib_error_t * rv; + struct rte_eth_link l; + + /* Create vnet interface */ + vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES); + xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT; + xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT; + xd->cpu_socket = (i8) rte_eth_dev_socket_id(i); + rte_eth_dev_info_get(i, &dev_info); + + memcpy(&xd->tx_conf, &dev_info.default_txconf, + sizeof(struct rte_eth_txconf)); + if (dm->no_multi_seg) + { + xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + port_conf_template.rxmode.jumbo_frame = 0; + } + else + { + xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS; + port_conf_template.rxmode.jumbo_frame = 1; + } + + memcpy(&xd->port_conf, &port_conf_template, sizeof(struct rte_eth_conf)); + + xd->tx_q_used = dev_info.max_tx_queues < tm->n_vlib_mains ? + 1 : tm->n_vlib_mains; + + if (dm->use_rss > 1 && dev_info.max_rx_queues >= dm->use_rss) + { + xd->rx_q_used = dm->use_rss; + xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + xd->port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP; + } + else + xd->rx_q_used = 1; + + xd->dev_type = VNET_DPDK_DEV_ETH; + if (!xd->pmd) { + + +#define _(s,f) else if (!strcmp(dev_info.driver_name, s)) \ + xd->pmd = VNET_DPDK_PMD_##f; + if (0) + ; + foreach_dpdk_pmd +#undef _ + else + xd->pmd = VNET_DPDK_PMD_UNKNOWN; + + + switch (xd->pmd) { + /* 1G adapters */ + case VNET_DPDK_PMD_E1000EM: + case VNET_DPDK_PMD_IGB: + case VNET_DPDK_PMD_IGBVF: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + break; + + /* 10G adapters */ + case VNET_DPDK_PMD_IXGBE: + case VNET_DPDK_PMD_IXGBEVF: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + xd->nb_rx_desc = DPDK_NB_RX_DESC_10GE; + xd->nb_tx_desc = DPDK_NB_TX_DESC_10GE; + break; + + /* Cisco VIC */ + case VNET_DPDK_PMD_VICE: + case VNET_DPDK_PMD_ENIC: + rte_eth_link_get_nowait(xd->device_index, &l); + if (l.link_speed == 40000) + { + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE; + xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE; + } + else + { + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + xd->nb_rx_desc = DPDK_NB_RX_DESC_10GE; + xd->nb_tx_desc = DPDK_NB_TX_DESC_10GE; + } + break; + + /* Intel Fortville */ + case VNET_DPDK_PMD_I40E: + case VNET_DPDK_PMD_I40EVF: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE; + xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE; + + switch (dev_info.pci_dev->id.device_id) { + case I40E_DEV_ID_10G_BASE_T: + case I40E_DEV_ID_SFP_XL710: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + case I40E_DEV_ID_QSFP_A: + case I40E_DEV_ID_QSFP_B: + case I40E_DEV_ID_QSFP_C: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + break; + case I40E_DEV_ID_VF: + rte_eth_link_get_nowait(xd->device_index, &l); + xd->port_type = l.link_speed == 10000 ? + VNET_DPDK_PORT_TYPE_ETH_10G : VNET_DPDK_PORT_TYPE_ETH_40G; + break; + default: + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; + } + break; + + /* Intel Red Rock Canyon */ + case VNET_DPDK_PMD_FM10K: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH; + xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE; + xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE; + break; + + /* virtio */ + case VNET_DPDK_PMD_VIRTIO: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO; + xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO; + break; + + /* vmxnet3 */ + case VNET_DPDK_PMD_VMXNET3: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + break; + + case VNET_DPDK_PMD_AF_PACKET: + xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET; + xd->af_packet_port_id = af_packet_port_id++; + break; + + default: + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; + } + + #ifdef NETMAP + if(strncmp(dev_info.driver_name, "vale", 4) == 0 + || strncmp(dev_info.driver_name, "netmap", 6) == 0) + { + xd->pmd = VNET_DPDK_PMD_NETMAP; + xd->port_type = VNET_DPDK_PORT_TYPE_NETMAP; + } + #endif + + } + + /* + * Ensure default mtu is not > the mtu read from the hardware. + * Otherwise rte_eth_dev_configure() will fail and the port will + * not be available. + */ + xd->port_conf.rxmode.max_rx_pkt_len = + (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen) ? + dev_info.max_rx_pktlen : ETHERNET_MAX_PACKET_BYTES; + + /* + * DAW-FIXME: VMXNET3 driver doesn't support jumbo / multi-buffer pkts + */ + if (xd->pmd == VNET_DPDK_PMD_VMXNET3) + { + xd->port_conf.rxmode.max_rx_pkt_len = 1518; + xd->port_conf.rxmode.jumbo_frame = 0; + } + + if (xd->pmd == VNET_DPDK_PMD_AF_PACKET) + { + f64 now = vlib_time_now(vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + memcpy (addr+2, &rnd, sizeof(rnd)); + addr[0] = 2; + addr[1] = 0xfe; + } + else + rte_eth_macaddr_get(i,(struct ether_addr *)addr); + + if (xd->tx_q_used < tm->n_vlib_mains) + { + xd->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, + CLIB_CACHE_LINE_BYTES); + memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES); + } + + xd->device_index = xd - dm->devices; + ASSERT(i == xd->device_index); + xd->per_interface_next_index = ~0; + + /* assign interface to input thread */ + dpdk_device_and_queue_t * dq; + int q; + + for (q = 0; q < xd->rx_q_used; q++) + { + int cpu = dm->input_cpu_first_index + next_cpu; + unsigned lcore = vlib_worker_threads[cpu].dpdk_lcore_id; + + /* + * numa node for worker thread handling this queue + * needed for taking buffers from the right mempool + */ + vec_validate(xd->cpu_socket_id_by_queue, q); + xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore); + + /* + * construct vector of (device,queue) pairs for each worker thread + */ + vec_add2(dm->devices_by_cpu[cpu], dq, 1); + dq->device = xd->device_index; + dq->queue_id = q; + + next_cpu++; + if (next_cpu == dm->input_cpu_count) + next_cpu = 0; + } + + vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + for (j = 0; j < tm->n_vlib_mains; j++) + { + vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE, + sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->tx_vectors[j]); + } + + vec_validate_aligned (xd->rx_vectors, xd->rx_q_used, + CLIB_CACHE_LINE_BYTES); + for (j = 0; j< xd->rx_q_used; j++) + { + vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1, + CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->rx_vectors[j]); + } + + vec_validate_aligned (xd->frames, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + + rv = dpdk_port_setup(dm, xd); + + if (rv < 0) + return rv; + + /* count the number of descriptors used for this device */ + nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used; + + error = ethernet_register_interface + (dm->vnet_main, + dpdk_device_class.index, + xd->device_index, + /* ethernet address */ addr, + &xd->vlib_hw_if_index, + dpdk_flag_change); + if (error) + return error; + + sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index); + + /* + * DAW-FIXME: The Cisco VIC firmware does not provide an api for a + * driver to dynamically change the mtu. If/when the + * VIC firmware gets fixed, then this should be removed. + */ + if (xd->pmd == VNET_DPDK_PMD_VICE || + xd->pmd == VNET_DPDK_PMD_ENIC) + { + /* + * Initialize mtu to what has been set by CIMC in the firmware cfg. + */ + hi->max_packet_bytes = dev_info.max_rx_pktlen; + /* + * remove vlan tag from VIC port to fix VLAN0 issue. + * TODO Handle VLAN tagged traffic + */ + int vlan_off; + vlan_off = rte_eth_dev_get_vlan_offload(xd->device_index); + vlan_off |= ETH_VLAN_STRIP_OFFLOAD; + rte_eth_dev_set_vlan_offload(xd->device_index, vlan_off); + } + /* + * DAW-FIXME: VMXNET3 driver doesn't support jumbo / multi-buffer pkts + */ + else if (xd->pmd == VNET_DPDK_PMD_VMXNET3) + hi->max_packet_bytes = 1518; + + hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = + xd->port_conf.rxmode.max_rx_pkt_len - sizeof(ethernet_header_t); + + rte_eth_dev_set_mtu(xd->device_index, hi->max_packet_bytes); + } + + if (dm->num_kni) { + clib_warning("Initializing KNI interfaces..."); + rte_kni_init(dm->num_kni); + for (i = 0; i < dm->num_kni; i++) + { + u8 addr[6]; + int j; + + /* Create vnet interface */ + vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES); + xd->dev_type = VNET_DPDK_DEV_KNI; + + xd->device_index = xd - dm->devices; + ASSERT(nports + i == xd->device_index); + xd->per_interface_next_index = ~0; + xd->kni_port_id = i; + xd->cpu_socket = -1; + hash_set (dm->dpdk_device_by_kni_port_id, i, xd - dm->devices); + xd->rx_q_used = 1; + + /* assign interface to input thread */ + dpdk_device_and_queue_t * dq; + vec_add2(dm->devices_by_cpu[dm->input_cpu_first_index], dq, 1); + dq->device = xd->device_index; + dq->queue_id = 0; + + vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + for (j = 0; j < tm->n_vlib_mains; j++) + { + vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE, + sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->tx_vectors[j]); + } + + vec_validate_aligned (xd->rx_vectors, xd->rx_q_used, + CLIB_CACHE_LINE_BYTES); + for (j = 0; j< xd->rx_q_used; j++) + { + vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1, + CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->rx_vectors[j]); + } + + vec_validate_aligned (xd->frames, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + + /* FIXME Set up one TX-queue per worker thread */ + + { + f64 now = vlib_time_now(vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + + memcpy (addr+2, &rnd, sizeof(rnd)); + addr[0] = 2; + addr[1] = 0xfe; + } + + error = ethernet_register_interface + (dm->vnet_main, + dpdk_device_class.index, + xd->device_index, + /* ethernet address */ addr, + &xd->vlib_hw_if_index, + dpdk_flag_change); + + if (error) + return error; + + sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index); + } + } + + if (nb_desc > dm->num_mbufs) + clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n", + dm->num_mbufs, nb_desc); + + /* init next vhost-user if index */ + dm->next_vu_if_id = 0; + + return 0; +} + +/* + * Tell the vlib physical memory allocator that we've handled + * the initialization. We don't actually do so until + * vlib_main(...) callls the dpdk config function. + */ +int vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, + int physmem_required) +{ + return 1; +} + +static clib_error_t * +write_sys_fs (char * file_name, char * fmt, ...) +{ + u8 * s; + int fd; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + vec_add1 (s, 0); // terminate c string + + if (write (fd, s, vec_len (s)) < 0) + return clib_error_return_unix (0, "write '%s' to '%s'", s, file_name); + + vec_free (s); + close (fd); + return 0; +} + +#define VIRTIO_PCI_NAME "virtio-pci" + +static clib_error_t * dpdk_bind_eth_kernel_drivers (vlib_main_t * vm, + char * pci_dev_id, + char * kernel_driver) +{ + dpdk_main_t * dm = &dpdk_main; + unformat_input_t _in; + unformat_input_t * in = &_in; + clib_error_t * error = 0; + u8 * line = 0, * modcmd = 0, * path = 0; + u8 * pci_vid = 0, *pci_did = 0, * devname = 0; + char *driver_name = kernel_driver; + FILE * fp; + + /* + * Bail out now if we're not running as root. + * This allows non-privileged use of the packet generator, etc. + */ + if (geteuid() != 0) + return 0; + + /* + * Get all ethernet pci device numbers for the device type specified. + */ + modcmd = format (0, "lspci -nDd %s | grep 0200 | " + "awk '{ print $1, $3 }'%c", pci_dev_id, 0); + if ((fp = popen ((const char *)modcmd, "r")) == NULL) + { + error = clib_error_return_unix (0, + "Unable to get %s ethernet pci devices.", + pci_dev_id); + goto done; + } + + vec_validate (line, BUFSIZ); + vec_validate (path, BUFSIZ); + while (fgets ((char *)line, BUFSIZ, fp) != NULL) + { + struct stat st; + u8 bind_uio = 1; + line[strlen ((char *)line) - 1] = 0; // chomp trailing newline. + + unformat_init_string (in, (char *)line, strlen((char *)line) + 1); + unformat(in, "%s %s:%s", &devname, &pci_vid, &pci_did); + unformat_free (in); + + /* + * Blacklist all ethernet interfaces in the + * linux IP routing tables (route --inet --inet6) + */ + if (strstr ((char *)dm->eth_if_blacklist, (char *)devname)) + continue; + + /* + * If there are any devices whitelisted, then blacklist all devices + * which are not explicitly whitelisted. + */ + if (dm->eth_if_whitelist && + !strstr ((char *)dm->eth_if_whitelist, (char *)devname)) + continue; + +#ifdef NETMAP + /* + * Optimistically open the device as a netmap device. + */ + if (eth_nm_open((char *)devname)) + continue; +#endif + + _vec_len (path) = 0; + path = format (path, "/sys/bus/pci/devices/%s/driver/unbind%c", + devname, 0); + + /* + * If the device is bound to a driver... + */ + if (stat ((const char *)path, &st) == 0) + { + u8 * device_path; + + /* + * If the interface is not a virtio... + */ + if (!driver_name || strcmp(driver_name, VIRTIO_PCI_NAME)) + { + /* + * If it is already bound to driver, don't unbind/bind it. + */ + device_path = format (0, "/sys/bus/pci/drivers/%s/%s/device%c", + driver_name, devname, 0); + if (stat ((const char *)device_path, &st) == 0) + bind_uio = 0; + + vec_free (device_path); + } + + /* + * unbind it from the current driver + */ + if (bind_uio) + { + _vec_len (path) -= 1; + path = format (path, "%c", 0); + error = write_sys_fs ((char *)path, "%s", devname); + if (error) + goto done; + } + } + + /* + * DAW-FIXME: The following bind/unbind dance is necessary for the dpdk + * virtio poll-mode driver to work. + */ + + if (driver_name && !strcmp(driver_name, VIRTIO_PCI_NAME)) + { + /* + * bind interface to the native kernel module + */ + _vec_len (path) = 0; + path = format (path, "/sys/bus/pci/drivers/%s/bind%c", + driver_name, 0); + error = write_sys_fs ((char *)path, "%s", devname); + if (error) + goto done; + + /* + * unbind interface from the native kernel module + */ + _vec_len (path) -= 5; + path = format (path, "unbind%c", 0); + error = write_sys_fs ((char *)path, "%s", devname); + if (error) + goto done; + } + + /* + * bind the interface to igb_uio + */ + if (bind_uio) + { + int pci_vendor_id = strtol((char *) pci_vid, NULL, 16); + int pci_device_id = strtol((char *) pci_did, NULL, 16); + + /* + * Set PCI ID to ".../virtio-pci/new_id" for Intel fortvile adapaters + */ + if (pci_vendor_id == 0x8086 && + (pci_device_id == I40E_DEV_ID_10G_BASE_T || + pci_device_id == I40E_DEV_ID_SFP_XL710 || + pci_device_id == I40E_DEV_ID_QSFP_A || + pci_device_id == I40E_DEV_ID_QSFP_B || + pci_device_id == I40E_DEV_ID_QSFP_C)) + { + _vec_len (path) = 0; + path = format (path, "/sys/bus/pci/drivers/%s/new_id%c", driver_name, 0); + error = write_sys_fs ((char *) path, "%s %s", pci_vid, pci_did); + if (error) + continue; + } + + _vec_len (path) = 0; + path = format (path, "/sys/bus/pci/drivers/%s/bind%c", driver_name, 0); + error = write_sys_fs ((char *) path, "%s", devname); + if (error) + { + error = 0; + continue; + } + } + } + + done: + vec_free (line); + vec_free (path); + vec_free (devname); + vec_free (pci_vid); + vec_free (pci_did); + vec_free (modcmd); + pclose (fp); + return error; +} + +static uword +unformat_socket_mem (unformat_input_t * input, va_list * va) +{ + uword ** r = va_arg (* va, uword **); + int i = 0; + u32 mem; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, ",")) + hash_set (*r, i, 1024); + else if (unformat (input, "%u,", &mem)) + hash_set (*r, i, mem); + else if (unformat (input, "%u", &mem)) + hash_set (*r, i, mem); + else + { + unformat_put_input (input); + goto done; + } + i++; + } + +done: + return 1; +} + +static u32 +get_node_free_hugepages_num (u32 node, u32 page_size) +{ + FILE * fp; + u8 * tmp; + + tmp = format (0, "/sys/devices/system/node/node%u/hugepages/hugepages-%ukB/" + "free_hugepages%c", node, page_size, 0); + fp = fopen ((char *) tmp, "r"); + vec_free(tmp); + + if (fp != NULL) + { + u8 * buffer = 0; + u32 pages_avail = 0; + + vec_validate (buffer, 256-1); + if (fgets ((char *)buffer, 256, fp)) + { + unformat_input_t in; + unformat_init_string (&in, (char *) buffer, strlen ((char *) buffer)); + unformat(&in, "%u", &pages_avail); + unformat_free (&in); + } + vec_free(buffer); + fclose(fp); + return pages_avail; + } + + return 0; +} + +static clib_error_t * +dpdk_config (vlib_main_t * vm, unformat_input_t * input) +{ + clib_error_t * error = 0; + dpdk_main_t * dm = &dpdk_main; + vlib_thread_main_t * tm = vlib_get_thread_main(); + u8 * s, * tmp = 0; + u8 * pci_dev_id = 0; + u8 * rte_cmd = 0, * ethname = 0; + FILE * rte_fp; + u32 log_level; + int ret, i; + char * fmt; +#ifdef NETMAP + int rxrings, txrings, rxslots, txslots, txburst; + char * nmnam; +#endif + unformat_input_t _in; + unformat_input_t * in = &_in; + u8 no_pci = 0; + u8 no_huge = 0; + u8 huge_dir = 0; + u8 file_prefix = 0; + u8 * socket_mem = 0; + + // MATT-FIXME: inverted virtio-vhost logic to use virtio by default + dm->use_virtio_vhost = 1; + + while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT) + { + /* Prime the pump */ + if (unformat (input, "no-hugetlb")) + { + vec_add1 (dm->eal_init_args, (u8 *) "no-huge"); + no_huge = 1; + } + + else if (unformat (input, "decimal-interface-names")) + dm->interface_name_format_decimal = 1; + + else if (unformat (input, "no-multi-seg")) + dm->no_multi_seg = 1; + + else if (unformat (input, "dev %s", &pci_dev_id)) + { + if (dm->eth_if_whitelist) + { + /* + * Don't add duplicate device id's. + */ + if (strstr ((char *)dm->eth_if_whitelist, (char *)pci_dev_id)) + continue; + + _vec_len (dm->eth_if_whitelist) -= 1; // chomp trailing NULL. + dm->eth_if_whitelist = format (dm->eth_if_whitelist, " %s%c", + pci_dev_id, 0); + } + else + dm->eth_if_whitelist = format (0, "%s%c", pci_dev_id, 0); + } + +#ifdef NETMAP + else if (unformat(input, "netmap %s/%d:%d/%d:%d/%d", + &nmname, &rxrings, &rxslots, &txrings, &txslots, &txburst)) { + char * rv; + rv = (char *) + eth_nm_args(nmname, rxrings, rxslots, txrings, txslots, txburst); + if (rv) { + error = clib_error_return (0, "%s", rv); + goto done; + } + }else if (unformat(input, "netmap %s", &nmname)) { + char * rv; + rv = (char *) + eth_nm_args(nmname, 0, 0, 0, 0, 0); + if (rv) { + error = clib_error_return (0, "%s", rv); + goto done; + } + } +#endif + + else if (unformat (input, "num-mbufs %d", &dm->num_mbufs)) + ; + else if (unformat (input, "kni %d", &dm->num_kni)) + ; + else if (unformat (input, "uio-driver %s", &dm->uio_driver_name)) + ; + else if (unformat (input, "vhost-user-coalesce-frames %d", &dm->vhost_coalesce_frames)) + ; + else if (unformat (input, "vhost-user-coalesce-time %f", &dm->vhost_coalesce_time)) + ; + else if (unformat (input, "enable-vhost-user")) + dm->use_virtio_vhost = 0; + else if (unformat (input, "rss %d", &dm->use_rss)) + ; + +#define _(a) \ + else if (unformat(input, #a)) \ + { \ + if (!strncmp(#a, "no-pci", 6)) \ + no_pci = 1; \ + tmp = format (0, "--%s%c", #a, 0); \ + vec_add1 (dm->eal_init_args, tmp); \ + } + foreach_eal_double_hyphen_predicate_arg +#undef _ + +#define _(a) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + if (!strncmp(#a, "huge-dir", 8)) \ + huge_dir = 1; \ + else if (!strncmp(#a, "file-prefix", 11)) \ + file_prefix = 1; \ + else if (!strncmp(#a, "socket-mem", 10)) \ + socket_mem = vec_dup (s); \ + tmp = format (0, "--%s%c", #a, 0); \ + vec_add1 (dm->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + vec_add1 (dm->eal_init_args, s); \ + } + foreach_eal_double_hyphen_arg +#undef _ + +#define _(a,b) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + tmp = format (0, "-%s%c", #b, 0); \ + vec_add1 (dm->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + vec_add1 (dm->eal_init_args, s); \ + } + foreach_eal_single_hyphen_arg +#undef _ + +#define _(a,b) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + tmp = format (0, "-%s%c", #b, 0); \ + vec_add1 (dm->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + vec_add1 (dm->eal_init_args, s); \ + dm->a##_set_manually = 1; \ + } + foreach_eal_single_hyphen_mandatory_arg +#undef _ + + else if (unformat(input, "default")) + ; + + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + goto done; + } + } + + if (!dm->uio_driver_name) + dm->uio_driver_name = format (0, "igb_uio"); + + /* + * Use 1G huge pages if available. + */ + if (!no_huge && !huge_dir) + { + uword * mem_by_socket = hash_create (0, sizeof (uword)); + uword c; + u8 use_1g = 1; + u8 use_2m = 1; + int rv; + + umount(DEFAULT_HUGE_DIR); + + /* Process "socket-mem" parameter value */ + if (vec_len (socket_mem)) + { + unformat_input_t in; + unformat_init_vector(&in, socket_mem); + unformat(&in, "%U", unformat_socket_mem, &mem_by_socket); + unformat_free(&in); + } + else + use_1g = 0; + + /* check if available enough 1GB pages for each socket */ + clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ({ + uword * p = hash_get (mem_by_socket, c); + if (p) + { + u32 mem = p[0]; + if (mem) + { + u32 pages_num_1g = mem / 1024; + u32 pages_num_2m = mem / 2; + u32 pages_avail; + + pages_avail = get_node_free_hugepages_num(c, 1048576); + if (!(pages_avail >= pages_num_1g)) + use_1g = 0; + + pages_avail = get_node_free_hugepages_num(c, 2048); + if (!(pages_avail >= pages_num_2m)) + use_2m = 0; + } + } + })); + + hash_free (mem_by_socket); + + rv = mkdir(VPP_RUN_DIR, 0755); + if (rv && errno != EEXIST) + { + error = clib_error_return (0, "mkdir '%s' failed errno %d", + VPP_RUN_DIR, errno); + goto done; + } + + rv = mkdir(DEFAULT_HUGE_DIR, 0755); + if (rv && errno != EEXIST) + { + error = clib_error_return (0, "mkdir '%s' failed errno %d", + DEFAULT_HUGE_DIR, errno); + goto done; + } + + if (use_1g) + { + rv = mount("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G"); + } + else if (use_2m) + { + rv = mount("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL); + } + else + { + return clib_error_return (0, "not enough free huge pages"); + } + + if (rv) + { + error = clib_error_return (0, "mount failed %d", errno); + goto done; + } + + tmp = format (0, "--huge-dir%c", 0); + vec_add1 (dm->eal_init_args, tmp); + tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0); + vec_add1 (dm->eal_init_args, tmp); + if (!file_prefix) + { + tmp = format (0, "--file-prefix%c", 0); + vec_add1 (dm->eal_init_args, tmp); + tmp = format (0, "vpp%c", 0); + vec_add1 (dm->eal_init_args, tmp); + } + } + + /* + * Blacklist all ethernet interfaces in the linux IP routing tables. + */ + dm->eth_if_blacklist = format (0, "%c", 0); + rte_cmd = format (0, "route --inet --inet6 -n|awk '{print $7}'|sort -u|" + "egrep $(echo $(ls -1d /sys/class/net/*/device|" + "cut -d/ -f5)|sed -s 's/ /|/g')%c", 0); + if ((rte_fp = popen ((const char *)rte_cmd, "r")) == NULL) + { + error = clib_error_return_unix (0, "Unable to find blacklist ethernet" + " interface(s) in linux routing tables."); + goto rte_cmd_err; + + } + + vec_validate (ethname, BUFSIZ); + while (fgets ((char *)ethname, BUFSIZ, rte_fp) != NULL) + { + FILE *rlnk_fp; + u8 * rlnk_cmd = 0, * devname = 0; + + ethname[strlen ((char *)ethname) - 1] = 0; // chomp trailing newline. + + rlnk_cmd = format (0, "readlink /sys/class/net/%s%c", + ethname, 0); + + if ((rlnk_fp = popen ((const char *)rlnk_cmd, "r")) == NULL) + { + error = clib_error_return_unix (0, "Unable to read %s link.", + ethname); + goto rlnk_cmd_err; + } + + vec_validate (devname, BUFSIZ); + while (fgets ((char *)devname, BUFSIZ, rlnk_fp) != NULL) + { + char * pci_id = 0; + + /* + * Extract the device PCI ID name from the link. It is the first + * PCI ID searching backwards from the end of the link pathname. + * For example: + * readlink /sys/class/net/eth0 + * ../../devices/pci0000:00/0000:00:0a.0/virtio4/net/eth0 + */ + for (pci_id = (char *)((devname + strlen((char *)devname))); + ((u8 *)pci_id > devname) && *pci_id != '.'; pci_id--) + ; + + /* + * Verify that the field found is a valid PCI ID. + */ + if ((*(pci_id - 1) == '.') || ((u8 *)(pci_id - 11) < devname) || + (*(pci_id - 11) != '/') || (*(pci_id - 3) != ':') || + (*(pci_id - 6) != ':')) + { + devname[strlen ((char *)devname) - 1] = 0; // chomp trailing newline. + clib_warning ("Unable to extract %s PCI ID (0x%llx \"%s\") " + "from 0x%llx \"%s\"", ethname, pci_id, pci_id, + devname, devname); + continue; + } + + pci_id[2] = 0; + pci_id -= 10; + + /* Don't blacklist any interfaces which have been whitelisted. + */ + if (dm->eth_if_whitelist && + strstr ((char *)dm->eth_if_whitelist, (char *)pci_id)) + continue; + + _vec_len (dm->eth_if_blacklist) -= 1; // chomp trailing NULL. + dm->eth_if_blacklist = format (dm->eth_if_blacklist, " %s%c", + pci_id, 0); + } + + rlnk_cmd_err: + pclose (rlnk_fp); + vec_free (rlnk_cmd); + vec_free (devname); + } + + rte_cmd_err: + pclose (rte_fp); + vec_free (rte_cmd); + vec_free (ethname); + + if (error) + return error; + + /* I'll bet that -c and -n must be the first and second args... */ + if (!dm->coremask_set_manually) + { + vlib_thread_registration_t * tr; + uword coremask; + int i; + + /* main thread core */ + coremask = 1 << tm->main_lcore; + + for (i = 0; i < vec_len (tm->registrations); i++) + { + tr = tm->registrations[i]; + if (clib_bitmap_is_zero(tr->coremask)) + continue; + coremask |= tr->coremask[0]; + } + + vec_insert (dm->eal_init_args, 2, 1); + dm->eal_init_args[1] = (u8 *) "-c"; + tmp = format (0, "%x%c", coremask, 0); + dm->eal_init_args[2] = tmp; + } + + if (!dm->nchannels_set_manually) + { + vec_insert (dm->eal_init_args, 2, 3); + dm->eal_init_args[3] = (u8 *) "-n"; + tmp = format (0, "%d", dm->nchannels); + dm->eal_init_args[4] = tmp; + } + + /* + * If there are whitelisted devices, + * add the whitelist option & device list to the dpdk arg list... + */ + if (dm->eth_if_whitelist) + { + unformat_init_string (in, (char *)dm->eth_if_whitelist, + vec_len(dm->eth_if_whitelist) - 1); + fmt = "-w%c"; + } + + /* + * Otherwise add the blacklisted devices to the dpdk arg list. + */ + else + { + unformat_init_string (in, (char *)dm->eth_if_blacklist, + vec_len(dm->eth_if_blacklist) - 1); + fmt = "-b%c"; + } + + while (unformat_check_input (in) != UNFORMAT_END_OF_INPUT) + { + tmp = format (0, fmt, 0); + vec_add1 (dm->eal_init_args, tmp); + unformat (in, "%s", &pci_dev_id); + vec_add1 (dm->eal_init_args, pci_dev_id); + } + + if (no_pci == 0) + { + /* + * Bind Virtio pci devices to the igb_uio kernel driver. + */ + error = dpdk_bind_eth_kernel_drivers (vm, "1af4:1000", VIRTIO_PCI_NAME); + if (error) + return error; + + /* + * Bind vmxnet3 pci devices to the igb_uio kernel driver. + */ + error = dpdk_bind_eth_kernel_drivers (vm, "15ad:07b0", + (char *) dm->uio_driver_name); + if (error) + return error; + + /* + * Bind Intel ethernet pci devices to igb_uio kernel driver. + */ + error = dpdk_bind_eth_kernel_drivers (vm, "8086:", + (char *) dm->uio_driver_name); + /* + * Bind Cisco VIC ethernet pci devices to igb_uio kernel driver. + */ + error = dpdk_bind_eth_kernel_drivers (vm, "1137:0043", + (char *) dm->uio_driver_name); + } + + /* set master-lcore */ + tmp = format (0, "--master-lcore%c", 0); + vec_add1 (dm->eal_init_args, tmp); + tmp = format (0, "%u%c", tm->main_lcore, 0); + vec_add1 (dm->eal_init_args, tmp); + + /* NULL terminate the "argv" vector, in case of stupidity */ + vec_add1 (dm->eal_init_args, 0); + _vec_len(dm->eal_init_args) -= 1; + + /* Set up DPDK eal and packet mbuf pool early. */ + + log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE; + + rte_set_log_level (log_level); + + vm = dm->vlib_main; + + ret = rte_eal_init(vec_len(dm->eal_init_args), (char **) dm->eal_init_args); + + /* lazy umount hugepages */ + umount2(DEFAULT_HUGE_DIR, MNT_DETACH); + + if (ret < 0) + return clib_error_return (0, "rte_eal_init returned %d", ret); + + /* main thread 1st */ + error = vlib_buffer_pool_create(vm, dm->num_mbufs, MBUF_SIZE, rte_socket_id()); + if (error) + return error; + + for (i = 0; i < RTE_MAX_LCORE; i++) + { + error = vlib_buffer_pool_create(vm, dm->num_mbufs, MBUF_SIZE, + rte_lcore_to_socket_id(i)); + if (error) + return error; + } + + if (dm->use_rss) + { + vlib_node_runtime_t * rt = vlib_node_get_runtime (vm, dpdk_input_node.index); + rt->function = dpdk_input_rss; + } + done: + return error; +} + +VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk"); + +void dpdk_update_link_state (dpdk_device_t * xd, f64 now) +{ + vnet_main_t * vnm = vnet_get_main(); + struct rte_eth_link prev_link = xd->link; + u32 hw_flags = 0; + u8 hw_flags_chg = 0; + + /* only update link state for PMD interfaces */ + if (xd->dev_type != VNET_DPDK_DEV_ETH) + return; + + xd->time_last_link_update = now ? now : xd->time_last_link_update; + memset(&xd->link, 0, sizeof(xd->link)); + rte_eth_link_get_nowait (xd->device_index, &xd->link); + + if (LINK_STATE_ELOGS) + { + vlib_main_t * vm = vlib_get_main(); + ELOG_TYPE_DECLARE(e) = { + .format = + "update-link-state: sw_if_index %d, admin_up %d," + "old link_state %d new link_state %d", + .format_args = "i4i1i1i1", + }; + + struct { u32 sw_if_index; u8 admin_up; + u8 old_link_state; u8 new_link_state;} *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->sw_if_index = xd->vlib_sw_if_index; + ed->admin_up = xd->admin_up; + ed->old_link_state = (u8) + vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index); + ed->new_link_state = (u8) xd->link.link_status; + } + + if ((xd->admin_up == 1) && + ((xd->link.link_status != 0) ^ + vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index))) + { + hw_flags_chg = 1; + hw_flags |= (xd->link.link_status ? + VNET_HW_INTERFACE_FLAG_LINK_UP: 0); + } + + if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex)) + { + hw_flags_chg = 1; + switch (xd->link.link_duplex) + { + case ETH_LINK_HALF_DUPLEX: + hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX; + break; + case ETH_LINK_FULL_DUPLEX: + hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX; + break; + default: + break; + } + } + if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed)) + { + hw_flags_chg = 1; + switch (xd->link.link_speed) + { + case ETH_LINK_SPEED_10: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M; + break; + case ETH_LINK_SPEED_100: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M; + break; + case ETH_LINK_SPEED_1000: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G; + break; + case ETH_LINK_SPEED_10000: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G; + break; + case ETH_LINK_SPEED_40G: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G; + break; + case 0: + break; + default: + clib_warning("unknown link speed %d", xd->link.link_speed); + break; + } + } + if (hw_flags_chg) + { + if (LINK_STATE_ELOGS) + { + vlib_main_t * vm = vlib_get_main(); + + ELOG_TYPE_DECLARE(e) = { + .format = "update-link-state: sw_if_index %d, new flags %d", + .format_args = "i4i4", + }; + + struct { u32 sw_if_index; u32 flags; } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->sw_if_index = xd->vlib_sw_if_index; + ed->flags = hw_flags; + } + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, hw_flags); + } +} + +static uword +dpdk_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, + vlib_frame_t * f) +{ + clib_error_t * error; + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + vlib_thread_main_t * tm = vlib_get_thread_main(); + void *vu_state; + int i; + + error = dpdk_lib_init (dm); + + /* + * Turn on the input node if we found some devices to drive + * and we're not running worker threads or i/o threads + */ + + if (error == 0 && vec_len(dm->devices) > 0) + { + if (tm->n_vlib_mains == 1) + vlib_node_set_state (vm, dpdk_input_node.index, + VLIB_NODE_STATE_POLLING); + else if (tm->main_thread_is_io_node) + vlib_node_set_state (vm, dpdk_io_input_node.index, + VLIB_NODE_STATE_POLLING); + else if (!dm->have_io_threads) + for (i=0; i < tm->n_vlib_mains; i++) + if (vec_len(dm->devices_by_cpu[i]) > 0) + vlib_node_set_state (vlib_mains[i], dpdk_input_node.index, + VLIB_NODE_STATE_POLLING); + } + + if (error) + clib_error_report (error); + + dpdk_vhost_user_process_init(&vu_state); + + dm->io_thread_release = 1; + + f64 now = vlib_time_now (vm); + vec_foreach (xd, dm->devices) + { + dpdk_update_link_state (xd, now); + } + + while (1) + { + vlib_process_wait_for_event_or_clock (vm, 5.0); + + if (dpdk_get_admin_up_down_in_progress()) + /* skip the poll if an admin up down is in progress (on any interface) */ + continue; + + vec_foreach (xd, dm->devices) + { + f64 now = vlib_time_now (vm); + if ((now - xd->time_last_stats_update) >= DPDK_STATS_POLL_INTERVAL) + dpdk_update_counters (xd, now); + if ((now - xd->time_last_link_update) >= DPDK_LINK_POLL_INTERVAL) + dpdk_update_link_state (xd, now); + + if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER) + if (dpdk_vhost_user_process_if(vm, xd, vu_state) != 0) + continue; + } + } + + dpdk_vhost_user_process_cleanup(vu_state); + + return 0; +} + +VLIB_REGISTER_NODE (dpdk_process_node,static) = { + .function = dpdk_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "dpdk-process", + .process_log2_n_stack_bytes = 17, +}; + +clib_error_t * +dpdk_init (vlib_main_t * vm) +{ + dpdk_main_t * dm = &dpdk_main; + vlib_node_t * ei; + clib_error_t * error = 0; + vlib_thread_main_t * tm = vlib_get_thread_main(); + + /* verify that structs are cacheline aligned */ + ASSERT(offsetof(dpdk_device_t, cacheline0) == 0); + ASSERT(offsetof(dpdk_device_t, cacheline1) == CLIB_CACHE_LINE_BYTES); + ASSERT(offsetof(dpdk_worker_t, cacheline0) == 0); + ASSERT(offsetof(frame_queue_trace_t, cacheline0) == 0); + + /* Add references to DPDK Driver Constructor functions to get the dynamic + * loader to pull in the driver library & run the constructors. + */ +#define _(d) \ +do { \ + void devinitfn_ ##d(void); \ + __attribute__((unused)) void (* volatile pf)(void); \ + pf = devinitfn_ ##d; \ +} while(0); + +#ifdef RTE_LIBRTE_EM_PMD + _(em_pmd_drv) +#endif + +#ifdef RTE_LIBRTE_IGB_PMD + _(pmd_igb_drv) +#endif + +#ifdef RTE_LIBRTE_IXGBE_PMD + _(rte_ixgbe_driver) +#endif + +#ifdef RTE_LIBRTE_I40E_PMD + _(rte_i40e_driver) + _(rte_i40evf_driver) +#endif + +#ifdef RTE_LIBRTE_FM10K_PMD + _(rte_fm10k_driver) +#endif + +#ifdef RTE_LIBRTE_VIRTIO_PMD + _(rte_virtio_driver) +#endif + +#ifdef RTE_LIBRTE_VMXNET3_PMD + _(rte_vmxnet3_driver) +#endif + +#ifdef RTE_LIBRTE_VICE_PMD + _(rte_vice_driver) +#endif + +#ifdef RTE_LIBRTE_ENIC_PMD + _(rte_enic_driver) +#endif + +#ifdef RTE_LIBRTE_PMD_AF_PACKET + _(pmd_af_packet_drv) +#endif + +#undef _ + + dm->vlib_main = vm; + dm->vnet_main = vnet_get_main(); + + ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input"); + if (ei == 0) + return clib_error_return (0, "ethernet-input node AWOL"); + + dm->ethernet_input_node_index = ei->index; + + dm->nchannels = 4; + dm->num_mbufs = dm->num_mbufs ? dm->num_mbufs : NB_MBUF; + vec_add1 (dm->eal_init_args, (u8 *) "vnet"); + + dm->dpdk_device_by_kni_port_id = hash_create (0, sizeof (uword)); + dm->vu_sw_if_index_by_listener_fd = hash_create (0, sizeof (uword)); + dm->vu_sw_if_index_by_sock_fd = hash_create (0, sizeof (uword)); + + /* $$$ use n_thread_stacks since it's known-good at this point */ + vec_validate (dm->recycle, tm->n_thread_stacks - 1); + + /* initialize EFD (early fast discard) default settings */ + dm->efd.enabled = DPDK_EFD_DISABLED; + dm->efd.queue_hi_thresh = ((DPDK_EFD_DEFAULT_DEVICE_QUEUE_HI_THRESH_PCT * + DPDK_NB_RX_DESC_10GE)/100); + dm->efd.consec_full_frames_hi_thresh = + DPDK_EFD_DEFAULT_CONSEC_FULL_FRAMES_HI_THRESH; + + /* vhost-user coalescence frames defaults */ + dm->vhost_coalesce_frames = 32; + dm->vhost_coalesce_time = 1e-3; + + /* init CLI */ + if ((error = vlib_call_init_function (vm, dpdk_cli_init))) + return error; + + return error; +} + +VLIB_INIT_FUNCTION (dpdk_init); + diff --git a/vnet/vnet/devices/dpdk/node.c b/vnet/vnet/devices/dpdk/node.c new file mode 100644 index 00000000000..fde0eb23e14 --- /dev/null +++ b/vnet/vnet/devices/dpdk/node.c @@ -0,0 +1,2010 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <vppinfra/xxhash.h> + +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/dpdk/dpdk.h> +#include <vnet/classify/vnet_classify.h> +#include <vnet/mpls-gre/packet.h> + +#include "dpdk_priv.h" + +#ifndef MAX +#define MAX(a,b) ((a) < (b) ? (b) : (a)) +#endif + +#ifndef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif + +/* + * At least in certain versions of ESXi, vmware e1000's don't honor the + * "strip rx CRC" bit. Set this flag to work around that bug FOR UNIT TEST ONLY. + * + * If wireshark complains like so: + * + * "Frame check sequence: 0x00000000 [incorrect, should be <hex-num>]" + * and you're using ESXi emulated e1000's, set this flag FOR UNIT TEST ONLY. + * + * Note: do NOT check in this file with this workaround enabled! You'll lose + * actual data from e.g. 10xGE interfaces. The extra 4 bytes annoy + * wireshark, but they're harmless... + */ +#define VMWARE_LENGTH_BUG_WORKAROUND 0 + +typedef struct { + u32 cached_next_index; + + /* convenience variables */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; +} handoff_dispatch_main_t; + +typedef struct { + u32 buffer_index; + u32 next_index; + u32 sw_if_index; +} handoff_dispatch_trace_t; + +/* packet trace format function */ +static u8 * format_handoff_dispatch_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + handoff_dispatch_trace_t * t = va_arg (*args, handoff_dispatch_trace_t *); + + s = format (s, "HANDOFF_DISPATCH: sw_if_index %d next_index %d buffer 0x%x", + t->sw_if_index, + t->next_index, + t->buffer_index); + return s; +} + +handoff_dispatch_main_t handoff_dispatch_main; + +vlib_node_registration_t handoff_dispatch_node; + +#define foreach_handoff_dispatch_error \ +_(EXAMPLE, "example packets") + +typedef enum { +#define _(sym,str) HANDOFF_DISPATCH_ERROR_##sym, + foreach_handoff_dispatch_error +#undef _ + HANDOFF_DISPATCH_N_ERROR, +} handoff_dispatch_error_t; + +static char * handoff_dispatch_error_strings[] = { +#define _(sym,string) string, + foreach_handoff_dispatch_error +#undef _ +}; + +static inline +void vlib_put_handoff_queue_elt (vlib_frame_queue_elt_t * hf) +{ + CLIB_MEMORY_BARRIER(); + hf->valid = 1; +} + +static uword +handoff_dispatch_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, * from, * to_next; + dpdk_rx_next_t next_index; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + u32 next0, next1; + u32 sw_if_index0, sw_if_index1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + next0 = vnet_buffer(b0)->io_handoff.next_index; + next1 = vnet_buffer(b1)->io_handoff.next_index; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0); + handoff_dispatch_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->buffer_index = bi0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + vlib_trace_buffer (vm, node, next1, b1, /* follow_chain */ 0); + handoff_dispatch_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + t->sw_if_index = sw_if_index1; + t->next_index = next1; + t->buffer_index = bi1; + } + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + u32 sw_if_index0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + next0 = vnet_buffer(b0)->io_handoff.next_index; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0); + handoff_dispatch_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->buffer_index = bi0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (handoff_dispatch_node) = { + .function = handoff_dispatch_node_fn, + .name = "handoff-dispatch", + .vector_size = sizeof (u32), + .format_trace = format_handoff_dispatch_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .flags = VLIB_NODE_FLAG_IS_HANDOFF, + + .n_errors = ARRAY_LEN(handoff_dispatch_error_strings), + .error_strings = handoff_dispatch_error_strings, + + .n_next_nodes = DPDK_RX_N_NEXT, + + .next_nodes = { + [DPDK_RX_NEXT_DROP] = "error-drop", + [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input", + [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input", + [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input", + }, +}; + +clib_error_t *handoff_dispatch_init (vlib_main_t *vm) +{ + handoff_dispatch_main_t * mp = &handoff_dispatch_main; + + mp->vlib_main = vm; + mp->vnet_main = &vnet_main; + + return 0; +} + +VLIB_INIT_FUNCTION (handoff_dispatch_init); + +u32 dpdk_get_handoff_node_index (void) +{ + return handoff_dispatch_node.index; +} + +static char * dpdk_error_strings[] = { +#define _(n,s) s, + foreach_dpdk_error +#undef _ +}; + +typedef struct { + u32 buffer_index; + u16 device_index; + u16 queue_index; + struct rte_mbuf mb; + vlib_buffer_t buffer; /* Copy of VLIB buffer; pkt data stored in pre_data. */ +} dpdk_rx_dma_trace_t; + +static u8 * format_dpdk_rx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main(); + dpdk_rx_dma_trace_t * t = va_arg (*va, dpdk_rx_dma_trace_t *); + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = vec_elt_at_index (dm->devices, t->device_index); + format_function_t * f; + uword indent = format_get_indent (s); + vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + + s = format (s, "%U rx queue %d", + format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, + format_vlib_buffer, &t->buffer); + +#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS + s = format (s, "\n%U%U", + format_white_space, indent, + format_dpdk_rx_rte_mbuf, &t->mb); +#else + s = format (s, "\n%U%U", + format_white_space, indent, + format_dpdk_rte_mbuf, &t->mb); +#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */ + f = node->format_buffer; + if (!f) + f = format_hex_bytes; + s = format (s, "\n%U%U", format_white_space, indent, + f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +always_inline void +dpdk_rx_next_and_error_from_mb_flags_x1 (dpdk_device_t *xd, struct rte_mbuf *mb, + vlib_buffer_t *b0, + u8 * next0, u8 * error0) +{ + u8 is0_ip4, is0_ip6, is0_mpls, n0; + uint16_t mb_flags = mb->ol_flags; + + if (PREDICT_FALSE(mb_flags & ( +#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS + PKT_EXT_RX_PKT_ERROR | PKT_EXT_RX_BAD_FCS | +#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */ + PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD + ))) + { + /* some error was flagged. determine the drop reason */ + n0 = DPDK_RX_NEXT_DROP; + *error0 = +#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS + (mb_flags & PKT_EXT_RX_PKT_ERROR) ? DPDK_ERROR_RX_PACKET_ERROR : + (mb_flags & PKT_EXT_RX_BAD_FCS) ? DPDK_ERROR_RX_BAD_FCS : +#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */ + (mb_flags & PKT_RX_IP_CKSUM_BAD) ? DPDK_ERROR_IP_CHECKSUM_ERROR : + (mb_flags & PKT_RX_L4_CKSUM_BAD) ? DPDK_ERROR_L4_CHECKSUM_ERROR : + DPDK_ERROR_NONE; + } + else + { + *error0 = DPDK_ERROR_NONE; + if (xd->per_interface_next_index != ~0) + n0 = xd->per_interface_next_index; + else if (mb_flags & PKT_RX_VLAN_PKT) + n0 = DPDK_RX_NEXT_ETHERNET_INPUT; + else + { + n0 = DPDK_RX_NEXT_ETHERNET_INPUT; +#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0) + is0_ip4 = (mb->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV4_EXT)) != 0; +#else + is0_ip4 = (mb_flags & (PKT_RX_IPV4_HDR | PKT_RX_IPV4_HDR_EXT)) != 0; +#endif + + if (PREDICT_TRUE(is0_ip4)) + n0 = DPDK_RX_NEXT_IP4_INPUT; + else + { +#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0) + is0_ip6 = + (mb->packet_type & (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L3_IPV6_EXT)) != 0; +#else + is0_ip6 = + (mb_flags & (PKT_RX_IPV6_HDR | PKT_RX_IPV6_HDR_EXT)) != 0; +#endif + if (PREDICT_TRUE(is0_ip6)) + n0 = DPDK_RX_NEXT_IP6_INPUT; + else + { + ethernet_header_t *h0 = (ethernet_header_t *) b0->data; + is0_mpls = (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)); + n0 = is0_mpls ? DPDK_RX_NEXT_MPLS_INPUT : n0; + } + } + } + } + *next0 = n0; +} + +void dpdk_rx_trace (dpdk_main_t * dm, + vlib_node_runtime_t * node, + dpdk_device_t * xd, + u16 queue_id, + u32 * buffers, + uword n_buffers) +{ + vlib_main_t * vm = vlib_get_main(); + u32 * b, n_left; + u8 next0; + + n_left = n_buffers; + b = buffers; + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t * b0; + dpdk_rx_dma_trace_t * t0; + struct rte_mbuf *mb; + u8 error0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + mb = ((struct rte_mbuf *)b0) - 1; + dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0, + &next0, &error0); + vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->queue_index = queue_id; + t0->device_index = xd->device_index; + t0->buffer_index = bi0; + + memcpy (&t0->mb, mb, sizeof (t0->mb)); + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data, sizeof (t0->buffer.pre_data)); + +#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS + /* + * Clear overloaded TX offload flags when a DPDK driver + * is using them for RX flags (e.g. Cisco VIC Ethernet driver) + */ + mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK; +#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */ + + b += 1; + } +} + +/* + * dpdk_efd_update_counters() + * Update EFD (early-fast-discard) counters + */ +void dpdk_efd_update_counters (dpdk_device_t *xd, + u32 n_buffers, + u16 enabled) +{ + if (enabled & DPDK_EFD_MONITOR_ENABLED) + { + u64 now = clib_cpu_time_now(); + if (xd->efd_agent.last_poll_time > 0) + { + u64 elapsed_time = (now - xd->efd_agent.last_poll_time); + if (elapsed_time > xd->efd_agent.max_poll_delay) + xd->efd_agent.max_poll_delay = elapsed_time; + } + xd->efd_agent.last_poll_time = now; + } + + xd->efd_agent.total_packet_cnt += n_buffers; + xd->efd_agent.last_burst_sz = n_buffers; + + if (n_buffers > xd->efd_agent.max_burst_sz) + xd->efd_agent.max_burst_sz = n_buffers; + + if (PREDICT_FALSE(n_buffers == VLIB_FRAME_SIZE)) + { + xd->efd_agent.full_frames_cnt++; + xd->efd_agent.consec_full_frames_cnt++; + } + else + { + xd->efd_agent.consec_full_frames_cnt = 0; + } +} + +/* is_efd_discardable() + * returns non zero DPDK error if packet meets early-fast-discard criteria, + * zero otherwise + */ +u32 is_efd_discardable (vlib_thread_main_t *tm, + vlib_buffer_t * b0, + struct rte_mbuf *mb) +{ + ethernet_header_t *eh = (ethernet_header_t *) b0->data; + + if (eh->type == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) + { + ip4_header_t *ipv4 = + (ip4_header_t *)&(b0->data[sizeof(ethernet_header_t)]); + u8 pkt_prec = (ipv4->tos >> 5); + + return (tm->efd.ip_prec_bitmap & (1 << pkt_prec) ? + DPDK_ERROR_IPV4_EFD_DROP_PKTS : DPDK_ERROR_NONE); + } + else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_IP6)) + { + ip6_header_t *ipv6 = + (ip6_header_t *)&(b0->data[sizeof(ethernet_header_t)]); + u8 pkt_tclass = + ((ipv6->ip_version_traffic_class_and_flow_label >> 20) & 0xff); + + return (tm->efd.ip_prec_bitmap & (1 << pkt_tclass) ? + DPDK_ERROR_IPV6_EFD_DROP_PKTS : DPDK_ERROR_NONE); + } + else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_MPLS_UNICAST)) + { + mpls_unicast_header_t *mpls = + (mpls_unicast_header_t *)&(b0->data[sizeof(ethernet_header_t)]); + u8 pkt_exp = ((mpls->label_exp_s_ttl >> 9) & 0x07); + + return (tm->efd.mpls_exp_bitmap & (1 << pkt_exp) ? + DPDK_ERROR_MPLS_EFD_DROP_PKTS : DPDK_ERROR_NONE); + } + else if ((eh->type == clib_net_to_host_u16(ETHERNET_TYPE_VLAN)) || + (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_DOT1AD))) + { + ethernet_vlan_header_t *vlan = + (ethernet_vlan_header_t *)&(b0->data[sizeof(ethernet_header_t)]); + u8 pkt_cos = ((vlan->priority_cfi_and_id >> 13) & 0x07); + + return (tm->efd.vlan_cos_bitmap & (1 << pkt_cos) ? + DPDK_ERROR_VLAN_EFD_DROP_PKTS : DPDK_ERROR_NONE); + } + + return DPDK_ERROR_NONE; +} + +/* + * This function is used when there are no worker threads. + * The main thread performs IO and forwards the packets. + */ +static inline u32 dpdk_device_input ( dpdk_main_t * dm, + dpdk_device_t * xd, + vlib_node_runtime_t * node, + u32 cpu_index, + u16 queue_id) +{ + u32 n_buffers; + u32 next_index = DPDK_RX_NEXT_ETHERNET_INPUT; + u32 n_left_to_next, * to_next; + u32 mb_index; + vlib_main_t * vm = vlib_get_main(); + uword n_rx_bytes = 0; + u32 n_trace, trace_cnt __attribute__((unused)); + vlib_buffer_free_list_t * fl; + u8 efd_discard_burst = 0; + + if (xd->admin_up == 0) + return 0; + + n_buffers = dpdk_rx_burst(dm, xd, queue_id); + + if (n_buffers == 0) + { + /* check if EFD (dpdk) is enabled */ + if (PREDICT_FALSE(dm->efd.enabled)) + { + /* reset a few stats */ + xd->efd_agent.last_poll_time = 0; + xd->efd_agent.last_burst_sz = 0; + } + return 0; + } + + vec_reset_length (xd->d_trace_buffers); + trace_cnt = n_trace = vlib_get_trace_count (vm, node); + + fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + /* + * DAW-FIXME: VMXNET3 device stop/start doesn't work, + * therefore fake the stop in the dpdk driver by + * silently dropping all of the incoming pkts instead of + * stopping the driver / hardware. + */ + if (PREDICT_FALSE(xd->admin_up != 1)) + { + for (mb_index = 0; mb_index < n_buffers; mb_index++) + rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]); + + return 0; + } + + /* Check for congestion if EFD (Early-Fast-Discard) is enabled + * in any mode (e.g. dpdk, monitor, or drop_all) + */ + if (PREDICT_FALSE(dm->efd.enabled)) + { + /* update EFD counters */ + dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled); + + if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED)) + { + /* discard all received packets */ + for (mb_index = 0; mb_index < n_buffers; mb_index++) + rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]); + + xd->efd_agent.discard_cnt += n_buffers; + increment_efd_drop_counter(vm, + DPDK_ERROR_VLAN_EFD_DROP_PKTS, + n_buffers); + + return 0; + } + + if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >= + dm->efd.consec_full_frames_hi_thresh)) + { + u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index, + queue_id); + if (device_queue_sz >= dm->efd.queue_hi_thresh) + { + /* dpdk device queue has reached the critical threshold */ + xd->efd_agent.congestion_cnt++; + + /* apply EFD to packets from the burst */ + efd_discard_burst = 1; + } + } + } + + mb_index = 0; + + while (n_buffers > 0) + { + u32 bi0; + u8 next0, error0; + u32 l3_offset0; + vlib_buffer_t * b0, * b_seg, * b_chain = 0; + u32 cntr_type; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_buffers > 0 && n_left_to_next > 0) + { + u8 nb_seg = 1; + struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index]; + struct rte_mbuf *mb_seg = mb->next; + + if (PREDICT_TRUE(n_buffers > 2)) + { + struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2]; + vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1); + CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE); + } + + ASSERT(mb); + + b0 = (vlib_buffer_t *)(mb+1); + + /* check whether EFD is looking for packets to discard */ + if (PREDICT_FALSE(efd_discard_burst)) + { + vlib_thread_main_t * tm = vlib_get_thread_main(); + + if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb))) + { + rte_pktmbuf_free(mb); + xd->efd_agent.discard_cnt++; + increment_efd_drop_counter(vm, + cntr_type, + 1); + n_buffers--; + mb_index++; + continue; + } + } + + /* Prefetch one next segment if it exists. */ + if (PREDICT_FALSE(mb->nb_segs > 1)) + { + struct rte_mbuf *pfmb = mb->next; + vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1); + CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE); + b_chain = b0; + } + + vlib_buffer_init_for_free_list (b0, fl); + b0->clone_count = 0; + + bi0 = vlib_get_buffer_index (vm, b0); + + to_next[0] = bi0; + to_next++; + n_left_to_next--; + + dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0, + &next0, &error0); +#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS + /* + * Clear overloaded TX offload flags when a DPDK driver + * is using them for RX flags (e.g. Cisco VIC Ethernet driver) + */ + + if (PREDICT_TRUE(trace_cnt == 0)) + mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK; + else + trace_cnt--; +#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */ + + b0->error = node->errors[error0]; + + l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT || + next0 == DPDK_RX_NEXT_IP6_INPUT || + next0 == DPDK_RX_NEXT_MPLS_INPUT) ? + sizeof (ethernet_header_t) : 0); + + b0->current_data = l3_offset0; + b0->current_length = mb->data_len - l3_offset0; + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + + if (VMWARE_LENGTH_BUG_WORKAROUND) + b0->current_length -= 4; + + vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0; + n_rx_bytes += mb->pkt_len; + + /* Process subsequent segments of multi-segment packets */ + while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs)) + { + ASSERT(mb_seg != 0); + + b_seg = (vlib_buffer_t *)(mb_seg+1); + vlib_buffer_init_for_free_list (b_seg, fl); + b_seg->clone_count = 0; + + ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + ASSERT(b_seg->current_data == 0); + + /* + * The driver (e.g. virtio) may not put the packet data at the start + * of the segment, so don't assume b_seg->current_data == 0 is correct. + */ + b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data; + + b_seg->current_length = mb_seg->data_len; + b0->total_length_not_including_first_buffer += + mb_seg->data_len; + + b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT; + b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg); + + b_chain = b_seg; + mb_seg = mb_seg->next; + nb_seg++; + } + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0); + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + if (PREDICT_FALSE (n_trace > mb_index)) + vec_add1 (xd->d_trace_buffers, bi0); + n_buffers--; + mb_index++; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0)) + { + dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers, + vec_len (xd->d_trace_buffers)); + vlib_set_trace_count (vm, node, n_trace - vec_len (xd->d_trace_buffers)); + } + + vlib_increment_combined_counter + (vnet_get_main()->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + cpu_index, + xd->vlib_sw_if_index, + mb_index, n_rx_bytes); + + dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index); + dw->aggregate_rx_packets += mb_index; + + return mb_index; +} + +#if VIRL > 0 +#define VIRL_SPEED_LIMIT() \ + /* Limit the input rate to 1000 vectors / sec */ \ + { \ + struct timespec ts, tsrem; \ + \ + ts.tv_sec = 0; \ + ts.tv_nsec = 1000*1000; /* 1ms */ \ + \ + while (nanosleep(&ts, &tsrem) < 0) \ + { \ + ts = tsrem; \ + } \ + } +#else +#define VIRL_SPEED_LIMIT() +#endif + + +static uword +dpdk_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * f) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + uword n_rx_packets = 0; + dpdk_device_and_queue_t * dq; + u32 cpu_index = os_get_cpu_number(); + + /* + * Poll all devices on this cpu for input/interrupts. + */ + vec_foreach (dq, dm->devices_by_cpu[cpu_index]) + { + xd = vec_elt_at_index(dm->devices, dq->device); + ASSERT(dq->queue_id == 0); + n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, 0); + } + + VIRL_SPEED_LIMIT() + + return n_rx_packets; +} + +uword +dpdk_input_rss (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * f) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + uword n_rx_packets = 0; + dpdk_device_and_queue_t * dq; + u32 cpu_index = os_get_cpu_number(); + + /* + * Poll all devices on this cpu for input/interrupts. + */ + vec_foreach (dq, dm->devices_by_cpu[cpu_index]) + { + xd = vec_elt_at_index(dm->devices, dq->device); + n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id); + } + + VIRL_SPEED_LIMIT() + + return n_rx_packets; +} + +VLIB_REGISTER_NODE (dpdk_input_node) = { + .function = dpdk_input, + .type = VLIB_NODE_TYPE_INPUT, + .name = "dpdk-input", + + /* Will be enabled if/when hardware is detected. */ + .state = VLIB_NODE_STATE_DISABLED, + + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_dpdk_rx_dma_trace, + + .n_errors = DPDK_N_ERROR, + .error_strings = dpdk_error_strings, + + .n_next_nodes = DPDK_RX_N_NEXT, + .next_nodes = { + [DPDK_RX_NEXT_DROP] = "error-drop", + [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input", + [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input", + }, +}; + +/* + * Override the next nodes for the dpdk input nodes. + * Must be invoked prior to VLIB_INIT_FUNCTION calls. + */ +void dpdk_set_next_node (dpdk_rx_next_t next, char *name) +{ + vlib_node_registration_t *r = &dpdk_input_node; + vlib_node_registration_t *r_io = &dpdk_io_input_node; + vlib_node_registration_t *r_handoff = &handoff_dispatch_node; + + switch (next) + { + case DPDK_RX_NEXT_IP4_INPUT: + case DPDK_RX_NEXT_IP6_INPUT: + case DPDK_RX_NEXT_MPLS_INPUT: + case DPDK_RX_NEXT_ETHERNET_INPUT: + r->next_nodes[next] = name; + r_io->next_nodes[next] = name; + r_handoff->next_nodes[next] = name; + break; + + default: + clib_warning ("%s: illegal next %d\n", __FUNCTION__, next); + break; + } +} + +inline vlib_frame_queue_elt_t * +vlib_get_handoff_queue_elt (u32 vlib_worker_index) +{ + vlib_frame_queue_t *fq; + vlib_frame_queue_elt_t *elt; + u64 new_tail; + + fq = vlib_frame_queues[vlib_worker_index]; + ASSERT (fq); + + new_tail = __sync_add_and_fetch (&fq->tail, 1); + + /* Wait until a ring slot is available */ + while (new_tail >= fq->head_hint + fq->nelts) + vlib_worker_thread_barrier_check (); + + elt = fq->elts + (new_tail & (fq->nelts-1)); + + /* this would be very bad... */ + while (elt->valid) + ; + + elt->msg_type = VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME; + elt->last_n_vectors = elt->n_vectors = 0; + + return elt; +} + +inline vlib_frame_queue_elt_t * +dpdk_get_handoff_queue_elt ( + u32 vlib_worker_index, + vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index) +{ + vlib_frame_queue_elt_t *elt; + + if (handoff_queue_elt_by_worker_index [vlib_worker_index]) + return handoff_queue_elt_by_worker_index [vlib_worker_index]; + + elt = vlib_get_handoff_queue_elt (vlib_worker_index); + + handoff_queue_elt_by_worker_index [vlib_worker_index] = elt; + + return elt; +} + +static inline vlib_frame_queue_t * +is_vlib_handoff_queue_congested ( + u32 vlib_worker_index, + u32 queue_hi_thresh, + vlib_frame_queue_t ** handoff_queue_by_worker_index) +{ + vlib_frame_queue_t *fq; + + fq = handoff_queue_by_worker_index [vlib_worker_index]; + if (fq != (vlib_frame_queue_t *)(~0)) + return fq; + + fq = vlib_frame_queues[vlib_worker_index]; + ASSERT (fq); + + if (PREDICT_FALSE(fq->tail >= (fq->head_hint + queue_hi_thresh))) { + /* a valid entry in the array will indicate the queue has reached + * the specified threshold and is congested + */ + handoff_queue_by_worker_index [vlib_worker_index] = fq; + fq->enqueue_full_events++; + return fq; + } + + return NULL; +} + +static inline u64 ipv4_get_key (ip4_header_t *ip) +{ + u64 hash_key; + + hash_key = *((u64*)(&ip->address_pair)) ^ ip->protocol; + + return hash_key; +} + +static inline u64 ipv6_get_key (ip6_header_t *ip) +{ + u64 hash_key; + + hash_key = ip->src_address.as_u64[0] ^ + ip->src_address.as_u64[1] ^ + ip->dst_address.as_u64[0] ^ + ip->dst_address.as_u64[1] ^ + ip->protocol; + + return hash_key; +} + + +#define MPLS_BOTTOM_OF_STACK_BIT_MASK 0x00000100U +#define MPLS_LABEL_MASK 0xFFFFF000U + +static inline u64 mpls_get_key (mpls_unicast_header_t *m) +{ + u64 hash_key; + u8 ip_ver; + + + /* find the bottom of the MPLS label stack. */ + if (PREDICT_TRUE(m->label_exp_s_ttl & + clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) { + goto bottom_lbl_found; + } + m++; + + if (PREDICT_TRUE(m->label_exp_s_ttl & + clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) { + goto bottom_lbl_found; + } + m++; + + if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) { + goto bottom_lbl_found; + } + m++; + + if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) { + goto bottom_lbl_found; + } + m++; + + if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) { + goto bottom_lbl_found; + } + + /* the bottom label was not found - use the last label */ + hash_key = m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK); + + return hash_key; + + +bottom_lbl_found: + m++; + ip_ver = (*((u8 *)m) >> 4); + + /* find out if it is IPV4 or IPV6 header */ + if (PREDICT_TRUE(ip_ver == 4)) { + hash_key = ipv4_get_key((ip4_header_t *)m); + } else if (PREDICT_TRUE(ip_ver == 6)) { + hash_key = ipv6_get_key((ip6_header_t *)m); + } else { + /* use the bottom label */ + hash_key = (m-1)->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK); + } + + return hash_key; + +} + +static inline u64 eth_get_key (ethernet_header_t *h0) +{ + u64 hash_key; + + + if (PREDICT_TRUE(h0->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) { + hash_key = ipv4_get_key((ip4_header_t *)(h0+1)); + } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_IP6)) { + hash_key = ipv6_get_key((ip6_header_t *)(h0+1)); + } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) { + hash_key = mpls_get_key((mpls_unicast_header_t *)(h0+1)); + } else if ((h0->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) || + (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_DOT1AD))) { + ethernet_vlan_header_t * outer = (ethernet_vlan_header_t *)(h0 + 1); + + outer = (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) ? + outer+1 : outer; + if (PREDICT_TRUE(outer->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) { + hash_key = ipv4_get_key((ip4_header_t *)(outer+1)); + } else if (outer->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6)) { + hash_key = ipv6_get_key((ip6_header_t *)(outer+1)); + } else if (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) { + hash_key = mpls_get_key((mpls_unicast_header_t *)(outer+1)); + } else { + hash_key = outer->type; + } + } else { + hash_key = 0; + } + + return hash_key; +} + +/* + * This function is used when dedicated IO threads feed the worker threads. + * + * Devices are allocated to this thread based on instances and instance_id. + * If instances==0 then the function automatically determines the number + * of instances of this thread, and allocates devices between them. + * If instances != 0, then instance_id must be in the range 0..instances-1. + * The function allocates devices among the specified number of instances, + * with this thread having the given instance id. This option is used for + * splitting devices among differently named "io"-type threads. + */ +void dpdk_io_thread (vlib_worker_thread_t * w, + u32 instances, + u32 instance_id, + char *worker_name, + dpdk_io_thread_callback_t callback) +{ + vlib_main_t * vm = vlib_get_main(); + vlib_thread_main_t * tm = vlib_get_thread_main(); + vlib_thread_registration_t * tr; + dpdk_main_t * dm = &dpdk_main; + char *io_name = w->registration->name; + dpdk_device_t * xd; + dpdk_device_t ** my_devices = 0; + vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index = 0; + vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0; + vlib_frame_queue_elt_t * hf = 0; + int i; + u32 n_left_to_next_worker = 0, * to_next_worker = 0; + u32 next_worker_index = 0; + u32 current_worker_index = ~0; + u32 cpu_index = os_get_cpu_number(); + u32 num_workers = 0; + u32 num_devices = 0; + uword * p; + u16 queue_id = 0; + vlib_node_runtime_t * node_trace; + u32 first_worker_index = 0; + + /* Wait until the dpdk init sequence is complete */ + while (dm->io_thread_release == 0) + vlib_worker_thread_barrier_check(); + + clib_time_init (&vm->clib_time); + + p = hash_get_mem (tm->thread_registrations_by_name, worker_name); + ASSERT (p); + tr = (vlib_thread_registration_t *) p[0]; + if (tr) + { + num_workers = tr->count; + first_worker_index = tr->first_index; + } + + /* Allocate devices to this thread */ + if (instances == 0) + { + /* auto-assign */ + instance_id = w->instance_id; + + p = hash_get_mem (tm->thread_registrations_by_name, io_name); + tr = (vlib_thread_registration_t *) p[0]; + /* Otherwise, how did we get here */ + ASSERT (tr && tr->count); + instances = tr->count; + } + else + { + /* manually assign */ + ASSERT (instance_id < instances); + } + + vec_validate (handoff_queue_elt_by_worker_index, + first_worker_index + num_workers - 1); + + vec_validate_init_empty (congested_handoff_queue_by_worker_index, + first_worker_index + num_workers - 1, + (vlib_frame_queue_t *)(~0)); + + /* packet tracing is triggered on the dpdk-input node for ease-of-use */ + node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index); + + /* And handle them... */ + while (1) + { + u32 n_buffers; + u32 mb_index; + uword n_rx_bytes = 0; + u32 n_trace, trace_cnt __attribute__((unused)); + vlib_buffer_free_list_t * fl; + u32 hash; + u64 hash_key; + u8 efd_discard_burst; + + vlib_worker_thread_barrier_check (); + + /* Invoke callback if supplied */ + if (PREDICT_FALSE(callback != NULL)) + callback(vm); + + if (PREDICT_FALSE(vec_len(dm->devices) != num_devices)) + { + vec_reset_length(my_devices); + vec_foreach (xd, dm->devices) + { + if (((xd - dm->devices) % tr->count) == instance_id) + { + fprintf(stderr, "i/o thread %d (cpu %d) takes port %d\n", + instance_id, (int) os_get_cpu_number(), (int) (xd - dm->devices)); + vec_add1 (my_devices, xd); + } + } + num_devices = vec_len(dm->devices); + } + + for (i = 0; i < vec_len (my_devices); i++) + { + xd = my_devices[i]; + + if (!xd->admin_up) + continue; + + n_buffers = dpdk_rx_burst(dm, xd, 0 /* queue_id */); + + if (n_buffers == 0) + { + /* check if EFD (dpdk) is enabled */ + if (PREDICT_FALSE(dm->efd.enabled)) + { + /* reset a few stats */ + xd->efd_agent.last_poll_time = 0; + xd->efd_agent.last_burst_sz = 0; + } + continue; + } + + vec_reset_length (xd->d_trace_buffers); + trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace); + + /* + * DAW-FIXME: VMXNET3 device stop/start doesn't work, + * therefore fake the stop in the dpdk driver by + * silently dropping all of the incoming pkts instead of + * stopping the driver / hardware. + */ + if (PREDICT_FALSE(xd->admin_up != 1)) + { + for (mb_index = 0; mb_index < n_buffers; mb_index++) + rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]); + continue; + } + + /* reset EFD action for the burst */ + efd_discard_burst = 0; + + /* Check for congestion if EFD (Early-Fast-Discard) is enabled + * in any mode (e.g. dpdk, monitor, or drop_all) + */ + if (PREDICT_FALSE(dm->efd.enabled)) + { + /* update EFD counters */ + dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled); + + if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED)) + { + /* drop all received packets */ + for (mb_index = 0; mb_index < n_buffers; mb_index++) + rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]); + + xd->efd_agent.discard_cnt += n_buffers; + increment_efd_drop_counter(vm, + DPDK_ERROR_VLAN_EFD_DROP_PKTS, + n_buffers); + + continue; + } + + if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >= + dm->efd.consec_full_frames_hi_thresh)) + { + u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index, + queue_id); + if (device_queue_sz >= dm->efd.queue_hi_thresh) + { + /* dpdk device queue has reached the critical threshold */ + xd->efd_agent.congestion_cnt++; + + /* apply EFD to packets from the burst */ + efd_discard_burst = 1; + } + } + } + + fl = vlib_buffer_get_free_list + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + mb_index = 0; + + while (n_buffers > 0) + { + u32 bi0; + u8 next0, error0; + u32 l3_offset0; + vlib_buffer_t * b0, * b_seg, * b_chain = 0; + ethernet_header_t * h0; + u8 nb_seg = 1; + struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index]; + struct rte_mbuf *mb_seg = mb->next; + + if (PREDICT_TRUE(n_buffers > 1)) + { + struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2]; + vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1); + CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (bp->data, CLIB_CACHE_LINE_BYTES, LOAD); + } + + b0 = (vlib_buffer_t *)(mb+1); + + /* check whether EFD is looking for packets to discard */ + if (PREDICT_FALSE(efd_discard_burst)) + { + u32 cntr_type; + if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb))) + { + rte_pktmbuf_free(mb); + xd->efd_agent.discard_cnt++; + increment_efd_drop_counter(vm, + cntr_type, + 1); + + n_buffers--; + mb_index++; + continue; + } + } + + /* Prefetch one next segment if it exists */ + if (PREDICT_FALSE(mb->nb_segs > 1)) + { + struct rte_mbuf *pfmb = mb->next; + vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1); + CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE); + b_chain = b0; + } + + bi0 = vlib_get_buffer_index (vm, b0); + vlib_buffer_init_for_free_list (b0, fl); + b0->clone_count = 0; + + dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0, + &next0, &error0); +#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS + /* + * Clear overloaded TX offload flags when a DPDK driver + * is using them for RX flags (e.g. Cisco VIC Ethernet driver) + */ + if (PREDICT_TRUE(trace_cnt == 0)) + mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK; + else + trace_cnt--; +#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */ + + if (error0) + clib_warning ("bi %d error %d", bi0, error0); + + b0->error = 0; + + l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT || + next0 == DPDK_RX_NEXT_IP6_INPUT || + next0 == DPDK_RX_NEXT_MPLS_INPUT) ? + sizeof (ethernet_header_t) : 0); + + b0->current_data = l3_offset0; + b0->current_length = mb->data_len - l3_offset0; + + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + + if (VMWARE_LENGTH_BUG_WORKAROUND) + b0->current_length -= 4; + + vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0; + vnet_buffer(b0)->io_handoff.next_index = next0; + n_rx_bytes += mb->pkt_len; + + /* Process subsequent segments of multi-segment packets */ + while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs)) + { + ASSERT(mb_seg != 0); + + b_seg = (vlib_buffer_t *)(mb_seg+1); + vlib_buffer_init_for_free_list (b_seg, fl); + b_seg->clone_count = 0; + + ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + ASSERT(b_seg->current_data == 0); + + /* + * The driver (e.g. virtio) may not put the packet data at the start + * of the segment, so don't assume b_seg->current_data == 0 is correct. + */ + b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data; + + b_seg->current_length = mb_seg->data_len; + b0->total_length_not_including_first_buffer += + mb_seg->data_len; + + b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT; + b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg); + + b_chain = b_seg; + mb_seg = mb_seg->next; + nb_seg++; + } + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0); + + if (PREDICT_FALSE (n_trace > mb_index)) + vec_add1 (xd->d_trace_buffers, bi0); + + next_worker_index = first_worker_index; + + /* + * Force unknown traffic onto worker 0, + * and into ethernet-input. $$$$ add more hashes. + */ + h0 = (ethernet_header_t *) b0->data; + + /* Compute ingress LB hash */ + hash_key = eth_get_key(h0); + hash = (u32)clib_xxhash(hash_key); + + if (PREDICT_TRUE (is_pow2(num_workers))) + next_worker_index += hash & (num_workers - 1); + else + next_worker_index += hash % num_workers; + + /* if EFD is enabled and not already discarding from dpdk, + * check the worker ring/queue for congestion + */ + if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst)) + { + vlib_frame_queue_t *fq; + + /* fq will be valid if the ring is congested */ + fq = is_vlib_handoff_queue_congested( + next_worker_index, tm->efd.queue_hi_thresh, + congested_handoff_queue_by_worker_index); + + if (PREDICT_FALSE(fq != NULL)) + { + u32 cntr_type; + if (PREDICT_TRUE(cntr_type = + is_efd_discardable(tm, b0, mb))) + { + /* discard the packet */ + fq->enqueue_efd_discards++; + increment_efd_drop_counter(vm, cntr_type, 1); + rte_pktmbuf_free(mb); + n_buffers--; + mb_index++; + continue; + } + } + } + + if (next_worker_index != current_worker_index) + { + if (hf) + hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker; + + hf = dpdk_get_handoff_queue_elt( + next_worker_index, + handoff_queue_elt_by_worker_index); + + n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors; + to_next_worker = &hf->buffer_index[hf->n_vectors]; + current_worker_index = next_worker_index; + } + + /* enqueue to correct worker thread */ + to_next_worker[0] = bi0; + to_next_worker++; + n_left_to_next_worker--; + + if (n_left_to_next_worker == 0) + { + hf->n_vectors = VLIB_FRAME_SIZE; + vlib_put_handoff_queue_elt(hf); + current_worker_index = ~0; + handoff_queue_elt_by_worker_index[next_worker_index] = 0; + hf = 0; + } + + n_buffers--; + mb_index++; + } + + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0)) + { + /* credit the trace to the trace node */ + dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers, + vec_len (xd->d_trace_buffers)); + vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers)); + } + + vlib_increment_combined_counter + (vnet_get_main()->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + cpu_index, + xd->vlib_sw_if_index, + mb_index, n_rx_bytes); + + dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index); + dw->aggregate_rx_packets += mb_index; + } + + if (hf) + hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker; + + /* Ship frames to the worker nodes */ + for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++) + { + if (handoff_queue_elt_by_worker_index[i]) + { + hf = handoff_queue_elt_by_worker_index[i]; + /* + * It works better to let the handoff node + * rate-adapt, always ship the handoff queue element. + */ + if (1 || hf->n_vectors == hf->last_n_vectors) + { + vlib_put_handoff_queue_elt(hf); + handoff_queue_elt_by_worker_index[i] = 0; + } + else + hf->last_n_vectors = hf->n_vectors; + } + congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0); + } + hf = 0; + current_worker_index = ~0; + + vlib_increment_main_loop_counter (vm); + } +} + +/* + * This function is used when the main thread performs IO and feeds the + * worker threads. + */ +static uword +dpdk_io_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * f) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + vlib_thread_main_t * tm = vlib_get_thread_main(); + uword n_rx_packets = 0; + static vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index; + static vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0; + vlib_frame_queue_elt_t * hf = 0; + int i; + u32 n_left_to_next_worker = 0, * to_next_worker = 0; + u32 next_worker_index = 0; + u32 current_worker_index = ~0; + u32 cpu_index = os_get_cpu_number(); + static int num_workers_set; + static u32 num_workers; + u16 queue_id = 0; + vlib_node_runtime_t * node_trace; + static u32 first_worker_index; + + if (PREDICT_FALSE(num_workers_set == 0)) + { + uword * p; + vlib_thread_registration_t * tr; + /* Only the standard vnet worker threads are supported */ + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + tr = (vlib_thread_registration_t *) p[0]; + if (tr) + { + num_workers = tr->count; + first_worker_index = tr->first_index; + } + num_workers_set = 1; + } + + if (PREDICT_FALSE(handoff_queue_elt_by_worker_index == 0)) + { + vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1); + + vec_validate_init_empty (congested_handoff_queue_by_worker_index, + first_worker_index + num_workers - 1, + (vlib_frame_queue_t *)(~0)); + } + + /* packet tracing is triggered on the dpdk-input node for ease-of-use */ + node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index); + + vec_foreach (xd, dm->devices) + { + u32 n_buffers; + u32 mb_index; + uword n_rx_bytes = 0; + u32 n_trace, trace_cnt __attribute__((unused)); + vlib_buffer_free_list_t * fl; + u32 hash; + u64 hash_key; + u8 efd_discard_burst = 0; + + if (!xd->admin_up) + continue; + + n_buffers = dpdk_rx_burst(dm, xd, queue_id ); + + if (n_buffers == 0) + { + /* check if EFD (dpdk) is enabled */ + if (PREDICT_FALSE(dm->efd.enabled)) + { + /* reset a few stats */ + xd->efd_agent.last_poll_time = 0; + xd->efd_agent.last_burst_sz = 0; + } + continue; + } + + vec_reset_length (xd->d_trace_buffers); + trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace); + + /* + * DAW-FIXME: VMXNET3 device stop/start doesn't work, + * therefore fake the stop in the dpdk driver by + * silently dropping all of the incoming pkts instead of + * stopping the driver / hardware. + */ + if (PREDICT_FALSE(xd->admin_up != 1)) + { + for (mb_index = 0; mb_index < n_buffers; mb_index++) + rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]); + continue; + } + + /* Check for congestion if EFD (Early-Fast-Discard) is enabled + * in any mode (e.g. dpdk, monitor, or drop_all) + */ + if (PREDICT_FALSE(dm->efd.enabled)) + { + /* update EFD counters */ + dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled); + + if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED)) + { + /* discard all received packets */ + for (mb_index = 0; mb_index < n_buffers; mb_index++) + rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]); + + xd->efd_agent.discard_cnt += n_buffers; + increment_efd_drop_counter(vm, + DPDK_ERROR_VLAN_EFD_DROP_PKTS, + n_buffers); + + continue; + } + + if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >= + dm->efd.consec_full_frames_hi_thresh)) + { + u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index, + queue_id); + if (device_queue_sz >= dm->efd.queue_hi_thresh) + { + /* dpdk device queue has reached the critical threshold */ + xd->efd_agent.congestion_cnt++; + + /* apply EFD to packets from the burst */ + efd_discard_burst = 1; + } + } + } + + fl = vlib_buffer_get_free_list + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + mb_index = 0; + + while (n_buffers > 0) + { + u32 bi0; + u8 next0, error0; + u32 l3_offset0; + vlib_buffer_t * b0, * b_seg, * b_chain = 0; + ethernet_header_t * h0; + u8 nb_seg = 1; + struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index]; + struct rte_mbuf *mb_seg = mb->next; + + if (PREDICT_TRUE(n_buffers > 1)) + { + struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2]; + vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1); + CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (bp->data, CLIB_CACHE_LINE_BYTES, LOAD); + } + + b0 = (vlib_buffer_t *)(mb+1); + + /* check whether EFD is looking for packets to discard */ + if (PREDICT_FALSE(efd_discard_burst)) + { + u32 cntr_type; + if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb))) + { + rte_pktmbuf_free(mb); + xd->efd_agent.discard_cnt++; + increment_efd_drop_counter(vm, + cntr_type, + 1); + + n_buffers--; + mb_index++; + continue; + } + } + + /* Prefetch one next segment if it exists */ + if (PREDICT_FALSE(mb->nb_segs > 1)) + { + struct rte_mbuf *pfmb = mb->next; + vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1); + CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE); + b_chain = b0; + } + + bi0 = vlib_get_buffer_index (vm, b0); + vlib_buffer_init_for_free_list (b0, fl); + b0->clone_count = 0; + + dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0, + &next0, &error0); +#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS + /* + * Clear overloaded TX offload flags when a DPDK driver + * is using them for RX flags (e.g. Cisco VIC Ethernet driver) + */ + if (PREDICT_TRUE(trace_cnt == 0)) + mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK; + else + trace_cnt--; +#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */ + + if (error0) + clib_warning ("bi %d error %d", bi0, error0); + + b0->error = 0; + + l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT || + next0 == DPDK_RX_NEXT_IP6_INPUT || + next0 == DPDK_RX_NEXT_MPLS_INPUT) ? + sizeof (ethernet_header_t) : 0); + + b0->current_data = l3_offset0; + b0->current_length = mb->data_len - l3_offset0; + + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + + if (VMWARE_LENGTH_BUG_WORKAROUND) + b0->current_length -= 4; + + vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0; + vnet_buffer(b0)->io_handoff.next_index = next0; + n_rx_bytes += mb->pkt_len; + + /* Process subsequent segments of multi-segment packets */ + while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs)) + { + ASSERT(mb_seg != 0); + + b_seg = (vlib_buffer_t *)(mb_seg+1); + vlib_buffer_init_for_free_list (b_seg, fl); + b_seg->clone_count = 0; + + ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + ASSERT(b_seg->current_data == 0); + + /* + * The driver (e.g. virtio) may not put the packet data at the start + * of the segment, so don't assume b_seg->current_data == 0 is correct. + */ + b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data; + + b_seg->current_length = mb_seg->data_len; + b0->total_length_not_including_first_buffer += + mb_seg->data_len; + + b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT; + b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg); + + b_chain = b_seg; + mb_seg = mb_seg->next; + nb_seg++; + } + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0); + + if (PREDICT_FALSE (n_trace > mb_index)) + vec_add1 (xd->d_trace_buffers, bi0); + + next_worker_index = first_worker_index; + + /* + * Force unknown traffic onto worker 0, + * and into ethernet-input. $$$$ add more hashes. + */ + h0 = (ethernet_header_t *) b0->data; + + /* Compute ingress LB hash */ + hash_key = eth_get_key(h0); + hash = (u32)clib_xxhash(hash_key); + + if (PREDICT_TRUE (is_pow2(num_workers))) + next_worker_index += hash & (num_workers - 1); + else + next_worker_index += hash % num_workers; + + /* if EFD is enabled and not already discarding from dpdk, + * check the worker ring/queue for congestion + */ + if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst)) + { + vlib_frame_queue_t *fq; + + /* fq will be valid if the ring is congested */ + fq = is_vlib_handoff_queue_congested( + next_worker_index, tm->efd.queue_hi_thresh, + congested_handoff_queue_by_worker_index); + + if (PREDICT_FALSE(fq != NULL)) + { + u32 cntr_type; + if (PREDICT_TRUE(cntr_type = + is_efd_discardable(tm, b0, mb))) + { + /* discard the packet */ + fq->enqueue_efd_discards++; + increment_efd_drop_counter(vm, cntr_type, 1); + rte_pktmbuf_free(mb); + n_buffers--; + mb_index++; + continue; + } + } + } + + if (next_worker_index != current_worker_index) + { + if (hf) + hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker; + + hf = dpdk_get_handoff_queue_elt( + next_worker_index, + handoff_queue_elt_by_worker_index); + + n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors; + to_next_worker = &hf->buffer_index[hf->n_vectors]; + current_worker_index = next_worker_index; + } + + /* enqueue to correct worker thread */ + to_next_worker[0] = bi0; + to_next_worker++; + n_left_to_next_worker--; + + if (n_left_to_next_worker == 0) + { + hf->n_vectors = VLIB_FRAME_SIZE; + vlib_put_handoff_queue_elt(hf); + current_worker_index = ~0; + handoff_queue_elt_by_worker_index[next_worker_index] = 0; + hf = 0; + } + + n_buffers--; + mb_index++; + } + + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0)) + { + /* credit the trace to the trace node */ + dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers, + vec_len (xd->d_trace_buffers)); + vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers)); + } + + vlib_increment_combined_counter + (vnet_get_main()->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + cpu_index, + xd->vlib_sw_if_index, + mb_index, n_rx_bytes); + + dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index); + dw->aggregate_rx_packets += mb_index; + n_rx_packets += mb_index; + } + + if (hf) + hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker; + + /* Ship frames to the worker nodes */ + for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++) + { + if (handoff_queue_elt_by_worker_index[i]) + { + hf = handoff_queue_elt_by_worker_index[i]; + /* + * It works better to let the handoff node + * rate-adapt, always ship the handoff queue element. + */ + if (1 || hf->n_vectors == hf->last_n_vectors) + { + vlib_put_handoff_queue_elt(hf); + handoff_queue_elt_by_worker_index[i] = 0; + } + else + hf->last_n_vectors = hf->n_vectors; + } + congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0); + } + hf = 0; + current_worker_index = ~0; + return n_rx_packets; +} + +VLIB_REGISTER_NODE (dpdk_io_input_node) = { + .function = dpdk_io_input, + .type = VLIB_NODE_TYPE_INPUT, + .name = "dpdk-io-input", + + /* Will be enabled if/when hardware is detected. */ + .state = VLIB_NODE_STATE_DISABLED, + + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_dpdk_rx_dma_trace, + + .n_errors = DPDK_N_ERROR, + .error_strings = dpdk_error_strings, + + .n_next_nodes = DPDK_RX_N_NEXT, + .next_nodes = { + [DPDK_RX_NEXT_DROP] = "error-drop", + [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input", + [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input", + }, +}; + +/* + * set_efd_bitmap() + * Based on the operation type, set lower/upper bits for the given index value + */ +void +set_efd_bitmap (u8 *bitmap, u32 value, u32 op) +{ + int ix; + + *bitmap = 0; + for (ix = 0; ix < 8; ix++) { + if (((op == EFD_OPERATION_LESS_THAN) && (ix < value)) || + ((op == EFD_OPERATION_GREATER_OR_EQUAL) && (ix >= value))){ + (*bitmap) |= (1 << ix); + } + } +} + +void +efd_config (u32 enabled, + u32 ip_prec, u32 ip_op, + u32 mpls_exp, u32 mpls_op, + u32 vlan_cos, u32 vlan_op) +{ + vlib_thread_main_t * tm = vlib_get_thread_main(); + dpdk_main_t * dm = &dpdk_main; + + if (enabled) { + tm->efd.enabled |= VLIB_EFD_DISCARD_ENABLED; + dm->efd.enabled |= DPDK_EFD_DISCARD_ENABLED; + } else { + tm->efd.enabled &= ~VLIB_EFD_DISCARD_ENABLED; + dm->efd.enabled &= ~DPDK_EFD_DISCARD_ENABLED; + } + + set_efd_bitmap(&tm->efd.ip_prec_bitmap, ip_prec, ip_op); + set_efd_bitmap(&tm->efd.mpls_exp_bitmap, mpls_exp, mpls_op); + set_efd_bitmap(&tm->efd.vlan_cos_bitmap, vlan_cos, vlan_op); + +} diff --git a/vnet/vnet/devices/dpdk/threads.c b/vnet/vnet/devices/dpdk/threads.c new file mode 100644 index 00000000000..aa32f1007c3 --- /dev/null +++ b/vnet/vnet/devices/dpdk/threads.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <signal.h> + +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/dpdk/dpdk.h> + +#include <vlibmemory/api.h> +#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */ + +#define vl_typedefs /* define message structures */ +#include <vlibmemory/vl_memory_api_h.h> +#undef vl_typedefs + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <vlibmemory/vl_memory_api_h.h> +#undef vl_printfun + +vlib_thread_main_t vlib_thread_main; + +frame_queue_trace_t *frame_queue_traces; + +/* + * Check the frame queue to see if any frames are available. + * If so, pull the packets off the frames and put them to + * the handoff node. + */ +static inline int vlib_frame_queue_dequeue_internal (vlib_main_t *vm) +{ + u32 thread_id = vm->cpu_index; + vlib_frame_queue_t *fq = vlib_frame_queues[thread_id]; + vlib_frame_queue_elt_t *elt; + u32 * from, * to; + vlib_frame_t * f; + int msg_type; + int processed = 0; + u32 n_left_to_node; + u32 vectors = 0; + + ASSERT (fq); + ASSERT(vm == vlib_mains[thread_id]); + + /* + * Gather trace data for frame queues + */ + if (PREDICT_FALSE(fq->trace)) + { + frame_queue_trace_t *fqt; + u32 elix; + + fqt = &frame_queue_traces[thread_id]; + fqt->nelts = fq->nelts; + fqt->head = fq->head; + fqt->head_hint = fq->head_hint; + fqt->tail = fq->tail; + fqt->threshold = fq->vector_threshold; + fqt->n_in_use = fqt->tail - fqt->head; + if (fqt->n_in_use > fqt->nelts){ + fqt->n_in_use = 0; + } + + for (elix=0; elix<fqt->nelts; elix++) { + elt = fq->elts + ((fq->head+1 + elix) & (fq->nelts-1)); + if (1 || elt->valid) + { + fqt->n_vectors[elix] = elt->n_vectors; + } + } + fqt->written = 1; + } + + while (1) + { + if (fq->head == fq->tail) + { + fq->head_hint = fq->head; + return processed; + } + + elt = fq->elts + ((fq->head+1) & (fq->nelts-1)); + + if (!elt->valid) + { + fq->head_hint = fq->head; + return processed; + } + + from = elt->buffer_index; + msg_type = elt->msg_type; + + ASSERT (msg_type == VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME); + ASSERT (elt->n_vectors <= VLIB_FRAME_SIZE); + + f = vlib_get_frame_to_node + (vm, 1 ? handoff_dispatch_node.index : ethernet_input_node.index); + + to = vlib_frame_vector_args (f); + + n_left_to_node = elt->n_vectors; + + while (n_left_to_node >= 4) + { + to[0] = from[0]; + to[1] = from[1]; + to[2] = from[2]; + to[3] = from[3]; + to += 4; + from += 4; + n_left_to_node -= 4; + } + + while (n_left_to_node > 0) + { + to[0] = from[0]; + to++; + from++; + n_left_to_node--; + } + + vectors += elt->n_vectors; + f->n_vectors = elt->n_vectors; + vlib_put_frame_to_node + (vm, 1 ? handoff_dispatch_node.index : ethernet_input_node.index, f); + + elt->valid = 0; + elt->n_vectors = 0; + elt->msg_type = 0xfefefefe; + CLIB_MEMORY_BARRIER(); + fq->head++; + processed++; + + /* + * Limit the number of packets pushed into the graph + */ + if (vectors >= fq->vector_threshold) + { + fq->head_hint = fq->head; + return processed; + } + } + ASSERT(0); + return processed; +} + +int dpdk_frame_queue_dequeue (vlib_main_t *vm) +{ + return vlib_frame_queue_dequeue_internal (vm); +} + +/* + * dpdk_worker_thread - Contains the main loop of a worker thread. + * + * w + * Information for the current thread + * io_name + * The name of thread performing dpdk device IO (if any). If there are no + * instances of that thread, then the current thread will do dpdk device + * polling. Ports will be divided among instances of the current thread. + * callback + * If not null, this function will be called once during each main loop. + */ +static_always_inline void +dpdk_worker_thread_internal (vlib_main_t *vm, + dpdk_worker_thread_callback_t callback, + int have_io_threads) +{ + vlib_node_main_t * nm = &vm->node_main; + u64 cpu_time_now = clib_cpu_time_now (); + + while (1) + { + vlib_worker_thread_barrier_check (); + + vlib_frame_queue_dequeue_internal (vm); + + /* Invoke callback if supplied */ + if (PREDICT_FALSE(callback != NULL)) + callback(vm); + + if (!have_io_threads) + { + vlib_node_runtime_t * n; + vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + { + cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_POLLING, /* frame */ 0, + cpu_time_now); + } + + } + + if (_vec_len (nm->pending_frames)) + { + int i; + cpu_time_now = clib_cpu_time_now (); + for (i = 0; i < _vec_len (nm->pending_frames); i++) { + vlib_pending_frame_t *p; + + p = nm->pending_frames + i; + + cpu_time_now = dispatch_pending_node (vm, p, cpu_time_now); + } + _vec_len (nm->pending_frames) = 0; + } + vlib_increment_main_loop_counter (vm); + + /* Record time stamp in case there are no enabled nodes and above + calls do not update time stamp. */ + cpu_time_now = clib_cpu_time_now (); + } +} + +void dpdk_worker_thread (vlib_worker_thread_t * w, + char *io_name, + dpdk_worker_thread_callback_t callback) +{ + vlib_main_t *vm; + uword * p; + vlib_thread_main_t * tm = vlib_get_thread_main(); + vlib_thread_registration_t * tr; + dpdk_main_t * dm = &dpdk_main; + + vm = vlib_get_main(); + + ASSERT(vm->cpu_index == os_get_cpu_number()); + + clib_time_init (&vm->clib_time); + clib_mem_set_heap (w->thread_mheap); + + /* Wait until the dpdk init sequence is complete */ + while (dm->io_thread_release == 0) + vlib_worker_thread_barrier_check (); + + /* any I/O threads? */ + p = hash_get_mem (tm->thread_registrations_by_name, io_name); + tr = (vlib_thread_registration_t *)p[0]; + + if (tr && tr->count > 0) + dpdk_worker_thread_internal(vm, callback, /* have_io_threads */ 1); + else + dpdk_worker_thread_internal(vm, callback, /* have_io_threads */ 0); +} + +void dpdk_worker_thread_fn (void * arg) +{ + vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; + vlib_worker_thread_init (w); + dpdk_worker_thread (w, "io", 0); +} + +#if VIRL == 0 +VLIB_REGISTER_THREAD (worker_thread_reg, static) = { + .name = "workers", + .short_name = "wk", + .function = dpdk_worker_thread_fn, + .mheap_size = 256<<20, +}; +#endif + +void dpdk_io_thread_fn (void * arg) +{ + vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; + vlib_worker_thread_init (w); + dpdk_io_thread (w, 0, 0, "workers", 0); +} + +#if VIRL == 0 +VLIB_REGISTER_THREAD (io_thread_reg, static) = { + .name = "io", + .short_name = "io", + .function = dpdk_io_thread_fn, + .mheap_size = 256<<20, +}; +#endif + +static void vl_api_rpc_call_t_handler (vl_api_rpc_call_t * mp) +{ + vl_api_rpc_reply_t * rmp; + int (*fp)(void *); + i32 rv = 0; + vlib_main_t * vm = vlib_get_main(); + + if (mp->function == 0) + { + rv = -1; + clib_warning ("rpc NULL function pointer"); + } + + else + { + if (mp->need_barrier_sync) + vlib_worker_thread_barrier_sync (vm); + + fp = (void *)(mp->function); + rv = (*fp)(mp->data); + + if (mp->need_barrier_sync) + vlib_worker_thread_barrier_release (vm); + } + + if (mp->send_reply) + { + unix_shared_memory_queue_t * q = + vl_api_client_index_to_input_queue (mp->client_index); + if (q) + { + rmp = vl_msg_api_alloc_as_if_client (sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_RPC_REPLY); + rmp->context = mp->context; + rmp->retval = rv; + vl_msg_api_send_shmem (q, (u8 *)&rmp); + } + } + if (mp->multicast) + { + clib_warning ("multicast not yet implemented..."); + } +} + +static void vl_api_rpc_reply_t_handler (vl_api_rpc_reply_t * mp) +{ clib_warning ("unimplemented"); } + +void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length) +{ + vl_api_rpc_call_t * mp; + api_main_t *am = &api_main; + vl_shmem_hdr_t *shmem_hdr = am->shmem_hdr; + + mp = vl_msg_api_alloc_as_if_client (sizeof (*mp) + data_length); + memset (mp, 0, sizeof (*mp)); + memcpy (mp->data, data, data_length); + mp->_vl_msg_id = ntohs (VL_API_RPC_CALL); + mp->function = (u64)fp; + mp->need_barrier_sync = 1; + + /* Use the "normal" control-plane mechanism for the main thread */ + vl_msg_api_send_shmem (shmem_hdr->vl_input_queue, (u8 *)&mp); +} + + +#define foreach_rpc_api_msg \ +_(RPC_CALL,rpc_call) \ +_(RPC_REPLY,rpc_reply) + +static clib_error_t * +rpc_api_hookup (vlib_main_t *vm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 0 /* do not trace */); + foreach_rpc_api_msg; +#undef _ + return 0; +} + +VLIB_API_INIT_FUNCTION(rpc_api_hookup); diff --git a/vnet/vnet/devices/dpdk/threads.h b/vnet/vnet/devices/dpdk/threads.h new file mode 100644 index 00000000000..8f0fcbdb465 --- /dev/null +++ b/vnet/vnet/devices/dpdk/threads.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_dpdk_threads_h__ +#define __included_dpdk_threads_h__ + +#include <vnet/vnet.h> + +void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length); + +typedef void (*dpdk_worker_thread_callback_t) (vlib_main_t *vm); + +void dpdk_worker_thread (vlib_worker_thread_t * w, + char *io_name, + dpdk_worker_thread_callback_t callback); + +int dpdk_frame_queue_dequeue (vlib_main_t *vm); + +#endif /* __included_dpdk_threads_h__ */ diff --git a/vnet/vnet/devices/dpdk/vhost_user.c b/vnet/vnet/devices/dpdk/vhost_user.c new file mode 100644 index 00000000000..5ab4c22ed3e --- /dev/null +++ b/vnet/vnet/devices/dpdk/vhost_user.c @@ -0,0 +1,1550 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/stat.h> +#include <sys/vfs.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> + +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> + +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/dpdk/dpdk.h> + +#include <vnet/devices/virtio/vhost-user.h> + +#define VHOST_USER_DEBUG_SOCKET 0 + +#if VHOST_USER_DEBUG_SOCKET == 1 +#define DBG_SOCK(args...) clib_warning(args); +#else +#define DBG_SOCK(args...) +#endif + +/* + * DPDK vhost-user functions + */ + +/* portions taken from dpdk + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +static uint64_t +qva_to_vva(struct virtio_net *dev, uint64_t qemu_va) +{ + struct virtio_memory_regions *region; + uint64_t vhost_va = 0; + uint32_t regionidx = 0; + + /* Find the region where the address lives. */ + for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { + region = &dev->mem->regions[regionidx]; + if ((qemu_va >= region->userspace_address) && + (qemu_va <= region->userspace_address + + region->memory_size)) { + vhost_va = qemu_va + region->guest_phys_address + + region->address_offset - + region->userspace_address; + break; + } + } + return vhost_va; +} + +static dpdk_device_t * +dpdk_vhost_user_device_from_hw_if_index(u32 hw_if_index) +{ + vnet_main_t *vnm = vnet_get_main(); + dpdk_main_t * dm = &dpdk_main; + vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance); + + if (xd->dev_type != VNET_DPDK_DEV_VHOST_USER) + return 0; + + return xd; +} + +static dpdk_device_t * +dpdk_vhost_user_device_from_sw_if_index(u32 sw_if_index) +{ + vnet_main_t *vnm = vnet_get_main(); + vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, sw_if_index); + ASSERT (sw->type == VNET_SW_INTERFACE_TYPE_HARDWARE); + + return dpdk_vhost_user_device_from_hw_if_index(sw->hw_if_index); +} + +static inline void * map_guest_mem(dpdk_device_t * xd, u64 addr) +{ + dpdk_vu_intf_t * vui = xd->vu_intf; + struct virtio_memory * mem = xd->vu_vhost_dev.mem; + int i; + for (i=0; i<mem->nregions; i++) { + if ((mem->regions[i].guest_phys_address <= addr) && + ((mem->regions[i].guest_phys_address + mem->regions[i].memory_size) > addr)) { + return (void *) (vui->region_addr[i] + addr - mem->regions[i].guest_phys_address); + } + } + DBG_SOCK("failed to map guest mem addr %llx", addr); + return 0; +} + +static clib_error_t * +dpdk_create_vhost_user_if_internal (u32 * hw_if_index, u32 if_id) +{ + dpdk_main_t * dm = &dpdk_main; + vlib_main_t * vm = vlib_get_main(); + vlib_thread_main_t * tm = vlib_get_thread_main(); + vnet_sw_interface_t * sw; + clib_error_t * error; + dpdk_device_and_queue_t * dq; + + dpdk_device_t * xd = NULL; + u8 addr[6]; + int j; + + vlib_worker_thread_barrier_sync (vm); + + int inactive_cnt = vec_len(dm->vu_inactive_interfaces_device_index); + // if there are any inactive ifaces + if (inactive_cnt > 0) { + // take last + u32 vui_idx = dm->vu_inactive_interfaces_device_index[inactive_cnt - 1]; + if (vec_len(dm->devices) > vui_idx) { + xd = vec_elt_at_index (dm->devices, vui_idx); + if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER) { + DBG_SOCK("reusing inactive vhost-user interface sw_if_index %d", xd->vlib_sw_if_index); + } else { + clib_warning("error: inactive vhost-user interface sw_if_index %d not VHOST_USER type!", + xd->vlib_sw_if_index); + // reset so new interface is created + xd = NULL; + } + } + // "remove" from inactive list + _vec_len(dm->vu_inactive_interfaces_device_index) -= 1; + } + + if (xd) { + // existing interface used - do not overwrite if_id if not needed + if (if_id != (u32)~0) + xd->vu_if_id = if_id; + + // reset virtqueues + for (j = 0; j < VIRTIO_QNUM; j++) + { + memset(xd->vu_vhost_dev.virtqueue[j], 0, sizeof(struct vhost_virtqueue)); + } + // reset lockp + memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES); + + // reset tx vectors + for (j = 0; j < tm->n_vlib_mains; j++) + { + vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE, + sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->tx_vectors[j]); + } + + // reset rx vector + for (j = 0; j < xd->rx_q_used; j++) + { + vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1, + CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->rx_vectors[j]); + } + } else { + // vui was not retrieved from inactive ifaces - create new + vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES); + xd->dev_type = VNET_DPDK_DEV_VHOST_USER; + xd->rx_q_used = 1; + vec_validate_aligned (xd->rx_vectors, xd->rx_q_used, CLIB_CACHE_LINE_BYTES); + + if (if_id == (u32)~0) + xd->vu_if_id = dm->next_vu_if_id++; + else + xd->vu_if_id = if_id; + + xd->device_index = xd - dm->devices; + xd->per_interface_next_index = ~0; + xd->vu_intf = NULL; + + xd->vu_vhost_dev.mem = clib_mem_alloc (sizeof(struct virtio_memory) + + VHOST_MEMORY_MAX_NREGIONS * + sizeof(struct virtio_memory_regions)); + + for (j = 0; j < VIRTIO_QNUM; j++) + { + xd->vu_vhost_dev.virtqueue[j] = clib_mem_alloc (sizeof(struct vhost_virtqueue)); + memset(xd->vu_vhost_dev.virtqueue[j], 0, sizeof(struct vhost_virtqueue)); + } + + xd->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, + CLIB_CACHE_LINE_BYTES); + memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES); + + vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + + for (j = 0; j < tm->n_vlib_mains; j++) + { + vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE, + sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->tx_vectors[j]); + } + + // reset rx vector + for (j = 0; j < xd->rx_q_used; j++) + { + vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1, + CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->rx_vectors[j]); + } + + vec_validate_aligned (xd->frames, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + + } + { + f64 now = vlib_time_now(vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + + memcpy (addr+2, &rnd, sizeof(rnd)); + addr[0] = 2; + addr[1] = 0xfe; + } + + error = ethernet_register_interface + (dm->vnet_main, + dpdk_device_class.index, + xd->device_index, + /* ethernet address */ addr, + &xd->vlib_hw_if_index, + 0); + + if (error) + return error; + + sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + + if (!xd->vu_intf) + xd->vu_intf = clib_mem_alloc (sizeof(*(xd->vu_intf))); + + *hw_if_index = xd->vlib_hw_if_index; + + int cpu = (xd->device_index % dm->input_cpu_count) + + dm->input_cpu_first_index; + + vec_add2(dm->devices_by_cpu[cpu], dq, 1); + dq->device = xd->device_index; + dq->queue_id = 0; + + // start polling if it was not started yet (because of no phys ifaces) + if (tm->n_vlib_mains == 1 && dpdk_input_node.state != VLIB_NODE_STATE_POLLING) + vlib_node_set_state (vm, dpdk_input_node.index, VLIB_NODE_STATE_POLLING); + + if (tm->n_vlib_mains > 1 && tm->main_thread_is_io_node) + vlib_node_set_state (vm, dpdk_io_input_node.index, VLIB_NODE_STATE_POLLING); + + if (tm->n_vlib_mains > 1 && !tm->main_thread_is_io_node) + vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index, + VLIB_NODE_STATE_POLLING); + + vlib_worker_thread_barrier_release (vm); + return 0; +} + +static clib_error_t * +dpdk_vhost_user_get_features(u32 hw_if_index, u64 * features) +{ + *features = rte_vhost_feature_get(); + + DBG_SOCK("supported features: 0x%x", *features); + return 0; +} + +static clib_error_t * +dpdk_vhost_user_set_features(u32 hw_if_index, u64 features) +{ + dpdk_device_t * xd; + u16 hdr_len = sizeof(struct virtio_net_hdr); + + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) { + clib_warning("not a vhost-user interface"); + return 0; + } + + xd->vu_vhost_dev.features = features; + + if (xd->vu_vhost_dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF)) + hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); + + xd->vu_vhost_dev.virtqueue[VIRTIO_RXQ]->vhost_hlen = hdr_len; + xd->vu_vhost_dev.virtqueue[VIRTIO_TXQ]->vhost_hlen = hdr_len; + + xd->vu_is_running = 0; + + return 0; +} + +static clib_error_t * +dpdk_vhost_user_set_mem_table(u32 hw_if_index, vhost_user_memory_t * vum, int fd[]) +{ + struct virtio_memory * mem; + int i; + dpdk_device_t * xd; + dpdk_vu_intf_t * vui; + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) { + clib_warning("not a vhost-user interface"); + return 0; + } + + vui = xd->vu_intf; + mem = xd->vu_vhost_dev.mem; + + mem->nregions = vum->nregions; + + for (i=0; i < mem->nregions; i++) { + u64 mapped_size, mapped_address; + + mem->regions[i].guest_phys_address = vum->regions[i].guest_phys_addr; + mem->regions[i].guest_phys_address_end = vum->regions[i].guest_phys_addr + + vum->regions[i].memory_size; + mem->regions[i].memory_size = vum->regions[i].memory_size; + mem->regions[i].userspace_address = vum->regions[i].userspace_addr; + + mapped_size = mem->regions[i].memory_size + vum->regions[i].mmap_offset; + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mapped_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd[i], 0); + + if ((void *)mapped_address == MAP_FAILED) + { + clib_warning("mmap error"); + return 0; + } + + mapped_address += vum->regions[i].mmap_offset; + vui->region_addr[i] = mapped_address; + vui->region_fd[i] = fd[i]; + mem->regions[i].address_offset = mapped_address - mem->regions[i].guest_phys_address; + + if (vum->regions[i].guest_phys_addr == 0) { + mem->base_address = vum->regions[i].userspace_addr; + mem->mapped_address = mem->regions[i].address_offset; + } + } + + xd->vu_is_running = 0; + + DBG_SOCK("done"); + return 0; +} + +static clib_error_t * +dpdk_vhost_user_set_vring_num(u32 hw_if_index, u8 idx, u32 num) +{ + dpdk_device_t * xd; + struct vhost_virtqueue *vq; + + DBG_SOCK("idx %u num %u", idx, num); + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) { + clib_warning("not a vhost-user interface"); + return 0; + } + vq = xd->vu_vhost_dev.virtqueue[idx]; + vq->size = num; + + xd->vu_is_running = 0; + + return 0; +} + +static clib_error_t * +dpdk_vhost_user_set_vring_addr(u32 hw_if_index, u8 idx, u64 desc, u64 used, u64 avail) +{ + dpdk_device_t * xd; + struct vhost_virtqueue *vq; + + DBG_SOCK("idx %u desc 0x%x used 0x%x avail 0x%x", idx, desc, used, avail); + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) { + clib_warning("not a vhost-user interface"); + return 0; + } + vq = xd->vu_vhost_dev.virtqueue[idx]; + + vq->desc = (struct vring_desc *) qva_to_vva(&xd->vu_vhost_dev, desc); + vq->used = (struct vring_used *) qva_to_vva(&xd->vu_vhost_dev, used); + vq->avail = (struct vring_avail *) qva_to_vva(&xd->vu_vhost_dev, avail); + + if (!(vq->desc && vq->used && vq->avail)) { + clib_warning("falied to set vring addr"); + } + + xd->vu_is_running = 0; + + return 0; +} + +static clib_error_t * +dpdk_vhost_user_get_vring_base(u32 hw_if_index, u8 idx, u32 * num) +{ + dpdk_device_t * xd; + struct vhost_virtqueue *vq; + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) { + clib_warning("not a vhost-user interface"); + return 0; + } + + vq = xd->vu_vhost_dev.virtqueue[idx]; + *num = vq->last_used_idx; + + DBG_SOCK("idx %u num %u", idx, *num); + return 0; +} + +static clib_error_t * +dpdk_vhost_user_set_vring_base(u32 hw_if_index, u8 idx, u32 num) +{ + dpdk_device_t * xd; + struct vhost_virtqueue *vq; + + DBG_SOCK("idx %u num %u", idx, num); + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) { + clib_warning("not a vhost-user interface"); + return 0; + } + + vq = xd->vu_vhost_dev.virtqueue[idx]; + vq->last_used_idx = num; + vq->last_used_idx_res = num; + + xd->vu_is_running = 0; + + return 0; +} + +static clib_error_t * +dpdk_vhost_user_set_vring_kick(u32 hw_if_index, u8 idx, int fd) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + struct vhost_virtqueue *vq, *vq0, *vq1; + + DBG_SOCK("idx %u fd %d", idx, fd); + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) { + clib_warning("not a vhost-user interface"); + return 0; + } + + vq = xd->vu_vhost_dev.virtqueue[idx]; + vq->kickfd = fd; + + vq0 = xd->vu_vhost_dev.virtqueue[0]; + vq1 = xd->vu_vhost_dev.virtqueue[1]; + + if (vq0->desc && vq0->avail && vq0->used && + vq1->desc && vq1->avail && vq1->used) { + xd->vu_is_running = 1; + if (xd->admin_up) + vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP | + ETH_LINK_FULL_DUPLEX ); + } + + return 0; +} + + +static clib_error_t * +dpdk_vhost_user_set_vring_call(u32 hw_if_index, u8 idx, int fd) +{ + dpdk_device_t * xd; + struct vhost_virtqueue *vq; + + DBG_SOCK("idx %u fd %d", idx, fd); + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) { + clib_warning("not a vhost-user interface"); + return 0; + } + + vq = xd->vu_vhost_dev.virtqueue[idx]; + /* reset callfd to force no interrupts */ + vq->callfd = -1; + + return 0; +} + +u8 +dpdk_vhost_user_want_interrupt(dpdk_device_t *xd, int idx) +{ + dpdk_vu_intf_t *vui = xd->vu_intf; + ASSERT(vui != NULL); + + if (PREDICT_FALSE(vui->num_vrings <= 0)) + return 0; + + dpdk_vu_vring *vring = &(vui->vrings[idx]); + struct vhost_virtqueue *vq = xd->vu_vhost_dev.virtqueue[idx]; + + /* return if vm is interested in interrupts */ + return (vring->callfd > 0) && !(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT); +} + +void +dpdk_vhost_user_send_interrupt(vlib_main_t * vm, dpdk_device_t * xd, int idx) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_vu_intf_t *vui = xd->vu_intf; + ASSERT(vui != NULL); + + if (PREDICT_FALSE(vui->num_vrings <= 0)) + return; + + dpdk_vu_vring *vring = &(vui->vrings[idx]); + struct vhost_virtqueue *vq = xd->vu_vhost_dev.virtqueue[idx]; + + /* if vm is interested in interrupts */ + if((vring->callfd > 0) && !(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + u64 x = 1; + int rv __attribute__((unused)); + /* $$$$ pay attention to rv */ + rv = write(vring->callfd, &x, sizeof(x)); + vring->n_since_last_int = 0; + vring->int_deadline = vlib_time_now(vm) + dm->vhost_coalesce_time; + } +} + +/* + * vhost-user interface management functions + */ + +// initialize vui with specified attributes +static void +dpdk_vhost_user_vui_init(vnet_main_t * vnm, + dpdk_device_t *xd, int sockfd, + const char * sock_filename, + u8 is_server, u64 feature_mask, + u32 * sw_if_index) +{ + dpdk_vu_intf_t *vui = xd->vu_intf; + memset(vui, 0, sizeof(*vui)); + + vui->unix_fd = sockfd; + vui->num_vrings = 2; + vui->sock_is_server = is_server; + strncpy(vui->sock_filename, sock_filename, ARRAY_LEN(vui->sock_filename)-1); + vui->sock_errno = 0; + vui->is_up = 0; + vui->feature_mask = feature_mask; + vui->active = 1; + vui->unix_file_index = ~0; + + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0); + + if (sw_if_index) + *sw_if_index = xd->vlib_sw_if_index; +} + +// register vui and start polling on it +static void +dpdk_vhost_user_vui_register(vlib_main_t * vm, dpdk_device_t *xd) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_vu_intf_t *vui = xd->vu_intf; + + hash_set (dm->vu_sw_if_index_by_listener_fd, vui->unix_fd, + xd->vlib_sw_if_index); +} + +static inline void +dpdk_vhost_user_if_disconnect(dpdk_device_t * xd) +{ + dpdk_vu_intf_t *vui = xd->vu_intf; + vnet_main_t * vnm = vnet_get_main(); + dpdk_main_t * dm = &dpdk_main; + + xd->admin_up = 0; + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0); + + if (vui->unix_file_index != ~0) { + unix_file_del (&unix_main, unix_main.file_pool + vui->unix_file_index); + vui->unix_file_index = ~0; + } + + hash_unset(dm->vu_sw_if_index_by_sock_fd, vui->unix_fd); + hash_unset(dm->vu_sw_if_index_by_listener_fd, vui->unix_fd); + close(vui->unix_fd); + vui->unix_fd = -1; + vui->is_up = 0; + + DBG_SOCK("interface ifindex %d disconnected", xd->vlib_sw_if_index); +} + +static clib_error_t * dpdk_vhost_user_callfd_read_ready (unix_file_t * uf) +{ + __attribute__((unused)) int n; + u8 buff[8]; + n = read(uf->file_descriptor, ((char*)&buff), 8); + return 0; +} + +static clib_error_t * dpdk_vhost_user_socket_read (unix_file_t * uf) +{ + int n; + int fd, number_of_fds = 0; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + vhost_user_msg_t msg; + struct msghdr mh; + struct iovec iov[1]; + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t *xd; + dpdk_vu_intf_t *vui; + struct cmsghdr *cmsg; + uword * p; + u8 q; + unix_file_t template = {0}; + vnet_main_t * vnm = vnet_get_main(); + + p = hash_get (dm->vu_sw_if_index_by_sock_fd, uf->file_descriptor); + if (p == 0) { + DBG_SOCK ("FD %d doesn't belong to any interface", + uf->file_descriptor); + return 0; + } + else + xd = dpdk_vhost_user_device_from_sw_if_index(p[0]); + + ASSERT(xd != NULL); + vui = xd->vu_intf; + + char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))]; + + memset(&mh, 0, sizeof(mh)); + memset(control, 0, sizeof(control)); + + /* set the payload */ + iov[0].iov_base = (void *) &msg; + iov[0].iov_len = VHOST_USER_MSG_HDR_SZ; + + mh.msg_iov = iov; + mh.msg_iovlen = 1; + mh.msg_control = control; + mh.msg_controllen = sizeof(control); + + n = recvmsg(uf->file_descriptor, &mh, 0); + + if (n != VHOST_USER_MSG_HDR_SZ) + goto close_socket; + + if (mh.msg_flags & MSG_CTRUNC) { + goto close_socket; + } + + cmsg = CMSG_FIRSTHDR(&mh); + + if (cmsg && (cmsg->cmsg_len > 0) && (cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS) && + (cmsg->cmsg_len - CMSG_LEN(0) <= VHOST_MEMORY_MAX_NREGIONS * sizeof(int))) { + number_of_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + memcpy(fds, CMSG_DATA(cmsg), number_of_fds * sizeof(int)); + } + + /* version 1, no reply bit set*/ + if ((msg.flags & 7) != 1) { + DBG_SOCK("malformed message received. closing socket"); + goto close_socket; + } + + { + int rv __attribute__((unused)); + /* $$$$ pay attention to rv */ + rv = read(uf->file_descriptor, ((char*)&msg) + n, msg.size); + } + + switch (msg.request) { + case VHOST_USER_GET_FEATURES: + DBG_SOCK("if %d msg VHOST_USER_GET_FEATURES", + xd->vlib_hw_if_index); + + msg.flags |= 4; + + dpdk_vhost_user_get_features(xd->vlib_hw_if_index, &msg.u64); + msg.u64 &= vui->feature_mask; + msg.size = sizeof(msg.u64); + break; + + case VHOST_USER_SET_FEATURES: + DBG_SOCK("if %d msg VHOST_USER_SET_FEATURES features 0x%016llx", + xd->vlib_hw_if_index, msg.u64); + + dpdk_vhost_user_set_features(xd->vlib_hw_if_index, msg.u64); + break; + + case VHOST_USER_SET_MEM_TABLE: + DBG_SOCK("if %d msg VHOST_USER_SET_MEM_TABLE nregions %d", + xd->vlib_hw_if_index, msg.memory.nregions); + + if ((msg.memory.nregions < 1) || + (msg.memory.nregions > VHOST_MEMORY_MAX_NREGIONS)) { + + DBG_SOCK("number of mem regions must be between 1 and %i", + VHOST_MEMORY_MAX_NREGIONS); + + goto close_socket; + } + + if (msg.memory.nregions != number_of_fds) { + DBG_SOCK("each memory region must have FD"); + goto close_socket; + } + + dpdk_vhost_user_set_mem_table(xd->vlib_hw_if_index, &msg.memory, fds); + break; + + case VHOST_USER_SET_VRING_NUM: + DBG_SOCK("if %d msg VHOST_USER_SET_VRING_NUM idx %d num %d", + xd->vlib_hw_if_index, msg.state.index, msg.state.num); + + if ((msg.state.num > 32768) || /* maximum ring size is 32768 */ + (msg.state.num == 0) || /* it cannot be zero */ + (msg.state.num % 2)) /* must be power of 2 */ + goto close_socket; + + dpdk_vhost_user_set_vring_num(xd->vlib_hw_if_index, msg.state.index, msg.state.num); + break; + + case VHOST_USER_SET_VRING_ADDR: + DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ADDR idx %d", + xd->vlib_hw_if_index, msg.state.index); + + dpdk_vhost_user_set_vring_addr(xd->vlib_hw_if_index, msg.state.index, + msg.addr.desc_user_addr, + msg.addr.used_user_addr, + msg.addr.avail_user_addr); + break; + + case VHOST_USER_SET_OWNER: + DBG_SOCK("if %d msg VHOST_USER_SET_OWNER", + xd->vlib_hw_if_index); + break; + + case VHOST_USER_RESET_OWNER: + DBG_SOCK("if %d msg VHOST_USER_RESET_OWNER", + xd->vlib_hw_if_index); + break; + + case VHOST_USER_SET_VRING_CALL: + DBG_SOCK("if %d msg VHOST_USER_SET_VRING_CALL u64 %d", + xd->vlib_hw_if_index, msg.u64); + + q = (u8) (msg.u64 & 0xFF); + + if (!(msg.u64 & 0x100)) + { + if (number_of_fds != 1) + goto close_socket; + + /* if there is old fd, delete it */ + if (vui->vrings[q].callfd) { + unix_file_t * uf = pool_elt_at_index (unix_main.file_pool, + vui->vrings[q].callfd_idx); + unix_file_del (&unix_main, uf); + } + vui->vrings[q].callfd = fds[0]; + template.read_function = dpdk_vhost_user_callfd_read_ready; + template.file_descriptor = fds[0]; + vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template); + } + else + vui->vrings[q].callfd = -1; + + dpdk_vhost_user_set_vring_call(xd->vlib_hw_if_index, q, vui->vrings[q].callfd); + break; + + case VHOST_USER_SET_VRING_KICK: + DBG_SOCK("if %d msg VHOST_USER_SET_VRING_KICK u64 %d", + xd->vlib_hw_if_index, msg.u64); + + q = (u8) (msg.u64 & 0xFF); + + if (!(msg.u64 & 0x100)) + { + if (number_of_fds != 1) + goto close_socket; + + vui->vrings[q].kickfd = fds[0]; + } + else + vui->vrings[q].kickfd = -1; + + dpdk_vhost_user_set_vring_kick(xd->vlib_hw_if_index, q, vui->vrings[q].kickfd); + break; + + case VHOST_USER_SET_VRING_ERR: + DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ERR u64 %d", + xd->vlib_hw_if_index, msg.u64); + + q = (u8) (msg.u64 & 0xFF); + + if (!(msg.u64 & 0x100)) + { + if (number_of_fds != 1) + goto close_socket; + + fd = fds[0]; + } + else + fd = -1; + + vui->vrings[q].errfd = fd; + break; + + case VHOST_USER_SET_VRING_BASE: + DBG_SOCK("if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d", + xd->vlib_hw_if_index, msg.state.index, msg.state.num); + + dpdk_vhost_user_set_vring_base(xd->vlib_hw_if_index, msg.state.index, msg.state.num); + break; + + case VHOST_USER_GET_VRING_BASE: + DBG_SOCK("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d", + xd->vlib_hw_if_index, msg.state.index, msg.state.num); + + msg.flags |= 4; + msg.size = sizeof(msg.state); + + dpdk_vhost_user_get_vring_base(xd->vlib_hw_if_index, msg.state.index, &msg.state.num); + break; + + case VHOST_USER_NONE: + DBG_SOCK("if %d msg VHOST_USER_NONE", + xd->vlib_hw_if_index); + break; + + case VHOST_USER_SET_LOG_BASE: + DBG_SOCK("if %d msg VHOST_USER_SET_LOG_BASE", + xd->vlib_hw_if_index); + break; + + case VHOST_USER_SET_LOG_FD: + DBG_SOCK("if %d msg VHOST_USER_SET_LOG_FD", + xd->vlib_hw_if_index); + break; + + default: + DBG_SOCK("unknown vhost-user message %d received. closing socket", + msg.request); + goto close_socket; + } + + /* if we have pointers to descriptor table, go up*/ + if (!vui->is_up && + xd->vu_vhost_dev.virtqueue[VHOST_NET_VRING_IDX_TX]->desc && + xd->vu_vhost_dev.virtqueue[VHOST_NET_VRING_IDX_RX]->desc) { + + DBG_SOCK("interface %d connected", xd->vlib_sw_if_index); + + vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP); + vui->is_up = 1; + } + + /* if we need to reply */ + if (msg.flags & 4) + { + n = send(uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); + if (n != (msg.size + VHOST_USER_MSG_HDR_SZ)) + goto close_socket; + } + + return 0; + +close_socket: + DBG_SOCK("error: close_socket"); + dpdk_vhost_user_if_disconnect(xd); + return 0; +} + +static clib_error_t * dpdk_vhost_user_socket_error (unix_file_t * uf) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t *xd; + uword * p; + + p = hash_get (dm->vu_sw_if_index_by_sock_fd, uf->file_descriptor); + if (p == 0) { + DBG_SOCK ("FD %d doesn't belong to any interface", + uf->file_descriptor); + return 0; + } + else + xd = dpdk_vhost_user_device_from_sw_if_index(p[0]); + + dpdk_vhost_user_if_disconnect(xd); + return 0; +} + +static clib_error_t * dpdk_vhost_user_socksvr_accept_ready (unix_file_t * uf) +{ + int client_fd, client_len; + struct sockaddr_un client; + unix_file_t template = {0}; + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = NULL; + dpdk_vu_intf_t * vui; + uword * p; + + p = hash_get (dm->vu_sw_if_index_by_listener_fd, + uf->file_descriptor); + if (p == 0) { + DBG_SOCK ("fd %d doesn't belong to any interface", + uf->file_descriptor); + return 0; + } + + xd = dpdk_vhost_user_device_from_sw_if_index(p[0]); + ASSERT(xd != NULL); + vui = xd->vu_intf; + + client_len = sizeof(client); + client_fd = accept (uf->file_descriptor, + (struct sockaddr *)&client, + (socklen_t *)&client_len); + + if (client_fd < 0) + return clib_error_return_unix (0, "accept"); + + template.read_function = dpdk_vhost_user_socket_read; + template.error_function = dpdk_vhost_user_socket_error; + template.file_descriptor = client_fd; + vui->unix_file_index = unix_file_add (&unix_main, &template); + + vui->client_fd = client_fd; + hash_set (dm->vu_sw_if_index_by_sock_fd, vui->client_fd, + xd->vlib_sw_if_index); + + return 0; +} + +// init server socket on specified sock_filename +static int dpdk_vhost_user_init_server_sock(const char * sock_filename, int *sockfd) +{ + int rv = 0, len; + struct sockaddr_un un; + int fd; + /* create listening socket */ + fd = socket(AF_UNIX, SOCK_STREAM, 0); + + if (fd < 0) { + return VNET_API_ERROR_SYSCALL_ERROR_1; + } + + un.sun_family = AF_UNIX; + strcpy((char *) un.sun_path, (char *) sock_filename); + + /* remove if exists */ + unlink( (char *) sock_filename); + + len = strlen((char *) un.sun_path) + strlen((char *) sock_filename); + + if (bind(fd, (struct sockaddr *) &un, len) == -1) { + rv = VNET_API_ERROR_SYSCALL_ERROR_2; + goto error; + } + + if (listen(fd, 1) == -1) { + rv = VNET_API_ERROR_SYSCALL_ERROR_3; + goto error; + } + + unix_file_t template = {0}; + template.read_function = dpdk_vhost_user_socksvr_accept_ready; + template.file_descriptor = fd; + unix_file_add (&unix_main, &template); + *sockfd = fd; + return rv; + +error: + close(fd); + return rv; +} + +/* + * vhost-user interface control functions used from vpe api + */ + +int dpdk_vhost_user_create_if(vnet_main_t * vnm, vlib_main_t * vm, + const char * sock_filename, + u8 is_server, + u32 * sw_if_index, + u64 feature_mask, + u8 renumber, u32 custom_dev_instance) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t *xd; + u32 hw_if_idx = ~0; + int sockfd = -1; + int rv = 0; + + // using virtio vhost user? + if (dm->use_virtio_vhost) { + return vhost_user_create_if(vnm, vm, sock_filename, is_server, + sw_if_index, feature_mask, renumber, custom_dev_instance); + } + + if (is_server) { + if ((rv = dpdk_vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) { + return rv; + } + } + + if (renumber) { + // set next vhost-user if id if custom one is higher or equal + if (custom_dev_instance >= dm->next_vu_if_id) + dm->next_vu_if_id = custom_dev_instance + 1; + + dpdk_create_vhost_user_if_internal(&hw_if_idx, custom_dev_instance); + } else + dpdk_create_vhost_user_if_internal(&hw_if_idx, (u32)~0); + DBG_SOCK("dpdk vhost-user interface created hw_if_index %d", hw_if_idx); + + xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_idx); + ASSERT(xd != NULL); + + dpdk_vhost_user_vui_init (vnm, xd, sockfd, sock_filename, is_server, + feature_mask, sw_if_index); + + dpdk_vhost_user_vui_register (vm, xd); + return rv; +} + +int dpdk_vhost_user_modify_if(vnet_main_t * vnm, vlib_main_t * vm, + const char * sock_filename, + u8 is_server, + u32 sw_if_index, + u64 feature_mask, + u8 renumber, u32 custom_dev_instance) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + dpdk_vu_intf_t * vui = NULL; + u32 sw_if_idx = ~0; + int sockfd = -1; + int rv = 0; + + // using virtio vhost user? + if (dm->use_virtio_vhost) { + return vhost_user_modify_if(vnm, vm, sock_filename, is_server, + sw_if_index, feature_mask, renumber, custom_dev_instance); + } + + xd = dpdk_vhost_user_device_from_sw_if_index(sw_if_index); + + if (xd == NULL) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + vui = xd->vu_intf; + + // interface is inactive + vui->active = 0; + // disconnect interface sockets + dpdk_vhost_user_if_disconnect(xd); + + if (is_server) { + if ((rv = dpdk_vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) { + return rv; + } + } + + dpdk_vhost_user_vui_init (vnm, xd, sockfd, sock_filename, is_server, + feature_mask, &sw_if_idx); + + if (renumber) { + vnet_interface_name_renumber (sw_if_idx, custom_dev_instance); + } + + dpdk_vhost_user_vui_register (vm, xd); + + return rv; +} + +int dpdk_vhost_user_delete_if(vnet_main_t * vnm, vlib_main_t * vm, + u32 sw_if_index) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd = NULL; + dpdk_vu_intf_t * vui; + int rv = 0; + + // using virtio vhost user? + if (dm->use_virtio_vhost) { + return vhost_user_delete_if(vnm, vm, sw_if_index); + } + + xd = dpdk_vhost_user_device_from_sw_if_index(sw_if_index); + + if (xd == NULL) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + vui = xd->vu_intf; + + // interface is inactive + vui->active = 0; + // disconnect interface sockets + dpdk_vhost_user_if_disconnect(xd); + // add to inactive interface list + vec_add1 (dm->vu_inactive_interfaces_device_index, xd->device_index); + + ethernet_delete_interface (vnm, xd->vlib_hw_if_index); + DBG_SOCK ("deleted (deactivated) vhost-user interface sw_if_index %d", sw_if_index); + + return rv; +} + +int dpdk_vhost_user_dump_ifs(vnet_main_t * vnm, vlib_main_t * vm, vhost_user_intf_details_t **out_vuids) +{ + int rv = 0; + dpdk_main_t * dm = &dpdk_main; + dpdk_device_t * xd; + dpdk_vu_intf_t * vui; + struct virtio_net * vhost_dev; + vhost_user_intf_details_t * r_vuids = NULL; + vhost_user_intf_details_t * vuid = NULL; + u32 * hw_if_indices = 0; + vnet_hw_interface_t * hi; + u8 *s = NULL; + int i; + + if (!out_vuids) + return -1; + + // using virtio vhost user? + if (dm->use_virtio_vhost) { + return vhost_user_dump_ifs(vnm, vm, out_vuids); + } + + vec_foreach (xd, dm->devices) { + if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER && + xd->vu_intf->active) + vec_add1(hw_if_indices, xd->vlib_hw_if_index); + } + + for (i = 0; i < vec_len (hw_if_indices); i++) { + hi = vnet_get_hw_interface (vnm, hw_if_indices[i]); + xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_indices[i]); + if (!xd) { + clib_warning("invalid vhost-user interface hw_if_index %d", hw_if_indices[i]); + continue; + } + + vui = xd->vu_intf; + ASSERT(vui != NULL); + vhost_dev = &xd->vu_vhost_dev; + u32 virtio_net_hdr_sz = (vui->num_vrings > 0 ? + vhost_dev->virtqueue[0]->vhost_hlen : 0); + + vec_add2(r_vuids, vuid, 1); + vuid->sw_if_index = xd->vlib_sw_if_index; + vuid->virtio_net_hdr_sz = virtio_net_hdr_sz; + vuid->features = vhost_dev->features; + vuid->is_server = vui->sock_is_server; + vuid->num_regions = (vhost_dev->mem != NULL ? vhost_dev->mem->nregions : 0); + vuid->sock_errno = vui->sock_errno; + strncpy((char *)vuid->sock_filename, (char *)vui->sock_filename, + ARRAY_LEN(vuid->sock_filename)-1); + + s = format (s, "%v%c", hi->name, 0); + + strncpy((char *)vuid->if_name, (char *)s, + ARRAY_LEN(vuid->if_name)-1); + _vec_len(s) = 0; + } + + vec_free (s); + vec_free (hw_if_indices); + + *out_vuids = r_vuids; + + return rv; +} + +/* + * Processing functions called from dpdk process fn + */ + +typedef struct { + struct sockaddr_un sun; + int sockfd; + unix_file_t template; + uword *event_data; +} dpdk_vu_process_state; + +void dpdk_vhost_user_process_init (void **ctx) +{ + dpdk_vu_process_state *state = clib_mem_alloc (sizeof(dpdk_vu_process_state)); + memset(state, 0, sizeof(*state)); + state->sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + state->sun.sun_family = AF_UNIX; + state->template.read_function = dpdk_vhost_user_socket_read; + state->template.error_function = dpdk_vhost_user_socket_error; + state->event_data = 0; + *ctx = state; +} + +void dpdk_vhost_user_process_cleanup (void *ctx) +{ + clib_mem_free(ctx); +} + +uword dpdk_vhost_user_process_if (vlib_main_t *vm, dpdk_device_t *xd, void *ctx) +{ + dpdk_main_t * dm = &dpdk_main; + dpdk_vu_process_state *state = (dpdk_vu_process_state *)ctx; + dpdk_vu_intf_t *vui = xd->vu_intf; + + if (vui->sock_is_server || !vui->active) + return 0; + + if (vui->unix_fd == -1) { + /* try to connect */ + strncpy(state->sun.sun_path, (char *) vui->sock_filename, sizeof(state->sun.sun_path) - 1); + + if (connect(state->sockfd, (struct sockaddr *) &(state->sun), sizeof(struct sockaddr_un)) == 0) { + vui->sock_errno = 0; + vui->unix_fd = state->sockfd; + state->template.file_descriptor = state->sockfd; + vui->unix_file_index = unix_file_add (&unix_main, &(state->template)); + hash_set (dm->vu_sw_if_index_by_sock_fd, state->sockfd, xd->vlib_sw_if_index); + + state->sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (state->sockfd < 0) + return -1; + } else { + vui->sock_errno = errno; + } + } else { + /* check if socket is alive */ + int error = 0; + socklen_t len = sizeof (error); + int retval = getsockopt(vui->unix_fd, SOL_SOCKET, SO_ERROR, &error, &len); + + if (retval) + dpdk_vhost_user_if_disconnect(xd); + } + return 0; +} + +/* + * CLI functions + */ + +static clib_error_t * +dpdk_vhost_user_connect_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + dpdk_main_t * dm = &dpdk_main; + unformat_input_t _line_input, * line_input = &_line_input; + u8 * sock_filename = NULL; + u32 sw_if_index; + u8 is_server = 0; + u64 feature_mask = (u64)~0; + u8 renumber = 0; + u32 custom_dev_instance = ~0; + + if (dm->use_virtio_vhost) { + return vhost_user_connect_command_fn(vm, input, cmd); + } + + /* Get a line of input. */ + if (! unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { + if (unformat (line_input, "socket %s", &sock_filename)) + ; + else if (unformat (line_input, "server")) + is_server = 1; + else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask)) + ; + else if (unformat (line_input, "renumber %d", &custom_dev_instance)) { + renumber = 1; + } + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + vnet_main_t *vnm = vnet_get_main(); + if (sock_filename == NULL) + return clib_error_return (0, "missing socket file"); + + dpdk_vhost_user_create_if(vnm, vm, (char *)sock_filename, + is_server, &sw_if_index, feature_mask, + renumber, custom_dev_instance); + + vec_free(sock_filename); + return 0; +} + +VLIB_CLI_COMMAND (dpdk_vhost_user_connect_command, static) = { + .path = "create vhost-user", + .short_help = "create vhost-user socket <socket-filename> [server] [feature-mask <hex>] [renumber <dev_instance>]", + .function = dpdk_vhost_user_connect_command_fn, +}; + +static clib_error_t * +dpdk_vhost_user_delete_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + dpdk_main_t * dm = &dpdk_main; + clib_error_t * error = 0; + unformat_input_t _line_input, * line_input = &_line_input; + u32 sw_if_index = ~0; + + if (dm->use_virtio_vhost) { + return vhost_user_delete_command_fn(vm, input, cmd); + } + + /* Get a line of input. */ + if (! unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { + if (unformat (line_input, "sw_if_index %d", &sw_if_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + if (sw_if_index == ~0) { + error = clib_error_return (0, "invalid sw_if_index", + format_unformat_error, input); + return error; + } + + vnet_main_t *vnm = vnet_get_main(); + + dpdk_vhost_user_delete_if(vnm, vm, sw_if_index); + + return 0; +} + +VLIB_CLI_COMMAND (dpdk_vhost_user_delete_command, static) = { + .path = "delete vhost-user", + .short_help = "delete vhost-user sw_if_index <nn>", + .function = dpdk_vhost_user_delete_command_fn, +}; + +#define foreach_dpdk_vhost_feature \ + _ (VIRTIO_NET_F_MRG_RXBUF) \ + _ (VIRTIO_NET_F_CTRL_VQ) \ + _ (VIRTIO_NET_F_CTRL_RX) + +static clib_error_t * +show_dpdk_vhost_user_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t * error = 0; + dpdk_main_t * dm = &dpdk_main; + vnet_main_t * vnm = vnet_get_main(); + dpdk_device_t * xd; + dpdk_vu_intf_t * vui; + struct virtio_net * vhost_dev; + u32 hw_if_index, * hw_if_indices = 0; + vnet_hw_interface_t * hi; + int i, j, q; + int show_descr = 0; + struct virtio_memory * mem; + struct feat_struct { u8 bit; char *str;}; + struct feat_struct *feat_entry; + + static struct feat_struct feat_array[] = { +#define _(f) { .str = #f, .bit = f, }, + foreach_dpdk_vhost_feature +#undef _ + { .str = NULL } + }; + + if (dm->use_virtio_vhost) { + return show_vhost_user_command_fn(vm, input, cmd); + } + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { + if (unformat (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) { + vec_add1 (hw_if_indices, hw_if_index); + vlib_cli_output(vm, "add %d", hw_if_index); + } + else if (unformat (input, "descriptors") || unformat (input, "desc") ) + show_descr = 1; + else { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + goto done; + } + } + if (vec_len (hw_if_indices) == 0) { + vec_foreach (xd, dm->devices) { + if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER && xd->vu_intf->active) + vec_add1(hw_if_indices, xd->vlib_hw_if_index); + } + } + + vlib_cli_output (vm, "DPDK vhost-user interfaces"); + vlib_cli_output (vm, "Global:\n coalesce frames %d time %e\n\n", + dm->vhost_coalesce_frames, dm->vhost_coalesce_time); + + for (i = 0; i < vec_len (hw_if_indices); i++) { + hi = vnet_get_hw_interface (vnm, hw_if_indices[i]); + + if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_indices[i]))) { + error = clib_error_return (0, "not dpdk vhost-user interface: '%s'", + hi->name); + goto done; + } + vui = xd->vu_intf; + vhost_dev = &xd->vu_vhost_dev; + mem = vhost_dev->mem; + u32 virtio_net_hdr_sz = (vui->num_vrings > 0 ? + vhost_dev->virtqueue[0]->vhost_hlen : 0); + + vlib_cli_output (vm, "Interface: %s (ifindex %d)", + hi->name, hw_if_indices[i]); + + vlib_cli_output (vm, "virtio_net_hdr_sz %d\n features (0x%llx): \n", + virtio_net_hdr_sz, xd->vu_vhost_dev.features); + + feat_entry = (struct feat_struct *) &feat_array; + while(feat_entry->str) { + if (xd->vu_vhost_dev.features & (1 << feat_entry->bit)) + vlib_cli_output (vm, " %s (%d)", feat_entry->str, feat_entry->bit); + feat_entry++; + } + + vlib_cli_output (vm, "\n"); + + vlib_cli_output (vm, " socket filename %s type %s errno \"%s\"\n\n", + vui->sock_filename, vui->sock_is_server ? "server" : "client", + strerror(vui->sock_errno)); + + vlib_cli_output (vm, " Memory regions (total %d)\n", mem->nregions); + + if (mem->nregions){ + vlib_cli_output(vm, " region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr\n"); + vlib_cli_output(vm, " ====== ===== ================== ================== ================== ================== ==================\n"); + } + for (j = 0; j < mem->nregions; j++) { + vlib_cli_output(vm, " %d %-5d 0x%016lx 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", j, + vui->region_fd[j], + mem->regions[j].guest_phys_address, + mem->regions[j].memory_size, + mem->regions[j].userspace_address, + mem->regions[j].address_offset, + vui->region_addr[j]); + } + for (q = 0; q < vui->num_vrings; q++) { + struct vhost_virtqueue *vq = vhost_dev->virtqueue[q]; + + vlib_cli_output(vm, "\n Virtqueue %d\n", q); + + vlib_cli_output(vm, " qsz %d last_used_idx %d last_used_idx_res %d\n", + vq->size, vq->last_used_idx, vq->last_used_idx_res); + + if (vq->avail && vq->used) + vlib_cli_output(vm, " avail.flags %x avail.idx %d used.flags %x used.idx %d\n", + vq->avail->flags, vq->avail->idx, vq->used->flags, vq->used->idx); + + vlib_cli_output(vm, " kickfd %d callfd %d errfd %d\n", + vui->vrings[q].kickfd, + vui->vrings[q].callfd, + vui->vrings[q].errfd); + + if (show_descr) { + vlib_cli_output(vm, "\n descriptor table:\n"); + vlib_cli_output(vm, " id addr len flags next user_addr\n"); + vlib_cli_output(vm, " ===== ================== ===== ====== ===== ==================\n"); + for(j = 0; j < vq->size; j++) { + vlib_cli_output(vm, " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", + j, + vq->desc[j].addr, + vq->desc[j].len, + vq->desc[j].flags, + vq->desc[j].next, + (u64) map_guest_mem(xd, vq->desc[j].addr));} + } + } + vlib_cli_output (vm, "\n"); + } +done: + vec_free (hw_if_indices); + return error; +} + +VLIB_CLI_COMMAND (show_vhost_user_command, static) = { + .path = "show vhost-user", + .short_help = "show vhost-user interface", + .function = show_dpdk_vhost_user_command_fn, +}; + |