summaryrefslogtreecommitdiffstats
path: root/vnet/vnet/devices
diff options
context:
space:
mode:
Diffstat (limited to 'vnet/vnet/devices')
-rw-r--r--vnet/vnet/devices/dpdk/cli.c974
-rw-r--r--vnet/vnet/devices/dpdk/device.c1483
-rw-r--r--vnet/vnet/devices/dpdk/dpdk.h515
-rw-r--r--vnet/vnet/devices/dpdk/dpdk_priv.h437
-rw-r--r--vnet/vnet/devices/dpdk/init.c1728
-rw-r--r--vnet/vnet/devices/dpdk/node.c2010
-rw-r--r--vnet/vnet/devices/dpdk/threads.c378
-rw-r--r--vnet/vnet/devices/dpdk/threads.h30
-rw-r--r--vnet/vnet/devices/dpdk/vhost_user.c1550
-rw-r--r--vnet/vnet/devices/ssvm/node.c323
-rw-r--r--vnet/vnet/devices/ssvm/ssvm_eth.c475
-rw-r--r--vnet/vnet/devices/ssvm/ssvm_eth.h128
-rw-r--r--vnet/vnet/devices/virtio/vhost-user.c1957
-rw-r--r--vnet/vnet/devices/virtio/vhost-user.h222
14 files changed, 12210 insertions, 0 deletions
diff --git a/vnet/vnet/devices/dpdk/cli.c b/vnet/vnet/devices/dpdk/cli.c
new file mode 100644
index 00000000000..c27dbfabfc0
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/cli.c
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls-gre/packet.h>
+
+#include "dpdk_priv.h"
+
+frame_queue_trace_t *frame_queue_traces;
+
+static clib_error_t *
+pcap_trace_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ u8 * filename;
+ u32 max;
+ int matched = 0;
+ clib_error_t * error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "on"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ if (dm->pcap_filename == 0)
+ dm->pcap_filename = format (0, "/tmp/vpe.pcap%c", 0);
+
+ memset (&dm->pcap_main, 0, sizeof (dm->pcap_main));
+ dm->pcap_main.file_name = (char *) dm->pcap_filename;
+ dm->pcap_main.n_packets_to_capture = 100;
+ if (dm->pcap_pkts_to_capture)
+ dm->pcap_main.n_packets_to_capture = dm->pcap_pkts_to_capture;
+
+ dm->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet;
+ dm->tx_pcap_enable = 1;
+ matched = 1;
+ vlib_cli_output (vm, "pcap tx capture on...");
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already on...");
+ }
+ matched = 1;
+ }
+ else if (unformat (input, "off"))
+ {
+ if (dm->tx_pcap_enable)
+ {
+ vlib_cli_output (vm, "captured %d pkts...",
+ dm->pcap_main.n_packets_captured+1);
+ if (dm->pcap_main.n_packets_captured)
+ {
+ dm->pcap_main.n_packets_to_capture =
+ dm->pcap_main.n_packets_captured;
+ error = pcap_write (&dm->pcap_main);
+ if (error)
+ clib_error_report (error);
+ else
+ vlib_cli_output (vm, "saved to %s...", dm->pcap_filename);
+ }
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already off...");
+ }
+
+ dm->tx_pcap_enable = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "max %d", &max))
+ {
+ dm->pcap_pkts_to_capture = max;
+ matched = 1;
+ }
+
+ else if (unformat (input, "intfc %U",
+ unformat_vnet_sw_interface, dm->vnet_main,
+ &dm->pcap_sw_if_index))
+ matched = 1;
+ else if (unformat (input, "intfc any"))
+ {
+ dm->pcap_sw_if_index = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "file %s", &filename))
+ {
+ u8 * chroot_filename;
+ /* Brain-police user path input */
+ if (strstr((char *)filename, "..") || index((char *)filename, '/'))
+ {
+ vlib_cli_output (vm, "illegal characters in filename '%s'",
+ filename);
+ continue;
+ }
+
+ chroot_filename = format (0, "/tmp/%s%c", filename, 0);
+ vec_free (filename);
+
+ if (dm->pcap_filename)
+ vec_free (dm->pcap_filename);
+ vec_add1 (filename, 0);
+ dm->pcap_filename = chroot_filename;
+ matched = 1;
+ }
+ else if (unformat (input, "status"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ vlib_cli_output (vm, "pcap tx capture is off...");
+ continue;
+ }
+
+ vlib_cli_output (vm, "pcap tx capture: %d of %d pkts...",
+ dm->pcap_main.n_packets_captured,
+ dm->pcap_main.n_packets_to_capture);
+ matched = 1;
+ }
+
+ else
+ break;
+ }
+
+ if (matched == 0)
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (pcap_trace_command, static) = {
+ .path = "pcap tx trace",
+ .short_help =
+ "pcap tx trace on off max <nn> intfc <intfc> file <name> status",
+ .function = pcap_trace_command_fn,
+};
+
+
+static clib_error_t *
+show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ struct rte_mempool * rmp;
+ int i;
+
+ for(i = 0; i < vec_len(vm->buffer_main->pktmbuf_pools); i++)
+ {
+ rmp = vm->buffer_main->pktmbuf_pools[i];
+ if (rmp)
+ {
+ unsigned count = rte_mempool_count(rmp);
+ unsigned free_count = rte_mempool_free_count(rmp);
+
+ vlib_cli_output(vm, "name=\"%s\" available = %7d allocated = %7d total = %7d\n",
+ rmp->name, (u32)count, (u32)free_count,
+ (u32)(count+free_count));
+ }
+ else
+ {
+ vlib_cli_output(vm, "rte_mempool is NULL (!)\n");
+ }
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (cmd_show_dpdk_bufferr,static) = {
+ .path = "show dpdk buffer",
+ .short_help = "show dpdk buffer state",
+ .function = show_dpdk_buffer,
+};
+
+static clib_error_t *
+test_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ static u32 * allocated_buffers;
+ u32 n_alloc = 0;
+ u32 n_free = 0;
+ u32 first, actual_alloc;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "allocate %d", &n_alloc))
+ ;
+ else if (unformat (input, "free %d", &n_free))
+ ;
+ else
+ break;
+ }
+
+ if (n_free)
+ {
+ if (vec_len (allocated_buffers) < n_free)
+ return clib_error_return (0, "Can't free %d, only %d allocated",
+ n_free, vec_len (allocated_buffers));
+
+ first = vec_len(allocated_buffers) - n_free;
+ vlib_buffer_free (vm, allocated_buffers + first, n_free);
+ _vec_len (allocated_buffers) = first;
+ }
+ if (n_alloc)
+ {
+ first = vec_len (allocated_buffers);
+ vec_validate (allocated_buffers,
+ vec_len (allocated_buffers) + n_alloc - 1);
+
+ actual_alloc = vlib_buffer_alloc (vm, allocated_buffers + first,
+ n_alloc);
+ _vec_len (allocated_buffers) = first + actual_alloc;
+
+ if (actual_alloc < n_alloc)
+ vlib_cli_output (vm, "WARNING: only allocated %d buffers",
+ actual_alloc);
+ }
+
+ vlib_cli_output (vm, "Currently %d buffers allocated",
+ vec_len (allocated_buffers));
+
+ if (allocated_buffers && vec_len(allocated_buffers) == 0)
+ vec_free(allocated_buffers);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (cmd_test_dpdk_bufferr,static) = {
+ .path = "test dpdk buffer",
+ .short_help = "test dpdk buffer [allocate <nn>][free <nn>]",
+ .function = test_dpdk_buffer,
+};
+
+static void
+show_dpdk_device_stats (vlib_main_t * vm, dpdk_device_t * xd)
+{
+ vlib_cli_output(vm,
+ "device_index %d\n"
+ " last_burst_sz %d\n"
+ " max_burst_sz %d\n"
+ " full_frames_cnt %u\n"
+ " consec_full_frames_cnt %u\n"
+ " congestion_cnt %d\n"
+ " last_poll_time %llu\n"
+ " max_poll_delay %llu\n"
+ " discard_cnt %u\n"
+ " total_packet_cnt %u\n",
+ xd->device_index,
+ xd->efd_agent.last_burst_sz,
+ xd->efd_agent.max_burst_sz,
+ xd->efd_agent.full_frames_cnt,
+ xd->efd_agent.consec_full_frames_cnt,
+ xd->efd_agent.congestion_cnt,
+ xd->efd_agent.last_poll_time,
+ xd->efd_agent.max_poll_delay,
+ xd->efd_agent.discard_cnt,
+ xd->efd_agent.total_packet_cnt);
+
+ u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
+ 0 /* queue_id */);
+ vlib_cli_output(vm,
+ " device_queue_sz %u\n",
+ device_queue_sz);
+}
+
+
+/*
+ * Trigger threads to grab frame queue trace data
+ */
+static clib_error_t *
+trace_frame_queue (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ clib_error_t * error = NULL;
+ frame_queue_trace_t *fqt;
+ u32 num_fq;
+ u32 fqix;
+ u32 enable = 0;
+
+ if (unformat(input, "on")) {
+ enable = 1;
+ } else if (unformat(input, "off")) {
+ enable = 0;
+ } else {
+ return clib_error_return(0, "expecting on or off");
+ }
+
+ num_fq = vec_len(vlib_frame_queues);
+ if (num_fq == 0)
+ {
+ vlib_cli_output(vm, "No frame queues exist\n");
+ return error;
+ }
+
+ // Allocate storage for trace if necessary
+ vec_validate_aligned(frame_queue_traces, num_fq-1, CLIB_CACHE_LINE_BYTES);
+
+ for (fqix=0; fqix<num_fq; fqix++) {
+ fqt = &frame_queue_traces[fqix];
+
+ memset(fqt->n_vectors, 0xff, sizeof(fqt->n_vectors));
+ fqt->written = 0;
+ vlib_frame_queues[fqix]->trace = enable;
+ }
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_trace_frame_queue,static) = {
+ .path = "trace frame-queue",
+ .short_help = "trace frame-queue (on|off)",
+ .function = trace_frame_queue,
+};
+
+
+/*
+ * Display frame queue trace data gathered by threads.
+ */
+static clib_error_t *
+show_frame_queue (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ clib_error_t * error = NULL;
+ frame_queue_trace_t *fqt;
+ u32 num_fq;
+ u32 fqix;
+
+ num_fq = vec_len(frame_queue_traces);
+ if (num_fq == 0)
+ {
+ vlib_cli_output(vm, "No trace data for frame queues\n");
+ return error;
+ }
+
+ for (fqix=0; fqix<num_fq; fqix++) {
+ fqt = &frame_queue_traces[fqix];
+
+ vlib_cli_output(vm, "Thread %d %v\n", fqix, vlib_worker_threads[fqix].name);
+
+ if (fqt->written == 0)
+ {
+ vlib_cli_output(vm, " no trace data\n");
+ continue;
+ }
+
+ vlib_cli_output(vm, " vector-threshold %d ring size %d in use %d\n",
+ fqt->threshold, fqt->nelts, fqt->n_in_use);
+ vlib_cli_output(vm, " head %12d head_hint %12d tail %12d\n",
+ fqt->head, fqt->head_hint, fqt->tail);
+ vlib_cli_output(vm, " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n",
+ fqt->n_vectors[0], fqt->n_vectors[1], fqt->n_vectors[2], fqt->n_vectors[3],
+ fqt->n_vectors[4], fqt->n_vectors[5], fqt->n_vectors[6], fqt->n_vectors[7],
+ fqt->n_vectors[8], fqt->n_vectors[9], fqt->n_vectors[10], fqt->n_vectors[11],
+ fqt->n_vectors[12], fqt->n_vectors[13], fqt->n_vectors[14], fqt->n_vectors[15]);
+
+ if (fqt->nelts > 16)
+ {
+ vlib_cli_output(vm, " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n",
+ fqt->n_vectors[16], fqt->n_vectors[17], fqt->n_vectors[18], fqt->n_vectors[19],
+ fqt->n_vectors[20], fqt->n_vectors[21], fqt->n_vectors[22], fqt->n_vectors[23],
+ fqt->n_vectors[24], fqt->n_vectors[25], fqt->n_vectors[26], fqt->n_vectors[27],
+ fqt->n_vectors[28], fqt->n_vectors[29], fqt->n_vectors[30], fqt->n_vectors[31]);
+ }
+ }
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_show_frame_queue,static) = {
+ .path = "show frame-queue",
+ .short_help = "show frame-queue trace",
+ .function = show_frame_queue,
+};
+
+
+/*
+ * Modify the number of elements on the frame_queues
+ */
+static clib_error_t *
+test_frame_queue_nelts (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ clib_error_t * error = NULL;
+ u32 num_fq;
+ u32 fqix;
+ u32 nelts = 0;
+
+ unformat(input, "%d", &nelts);
+ if ((nelts != 4) && (nelts != 8) && (nelts != 16) && (nelts != 32)) {
+ return clib_error_return(0, "expecting 4,8,16,32");
+ }
+
+ num_fq = vec_len(vlib_frame_queues);
+ if (num_fq == 0)
+ {
+ vlib_cli_output(vm, "No frame queues exist\n");
+ return error;
+ }
+
+ for (fqix=0; fqix<num_fq; fqix++) {
+ vlib_frame_queues[fqix]->nelts = nelts;
+ }
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_test_frame_queue_nelts,static) = {
+ .path = "test frame-queue nelts",
+ .short_help = "test frame-queue nelts (4,8,16,32)",
+ .function = test_frame_queue_nelts,
+};
+
+
+/*
+ * Modify the max number of packets pulled off the frame queues
+ */
+static clib_error_t *
+test_frame_queue_threshold (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ clib_error_t * error = NULL;
+ u32 num_fq;
+ u32 fqix;
+ u32 threshold = 0;
+
+ if (unformat(input, "%d", &threshold)) {
+ } else {
+ vlib_cli_output(vm, "expecting threshold value\n");
+ return error;
+ }
+
+ if (threshold == 0)
+ threshold = ~0;
+
+ num_fq = vec_len(vlib_frame_queues);
+ if (num_fq == 0)
+ {
+ vlib_cli_output(vm, "No frame queues exist\n");
+ return error;
+ }
+
+ for (fqix=0; fqix<num_fq; fqix++) {
+ vlib_frame_queues[fqix]->vector_threshold = threshold;
+ }
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_test_frame_queue_threshold,static) = {
+ .path = "test frame-queue threshold",
+ .short_help = "test frame-queue threshold N (0=no limit)",
+ .function = test_frame_queue_threshold,
+};
+
+static void
+show_efd_config (vlib_main_t * vm)
+{
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ dpdk_main_t * dm = &dpdk_main;
+
+ vlib_cli_output(vm,
+ "dpdk: (0x%04x) enabled:%d monitor:%d drop_all:%d\n"
+ " dpdk_queue_hi_thresh %d\n"
+ " consec_full_frames_hi_thresh %d\n"
+ "---------\n"
+ "worker: (0x%04x) enabled:%d monitor:%d\n"
+ " worker_queue_hi_thresh %d\n",
+ dm->efd.enabled,
+ ((dm->efd.enabled & DPDK_EFD_DISCARD_ENABLED) ? 1:0),
+ ((dm->efd.enabled & DPDK_EFD_MONITOR_ENABLED) ? 1:0),
+ ((dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED) ? 1:0),
+ dm->efd.queue_hi_thresh,
+ dm->efd.consec_full_frames_hi_thresh,
+ tm->efd.enabled,
+ ((tm->efd.enabled & VLIB_EFD_DISCARD_ENABLED) ? 1:0),
+ ((dm->efd.enabled & VLIB_EFD_MONITOR_ENABLED) ? 1:0),
+ tm->efd.queue_hi_thresh);
+ vlib_cli_output(vm,
+ "---------\n"
+ "ip_prec_bitmap 0x%02x\n"
+ "mpls_exp_bitmap 0x%02x\n"
+ "vlan_cos_bitmap 0x%02x\n",
+ tm->efd.ip_prec_bitmap,
+ tm->efd.mpls_exp_bitmap,
+ tm->efd.vlan_cos_bitmap);
+}
+
+static clib_error_t *
+show_efd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+
+ if (unformat(input, "config")) {
+ show_efd_config(vm);
+ } else if (unformat(input, "dpdk")) {
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ u32 device_id = ~0;
+
+ unformat(input, "device %d", &device_id);
+ vec_foreach (xd, dm->devices) {
+ if ((xd->device_index == device_id) || (device_id == ~0)) {
+ show_dpdk_device_stats(vm, xd);
+ }
+ }
+ } else if (unformat(input, "worker")) {
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vlib_frame_queue_t *fq;
+ vlib_thread_registration_t * tr;
+ int thread_id;
+ u32 num_workers = 0;
+ u32 first_worker_index = 0;
+ uword * p;
+
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ ASSERT (p);
+ tr = (vlib_thread_registration_t *) p[0];
+ if (tr)
+ {
+ num_workers = tr->count;
+ first_worker_index = tr->first_index;
+ }
+
+ vlib_cli_output(vm,
+ "num_workers %d\n"
+ "first_worker_index %d\n"
+ "vlib_frame_queues[%d]:\n",
+ num_workers,
+ first_worker_index,
+ tm->n_vlib_mains);
+
+ for (thread_id = 0; thread_id < tm->n_vlib_mains; thread_id++) {
+ fq = vlib_frame_queues[thread_id];
+ if (fq) {
+ vlib_cli_output(vm,
+ "%2d: frames_queued %u\n"
+ " frames_queued_hint %u\n"
+ " enqueue_full_events %u\n"
+ " enqueue_efd_discards %u\n",
+ thread_id,
+ (fq->tail - fq->head),
+ (fq->tail - fq->head_hint),
+ fq->enqueue_full_events,
+ fq->enqueue_efd_discards);
+ }
+ }
+ } else if (unformat(input, "help")) {
+ vlib_cli_output(vm, "Usage: show efd config | "
+ "dpdk [device <id>] | worker\n");
+ } else {
+ show_efd_config(vm);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_efd_command, static) = {
+ .path = "show efd",
+ .short_help = "Show efd [device <id>] | [config]",
+ .function = show_efd,
+};
+
+static clib_error_t *
+clear_efd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vlib_frame_queue_t *fq;
+ int thread_id;
+
+ vec_foreach (xd, dm->devices) {
+ xd->efd_agent.last_burst_sz = 0;
+ xd->efd_agent.max_burst_sz = 0;
+ xd->efd_agent.full_frames_cnt = 0;
+ xd->efd_agent.consec_full_frames_cnt = 0;
+ xd->efd_agent.congestion_cnt = 0;
+ xd->efd_agent.last_poll_time = 0;
+ xd->efd_agent.max_poll_delay = 0;
+ xd->efd_agent.discard_cnt = 0;
+ xd->efd_agent.total_packet_cnt = 0;
+ }
+
+ for (thread_id = 0; thread_id < tm->n_vlib_mains; thread_id++) {
+ fq = vlib_frame_queues[thread_id];
+ if (fq) {
+ fq->enqueue_full_events = 0;
+ fq->enqueue_efd_discards = 0;
+ }
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (clear_efd_command,static) = {
+ .path = "clear efd",
+ .short_help = "Clear early-fast-discard counters",
+ .function = clear_efd,
+};
+
+static clib_error_t *
+parse_op_and_prec (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd,
+ char *prec_type, u8 *prec_bitmap)
+{
+ clib_error_t * error = NULL;
+ u8 op = 0;
+ u8 prec = 0;
+
+ if (unformat(input, "ge")) {
+ op = EFD_OPERATION_GREATER_OR_EQUAL;
+ } else if (unformat(input, "lt")) {
+ op = EFD_OPERATION_LESS_THAN;
+ } else if (unformat(input, "help")) {
+ vlib_cli_output(vm,
+ "enter operation [ge | lt] and precedence <0-7>)");
+ return (error);
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (unformat (input, "%u", &prec)) {
+ if (prec > 7) {
+ return clib_error_return(0, "precedence %d is out of range <0-7>",
+ prec);
+ }
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ set_efd_bitmap(prec_bitmap, prec, op);
+
+ vlib_cli_output(vm,
+ "EFD will be set for %s precedence %s%u%s.",
+ prec_type,
+ (op == EFD_OPERATION_LESS_THAN) ? "less than " : "",
+ prec,
+ (op == EFD_OPERATION_GREATER_OR_EQUAL) ? " and greater" : "");
+
+ return (error);
+}
+
+
+static clib_error_t *
+set_efd (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ clib_error_t * error = NULL;
+
+ if (unformat(input, "enable")) {
+ if (unformat(input, "dpdk")) {
+ dm->efd.enabled |= DPDK_EFD_DISCARD_ENABLED;
+ } else if (unformat(input, "worker")) {
+ tm->efd.enabled |= VLIB_EFD_DISCARD_ENABLED;
+ } else if (unformat(input, "monitor")) {
+ dm->efd.enabled |= DPDK_EFD_MONITOR_ENABLED;
+ tm->efd.enabled |= VLIB_EFD_MONITOR_ENABLED;
+ } else if (unformat(input, "drop_all")) {
+ dm->efd.enabled |= DPDK_EFD_DROPALL_ENABLED;
+ } else if (unformat(input, "default")) {
+ dm->efd.enabled = (DPDK_EFD_DISCARD_ENABLED |
+ DPDK_EFD_MONITOR_ENABLED);
+ tm->efd.enabled = (VLIB_EFD_DISCARD_ENABLED |
+ VLIB_EFD_MONITOR_ENABLED);
+ } else {
+ return clib_error_return(0, "Usage: set efd enable [dpdk | "
+ "worker | monitor | drop_all | default]");
+ }
+ } else if (unformat(input, "disable")) {
+ if (unformat(input, "dpdk")) {
+ dm->efd.enabled &= ~DPDK_EFD_DISCARD_ENABLED;
+ } else if (unformat(input, "worker")) {
+ tm->efd.enabled &= ~VLIB_EFD_DISCARD_ENABLED;
+ } else if (unformat(input, "monitor")) {
+ dm->efd.enabled &= ~DPDK_EFD_MONITOR_ENABLED;
+ tm->efd.enabled &= ~VLIB_EFD_MONITOR_ENABLED;
+ } else if (unformat(input, "drop_all")) {
+ dm->efd.enabled &= ~DPDK_EFD_DROPALL_ENABLED;
+ } else if (unformat(input, "all")) {
+ dm->efd.enabled = 0;
+ tm->efd.enabled = 0;
+ } else {
+ return clib_error_return(0, "Usage: set efd disable [dpdk | "
+ "worker | monitor | drop_all | all]");
+ }
+ } else if (unformat(input, "worker_queue_hi_thresh")) {
+ u32 mark;
+ if (unformat (input, "%u", &mark)) {
+ tm->efd.queue_hi_thresh = mark;
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ } else if (unformat(input, "dpdk_device_hi_thresh")) {
+ u32 thresh;
+ if (unformat (input, "%u", &thresh)) {
+ dm->efd.queue_hi_thresh = thresh;
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ } else if (unformat(input, "consec_full_frames_hi_thresh")) {
+ u32 thresh;
+ if (unformat (input, "%u", &thresh)) {
+ dm->efd.consec_full_frames_hi_thresh = thresh;
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ } else if (unformat(input, "ip-prec")) {
+ return (parse_op_and_prec(vm, input, cmd,
+ "ip", &tm->efd.ip_prec_bitmap));
+ } else if (unformat(input, "mpls-exp")) {
+ return (parse_op_and_prec(vm, input, cmd,
+ "mpls", &tm->efd.mpls_exp_bitmap));
+ } else if (unformat(input, "vlan-cos")) {
+ return (parse_op_and_prec(vm, input, cmd,
+ "vlan", &tm->efd.vlan_cos_bitmap));
+ } else if (unformat(input, "help")) {
+ vlib_cli_output(vm,
+ "Usage:\n"
+ " set efd enable <dpdk | worker | monitor | drop_all | default> |\n"
+ " set efd disable <dpdk | worker | monitor | drop_all | all> |\n"
+ " set efd <ip-prec | mpls-exp | vlan-cos> <ge | lt> <0-7>\n"
+ " set efd worker_queue_hi_thresh <0-32> |\n"
+ " set efd dpdk_device_hi_thresh <0-%d> |\n"
+ " set efd consec_full_frames_hi_thresh <count> |\n",
+ DPDK_NB_RX_DESC_10GE);
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_set_efd,static) = {
+ .path = "set efd",
+ .short_help = "set early-fast-discard commands",
+ .function = set_efd,
+};
+
+static clib_error_t *
+set_dpdk_if_desc (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ dpdk_main_t * dm = &dpdk_main;
+ vnet_hw_interface_t * hw;
+ dpdk_device_t * xd;
+ u32 hw_if_index = (u32) ~0;
+ u32 nb_rx_desc = (u32) ~0;
+ u32 nb_tx_desc = (u32) ~0;
+ clib_error_t * rv;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "tx %d", &nb_tx_desc))
+ ;
+ else if (unformat (line_input, "rx %d", &nb_rx_desc))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ if (xd->dev_type != VNET_DPDK_DEV_ETH)
+ return clib_error_return (0, "number of descriptors can be set only for "
+ "physical devices");
+
+ if ((nb_rx_desc == (u32) ~0 || nb_rx_desc == xd->nb_rx_desc) &&
+ (nb_tx_desc == (u32) ~0 || nb_tx_desc == xd->nb_tx_desc))
+ return clib_error_return (0, "nothing changed");
+
+ if (nb_rx_desc != (u32) ~0)
+ xd->nb_rx_desc = nb_rx_desc;
+
+ if (nb_tx_desc != (u32) ~0)
+ xd->nb_rx_desc = nb_rx_desc;
+
+ rv = dpdk_port_setup(dm, xd);
+
+ return rv < 0 ? rv : 0;
+}
+
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = {
+ .path = "set dpdk interface descriptors",
+ .short_help = "set dpdk interface descriptors <if-name> [rx <n>] [tx <n>]",
+ .function = set_dpdk_if_desc,
+};
+
+static clib_error_t *
+show_dpdk_if_placement (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_and_queue_t * dq;
+ int cpu;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_cli_output(vm, "All interfaces are handled by main thread");
+
+ for(cpu = 0; cpu < vec_len(dm->devices_by_cpu); cpu++)
+ {
+ if (vec_len(dm->devices_by_cpu[cpu]))
+ vlib_cli_output(vm, "Thread %u (%s at lcore %u):", cpu,
+ vlib_worker_threads[cpu].name,
+ vlib_worker_threads[cpu].dpdk_lcore_id);
+
+ vec_foreach(dq, dm->devices_by_cpu[cpu])
+ {
+ u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index;
+ vnet_hw_interface_t * hi = vnet_get_hw_interface(dm->vnet_main, hw_if_index);
+ vlib_cli_output(vm, " %v queue %u", hi->name, dq->queue_id);
+ }
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_placement,static) = {
+ .path = "show dpdk interface placement",
+ .short_help = "show dpdk interface placement",
+ .function = show_dpdk_if_placement,
+};
+
+static int
+dpdk_device_queue_sort(void * a1, void * a2)
+{
+ dpdk_device_and_queue_t * dq1 = a1;
+ dpdk_device_and_queue_t * dq2 = a2;
+
+ if (dq1->device > dq2->device)
+ return 1;
+ else if (dq1->device < dq2->device)
+ return -1;
+ else if (dq1->queue_id > dq2->queue_id)
+ return 1;
+ else if (dq1->queue_id < dq2->queue_id)
+ return -1;
+ else
+ return 0;
+}
+
+static clib_error_t *
+set_dpdk_if_placement (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_and_queue_t * dq;
+ vnet_hw_interface_t * hw;
+ dpdk_device_t * xd;
+ u32 hw_if_index = (u32) ~0;
+ u32 queue = (u32) 0;
+ u32 cpu = (u32) ~0;
+ int i;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "queue %d", &queue))
+ ;
+ else if (unformat (line_input, "thread %d", &cpu))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ if (cpu < dm->input_cpu_first_index ||
+ cpu >= (dm->input_cpu_first_index + dm->input_cpu_count))
+ return clib_error_return (0, "please specify valid thread id");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ for(i = 0; i < vec_len(dm->devices_by_cpu); i++)
+ {
+ vec_foreach(dq, dm->devices_by_cpu[i])
+ {
+ if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index &&
+ queue == dq->queue_id)
+ {
+ if (cpu == i) /* nothing to do */
+ return 0;
+
+ vec_del1(dm->devices_by_cpu[i], dq - dm->devices_by_cpu[i]);
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->queue_id = queue;
+ dq->device = xd->device_index;
+ xd->cpu_socket_id_by_queue[queue] =
+ rte_lcore_to_socket_id(vlib_worker_threads[cpu].dpdk_lcore_id);
+
+ vec_sort_with_function(dm->devices_by_cpu[i],
+ dpdk_device_queue_sort);
+
+ vec_sort_with_function(dm->devices_by_cpu[cpu],
+ dpdk_device_queue_sort);
+
+ if (vec_len(dm->devices_by_cpu[i]) == 0)
+ vlib_node_set_state (vlib_mains[i], dpdk_input_node.index,
+ VLIB_NODE_STATE_DISABLED);
+
+ if (vec_len(dm->devices_by_cpu[cpu]) == 1)
+ vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ return 0;
+ }
+ }
+ }
+
+ return clib_error_return (0, "not found");
+}
+
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = {
+ .path = "set dpdk interface placement",
+ .short_help = "set dpdk interface placement <if-name> [queue <n>] thread <n>",
+ .function = set_dpdk_if_placement,
+};
+
+clib_error_t *
+dpdk_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (dpdk_cli_init);
diff --git a/vnet/vnet/devices/dpdk/device.c b/vnet/vnet/devices/dpdk/device.c
new file mode 100644
index 00000000000..a19c3131ef9
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/device.c
@@ -0,0 +1,1483 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include "dpdk_priv.h"
+#include <vppinfra/error.h>
+
+#define foreach_dpdk_tx_func_error \
+ _(BAD_RETVAL, "DPDK tx function returned an error") \
+ _(RING_FULL, "Tx packet drops (ring full)") \
+ _(PKT_DROP, "Tx packet drops (dpdk tx failure)") \
+ _(REPL_FAIL, "Tx packet drops (replication failure)")
+
+typedef enum {
+#define _(f,s) DPDK_TX_FUNC_ERROR_##f,
+ foreach_dpdk_tx_func_error
+#undef _
+ DPDK_TX_FUNC_N_ERROR,
+} dpdk_tx_func_error_t;
+
+static char * dpdk_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_tx_func_error
+#undef _
+};
+
+static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ struct rte_mbuf * first_mb = 0, * new_mb, * pkt_mb, ** prev_mb_next = 0;
+ u8 nb_segs, nb_segs_left;
+ u32 copy_bytes;
+ unsigned socket_id = rte_socket_id();
+
+ ASSERT (bm->pktmbuf_pools[socket_id]);
+ pkt_mb = ((struct rte_mbuf *)b)-1;
+ nb_segs = pkt_mb->nb_segs;
+ for (nb_segs_left = nb_segs; nb_segs_left; nb_segs_left--)
+ {
+ if (PREDICT_FALSE(pkt_mb == 0))
+ {
+ clib_warning ("Missing %d mbuf chain segment(s): "
+ "(nb_segs = %d, nb_segs_left = %d)!",
+ nb_segs - nb_segs_left, nb_segs, nb_segs_left);
+ if (first_mb)
+ rte_pktmbuf_free(first_mb);
+ return NULL;
+ }
+ new_mb = rte_pktmbuf_alloc (bm->pktmbuf_pools[socket_id]);
+ if (PREDICT_FALSE(new_mb == 0))
+ {
+ if (first_mb)
+ rte_pktmbuf_free(first_mb);
+ return NULL;
+ }
+
+ /*
+ * Copy packet info into 1st segment.
+ */
+ if (first_mb == 0)
+ {
+ first_mb = new_mb;
+ rte_pktmbuf_pkt_len (first_mb) = pkt_mb->pkt_len;
+ first_mb->nb_segs = pkt_mb->nb_segs;
+ first_mb->port = pkt_mb->port;
+#ifdef DAW_FIXME // TX Offload support TBD
+ first_mb->vlan_macip = pkt_mb->vlan_macip;
+ first_mb->hash = pkt_mb->hash;
+ first_mb->ol_flags = pkt_mb->ol_flags
+#endif
+ }
+ else
+ {
+ ASSERT(prev_mb_next != 0);
+ *prev_mb_next = new_mb;
+ }
+
+ /*
+ * Copy packet segment data into new mbuf segment.
+ */
+ rte_pktmbuf_data_len (new_mb) = pkt_mb->data_len;
+ copy_bytes = pkt_mb->data_len + RTE_PKTMBUF_HEADROOM;
+ ASSERT(copy_bytes <= pkt_mb->buf_len);
+ memcpy(new_mb->buf_addr, pkt_mb->buf_addr, copy_bytes);
+
+ prev_mb_next = &new_mb->next;
+ pkt_mb = pkt_mb->next;
+ }
+
+ ASSERT(pkt_mb == 0);
+ __rte_mbuf_sanity_check(first_mb, 1);
+
+ return first_mb;
+}
+
+typedef struct {
+ u32 buffer_index;
+ u16 device_index;
+ u8 queue_index;
+ struct rte_mbuf mb;
+ /* Copy of VLIB buffer; packet data stored in pre_data. */
+ vlib_buffer_t buffer;
+} dpdk_tx_dma_trace_t;
+
+static void
+dpdk_tx_trace_buffer (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id,
+ u32 buffer_index,
+ vlib_buffer_t * buffer)
+{
+ vlib_main_t * vm = vlib_get_main();
+ dpdk_tx_dma_trace_t * t0;
+ struct rte_mbuf * mb;
+
+ mb = ((struct rte_mbuf *)buffer)-1;
+
+ t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = buffer_index;
+ memcpy (&t0->mb, mb, sizeof (t0->mb));
+ memcpy (&t0->buffer, buffer, sizeof (buffer[0]) - sizeof (buffer->pre_data));
+ memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data,
+ sizeof (t0->buffer.pre_data));
+}
+
+/*
+ * This function calls the dpdk's tx_burst function to transmit the packets
+ * on the tx_vector. It manages a lock per-device if the device does not
+ * support multiple queues. It returns the number of packets untransmitted
+ * on the tx_vector. If all packets are transmitted (the normal case), the
+ * function returns 0.
+ *
+ * The tx_burst function may not be able to transmit all packets because the
+ * dpdk ring is full. If a flowcontrol callback function has been configured
+ * then the function simply returns. If no callback has been configured, the
+ * function will retry calling tx_burst with the remaining packets. This will
+ * continue until all packets are transmitted or tx_burst indicates no packets
+ * could be transmitted. (The caller can drop the remaining packets.)
+ *
+ * The function assumes there is at least one packet on the tx_vector.
+ */
+static_always_inline
+u32 tx_burst_vector_internal (vlib_main_t * vm,
+ dpdk_device_t * xd,
+ struct rte_mbuf ** tx_vector)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ u32 n_packets;
+ u32 tx_head;
+ u32 tx_tail;
+ u32 n_retry;
+ int rv;
+ int queue_id;
+ tx_ring_hdr_t *ring;
+
+ ring = vec_header(tx_vector, sizeof(*ring));
+
+ n_packets = ring->tx_head - ring->tx_tail;
+
+ tx_head = ring->tx_head % DPDK_TX_RING_SIZE;
+
+ /*
+ * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to
+ * unpredictable results.
+ */
+ ASSERT(n_packets > 0);
+
+ /*
+ * Check for tx_vector overflow. If this fails it is a system configuration
+ * error. The ring should be sized big enough to handle the largest un-flowed
+ * off burst from a traffic manager. A larger size also helps performance
+ * a bit because it decreases the probability of having to issue two tx_burst
+ * calls due to a ring wrap.
+ */
+ ASSERT(n_packets < DPDK_TX_RING_SIZE);
+
+ /*
+ * If there is no flowcontrol callback, there is only temporary buffering
+ * on the tx_vector and so the tail should always be 0.
+ */
+ ASSERT(dm->flowcontrol_callback || ring->tx_tail == 0);
+
+ /*
+ * If there is a flowcontrol callback, don't retry any incomplete tx_bursts.
+ * Apply backpressure instead. If there is no callback, keep retrying until
+ * a tx_burst sends no packets. n_retry of 255 essentially means no retry
+ * limit.
+ */
+ n_retry = dm->flowcontrol_callback ? 0 : 255;
+
+ queue_id = vm->cpu_index;
+
+ do {
+ /* start the burst at the tail */
+ tx_tail = ring->tx_tail % DPDK_TX_RING_SIZE;
+
+ /*
+ * This device only supports one TX queue,
+ * and we're running multi-threaded...
+ */
+ if (PREDICT_FALSE(xd->lockp != 0))
+ {
+ queue_id = 0;
+ while (__sync_lock_test_and_set (xd->lockp, 1))
+ /* zzzz */;
+ }
+
+ if (PREDICT_TRUE(xd->dev_type == VNET_DPDK_DEV_ETH))
+ {
+ if (PREDICT_TRUE(tx_head > tx_tail))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_eth_tx_burst(xd->device_index,
+ (uint16_t) queue_id,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head-tx_tail));
+ }
+ else
+ {
+ /*
+ * This can only happen if there is a flowcontrol callback.
+ * We need to split the transmit into two calls: one for
+ * the packets up to the wrap point, and one to continue
+ * at the start of the ring.
+ * Transmit pkts up to the wrap point.
+ */
+ rv = rte_eth_tx_burst(xd->device_index,
+ (uint16_t) queue_id,
+ &tx_vector[tx_tail],
+ (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
+
+ /*
+ * If we transmitted everything we wanted, then allow 1 retry
+ * so we can try to transmit the rest. If we didn't transmit
+ * everything, stop now.
+ */
+ n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
+ }
+ }
+ else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+ {
+ if (PREDICT_TRUE(tx_head > tx_tail))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, VIRTIO_RXQ,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head-tx_tail));
+ if (PREDICT_TRUE(rv > 0))
+ {
+ if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) {
+ dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]);
+ vring->n_since_last_int += rv;
+
+ if (vring->n_since_last_int > dm->vhost_coalesce_frames)
+ dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ);
+ }
+
+ int c = rv;
+ while(c--)
+ rte_pktmbuf_free (tx_vector[tx_tail+c]);
+ }
+ }
+ else
+ {
+ /*
+ * If we transmitted everything we wanted, then allow 1 retry
+ * so we can try to transmit the rest. If we didn't transmit
+ * everything, stop now.
+ */
+ rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, VIRTIO_RXQ,
+ &tx_vector[tx_tail],
+ (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
+
+ if (PREDICT_TRUE(rv > 0))
+ {
+ if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) {
+ dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]);
+ vring->n_since_last_int += rv;
+
+ if (vring->n_since_last_int > dm->vhost_coalesce_frames)
+ dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ);
+ }
+
+ int c = rv;
+ while(c--)
+ rte_pktmbuf_free (tx_vector[tx_tail+c]);
+ }
+
+ n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
+ }
+ }
+ else if (xd->dev_type == VNET_DPDK_DEV_KNI)
+ {
+ if (PREDICT_TRUE(tx_head > tx_tail))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_kni_tx_burst(xd->kni,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head-tx_tail));
+ }
+ else
+ {
+ /*
+ * This can only happen if there is a flowcontrol callback.
+ * We need to split the transmit into two calls: one for
+ * the packets up to the wrap point, and one to continue
+ * at the start of the ring.
+ * Transmit pkts up to the wrap point.
+ */
+ rv = rte_kni_tx_burst(xd->kni,
+ &tx_vector[tx_tail],
+ (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
+
+ /*
+ * If we transmitted everything we wanted, then allow 1 retry
+ * so we can try to transmit the rest. If we didn't transmit
+ * everything, stop now.
+ */
+ n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
+ }
+ }
+ else
+ {
+ ASSERT(0);
+ rv = 0;
+ }
+
+ if (PREDICT_FALSE(xd->lockp != 0))
+ *xd->lockp = 0;
+
+ if (PREDICT_FALSE(rv < 0))
+ {
+ // emit non-fatal message, bump counter
+ vnet_main_t * vnm = dm->vnet_main;
+ vnet_interface_main_t * im = &vnm->interface_main;
+ u32 node_index;
+
+ node_index = vec_elt_at_index(im->hw_interfaces,
+ xd->vlib_hw_if_index)->tx_node_index;
+
+ vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1);
+ clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index, rv);
+ return n_packets; // untransmitted packets
+ }
+ ring->tx_tail += (u16)rv;
+ n_packets -= (uint16_t) rv;
+ } while (rv && n_packets && (n_retry>0));
+
+ return n_packets;
+}
+
+
+/*
+ * This function transmits any packets on the interface's tx_vector and returns
+ * the number of packets untransmitted on the tx_vector. If the tx_vector is
+ * empty the function simply returns 0.
+ *
+ * It is intended to be called by a traffic manager which has flowed-off an
+ * interface to see if the interface can be flowed-on again.
+ */
+u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ int queue_id;
+ struct rte_mbuf ** tx_vector;
+ tx_ring_hdr_t *ring;
+
+ /* param is dev_instance and not hw_if_index to save another lookup */
+ xd = vec_elt_at_index (dm->devices, dev_instance);
+
+ queue_id = vm->cpu_index;
+ tx_vector = xd->tx_vectors[queue_id];
+
+ /* If no packets on the ring, don't bother calling tx function */
+ ring = vec_header(tx_vector, sizeof(*ring));
+ if (ring->tx_head == ring->tx_tail)
+ {
+ return 0;
+ }
+
+ return tx_burst_vector_internal (vm, xd, tx_vector);
+}
+
+/*
+ * Transmits the packets on the frame to the interface associated with the
+ * node. It first copies packets on the frame to a tx_vector containing the
+ * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal
+ * which calls the dpdk tx_burst function.
+ *
+ * The tx_vector is treated slightly differently depending on whether or
+ * not a flowcontrol callback function has been configured. If there is no
+ * callback, the tx_vector is a temporary array of rte_mbuf packet pointers.
+ * Its entries are written and consumed before the function exits.
+ *
+ * If there is a callback then the transmit is being invoked in the presence
+ * of a traffic manager. Here the tx_vector is treated like a ring of rte_mbuf
+ * pointers. If not all packets can be transmitted, the untransmitted packets
+ * stay on the tx_vector until the next call. The callback allows the traffic
+ * manager to flow-off dequeues to the interface. The companion function
+ * dpdk_interface_tx_vector() allows the traffic manager to detect when
+ * it should flow-on the interface again.
+ */
+static uword
+dpdk_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, rd->dev_instance);
+ u32 n_packets = f->n_vectors;
+ u32 n_left;
+ u32 * from;
+ struct rte_mbuf ** tx_vector;
+ int i;
+ int queue_id;
+ u32 my_cpu;
+ u32 tx_pkts = 0;
+ tx_ring_hdr_t *ring;
+ u32 n_on_ring;
+
+ my_cpu = vm->cpu_index;
+
+ queue_id = my_cpu;
+
+ tx_vector = xd->tx_vectors[queue_id];
+ ring = vec_header(tx_vector, sizeof(*ring));
+
+ n_on_ring = ring->tx_head - ring->tx_tail;
+ from = vlib_frame_vector_args (f);
+
+ ASSERT(n_packets <= VLIB_FRAME_SIZE);
+
+ if (PREDICT_FALSE(n_on_ring + n_packets > DPDK_TX_RING_SIZE))
+ {
+ /*
+ * Overflowing the ring should never happen.
+ * If it does then drop the whole frame.
+ */
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL,
+ n_packets);
+
+ while (n_packets--)
+ {
+ u32 bi0 = from[n_packets];
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+ struct rte_mbuf *mb0 = ((struct rte_mbuf *)b0) - 1;
+ rte_pktmbuf_free (mb0);
+ }
+ return n_on_ring;
+ }
+
+ if (PREDICT_FALSE(dm->tx_pcap_enable))
+ {
+ n_left = n_packets;
+ while (n_left > 0)
+ {
+ u32 bi0 = from[0];
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, bi0);
+ if (dm->pcap_sw_if_index == 0 ||
+ dm->pcap_sw_if_index == vnet_buffer(b0)->sw_if_index [VLIB_TX])
+ pcap_add_buffer (&dm->pcap_main, vm, bi0, 512);
+ from++;
+ n_left--;
+ }
+ }
+
+ from = vlib_frame_vector_args (f);
+ n_left = n_packets;
+ i = ring->tx_head % DPDK_TX_RING_SIZE;
+
+ while (n_left >= 4)
+ {
+ u32 bi0, bi1;
+ u32 pi0, pi1;
+ struct rte_mbuf * mb0, * mb1;
+ struct rte_mbuf * prefmb0, * prefmb1;
+ vlib_buffer_t * b0, * b1;
+ vlib_buffer_t * pref0, * pref1;
+ i16 delta0, delta1;
+ u16 new_data_len0, new_data_len1;
+ u16 new_pkt_len0, new_pkt_len1;
+ u32 any_clone;
+
+ pi0 = from[2];
+ pi1 = from[3];
+ pref0 = vlib_get_buffer (vm, pi0);
+ pref1 = vlib_get_buffer (vm, pi1);
+
+ prefmb0 = ((struct rte_mbuf *)pref0) - 1;
+ prefmb1 = ((struct rte_mbuf *)pref1) - 1;
+
+ CLIB_PREFETCH(prefmb0, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH(pref0, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH(prefmb1, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH(pref1, CLIB_CACHE_LINE_BYTES, LOAD);
+
+ bi0 = from[0];
+ bi1 = from[1];
+ from += 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ mb0 = ((struct rte_mbuf *)b0) - 1;
+ mb1 = ((struct rte_mbuf *)b1) - 1;
+
+ any_clone = b0->clone_count | b1->clone_count;
+ if (PREDICT_FALSE(any_clone != 0))
+ {
+ if (PREDICT_FALSE(b0->clone_count != 0))
+ {
+ struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
+ if (PREDICT_FALSE(mb0_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b0->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ mb0 = mb0_new;
+ vec_add1 (dm->recycle[my_cpu], bi0);
+ }
+ if (PREDICT_FALSE(b1->clone_count != 0))
+ {
+ struct rte_mbuf * mb1_new = dpdk_replicate_packet_mb (b1);
+ if (PREDICT_FALSE(mb1_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b1->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ mb1 = mb1_new;
+ vec_add1 (dm->recycle[my_cpu], bi1);
+ }
+ }
+
+ delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
+ vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
+ delta1 = PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
+ vlib_buffer_length_in_chain (vm, b1) - (i16) mb1->pkt_len;
+
+ new_data_len0 = (u16)((i16) mb0->data_len + delta0);
+ new_data_len1 = (u16)((i16) mb1->data_len + delta1);
+ new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0);
+ new_pkt_len1 = (u16)((i16) mb1->pkt_len + delta1);
+
+ b0->current_length = new_data_len0;
+ b1->current_length = new_data_len1;
+ mb0->data_len = new_data_len0;
+ mb1->data_len = new_data_len1;
+ mb0->pkt_len = new_pkt_len0;
+ mb1->pkt_len = new_pkt_len1;
+
+ mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ?
+ mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
+ mb1->data_off = (PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL)) ?
+ mb1->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b1->current_data);
+
+ if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1);
+ }
+
+ if (PREDICT_TRUE(any_clone == 0))
+ {
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+ i++;
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
+ i++;
+ }
+ else
+ {
+ /* cloning was done, need to check for failure */
+ if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+ i++;
+ }
+ if (PREDICT_TRUE((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
+ i++;
+ }
+ }
+
+ n_left -= 2;
+ }
+ while (n_left > 0)
+ {
+ u32 bi0;
+ struct rte_mbuf * mb0;
+ vlib_buffer_t * b0;
+ i16 delta0;
+ u16 new_data_len0;
+ u16 new_pkt_len0;
+
+ bi0 = from[0];
+ from++;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ mb0 = ((struct rte_mbuf *)b0) - 1;
+ if (PREDICT_FALSE(b0->clone_count != 0))
+ {
+ struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
+ if (PREDICT_FALSE(mb0_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b0->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ mb0 = mb0_new;
+ vec_add1 (dm->recycle[my_cpu], bi0);
+ }
+
+ delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
+ vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
+
+ new_data_len0 = (u16)((i16) mb0->data_len + delta0);
+ new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0);
+
+ b0->current_length = new_data_len0;
+ mb0->data_len = new_data_len0;
+ mb0->pkt_len = new_pkt_len0;
+ mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ?
+ mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
+
+ if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE))
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+
+ if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+ i++;
+ }
+ n_left--;
+ }
+
+ /* account for additional packets in the ring */
+ ring->tx_head += n_packets;
+ n_on_ring = ring->tx_head - ring->tx_tail;
+
+ /* transmit as many packets as possible */
+ n_packets = tx_burst_vector_internal (vm, xd, tx_vector);
+
+ /*
+ * tx_pkts is the number of packets successfully transmitted
+ * This is the number originally on ring minus the number remaining on ring
+ */
+ tx_pkts = n_on_ring - n_packets;
+
+ if (PREDICT_FALSE(dm->flowcontrol_callback != 0))
+ {
+ if (PREDICT_FALSE(n_packets))
+ {
+ /* Callback may want to enable flowcontrol */
+ dm->flowcontrol_callback(vm, xd->vlib_hw_if_index, ring->tx_head - ring->tx_tail);
+ }
+ else
+ {
+ /* Reset head/tail to avoid unnecessary wrap */
+ ring->tx_head = 0;
+ ring->tx_tail = 0;
+ }
+ }
+ else
+ {
+ /* If there is no callback then drop any non-transmitted packets */
+ if (PREDICT_FALSE(n_packets))
+ {
+ vlib_simple_counter_main_t * cm;
+ vnet_main_t * vnm = vnet_get_main();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, n_packets);
+
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP,
+ n_packets);
+
+ while (n_packets--)
+ rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]);
+ }
+
+ /* Reset head/tail to avoid unnecessary wrap */
+ ring->tx_head = 0;
+ ring->tx_tail = 0;
+ }
+
+ /* Recycle replicated buffers */
+ if (PREDICT_FALSE(vec_len(dm->recycle[my_cpu])))
+ {
+ vlib_buffer_free (vm, dm->recycle[my_cpu], vec_len(dm->recycle[my_cpu]));
+ _vec_len(dm->recycle[my_cpu]) = 0;
+ }
+
+ ASSERT(ring->tx_head >= ring->tx_tail);
+
+ return tx_pkts;
+}
+
+static int dpdk_device_renumber (vnet_hw_interface_t * hi,
+ u32 new_dev_instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ if (!xd || xd->dev_type != VNET_DPDK_DEV_VHOST_USER) {
+ clib_warning("cannot renumber non-vhost-user interface (sw_if_index: %d)",
+ hi->sw_if_index);
+ return 0;
+ }
+
+ xd->vu_if_id = new_dev_instance;
+ return 0;
+}
+
+static u8 * format_dpdk_device_name (u8 * s, va_list * args)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ char *devname_format;
+ char *device_name;
+ u32 i = va_arg (*args, u32);
+ struct rte_eth_dev_info dev_info;
+
+ if (dm->interface_name_format_decimal)
+ devname_format = "%s%d/%d/%d";
+ else
+ devname_format = "%s%x/%x/%x";
+
+ if (dm->devices[i].dev_type == VNET_DPDK_DEV_KNI) {
+ return format(s, "kni%d", dm->devices[i].kni_port_id);
+ } else if (dm->devices[i].dev_type == VNET_DPDK_DEV_VHOST_USER) {
+ return format(s, "VirtualEthernet0/0/%d", dm->devices[i].vu_if_id);
+ }
+ switch (dm->devices[i].port_type)
+ {
+ case VNET_DPDK_PORT_TYPE_ETH_1G:
+ device_name = "GigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_10G:
+ device_name = "TenGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_40G:
+ device_name = "FortyGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_SWITCH:
+ device_name = "EthernetSwitch";
+ break;
+
+ #ifdef NETMAP
+ case VNET_DPDK_PORT_TYPE_NETMAP:
+ rte_eth_dev_info_get(i, &dev_info);
+ return format(s, "netmap:%s", dev_info.driver_name);
+ #endif
+
+ case VNET_DPDK_PORT_TYPE_AF_PACKET:
+ rte_eth_dev_info_get(i, &dev_info);
+ return format(s, "af_packet%d", dm->devices[i].af_packet_port_id);
+
+ default:
+ case VNET_DPDK_PORT_TYPE_UNKNOWN:
+ device_name = "UnknownEthernet";
+ break;
+ }
+
+ rte_eth_dev_info_get(i, &dev_info);
+ return format (s, devname_format, device_name, dev_info.pci_dev->addr.bus,
+ dev_info.pci_dev->addr.devid,
+ dev_info.pci_dev->addr.function);
+}
+
+static u8 * format_dpdk_device_type (u8 * s, va_list * args)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ char *dev_type;
+ u32 i = va_arg (*args, u32);
+
+ if (dm->devices[i].dev_type == VNET_DPDK_DEV_KNI) {
+ return format(s, "Kernel NIC Interface");
+ } else if (dm->devices[i].dev_type == VNET_DPDK_DEV_VHOST_USER) {
+ return format(s, "vhost-user interface");
+ }
+
+ switch (dm->devices[i].pmd)
+ {
+ case VNET_DPDK_PMD_E1000EM:
+ dev_type = "Intel 82540EM (e1000)";
+ break;
+
+ case VNET_DPDK_PMD_IGB:
+ dev_type = "Intel e1000";
+ break;
+
+ case VNET_DPDK_PMD_I40E:
+ dev_type = "Intel X710/XL710 Family";
+ break;
+
+ case VNET_DPDK_PMD_I40EVF:
+ dev_type = "Intel X710/XL710 Family VF";
+ break;
+
+ case VNET_DPDK_PMD_FM10K:
+ dev_type = "Intel FM10000 Family Ethernet Switch";
+ break;
+
+ case VNET_DPDK_PMD_IGBVF:
+ dev_type = "Intel e1000 VF";
+ break;
+
+ case VNET_DPDK_PMD_VIRTIO:
+ dev_type = "Red Hat Virtio";
+ break;
+
+ case VNET_DPDK_PMD_IXGBEVF:
+ dev_type = "Intel 82599 VF";
+ break;
+
+ case VNET_DPDK_PMD_IXGBE:
+ dev_type = "Intel 82599";
+ break;
+
+ case VNET_DPDK_PMD_VICE:
+ case VNET_DPDK_PMD_ENIC:
+ dev_type = "Cisco VIC";
+ break;
+
+ case VNET_DPDK_PMD_VMXNET3:
+ dev_type = "VMware VMXNET3";
+ break;
+
+#ifdef NETMAP
+ case VNET_DPDK_PMD_NETMAP:
+ dev_type = "Netmap/Vale";
+ break;
+#endif
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ dev_type = "af_packet";
+ break;
+
+ default:
+ case VNET_DPDK_PMD_UNKNOWN:
+ dev_type = "### UNKNOWN ###";
+ break;
+ }
+
+ return format (s, dev_type);
+}
+
+static u8 * format_dpdk_link_status (u8 * s, va_list * args)
+{
+ dpdk_device_t * xd = va_arg (*args, dpdk_device_t *);
+ struct rte_eth_link * l = &xd->link;
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index);
+
+ s = format (s, "%s ", l->link_status ? "up" : "down");
+ if (l->link_status)
+ {
+ u32 promisc = rte_eth_promiscuous_get (xd->device_index);
+
+ s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ?
+ "full" : "half");
+ s = format (s, "speed %u mtu %d %s\n", l->link_speed,
+ hi->max_packet_bytes, promisc ? " promisc" : "");
+ }
+ else
+ s = format (s, "\n");
+
+ return s;
+}
+
+#define _line_len 72
+#define _(v, str) \
+if (bitmap & v) { \
+ if (format_get_indent (s) > next_split ) { \
+ next_split += _line_len; \
+ s = format(s,"\n%U", format_white_space, indent); \
+ } \
+ s = format(s, "%s ", str); \
+}
+
+static u8 * format_dpdk_rss_hf_name(u8 * s, va_list * args)
+{
+ u64 bitmap = va_arg (*args, u64);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format(s, "none");
+
+ foreach_dpdk_rss_hf
+
+ return s;
+}
+
+static u8 * format_dpdk_rx_offload_caps(u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format(s, "none");
+
+ foreach_dpdk_rx_offload_caps
+
+ return s;
+}
+
+static u8 * format_dpdk_tx_offload_caps(u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+ if (!bitmap)
+ return format(s, "none");
+
+ foreach_dpdk_tx_offload_caps
+
+ return s;
+}
+
+#undef _line_len
+#undef _
+
+static u8 * format_dpdk_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ int verbose = va_arg (*args, int);
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, dev_instance);
+ uword indent = format_get_indent (s);
+ f64 now = vlib_time_now (dm->vlib_main);
+
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+
+ s = format (s, "%U\n%Ucarrier %U",
+ format_dpdk_device_type, xd->device_index,
+ format_white_space, indent + 2,
+ format_dpdk_link_status, xd);
+
+ if (verbose > 1 && xd->dev_type == VNET_DPDK_DEV_ETH)
+ {
+ struct rte_eth_dev_info di;
+ struct rte_pci_device * pci;
+ struct rte_eth_rss_conf rss_conf;
+ int vlan_off;
+
+ rss_conf.rss_key = 0;
+ rte_eth_dev_info_get(xd->device_index, &di);
+ rte_eth_dev_rss_hash_conf_get(xd->device_index, &rss_conf);
+ pci = di.pci_dev;
+
+ s = format(s, "%Upci id: device %04x:%04x subsystem %04x:%04x\n"
+ "%Upci address: %04x:%02x:%02x.%02x\n",
+ format_white_space, indent + 2,
+ pci->id.vendor_id, pci->id.device_id,
+ pci->id.subsystem_vendor_id,
+ pci->id.subsystem_device_id,
+ format_white_space, indent + 2,
+ pci->addr.domain, pci->addr.bus,
+ pci->addr.devid, pci->addr.function);
+ s = format(s, "%Umax rx packet len: %d\n",
+ format_white_space, indent + 2, di.max_rx_pktlen);
+ s = format(s, "%Upromiscuous: unicast %s all-multicast %s\n",
+ format_white_space, indent + 2,
+ rte_eth_promiscuous_get(xd->device_index) ? "on" : "off",
+ rte_eth_promiscuous_get(xd->device_index) ? "on" : "off");
+ vlan_off = rte_eth_dev_get_vlan_offload(xd->device_index);
+ s = format(s, "%Uvlan offload: strip %s filter %s qinq %s\n",
+ format_white_space, indent + 2,
+ vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off");
+ s = format(s, "%Uqueue size (max): rx %d (%d) tx %d (%d)\n",
+ format_white_space, indent + 2,
+ xd->rx_q_used, di.max_rx_queues,
+ xd->tx_q_used, di.max_tx_queues);
+ s = format(s, "%Urx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rx_offload_caps, di.rx_offload_capa);
+ s = format(s, "%Utx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_tx_offload_caps, di.tx_offload_capa);
+ s = format(s, "%Urss active: %U\n"
+ "%Urss supported: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, rss_conf.rss_hf,
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, di.flow_type_rss_offloads);
+ }
+
+ if (xd->cpu_socket > -1)
+ s = format (s, "%Ucpu socket %d",
+ format_white_space, indent + 2,
+ xd->cpu_socket);
+
+ /* $$$ MIB counters */
+
+ {
+#define _(N, V) \
+ if (xd->stats.V != 0) \
+ s = format (s, "\n%U%-40U%16Ld", \
+ format_white_space, indent + 2, \
+ format_c_identifier, #N, xd->stats.V);
+
+ foreach_dpdk_counter
+#undef _
+ }
+
+ u8 * xs = 0;
+ struct rte_eth_xstats * xstat;
+
+ vec_foreach(xstat, xd->xstats)
+ {
+ if (xstat->value)
+ {
+ /* format_c_identifier don't like c strings inside vector */
+ u8 * name = format(0,"%s", xstat->name);
+ xs = format(xs, "\n%U%-38U%16Ld",
+ format_white_space, indent + 4,
+ format_c_identifier, name, xstat->value);
+ vec_free(name);
+ }
+ }
+
+ if (xs)
+ {
+ s = format(s, "\n%Uextended stats:%v",
+ format_white_space, indent + 2, xs);
+ vec_free(xs);
+ }
+
+ return s;
+}
+
+static u8 * format_dpdk_tx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main();
+ dpdk_tx_dma_trace_t * t = va_arg (*va, dpdk_tx_dma_trace_t *);
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, t->device_index);
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U tx queue %d",
+ format_vnet_sw_interface_name, vnm, sw,
+ t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index,
+ format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_ethernet_header_with_length, t->buffer.pre_data,
+ sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+static void dpdk_clear_hw_interface_counters (u32 instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, instance);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (xd->admin_up != 0xff)
+ {
+ rte_eth_stats_reset (xd->device_index);
+ memset (&xd->last_stats, 0, sizeof (xd->last_stats));
+ dpdk_update_counters (xd, vlib_time_now (dm->vlib_main));
+ }
+ else
+ {
+ rte_eth_stats_reset (xd->device_index);
+ memset(&xd->stats, 0, sizeof(xd->stats));
+ memset (&xd->last_stats, 0, sizeof (xd->last_stats));
+ }
+ rte_eth_xstats_reset(xd->device_index);
+}
+
+static int
+kni_config_network_if(u8 port_id, u8 if_up)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ uword *p;
+
+ p = hash_get (dm->dpdk_device_by_kni_port_id, port_id);
+ if (p == 0) {
+ clib_warning("unknown interface");
+ return 0;
+ } else {
+ xd = vec_elt_at_index (dm->devices, p[0]);
+ }
+
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index,
+ if_up ? VNET_HW_INTERFACE_FLAG_LINK_UP |
+ ETH_LINK_FULL_DUPLEX : 0);
+ return 0;
+}
+
+static int
+kni_change_mtu(u8 port_id, unsigned new_mtu)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ uword *p;
+ vnet_hw_interface_t * hif;
+
+ p = hash_get (dm->dpdk_device_by_kni_port_id, port_id);
+ if (p == 0) {
+ clib_warning("unknown interface");
+ return 0;
+ } else {
+ xd = vec_elt_at_index (dm->devices, p[0]);
+ }
+ hif = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index);
+
+ hif->max_packet_bytes = new_mtu;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t * hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, hif->dev_instance);
+ int rv = 0;
+
+ if (xd->dev_type == VNET_DPDK_DEV_KNI)
+ {
+ if (is_up)
+ {
+ struct rte_kni_conf conf;
+ struct rte_kni_ops ops;
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ memset(&conf, 0, sizeof(conf));
+ snprintf(conf.name, RTE_KNI_NAMESIZE, "vpp%u", xd->kni_port_id);
+ conf.mbuf_size = MBUF_SIZE;
+ memset(&ops, 0, sizeof(ops));
+ ops.port_id = xd->kni_port_id;
+ ops.change_mtu = kni_change_mtu;
+ ops.config_network_if = kni_config_network_if;
+
+ xd->kni = rte_kni_alloc(bm->pktmbuf_pools[rte_socket_id()], &conf, &ops);
+ if (!xd->kni)
+ {
+ clib_warning("failed to allocate kni interface");
+ }
+ else
+ {
+ hif->max_packet_bytes = 1500; /* kni interface default value */
+ xd->admin_up = 1;
+ }
+ }
+ else
+ {
+ xd->admin_up = 0;
+ rte_kni_release(xd->kni);
+ }
+ return 0;
+ }
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+ {
+ if (is_up)
+ {
+ if (xd->vu_is_running)
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP |
+ ETH_LINK_FULL_DUPLEX );
+ xd->admin_up = 1;
+ }
+ else
+ {
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+ xd->admin_up = 0;
+ }
+
+ return 0;
+ }
+
+
+ if (is_up)
+ {
+ f64 now = vlib_time_now (dm->vlib_main);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (xd->admin_up == 0)
+ rv = rte_eth_dev_start (xd->device_index);
+
+ if (xd->promisc)
+ rte_eth_promiscuous_enable(xd->device_index);
+ else
+ rte_eth_promiscuous_disable(xd->device_index);
+
+ rte_eth_allmulticast_enable (xd->device_index);
+ xd->admin_up = 1;
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+ }
+ else
+ {
+ rte_eth_allmulticast_disable (xd->device_index);
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (xd->pmd != VNET_DPDK_PMD_VMXNET3)
+ {
+ rte_eth_dev_stop (xd->device_index);
+ xd->admin_up = 0;
+ }
+ else
+ xd->admin_up = ~0;
+ }
+
+ if (rv < 0)
+ clib_warning ("rte_eth_dev_%s error: %d", is_up ? "start" : "stop",
+ rv);
+
+ return /* no error */ 0;
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void dpdk_set_interface_next_node (vnet_main_t *vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ dpdk_main_t * xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t * xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ xd->per_interface_next_index = node_index;
+ return;
+ }
+
+ xd->per_interface_next_index =
+ vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index);
+}
+
+
+static clib_error_t *
+dpdk_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t * st,
+ int is_add)
+{
+ dpdk_main_t * xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t * xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+ vnet_sw_interface_t * t = (vnet_sw_interface_t *) st;
+ int r, vlan_offload;
+
+
+ if (xd->dev_type != VNET_DPDK_DEV_ETH)
+ return 0;
+ /* currently we program VLANS only for IXGBE VF */
+ if (xd->pmd != VNET_DPDK_PMD_IXGBEVF)
+ return 0;
+
+ if (t->sub.eth.flags.no_tags == 1)
+ return 0;
+
+ if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1 ))
+ return clib_error_return (0, "unsupported VLAN setup");
+
+
+ vlan_offload = rte_eth_dev_get_vlan_offload(xd->device_index);
+ vlan_offload |= ETH_VLAN_FILTER_OFFLOAD;
+
+ if ((r = rte_eth_dev_set_vlan_offload(xd->device_index, vlan_offload)))
+ return clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d",
+ xd->device_index, r);
+
+
+ if ((r = rte_eth_dev_vlan_filter(xd->device_index, t->sub.eth.outer_vlan_id, is_add)))
+ return clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d",
+ xd->device_index, r);
+
+ return 0;
+}
+
+VNET_DEVICE_CLASS (dpdk_device_class) = {
+ .name = "dpdk",
+ .tx_function = dpdk_interface_tx,
+ .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR,
+ .tx_function_error_strings = dpdk_tx_func_error_strings,
+ .format_device_name = format_dpdk_device_name,
+ .format_device = format_dpdk_device,
+ .format_tx_trace = format_dpdk_tx_dma_trace,
+ .clear_counters = dpdk_clear_hw_interface_counters,
+ .admin_up_down_function = dpdk_interface_admin_up_down,
+ .subif_add_del_function = dpdk_subif_add_del_function,
+ .rx_redirect_to_node = dpdk_set_interface_next_node,
+ .no_flatten_output_chains = 1,
+ .name_renumber = dpdk_device_renumber,
+};
+
+void dpdk_set_flowcontrol_callback (vlib_main_t *vm,
+ dpdk_flowcontrol_callback_t callback)
+{
+ dpdk_main.flowcontrol_callback = callback;
+}
+
+#define UP_DOWN_FLAG_EVENT 1
+
+
+u32 dpdk_get_admin_up_down_in_progress (void)
+{
+ return dpdk_main.admin_up_down_in_progress;
+}
+
+static uword
+admin_up_down_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ clib_error_t * error = 0;
+ uword event_type;
+ uword *event_data = 0;
+ u32 index;
+ u32 sw_if_index;
+ u32 flags;
+
+ while (1)
+ {
+ vlib_process_wait_for_event (vm);
+
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ dpdk_main.admin_up_down_in_progress = 1;
+
+ for (index=0; index<vec_len(event_data); index++)
+ {
+ sw_if_index = event_data[index] >> 32;
+ flags = (u32) event_data[index];
+
+ switch (event_type) {
+ case UP_DOWN_FLAG_EVENT:
+ error = vnet_sw_interface_set_flags (vnet_get_main(), sw_if_index, flags);
+ clib_error_report(error);
+ break;
+ }
+ }
+
+ vec_reset_length (event_data);
+
+ dpdk_main.admin_up_down_in_progress = 0;
+
+ }
+ return 0; /* or not */
+}
+
+VLIB_REGISTER_NODE (admin_up_down_process_node,static) = {
+ .function = admin_up_down_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "admin-up-down-process",
+ .process_log2_n_stack_bytes = 17, // 256KB
+};
+
+/*
+ * Asynchronously invoke vnet_sw_interface_set_flags via the admin_up_down
+ * process. Useful for avoiding long blocking delays (>150ms) in the dpdk
+ * drivers.
+ * WARNING: when posting this event, no other interface-related calls should
+ * be made (e.g. vnet_create_sw_interface()) while the event is being
+ * processed (admin_up_down_in_progress). This is required in order to avoid
+ * race conditions in manipulating interface data structures.
+ */
+void post_sw_interface_set_flags (vlib_main_t *vm, u32 sw_if_index, u32 flags)
+{
+ vlib_process_signal_event
+ (vm, admin_up_down_process_node.index,
+ UP_DOWN_FLAG_EVENT,
+ (((uword)sw_if_index << 32) | flags));
+}
+
+/*
+ * Called by the dpdk driver's rte_delay_us() function.
+ * Return 0 to have the dpdk do a regular delay loop.
+ * Return 1 if to skip the delay loop because we are suspending
+ * the calling vlib process instead.
+ */
+int rte_delay_us_override (unsigned us) {
+ vlib_main_t * vm;
+
+ /* Don't bother intercepting for short delays */
+ if (us < 10) return 0;
+
+ /*
+ * Only intercept if we are in a vlib process.
+ * If we are called from a vlib worker thread or the vlib main
+ * thread then do not intercept. (Must not be called from an
+ * independent pthread).
+ */
+ if (os_get_cpu_number() == 0)
+ {
+ /*
+ * We're in the vlib main thread or a vlib process. Make sure
+ * the process is running and we're not still initializing.
+ */
+ vm = vlib_get_main();
+ if (vlib_in_process_context(vm))
+ {
+ /* Only suspend for the admin_down_process */
+ vlib_process_t * proc = vlib_get_current_process(vm);
+ if (!(proc->flags & VLIB_PROCESS_IS_RUNNING) ||
+ (proc->node_runtime.function != admin_up_down_process))
+ return 0;
+
+ f64 delay = 1e-6 * us;
+ vlib_process_suspend(vm, delay);
+ return 1;
+ }
+ }
+ return 0; // no override
+}
diff --git a/vnet/vnet/devices/dpdk/dpdk.h b/vnet/vnet/devices/dpdk/dpdk.h
new file mode 100644
index 00000000000..fd984e4d4df
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/dpdk.h
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_dpdk_h__
+#define __included_dpdk_h__
+
+/* $$$$ We should rename always_inline -> clib_always_inline */
+#undef always_inline
+
+#include <rte_config.h>
+
+#include <rte_common.h>
+#include <rte_dev.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memcpy.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_prefetch.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_branch_prediction.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_random.h>
+#include <rte_debug.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_kni.h>
+#include <rte_virtio_net.h>
+#include <rte_pci_dev_ids.h>
+#include <rte_version.h>
+
+#include <vnet/unix/pcap.h>
+#include <vnet/devices/virtio/vhost-user.h>
+
+#if CLIB_DEBUG > 0
+#define always_inline static inline
+#else
+#define always_inline static inline __attribute__ ((__always_inline__))
+#endif
+
+#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
+#define NB_MBUF (32<<10)
+
+vnet_device_class_t dpdk_device_class;
+vlib_node_registration_t dpdk_input_node;
+vlib_node_registration_t dpdk_io_input_node;
+vlib_node_registration_t handoff_dispatch_node;
+
+typedef enum {
+ VNET_DPDK_DEV_ETH = 1, /* Standard DPDK PMD driver */
+ VNET_DPDK_DEV_KNI, /* Kernel NIC Interface */
+ VNET_DPDK_DEV_VHOST_USER,
+ VNET_DPDK_DEV_UNKNOWN, /* must be last */
+} dpdk_device_type_t;
+
+#define foreach_dpdk_pmd \
+ _ ("rte_em_pmd", E1000EM) \
+ _ ("rte_igb_pmd", IGB) \
+ _ ("rte_igbvf_pmd", IGBVF) \
+ _ ("rte_ixgbe_pmd", IXGBE) \
+ _ ("rte_ixgbevf_pmd", IXGBEVF) \
+ _ ("rte_i40e_pmd", I40E) \
+ _ ("rte_i40evf_pmd", I40EVF) \
+ _ ("rte_virtio_pmd", VIRTIO) \
+ _ ("rte_vice_pmd", VICE) \
+ _ ("rte_enic_pmd", ENIC) \
+ _ ("rte_vmxnet3_pmd", VMXNET3) \
+ _ ("AF_PACKET PMD", AF_PACKET) \
+ _ ("rte_pmd_fm10k", FM10K)
+
+typedef enum {
+ VNET_DPDK_PMD_NONE,
+#define _(s,f) VNET_DPDK_PMD_##f,
+ foreach_dpdk_pmd
+#undef _
+#ifdef NETMAP
+ VNET_DPDK_PMD_NETMAP,
+#endif
+ VNET_DPDK_PMD_UNKNOWN, /* must be last */
+} dpdk_pmd_t;
+
+typedef enum {
+ VNET_DPDK_PORT_TYPE_ETH_1G,
+ VNET_DPDK_PORT_TYPE_ETH_10G,
+ VNET_DPDK_PORT_TYPE_ETH_40G,
+ VNET_DPDK_PORT_TYPE_ETH_SWITCH,
+#ifdef NETMAP
+ VNET_DPDK_PORT_TYPE_NETMAP,
+#endif
+ VNET_DPDK_PORT_TYPE_AF_PACKET,
+ VNET_DPDK_PORT_TYPE_UNKNOWN,
+} dpdk_port_type_t;
+
+typedef struct {
+ f64 deadline;
+ vlib_frame_t * frame;
+} dpdk_frame_t;
+
+#define DPDK_EFD_MAX_DISCARD_RATE 10
+
+typedef struct {
+ u16 last_burst_sz;
+ u16 max_burst_sz;
+ u32 full_frames_cnt;
+ u32 consec_full_frames_cnt;
+ u32 congestion_cnt;
+ u64 last_poll_time;
+ u64 max_poll_delay;
+ u32 discard_cnt;
+ u32 total_packet_cnt;
+} dpdk_efd_agent_t;
+
+typedef struct {
+ int callfd;
+ int kickfd;
+ int errfd;
+ u32 callfd_idx;
+ u32 n_since_last_int;
+ f64 int_deadline;
+} dpdk_vu_vring;
+
+typedef struct {
+ u32 is_up;
+ u32 unix_fd;
+ u32 unix_file_index;
+ u32 client_fd;
+ char sock_filename[256];
+ int sock_errno;
+ u8 sock_is_server;
+ u8 active;
+
+ u64 feature_mask;
+ u32 num_vrings;
+ dpdk_vu_vring vrings[2];
+ u64 region_addr[VHOST_MEMORY_MAX_NREGIONS];
+ u32 region_fd[VHOST_MEMORY_MAX_NREGIONS];
+} dpdk_vu_intf_t;
+
+typedef void (*dpdk_flowcontrol_callback_t) (vlib_main_t *vm,
+ u32 hw_if_index,
+ u32 n_packets);
+
+/*
+ * The header for the tx_vector in dpdk_device_t.
+ * Head and tail are indexes into the tx_vector and are of type
+ * u64 so they never overflow.
+ */
+typedef struct {
+ u64 tx_head;
+ u64 tx_tail;
+} tx_ring_hdr_t;
+
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+ volatile u32 *lockp;
+
+ /* Instance ID */
+ u32 device_index;
+
+ u32 vlib_hw_if_index;
+ u32 vlib_sw_if_index;
+
+ /* next node index if we decide to steal the rx graph arc */
+ u32 per_interface_next_index;
+
+ /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */
+ struct rte_mbuf *** tx_vectors; /* one per worker thread */
+ struct rte_mbuf *** rx_vectors;
+
+ /* vector of traced contexts, per device */
+ u32 * d_trace_buffers;
+
+ /* per-worker destination frame queue */
+ dpdk_frame_t * frames;
+
+ dpdk_device_type_t dev_type:8;
+ dpdk_pmd_t pmd:8;
+ i8 cpu_socket;
+
+ u8 admin_up;
+ u8 promisc;
+
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline1);
+
+ /* PMD related */
+ u16 tx_q_used;
+ u16 rx_q_used;
+ u16 nb_rx_desc;
+ u16 nb_tx_desc;
+ u16 * cpu_socket_id_by_queue;
+ struct rte_eth_conf port_conf;
+ struct rte_eth_txconf tx_conf;
+
+ /* KNI related */
+ struct rte_kni *kni;
+ u8 kni_port_id;
+
+ /* vhost-user related */
+ u32 vu_if_id;
+ struct virtio_net vu_vhost_dev;
+ u32 vu_is_running;
+ dpdk_vu_intf_t *vu_intf;
+
+ /* af_packet */
+ u8 af_packet_port_id;
+
+ struct rte_eth_link link;
+ f64 time_last_link_update;
+
+ struct rte_eth_stats stats;
+ struct rte_eth_stats last_stats;
+ struct rte_eth_xstats * xstats;
+ f64 time_last_stats_update;
+ dpdk_port_type_t port_type;
+
+ dpdk_efd_agent_t efd_agent;
+} dpdk_device_t;
+
+#define MAX_NELTS 32
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+ u64 head;
+ u64 head_hint;
+ u64 tail;
+ u32 n_in_use;
+ u32 nelts;
+ u32 written;
+ u32 threshold;
+ i32 n_vectors[MAX_NELTS];
+} frame_queue_trace_t;
+
+#define DPDK_TX_RING_SIZE (4 * 1024)
+
+#define DPDK_STATS_POLL_INTERVAL 10.0
+#define DPDK_LINK_POLL_INTERVAL 3.0
+
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+
+ /* total input packet counter */
+ u64 aggregate_rx_packets;
+} dpdk_worker_t;
+
+typedef struct {
+ u32 device;
+ u16 queue_id;
+} dpdk_device_and_queue_t;
+
+/* Early-Fast-Discard (EFD) */
+#define DPDK_EFD_DISABLED 0
+#define DPDK_EFD_DISCARD_ENABLED (1 << 0)
+#define DPDK_EFD_MONITOR_ENABLED (1 << 1)
+#define DPDK_EFD_DROPALL_ENABLED (1 << 2)
+
+#define DPDK_EFD_DEFAULT_DEVICE_QUEUE_HI_THRESH_PCT 90
+#define DPDK_EFD_DEFAULT_CONSEC_FULL_FRAMES_HI_THRESH 6
+
+typedef struct dpdk_efd_t {
+ u16 enabled;
+ u16 queue_hi_thresh;
+ u16 consec_full_frames_hi_thresh;
+ u16 pad;
+} dpdk_efd_t;
+
+typedef struct {
+
+ /* Devices */
+ dpdk_device_t * devices;
+ dpdk_device_and_queue_t ** devices_by_cpu;
+
+ /* per-thread recycle lists */
+ u32 ** recycle;
+
+ /* flow control callback. If 0 then flow control is disabled */
+ dpdk_flowcontrol_callback_t flowcontrol_callback;
+
+ /* vlib buffer free list, must be same size as an rte_mbuf */
+ u32 vlib_buffer_free_list_index;
+
+ /*
+ * format interface names ala xxxEthernet%d/%d/%d instead of
+ * xxxEthernet%x/%x/%x. For VIRL.
+ */
+ u8 interface_name_format_decimal;
+
+
+ /* dpdk worker "threads" */
+ dpdk_worker_t * workers;
+
+ /* Config stuff */
+ u8 ** eal_init_args;
+ u8 * eth_if_blacklist;
+ u8 * eth_if_whitelist;
+ u8 * uio_driver_name;
+ u8 no_multi_seg;
+
+ /* Required config parameters */
+ u8 coremask_set_manually;
+ u8 nchannels_set_manually;
+ u32 coremask;
+ u32 nchannels;
+ u32 num_mbufs;
+ u32 use_rss;
+ u8 num_kni; /* while kni_init allows u32, port_id in callback fn is only u8 */
+
+ /* Ethernet input node index */
+ u32 ethernet_input_node_index;
+
+ /* dpdk i/o thread initialization barrier */
+ volatile u32 io_thread_release;
+
+ /* pcap tracing [only works if (CLIB_DEBUG > 0)] */
+ int tx_pcap_enable;
+ pcap_main_t pcap_main;
+ u8 * pcap_filename;
+ u32 pcap_sw_if_index;
+ u32 pcap_pkts_to_capture;
+
+ /* virtio vhost-user switch */
+ u8 use_virtio_vhost;
+
+ /* vhost-user coalescence frames config */
+ u32 vhost_coalesce_frames;
+ f64 vhost_coalesce_time;
+
+ /* hashes */
+ uword * dpdk_device_by_kni_port_id;
+ uword * vu_sw_if_index_by_listener_fd;
+ uword * vu_sw_if_index_by_sock_fd;
+ u32 * vu_inactive_interfaces_device_index;
+
+ u32 next_vu_if_id;
+
+ /* efd (early-fast-discard) settings */
+ dpdk_efd_t efd;
+
+ /*
+ * flag indicating that a posted admin up/down
+ * (via post_sw_interface_set_flags) is in progress
+ */
+ u8 admin_up_down_in_progress;
+
+ u8 have_io_threads;
+
+ /* which cpus are running dpdk-input */
+ int input_cpu_first_index;
+ int input_cpu_count;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} dpdk_main_t;
+
+dpdk_main_t dpdk_main;
+
+typedef enum {
+ DPDK_RX_NEXT_IP4_INPUT,
+ DPDK_RX_NEXT_IP6_INPUT,
+ DPDK_RX_NEXT_MPLS_INPUT,
+ DPDK_RX_NEXT_ETHERNET_INPUT,
+ DPDK_RX_NEXT_DROP,
+ DPDK_RX_N_NEXT,
+} dpdk_rx_next_t;
+
+void vnet_buffer_needs_dpdk_mb (vlib_buffer_t * b);
+
+void dpdk_set_next_node (dpdk_rx_next_t, char *);
+
+typedef void (*dpdk_io_thread_callback_t) (vlib_main_t *vm);
+
+void dpdk_io_thread (vlib_worker_thread_t * w,
+ u32 instances,
+ u32 instance_id,
+ char *worker_name,
+ dpdk_io_thread_callback_t callback);
+void dpdk_thread_input (dpdk_main_t * dm, dpdk_device_t * xd);
+
+clib_error_t * dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd);
+
+void dpdk_set_flowcontrol_callback (vlib_main_t *vm,
+ dpdk_flowcontrol_callback_t callback);
+
+u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance);
+
+vlib_frame_queue_elt_t * vlib_get_handoff_queue_elt (u32 vlib_worker_index);
+
+u32 dpdk_get_handoff_node_index (void);
+
+void set_efd_bitmap (u8 *bitmap, u32 value, u32 op);
+
+#define foreach_dpdk_error \
+ _(NONE, "no error") \
+ _(RX_PACKET_ERROR, "Rx packet errors") \
+ _(RX_BAD_FCS, "Rx bad fcs") \
+ _(L4_CHECKSUM_ERROR, "Rx L4 checksum errors") \
+ _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \
+ _(RX_ALLOC_FAIL, "rx buf alloc from free list failed") \
+ _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem") \
+ _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error") \
+ _(IPV4_EFD_DROP_PKTS, "IPV4 Early Fast Discard rx drops") \
+ _(IPV6_EFD_DROP_PKTS, "IPV6 Early Fast Discard rx drops") \
+ _(MPLS_EFD_DROP_PKTS, "MPLS Early Fast Discard rx drops") \
+ _(VLAN_EFD_DROP_PKTS, "VLAN Early Fast Discard rx drops")
+
+typedef enum {
+#define _(f,s) DPDK_ERROR_##f,
+ foreach_dpdk_error
+#undef _
+ DPDK_N_ERROR,
+} dpdk_error_t;
+
+/*
+ * Increment EFD drop counter
+ */
+static_always_inline
+void increment_efd_drop_counter (vlib_main_t * vm, u32 counter_index, u32 count)
+{
+ vlib_node_t *my_n;
+
+ my_n = vlib_get_node (vm, dpdk_input_node.index);
+ vm->error_main.counters[my_n->error_heap_index+counter_index] += count;
+}
+
+void dpdk_update_link_state (dpdk_device_t * xd, f64 now);
+void dpdk_efd_update_counters(dpdk_device_t *xd, u32 n_buffers, u16 enabled);
+u32 is_efd_discardable(vlib_thread_main_t *tm,
+ vlib_buffer_t * b0,
+ struct rte_mbuf *mb);
+
+/* dpdk vhost-user interrupt management */
+u8 dpdk_vhost_user_want_interrupt (dpdk_device_t *xd, int idx);
+void dpdk_vhost_user_send_interrupt (vlib_main_t * vm, dpdk_device_t * xd,
+ int idx);
+
+
+static inline u64 vnet_get_aggregate_rx_packets (void)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ u64 sum = 0;
+ dpdk_worker_t * dw;
+
+ vec_foreach(dw, dm->workers)
+ sum += dw->aggregate_rx_packets;
+
+ return sum;
+}
+
+void dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id,
+ u32 * buffers,
+ uword n_buffers);
+
+#define EFD_OPERATION_LESS_THAN 0
+#define EFD_OPERATION_GREATER_OR_EQUAL 1
+
+void efd_config(u32 enabled,
+ u32 ip_prec, u32 ip_op,
+ u32 mpls_exp, u32 mpls_op,
+ u32 vlan_cos, u32 vlan_op);
+
+void post_sw_interface_set_flags (vlib_main_t *vm, u32 sw_if_index, u32 flags);
+
+typedef struct vhost_user_memory vhost_user_memory_t;
+
+void dpdk_vhost_user_process_init (void **ctx);
+void dpdk_vhost_user_process_cleanup (void *ctx);
+uword dpdk_vhost_user_process_if (vlib_main_t *vm, dpdk_device_t *xd, void *ctx);
+
+// vhost-user calls
+int dpdk_vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 * sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int dpdk_vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int dpdk_vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm,
+ u32 sw_if_index);
+int dpdk_vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_details_t **out_vuids);
+
+u32 dpdk_get_admin_up_down_in_progress (void);
+
+uword
+dpdk_input_rss (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f);
+
+#endif /* __included_dpdk_h__ */
diff --git a/vnet/vnet/devices/dpdk/dpdk_priv.h b/vnet/vnet/devices/dpdk/dpdk_priv.h
new file mode 100644
index 00000000000..e452e02d90d
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/dpdk_priv.h
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define DPDK_NB_RX_DESC_DEFAULT 512
+#define DPDK_NB_TX_DESC_DEFAULT 512
+#define DPDK_NB_RX_DESC_VIRTIO 256
+#define DPDK_NB_TX_DESC_VIRTIO 256
+#define DPDK_NB_RX_DESC_10GE 2048
+#define DPDK_NB_TX_DESC_10GE 2048
+#define DPDK_NB_RX_DESC_40GE (4096-128)
+#define DPDK_NB_TX_DESC_40GE 2048
+
+#define foreach_dpdk_counter \
+ _ (tx_frames_ok, opackets) \
+ _ (tx_bytes_ok, obytes) \
+ _ (tx_errors, oerrors) \
+ _ (tx_loopback_frames_ok, olbpackets) \
+ _ (tx_loopback_bytes_ok, olbbytes) \
+ _ (rx_frames_ok, ipackets) \
+ _ (rx_bytes_ok, ibytes) \
+ _ (rx_errors, ierrors) \
+ _ (rx_missed, imissed) \
+ _ (rx_bad_crc, ibadcrc) \
+ _ (rx_bad_length, ibadlen) \
+ _ (rx_multicast_frames_ok, imcasts) \
+ _ (rx_no_bufs, rx_nombuf) \
+ _ (rx_filter_match, fdirmatch) \
+ _ (rx_filter_miss, fdirmiss) \
+ _ (tx_pause_xon, tx_pause_xon) \
+ _ (rx_pause_xon, rx_pause_xon) \
+ _ (tx_pause_xoff, tx_pause_xoff) \
+ _ (rx_pause_xoff, rx_pause_xoff) \
+ _ (rx_loopback_frames_ok, ilbpackets) \
+ _ (rx_loopback_bytes_ok, ilbbytes)
+
+#define foreach_dpdk_q_counter \
+ _ (rx_frames_ok, q_ipackets) \
+ _ (tx_frames_ok, q_opackets) \
+ _ (rx_bytes_ok, q_ibytes) \
+ _ (tx_bytes_ok, q_obytes) \
+ _ (rx_errors, q_errors)
+
+#define foreach_dpdk_rss_hf \
+ _(ETH_RSS_IPV4, "ipv4") \
+ _(ETH_RSS_FRAG_IPV4, "ipv4-frag") \
+ _(ETH_RSS_NONFRAG_IPV4_TCP, "ipv4-tcp") \
+ _(ETH_RSS_NONFRAG_IPV4_UDP, "ipv4-udp") \
+ _(ETH_RSS_NONFRAG_IPV4_SCTP, "ipv4-sctp") \
+ _(ETH_RSS_NONFRAG_IPV4_OTHER, "ipv4-other") \
+ _(ETH_RSS_IPV6, "ipv6") \
+ _(ETH_RSS_FRAG_IPV6, "ipv6-frag") \
+ _(ETH_RSS_NONFRAG_IPV6_TCP, "ipv6-tcp") \
+ _(ETH_RSS_NONFRAG_IPV6_UDP, "ipv6-udp") \
+ _(ETH_RSS_NONFRAG_IPV6_SCTP, "ipv6-sctp") \
+ _(ETH_RSS_NONFRAG_IPV6_OTHER, "ipv6-other") \
+ _(ETH_RSS_L2_PAYLOAD, "l2-payload") \
+ _(ETH_RSS_IPV6_EX, "ipv6-ex") \
+ _(ETH_RSS_IPV6_TCP_EX, "ipv6-tcp-ex") \
+ _(ETH_RSS_IPV6_UDP_EX, "ipv6-udp-ex")
+
+#define foreach_dpdk_rx_offload_caps \
+ _(DEV_RX_OFFLOAD_VLAN_STRIP, "vlan-strip") \
+ _(DEV_RX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_RX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_LRO , "rcp-lro") \
+ _(DEV_RX_OFFLOAD_QINQ_STRIP, "qinq-strip")
+
+#define foreach_dpdk_tx_offload_caps \
+ _(DEV_TX_OFFLOAD_VLAN_INSERT, "vlan-insert") \
+ _(DEV_TX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_TX_OFFLOAD_SCTP_CKSUM , "sctp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_TSO , "tcp-tso") \
+ _(DEV_TX_OFFLOAD_UDP_TSO , "udp-tso") \
+ _(DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM, "outer-ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_QINQ_INSERT, "qinq-insert")
+
+#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
+
+#define foreach_dpdk_pkt_rx_offload_flag \
+ _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \
+ _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \
+ _ (PKT_RX_FDIR, "RX packet with FDIR infos") \
+ _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \
+ _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet")
+
+#define foreach_dpdk_pkt_type \
+ _ (RTE_PTYPE_L3_IPV4, "Packet with IPv4 header") \
+ _ (RTE_PTYPE_L3_IPV4_EXT, "Packet with extended IPv4 header") \
+ _ (RTE_PTYPE_L3_IPV6, "Packet with IPv6 header") \
+ _ (RTE_PTYPE_L3_IPV6_EXT, "Packet with extended IPv6 header")
+#else
+#define foreach_dpdk_pkt_rx_offload_flag \
+ _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \
+ _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \
+ _ (PKT_RX_FDIR, "RX packet with FDIR infos") \
+ _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IPV4_HDR, "RX packet with IPv4 header") \
+ _ (PKT_RX_IPV4_HDR_EXT, "RX packet with extended IPv4 header") \
+ _ (PKT_RX_IPV6_HDR, "RX packet with IPv6 header") \
+ _ (PKT_RX_IPV6_HDR_EXT, "RX packet with extended IPv6 header") \
+ _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \
+ _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet")
+
+#define foreach_dpdk_pkt_type /* Dummy */
+#endif /* RTE_VERSION */
+
+#define foreach_dpdk_pkt_tx_offload_flag \
+ _ (PKT_TX_VLAN_PKT, "TX packet is a 802.1q VLAN packet") \
+ _ (PKT_TX_IP_CKSUM, "IP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_TCP_CKSUM, "TCP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_SCTP_CKSUM, "SCTP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_IEEE1588_TMST, "TX IEEE1588 packet to timestamp")
+
+#define foreach_dpdk_pkt_offload_flag \
+ foreach_dpdk_pkt_rx_offload_flag \
+ foreach_dpdk_pkt_tx_offload_flag
+
+static inline u8 * format_dpdk_pkt_types (u8 * s, va_list * va)
+{
+ u32 *pkt_types = va_arg (*va, u32 *);
+ uword indent __attribute__((unused)) = format_get_indent (s) + 2;
+
+ if (!*pkt_types)
+ return s;
+
+ s = format (s, "Packet Types");
+
+#define _(F, S) \
+ if (*pkt_types & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_type
+
+#undef _
+
+ return s;
+}
+
+static inline u8 * format_dpdk_pkt_offload_flags (u8 * s, va_list * va)
+{
+ u16 *ol_flags = va_arg (*va, u16 *);
+ uword indent = format_get_indent (s) + 2;
+
+ if (!*ol_flags)
+ return s;
+
+ s = format (s, "Packet Offload Flags");
+
+#define _(F, S) \
+ if (*ol_flags & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_offload_flag
+
+#undef _
+
+ return s;
+}
+
+static inline u8 * format_dpdk_rte_mbuf (u8 * s, va_list * va)
+{
+ struct rte_mbuf * mb = va_arg (*va, struct rte_mbuf *);
+ uword indent = format_get_indent (s) + 2;
+
+ s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d"
+ "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x,"
+ "\n%Upacket_type 0x%x",
+ mb->port, mb->nb_segs, mb->pkt_len,
+ format_white_space, indent,
+ mb->buf_len, mb->data_len, mb->ol_flags,
+ format_white_space, indent,
+ mb->packet_type);
+
+ if (mb->ol_flags)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_offload_flags, &mb->ol_flags);
+
+ if (mb->packet_type)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_types, &mb->packet_type);
+ return s;
+}
+
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+#define foreach_dpdk_pkt_ext_rx_offload_flag \
+ _ (PKT_EXT_RX_PKT_ERROR, "RX Packet Error") \
+ _ (PKT_EXT_RX_BAD_FCS, "RX Bad FCS checksum") \
+ _ (PKT_EXT_RX_UDP, "RX packet with UDP L4 header") \
+ _ (PKT_EXT_RX_TCP, "RX packet with TCP L4 header") \
+ _ (PKT_EXT_RX_IPV4_FRAGMENT, "RX packet IPv4 Fragment")
+
+#define foreach_dpdk_pkt_ext_offload_flag \
+ foreach_dpdk_pkt_rx_offload_flag \
+ foreach_dpdk_pkt_ext_rx_offload_flag
+
+static inline u8 * format_dpdk_pkt_rx_offload_flags (u8 * s, va_list * va)
+{
+ u16 *ol_flags = va_arg (*va, u16 *);
+ uword indent = format_get_indent (s) + 2;
+
+ if (!*ol_flags)
+ return s;
+
+ s = format (s, "Packet RX Offload Flags");
+
+#define _(F, S) \
+ if (*ol_flags & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_ext_offload_flag
+
+#undef _
+
+ return s;
+}
+
+static inline u8 * format_dpdk_rx_rte_mbuf (u8 * s, va_list * va)
+{
+ struct rte_mbuf * mb = va_arg (*va, struct rte_mbuf *);
+ uword indent = format_get_indent (s) + 2;
+
+ /*
+ * Note: Assumes mb is head of pkt chain -- port, nb_segs, & pkt_len
+ * are only valid for the 1st mbuf segment.
+ */
+ s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d"
+ "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x"
+ "\n%Upacket_type 0x%x",
+ mb->port, mb->nb_segs, mb->pkt_len,
+ format_white_space, indent,
+ mb->buf_len, mb->data_len, mb->ol_flags,
+ format_white_space, indent,
+ mb->packet_type);
+
+ if (mb->ol_flags)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_rx_offload_flags, &mb->ol_flags);
+
+ if (mb->packet_type)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_types, &mb->packet_type);
+ return s;
+}
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+/* These args appear by themselves */
+#define foreach_eal_double_hyphen_predicate_arg \
+_(no-shconf) \
+_(no-hpet) \
+_(no-pci) \
+_(no-huge) \
+_(vmware-tsc-map) \
+_(virtio-vhost)
+
+#define foreach_eal_single_hyphen_mandatory_arg \
+_(coremask, c) \
+_(nchannels, n) \
+
+#define foreach_eal_single_hyphen_arg \
+_(blacklist, b) \
+_(mem-alloc-request, m) \
+_(force-ranks, r)
+
+/* These args are preceeded by "--" and followed by a single string */
+#define foreach_eal_double_hyphen_arg \
+_(huge-dir) \
+_(proc-type) \
+_(file-prefix) \
+_(socket-mem) \
+_(vdev)
+
+static inline u32
+dpdk_rx_burst ( dpdk_main_t * dm, dpdk_device_t * xd, u16 queue_id)
+{
+ u32 n_buffers;
+ u32 n_left;
+ u32 n_this_chunk;
+
+ n_left = VLIB_FRAME_SIZE;
+ n_buffers = 0;
+
+ if (PREDICT_TRUE(xd->dev_type == VNET_DPDK_DEV_ETH))
+ {
+ while (n_left)
+ {
+ n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id,
+ xd->rx_vectors[queue_id] + n_buffers, n_left);
+ n_buffers += n_this_chunk;
+ n_left -= n_this_chunk;
+
+ /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */
+ if (n_this_chunk < 32)
+ break;
+ }
+ }
+ else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+ {
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ unsigned socket_id = rte_socket_id();
+
+ if (PREDICT_FALSE(!xd->vu_is_running))
+ return 0;
+
+ n_buffers = rte_vhost_dequeue_burst(&xd->vu_vhost_dev, VIRTIO_TXQ,
+ bm->pktmbuf_pools[socket_id],
+ xd->rx_vectors[queue_id], VLIB_FRAME_SIZE);
+
+ f64 now = vlib_time_now (dm->vlib_main);
+
+ /* send pending interrupts if needed */
+ if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_TXQ)) {
+ dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_TXQ]);
+ vring->n_since_last_int += n_buffers;
+
+ if ((vring->n_since_last_int && (vring->int_deadline < now))
+ || (vring->n_since_last_int > dm->vhost_coalesce_frames))
+ dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_TXQ);
+ }
+
+ if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) {
+ dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]);
+ if (vring->n_since_last_int && (vring->int_deadline < now))
+ dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ);
+ }
+
+ }
+ else if (xd->dev_type == VNET_DPDK_DEV_KNI)
+ {
+ n_buffers = rte_kni_rx_burst(xd->kni, xd->rx_vectors[queue_id], VLIB_FRAME_SIZE);
+ rte_kni_handle_request(xd->kni);
+ }
+ else
+ {
+ ASSERT(0);
+ }
+
+ return n_buffers;
+}
+
+
+static inline void
+dpdk_update_counters (dpdk_device_t * xd, f64 now)
+{
+ vlib_simple_counter_main_t * cm;
+ vnet_main_t * vnm = vnet_get_main();
+ u32 my_cpu = os_get_cpu_number();
+ u64 rxerrors, last_rxerrors;
+ int len;
+
+ /* only update counters for PMD interfaces */
+ if (xd->dev_type != VNET_DPDK_DEV_ETH)
+ return;
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (xd->admin_up != 0xff)
+ {
+ xd->time_last_stats_update = now ? now : xd->time_last_stats_update;
+ memcpy (&xd->last_stats, &xd->stats, sizeof (xd->last_stats));
+ rte_eth_stats_get (xd->device_index, &xd->stats);
+
+ /* maybe bump interface rx no buffer counter */
+ if (PREDICT_FALSE (xd->stats.rx_nombuf != xd->last_stats.rx_nombuf))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_NO_BUF);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ xd->stats.rx_nombuf -
+ xd->last_stats.rx_nombuf);
+ }
+
+ /* missed pkt counter */
+ if (PREDICT_FALSE (xd->stats.imissed != xd->last_stats.imissed))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_MISS);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ xd->stats.imissed -
+ xd->last_stats.imissed);
+ }
+ rxerrors = xd->stats.ibadcrc
+ + xd->stats.ibadlen + xd->stats.ierrors;
+ last_rxerrors = xd->last_stats.ibadcrc
+ + xd->last_stats.ibadlen + xd->last_stats.ierrors;
+
+ if (PREDICT_FALSE (rxerrors != last_rxerrors))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ rxerrors - last_rxerrors);
+ }
+ }
+
+ if ((len = rte_eth_xstats_get(xd->device_index, NULL, 0)) > 0)
+ {
+ vec_validate(xd->xstats, len - 1);
+ len = rte_eth_xstats_get(xd->device_index, xd->xstats, vec_len(xd->xstats));
+ ASSERT(vec_len(xd->xstats) == len);
+ _vec_len(xd->xstats) = len;
+ }
+}
diff --git a/vnet/vnet/devices/dpdk/init.c b/vnet/vnet/devices/dpdk/init.c
new file mode 100644
index 00000000000..a4b0f01475f
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/init.c
@@ -0,0 +1,1728 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vlib/unix/physmem.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "dpdk_priv.h"
+
+dpdk_main_t dpdk_main;
+
+/* force linker to link functions used by vlib and declared weak */
+void *vlib_weakly_linked_functions[] = {
+ &rte_pktmbuf_init,
+ &rte_pktmbuf_pool_init,
+};
+
+#define LINK_STATE_ELOGS 0
+
+#define DEFAULT_HUGE_DIR "/run/vpp/hugepages"
+#define VPP_RUN_DIR "/run/vpp"
+
+/* Port configuration, mildly modified Intel app values */
+
+static struct rte_eth_conf port_conf_template = {
+ .rxmode = {
+ .split_hdr_size = 0,
+ .header_split = 0, /**< Header Split disabled */
+ .hw_ip_checksum = 0, /**< IP checksum offload disabled */
+ .hw_vlan_filter = 0, /**< VLAN filtering disabled */
+ .hw_strip_crc = 1, /**< CRC stripped by hardware */
+ },
+ .txmode = {
+ .mq_mode = ETH_MQ_TX_NONE,
+ },
+};
+
+clib_error_t *
+dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ int rv;
+ int j;
+
+ ASSERT(os_get_cpu_number() == 0);
+
+ if (xd->admin_up) {
+ vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0);
+ rte_eth_dev_stop (xd->device_index);
+ }
+
+ rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used,
+ xd->tx_q_used, &xd->port_conf);
+
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ /* Set up one TX-queue per worker thread */
+ for (j = 0; j < xd->tx_q_used; j++)
+ {
+ rv = rte_eth_tx_queue_setup(xd->device_index, j, xd->nb_tx_desc,
+ xd->cpu_socket, &xd->tx_conf);
+ if (rv < 0)
+ break;
+ }
+
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_tx_queue_setup[%d]: err %d",
+ xd->device_index, rv);
+
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+
+ rv = rte_eth_rx_queue_setup(xd->device_index, j, xd->nb_rx_desc,
+ xd->cpu_socket, 0,
+ bm->pktmbuf_pools[xd->cpu_socket_id_by_queue[j]]);
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d",
+ xd->device_index, rv);
+ }
+
+ if (xd->admin_up) {
+ rte_eth_dev_start (xd->device_index);
+ }
+ return 0;
+}
+
+static u32 dpdk_flag_change (vnet_main_t * vnm,
+ vnet_hw_interface_t * hi,
+ u32 flags)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+ u32 old = 0;
+
+ if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC(flags))
+ {
+ old = xd->promisc;
+ xd->promisc = flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL;
+
+ if (xd->admin_up)
+ {
+ if (xd->promisc)
+ rte_eth_promiscuous_enable(xd->device_index);
+ else
+ rte_eth_promiscuous_disable(xd->device_index);
+ }
+ }
+ else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU(flags))
+ {
+ /*
+ * DAW-FIXME: The Cisco VIC firmware does not provide an api for a
+ * driver to dynamically change the mtu. If/when the
+ * VIC firmware gets fixed, then this should be removed.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_VICE ||
+ xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ struct rte_eth_dev_info dev_info;
+
+ /*
+ * Restore mtu to what has been set by CIMC in the firmware cfg.
+ */
+ rte_eth_dev_info_get(xd->device_index, &dev_info);
+ hi->max_packet_bytes = dev_info.max_rx_pktlen;
+
+ vlib_cli_output (vlib_get_main(),
+ "Cisco VIC mtu can only be changed "
+ "using CIMC then rebooting the server!");
+ }
+ else
+ {
+ int rv;
+
+ /*
+ * DAW-FIXME: The DPDK VMXNET3 driver does not currently support
+ * multi-buffer packets. Max out at 1518 bytes for now.
+ *
+ * If/when the driver gets fixed, then this should be
+ * removed.
+ */
+ if ((xd->pmd == VNET_DPDK_PMD_VMXNET3) &&
+ (hi->max_packet_bytes > 1518))
+ {
+ hi->max_packet_bytes = 1518;
+
+ vlib_cli_output (vlib_get_main(),
+ "VMXNET3 driver does not support jumbo frames "
+ "yet -- setting mtu to 1518!");
+ }
+
+ xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes;
+
+ if (xd->admin_up)
+ rte_eth_dev_stop (xd->device_index);
+
+ rv = rte_eth_dev_configure
+ (xd->device_index,
+ xd->rx_q_used,
+ xd->tx_q_used,
+ &xd->port_conf);
+
+ if (rv < 0)
+ vlib_cli_output (vlib_get_main(),
+ "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ rte_eth_dev_set_mtu(xd->device_index, hi->max_packet_bytes);
+
+ if (xd->admin_up)
+ rte_eth_dev_start (xd->device_index);
+ }
+ }
+ return old;
+}
+
+#ifdef NETMAP
+extern int rte_netmap_probe(void);
+#endif
+
+static clib_error_t *
+dpdk_lib_init (dpdk_main_t * dm)
+{
+ u32 nports;
+ u32 nb_desc = 0;
+ int i;
+ clib_error_t * error;
+ vlib_main_t * vm = vlib_get_main();
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vnet_sw_interface_t * sw;
+ vnet_hw_interface_t * hi;
+ dpdk_device_t * xd;
+ vlib_thread_registration_t * tr;
+ uword * p;
+
+ u32 next_cpu = 0;
+ u8 af_packet_port_id = 0;
+
+ dm->input_cpu_first_index = 0;
+ dm->input_cpu_count = 1;
+
+ /* find out which cpus will be used for input */
+ p = hash_get_mem (tm->thread_registrations_by_name, "io");
+ tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (!tr || tr->count == 0)
+ {
+ /* no io threads, workers doing input */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+ }
+ else
+ {
+ dm->have_io_threads = 1;
+ }
+
+ if (tr && tr->count > 0)
+ {
+ dm->input_cpu_first_index = tr->first_index;
+ dm->input_cpu_count = tr->count;
+ }
+
+ vec_validate_aligned (dm->devices_by_cpu, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (dm->workers, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+#ifdef NETMAP
+ if(rte_netmap_probe() < 0)
+ return clib_error_return (0, "rte netmap probe failed");
+#endif
+
+ nports = rte_eth_dev_count();
+ if (nports < 1)
+ {
+ clib_warning ("DPDK drivers found no ports...");
+ }
+
+ if (CLIB_DEBUG > 0)
+ clib_warning ("DPDK drivers found %d ports...", nports);
+
+ /*
+ * All buffers are all allocated from the same rte_mempool.
+ * Thus they all have the same number of data bytes.
+ */
+ dm->vlib_buffer_free_list_index =
+ vlib_buffer_get_or_create_free_list (
+ vm, VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, "dpdk rx");
+
+ for (i = 0; i < nports; i++)
+ {
+ u8 addr[6];
+ int j;
+ struct rte_eth_dev_info dev_info;
+ clib_error_t * rv;
+ struct rte_eth_link l;
+
+ /* Create vnet interface */
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
+ xd->cpu_socket = (i8) rte_eth_dev_socket_id(i);
+ rte_eth_dev_info_get(i, &dev_info);
+
+ memcpy(&xd->tx_conf, &dev_info.default_txconf,
+ sizeof(struct rte_eth_txconf));
+ if (dm->no_multi_seg)
+ {
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 0;
+ }
+ else
+ {
+ xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 1;
+ }
+
+ memcpy(&xd->port_conf, &port_conf_template, sizeof(struct rte_eth_conf));
+
+ xd->tx_q_used = dev_info.max_tx_queues < tm->n_vlib_mains ?
+ 1 : tm->n_vlib_mains;
+
+ if (dm->use_rss > 1 && dev_info.max_rx_queues >= dm->use_rss)
+ {
+ xd->rx_q_used = dm->use_rss;
+ xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+ xd->port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP;
+ }
+ else
+ xd->rx_q_used = 1;
+
+ xd->dev_type = VNET_DPDK_DEV_ETH;
+ if (!xd->pmd) {
+
+
+#define _(s,f) else if (!strcmp(dev_info.driver_name, s)) \
+ xd->pmd = VNET_DPDK_PMD_##f;
+ if (0)
+ ;
+ foreach_dpdk_pmd
+#undef _
+ else
+ xd->pmd = VNET_DPDK_PMD_UNKNOWN;
+
+
+ switch (xd->pmd) {
+ /* 1G adapters */
+ case VNET_DPDK_PMD_E1000EM:
+ case VNET_DPDK_PMD_IGB:
+ case VNET_DPDK_PMD_IGBVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ break;
+
+ /* 10G adapters */
+ case VNET_DPDK_PMD_IXGBE:
+ case VNET_DPDK_PMD_IXGBEVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_10GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_10GE;
+ break;
+
+ /* Cisco VIC */
+ case VNET_DPDK_PMD_VICE:
+ case VNET_DPDK_PMD_ENIC:
+ rte_eth_link_get_nowait(xd->device_index, &l);
+ if (l.link_speed == 40000)
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE;
+ }
+ else
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_10GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_10GE;
+ }
+ break;
+
+ /* Intel Fortville */
+ case VNET_DPDK_PMD_I40E:
+ case VNET_DPDK_PMD_I40EVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE;
+
+ switch (dev_info.pci_dev->id.device_id) {
+ case I40E_DEV_ID_10G_BASE_T:
+ case I40E_DEV_ID_SFP_XL710:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ case I40E_DEV_ID_QSFP_A:
+ case I40E_DEV_ID_QSFP_B:
+ case I40E_DEV_ID_QSFP_C:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ case I40E_DEV_ID_VF:
+ rte_eth_link_get_nowait(xd->device_index, &l);
+ xd->port_type = l.link_speed == 10000 ?
+ VNET_DPDK_PORT_TYPE_ETH_10G : VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+ break;
+
+ /* Intel Red Rock Canyon */
+ case VNET_DPDK_PMD_FM10K:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE;
+ break;
+
+ /* virtio */
+ case VNET_DPDK_PMD_VIRTIO:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO;
+ break;
+
+ /* vmxnet3 */
+ case VNET_DPDK_PMD_VMXNET3:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ break;
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET;
+ xd->af_packet_port_id = af_packet_port_id++;
+ break;
+
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+
+ #ifdef NETMAP
+ if(strncmp(dev_info.driver_name, "vale", 4) == 0
+ || strncmp(dev_info.driver_name, "netmap", 6) == 0)
+ {
+ xd->pmd = VNET_DPDK_PMD_NETMAP;
+ xd->port_type = VNET_DPDK_PORT_TYPE_NETMAP;
+ }
+ #endif
+
+ }
+
+ /*
+ * Ensure default mtu is not > the mtu read from the hardware.
+ * Otherwise rte_eth_dev_configure() will fail and the port will
+ * not be available.
+ */
+ xd->port_conf.rxmode.max_rx_pkt_len =
+ (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen) ?
+ dev_info.max_rx_pktlen : ETHERNET_MAX_PACKET_BYTES;
+
+ /*
+ * DAW-FIXME: VMXNET3 driver doesn't support jumbo / multi-buffer pkts
+ */
+ if (xd->pmd == VNET_DPDK_PMD_VMXNET3)
+ {
+ xd->port_conf.rxmode.max_rx_pkt_len = 1518;
+ xd->port_conf.rxmode.jumbo_frame = 0;
+ }
+
+ if (xd->pmd == VNET_DPDK_PMD_AF_PACKET)
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+ memcpy (addr+2, &rnd, sizeof(rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+ else
+ rte_eth_macaddr_get(i,(struct ether_addr *)addr);
+
+ if (xd->tx_q_used < tm->n_vlib_mains)
+ {
+ xd->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES);
+ }
+
+ xd->device_index = xd - dm->devices;
+ ASSERT(i == xd->device_index);
+ xd->per_interface_next_index = ~0;
+
+ /* assign interface to input thread */
+ dpdk_device_and_queue_t * dq;
+ int q;
+
+ for (q = 0; q < xd->rx_q_used; q++)
+ {
+ int cpu = dm->input_cpu_first_index + next_cpu;
+ unsigned lcore = vlib_worker_threads[cpu].dpdk_lcore_id;
+
+ /*
+ * numa node for worker thread handling this queue
+ * needed for taking buffers from the right mempool
+ */
+ vec_validate(xd->cpu_socket_id_by_queue, q);
+ xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore);
+
+ /*
+ * construct vector of (device,queue) pairs for each worker thread
+ */
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = q;
+
+ next_cpu++;
+ if (next_cpu == dm->input_cpu_count)
+ next_cpu = 0;
+ }
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+ sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j< xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->frames, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ rv = dpdk_port_setup(dm, xd);
+
+ if (rv < 0)
+ return rv;
+
+ /* count the number of descriptors used for this device */
+ nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used;
+
+ error = ethernet_register_interface
+ (dm->vnet_main,
+ dpdk_device_class.index,
+ xd->device_index,
+ /* ethernet address */ addr,
+ &xd->vlib_hw_if_index,
+ dpdk_flag_change);
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+ hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+
+ /*
+ * DAW-FIXME: The Cisco VIC firmware does not provide an api for a
+ * driver to dynamically change the mtu. If/when the
+ * VIC firmware gets fixed, then this should be removed.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_VICE ||
+ xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ /*
+ * Initialize mtu to what has been set by CIMC in the firmware cfg.
+ */
+ hi->max_packet_bytes = dev_info.max_rx_pktlen;
+ /*
+ * remove vlan tag from VIC port to fix VLAN0 issue.
+ * TODO Handle VLAN tagged traffic
+ */
+ int vlan_off;
+ vlan_off = rte_eth_dev_get_vlan_offload(xd->device_index);
+ vlan_off |= ETH_VLAN_STRIP_OFFLOAD;
+ rte_eth_dev_set_vlan_offload(xd->device_index, vlan_off);
+ }
+ /*
+ * DAW-FIXME: VMXNET3 driver doesn't support jumbo / multi-buffer pkts
+ */
+ else if (xd->pmd == VNET_DPDK_PMD_VMXNET3)
+ hi->max_packet_bytes = 1518;
+
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] =
+ xd->port_conf.rxmode.max_rx_pkt_len - sizeof(ethernet_header_t);
+
+ rte_eth_dev_set_mtu(xd->device_index, hi->max_packet_bytes);
+ }
+
+ if (dm->num_kni) {
+ clib_warning("Initializing KNI interfaces...");
+ rte_kni_init(dm->num_kni);
+ for (i = 0; i < dm->num_kni; i++)
+ {
+ u8 addr[6];
+ int j;
+
+ /* Create vnet interface */
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->dev_type = VNET_DPDK_DEV_KNI;
+
+ xd->device_index = xd - dm->devices;
+ ASSERT(nports + i == xd->device_index);
+ xd->per_interface_next_index = ~0;
+ xd->kni_port_id = i;
+ xd->cpu_socket = -1;
+ hash_set (dm->dpdk_device_by_kni_port_id, i, xd - dm->devices);
+ xd->rx_q_used = 1;
+
+ /* assign interface to input thread */
+ dpdk_device_and_queue_t * dq;
+ vec_add2(dm->devices_by_cpu[dm->input_cpu_first_index], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+ sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j< xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->frames, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ /* FIXME Set up one TX-queue per worker thread */
+
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (addr+2, &rnd, sizeof(rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface
+ (dm->vnet_main,
+ dpdk_device_class.index,
+ xd->device_index,
+ /* ethernet address */ addr,
+ &xd->vlib_hw_if_index,
+ dpdk_flag_change);
+
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+ hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ }
+ }
+
+ if (nb_desc > dm->num_mbufs)
+ clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n",
+ dm->num_mbufs, nb_desc);
+
+ /* init next vhost-user if index */
+ dm->next_vu_if_id = 0;
+
+ return 0;
+}
+
+/*
+ * Tell the vlib physical memory allocator that we've handled
+ * the initialization. We don't actually do so until
+ * vlib_main(...) callls the dpdk config function.
+ */
+int vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm,
+ int physmem_required)
+{
+ return 1;
+}
+
+static clib_error_t *
+write_sys_fs (char * file_name, char * fmt, ...)
+{
+ u8 * s;
+ int fd;
+
+ fd = open (file_name, O_WRONLY);
+ if (fd < 0)
+ return clib_error_return_unix (0, "open `%s'", file_name);
+
+ va_list va;
+ va_start (va, fmt);
+ s = va_format (0, fmt, &va);
+ va_end (va);
+ vec_add1 (s, 0); // terminate c string
+
+ if (write (fd, s, vec_len (s)) < 0)
+ return clib_error_return_unix (0, "write '%s' to '%s'", s, file_name);
+
+ vec_free (s);
+ close (fd);
+ return 0;
+}
+
+#define VIRTIO_PCI_NAME "virtio-pci"
+
+static clib_error_t * dpdk_bind_eth_kernel_drivers (vlib_main_t * vm,
+ char * pci_dev_id,
+ char * kernel_driver)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ unformat_input_t _in;
+ unformat_input_t * in = &_in;
+ clib_error_t * error = 0;
+ u8 * line = 0, * modcmd = 0, * path = 0;
+ u8 * pci_vid = 0, *pci_did = 0, * devname = 0;
+ char *driver_name = kernel_driver;
+ FILE * fp;
+
+ /*
+ * Bail out now if we're not running as root.
+ * This allows non-privileged use of the packet generator, etc.
+ */
+ if (geteuid() != 0)
+ return 0;
+
+ /*
+ * Get all ethernet pci device numbers for the device type specified.
+ */
+ modcmd = format (0, "lspci -nDd %s | grep 0200 | "
+ "awk '{ print $1, $3 }'%c", pci_dev_id, 0);
+ if ((fp = popen ((const char *)modcmd, "r")) == NULL)
+ {
+ error = clib_error_return_unix (0,
+ "Unable to get %s ethernet pci devices.",
+ pci_dev_id);
+ goto done;
+ }
+
+ vec_validate (line, BUFSIZ);
+ vec_validate (path, BUFSIZ);
+ while (fgets ((char *)line, BUFSIZ, fp) != NULL)
+ {
+ struct stat st;
+ u8 bind_uio = 1;
+ line[strlen ((char *)line) - 1] = 0; // chomp trailing newline.
+
+ unformat_init_string (in, (char *)line, strlen((char *)line) + 1);
+ unformat(in, "%s %s:%s", &devname, &pci_vid, &pci_did);
+ unformat_free (in);
+
+ /*
+ * Blacklist all ethernet interfaces in the
+ * linux IP routing tables (route --inet --inet6)
+ */
+ if (strstr ((char *)dm->eth_if_blacklist, (char *)devname))
+ continue;
+
+ /*
+ * If there are any devices whitelisted, then blacklist all devices
+ * which are not explicitly whitelisted.
+ */
+ if (dm->eth_if_whitelist &&
+ !strstr ((char *)dm->eth_if_whitelist, (char *)devname))
+ continue;
+
+#ifdef NETMAP
+ /*
+ * Optimistically open the device as a netmap device.
+ */
+ if (eth_nm_open((char *)devname))
+ continue;
+#endif
+
+ _vec_len (path) = 0;
+ path = format (path, "/sys/bus/pci/devices/%s/driver/unbind%c",
+ devname, 0);
+
+ /*
+ * If the device is bound to a driver...
+ */
+ if (stat ((const char *)path, &st) == 0)
+ {
+ u8 * device_path;
+
+ /*
+ * If the interface is not a virtio...
+ */
+ if (!driver_name || strcmp(driver_name, VIRTIO_PCI_NAME))
+ {
+ /*
+ * If it is already bound to driver, don't unbind/bind it.
+ */
+ device_path = format (0, "/sys/bus/pci/drivers/%s/%s/device%c",
+ driver_name, devname, 0);
+ if (stat ((const char *)device_path, &st) == 0)
+ bind_uio = 0;
+
+ vec_free (device_path);
+ }
+
+ /*
+ * unbind it from the current driver
+ */
+ if (bind_uio)
+ {
+ _vec_len (path) -= 1;
+ path = format (path, "%c", 0);
+ error = write_sys_fs ((char *)path, "%s", devname);
+ if (error)
+ goto done;
+ }
+ }
+
+ /*
+ * DAW-FIXME: The following bind/unbind dance is necessary for the dpdk
+ * virtio poll-mode driver to work.
+ */
+
+ if (driver_name && !strcmp(driver_name, VIRTIO_PCI_NAME))
+ {
+ /*
+ * bind interface to the native kernel module
+ */
+ _vec_len (path) = 0;
+ path = format (path, "/sys/bus/pci/drivers/%s/bind%c",
+ driver_name, 0);
+ error = write_sys_fs ((char *)path, "%s", devname);
+ if (error)
+ goto done;
+
+ /*
+ * unbind interface from the native kernel module
+ */
+ _vec_len (path) -= 5;
+ path = format (path, "unbind%c", 0);
+ error = write_sys_fs ((char *)path, "%s", devname);
+ if (error)
+ goto done;
+ }
+
+ /*
+ * bind the interface to igb_uio
+ */
+ if (bind_uio)
+ {
+ int pci_vendor_id = strtol((char *) pci_vid, NULL, 16);
+ int pci_device_id = strtol((char *) pci_did, NULL, 16);
+
+ /*
+ * Set PCI ID to ".../virtio-pci/new_id" for Intel fortvile adapaters
+ */
+ if (pci_vendor_id == 0x8086 &&
+ (pci_device_id == I40E_DEV_ID_10G_BASE_T ||
+ pci_device_id == I40E_DEV_ID_SFP_XL710 ||
+ pci_device_id == I40E_DEV_ID_QSFP_A ||
+ pci_device_id == I40E_DEV_ID_QSFP_B ||
+ pci_device_id == I40E_DEV_ID_QSFP_C))
+ {
+ _vec_len (path) = 0;
+ path = format (path, "/sys/bus/pci/drivers/%s/new_id%c", driver_name, 0);
+ error = write_sys_fs ((char *) path, "%s %s", pci_vid, pci_did);
+ if (error)
+ continue;
+ }
+
+ _vec_len (path) = 0;
+ path = format (path, "/sys/bus/pci/drivers/%s/bind%c", driver_name, 0);
+ error = write_sys_fs ((char *) path, "%s", devname);
+ if (error)
+ {
+ error = 0;
+ continue;
+ }
+ }
+ }
+
+ done:
+ vec_free (line);
+ vec_free (path);
+ vec_free (devname);
+ vec_free (pci_vid);
+ vec_free (pci_did);
+ vec_free (modcmd);
+ pclose (fp);
+ return error;
+}
+
+static uword
+unformat_socket_mem (unformat_input_t * input, va_list * va)
+{
+ uword ** r = va_arg (* va, uword **);
+ int i = 0;
+ u32 mem;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, ","))
+ hash_set (*r, i, 1024);
+ else if (unformat (input, "%u,", &mem))
+ hash_set (*r, i, mem);
+ else if (unformat (input, "%u", &mem))
+ hash_set (*r, i, mem);
+ else
+ {
+ unformat_put_input (input);
+ goto done;
+ }
+ i++;
+ }
+
+done:
+ return 1;
+}
+
+static u32
+get_node_free_hugepages_num (u32 node, u32 page_size)
+{
+ FILE * fp;
+ u8 * tmp;
+
+ tmp = format (0, "/sys/devices/system/node/node%u/hugepages/hugepages-%ukB/"
+ "free_hugepages%c", node, page_size, 0);
+ fp = fopen ((char *) tmp, "r");
+ vec_free(tmp);
+
+ if (fp != NULL)
+ {
+ u8 * buffer = 0;
+ u32 pages_avail = 0;
+
+ vec_validate (buffer, 256-1);
+ if (fgets ((char *)buffer, 256, fp))
+ {
+ unformat_input_t in;
+ unformat_init_string (&in, (char *) buffer, strlen ((char *) buffer));
+ unformat(&in, "%u", &pages_avail);
+ unformat_free (&in);
+ }
+ vec_free(buffer);
+ fclose(fp);
+ return pages_avail;
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ clib_error_t * error = 0;
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ u8 * s, * tmp = 0;
+ u8 * pci_dev_id = 0;
+ u8 * rte_cmd = 0, * ethname = 0;
+ FILE * rte_fp;
+ u32 log_level;
+ int ret, i;
+ char * fmt;
+#ifdef NETMAP
+ int rxrings, txrings, rxslots, txslots, txburst;
+ char * nmnam;
+#endif
+ unformat_input_t _in;
+ unformat_input_t * in = &_in;
+ u8 no_pci = 0;
+ u8 no_huge = 0;
+ u8 huge_dir = 0;
+ u8 file_prefix = 0;
+ u8 * socket_mem = 0;
+
+ // MATT-FIXME: inverted virtio-vhost logic to use virtio by default
+ dm->use_virtio_vhost = 1;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ /* Prime the pump */
+ if (unformat (input, "no-hugetlb"))
+ {
+ vec_add1 (dm->eal_init_args, (u8 *) "no-huge");
+ no_huge = 1;
+ }
+
+ else if (unformat (input, "decimal-interface-names"))
+ dm->interface_name_format_decimal = 1;
+
+ else if (unformat (input, "no-multi-seg"))
+ dm->no_multi_seg = 1;
+
+ else if (unformat (input, "dev %s", &pci_dev_id))
+ {
+ if (dm->eth_if_whitelist)
+ {
+ /*
+ * Don't add duplicate device id's.
+ */
+ if (strstr ((char *)dm->eth_if_whitelist, (char *)pci_dev_id))
+ continue;
+
+ _vec_len (dm->eth_if_whitelist) -= 1; // chomp trailing NULL.
+ dm->eth_if_whitelist = format (dm->eth_if_whitelist, " %s%c",
+ pci_dev_id, 0);
+ }
+ else
+ dm->eth_if_whitelist = format (0, "%s%c", pci_dev_id, 0);
+ }
+
+#ifdef NETMAP
+ else if (unformat(input, "netmap %s/%d:%d/%d:%d/%d",
+ &nmname, &rxrings, &rxslots, &txrings, &txslots, &txburst)) {
+ char * rv;
+ rv = (char *)
+ eth_nm_args(nmname, rxrings, rxslots, txrings, txslots, txburst);
+ if (rv) {
+ error = clib_error_return (0, "%s", rv);
+ goto done;
+ }
+ }else if (unformat(input, "netmap %s", &nmname)) {
+ char * rv;
+ rv = (char *)
+ eth_nm_args(nmname, 0, 0, 0, 0, 0);
+ if (rv) {
+ error = clib_error_return (0, "%s", rv);
+ goto done;
+ }
+ }
+#endif
+
+ else if (unformat (input, "num-mbufs %d", &dm->num_mbufs))
+ ;
+ else if (unformat (input, "kni %d", &dm->num_kni))
+ ;
+ else if (unformat (input, "uio-driver %s", &dm->uio_driver_name))
+ ;
+ else if (unformat (input, "vhost-user-coalesce-frames %d", &dm->vhost_coalesce_frames))
+ ;
+ else if (unformat (input, "vhost-user-coalesce-time %f", &dm->vhost_coalesce_time))
+ ;
+ else if (unformat (input, "enable-vhost-user"))
+ dm->use_virtio_vhost = 0;
+ else if (unformat (input, "rss %d", &dm->use_rss))
+ ;
+
+#define _(a) \
+ else if (unformat(input, #a)) \
+ { \
+ if (!strncmp(#a, "no-pci", 6)) \
+ no_pci = 1; \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (dm->eal_init_args, tmp); \
+ }
+ foreach_eal_double_hyphen_predicate_arg
+#undef _
+
+#define _(a) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ if (!strncmp(#a, "huge-dir", 8)) \
+ huge_dir = 1; \
+ else if (!strncmp(#a, "file-prefix", 11)) \
+ file_prefix = 1; \
+ else if (!strncmp(#a, "socket-mem", 10)) \
+ socket_mem = vec_dup (s); \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (dm->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (dm->eal_init_args, s); \
+ }
+ foreach_eal_double_hyphen_arg
+#undef _
+
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (dm->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (dm->eal_init_args, s); \
+ }
+ foreach_eal_single_hyphen_arg
+#undef _
+
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (dm->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (dm->eal_init_args, s); \
+ dm->a##_set_manually = 1; \
+ }
+ foreach_eal_single_hyphen_mandatory_arg
+#undef _
+
+ else if (unformat(input, "default"))
+ ;
+
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+
+ if (!dm->uio_driver_name)
+ dm->uio_driver_name = format (0, "igb_uio");
+
+ /*
+ * Use 1G huge pages if available.
+ */
+ if (!no_huge && !huge_dir)
+ {
+ uword * mem_by_socket = hash_create (0, sizeof (uword));
+ uword c;
+ u8 use_1g = 1;
+ u8 use_2m = 1;
+ int rv;
+
+ umount(DEFAULT_HUGE_DIR);
+
+ /* Process "socket-mem" parameter value */
+ if (vec_len (socket_mem))
+ {
+ unformat_input_t in;
+ unformat_init_vector(&in, socket_mem);
+ unformat(&in, "%U", unformat_socket_mem, &mem_by_socket);
+ unformat_free(&in);
+ }
+ else
+ use_1g = 0;
+
+ /* check if available enough 1GB pages for each socket */
+ clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ({
+ uword * p = hash_get (mem_by_socket, c);
+ if (p)
+ {
+ u32 mem = p[0];
+ if (mem)
+ {
+ u32 pages_num_1g = mem / 1024;
+ u32 pages_num_2m = mem / 2;
+ u32 pages_avail;
+
+ pages_avail = get_node_free_hugepages_num(c, 1048576);
+ if (!(pages_avail >= pages_num_1g))
+ use_1g = 0;
+
+ pages_avail = get_node_free_hugepages_num(c, 2048);
+ if (!(pages_avail >= pages_num_2m))
+ use_2m = 0;
+ }
+ }
+ }));
+
+ hash_free (mem_by_socket);
+
+ rv = mkdir(VPP_RUN_DIR, 0755);
+ if (rv && errno != EEXIST)
+ {
+ error = clib_error_return (0, "mkdir '%s' failed errno %d",
+ VPP_RUN_DIR, errno);
+ goto done;
+ }
+
+ rv = mkdir(DEFAULT_HUGE_DIR, 0755);
+ if (rv && errno != EEXIST)
+ {
+ error = clib_error_return (0, "mkdir '%s' failed errno %d",
+ DEFAULT_HUGE_DIR, errno);
+ goto done;
+ }
+
+ if (use_1g)
+ {
+ rv = mount("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G");
+ }
+ else if (use_2m)
+ {
+ rv = mount("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL);
+ }
+ else
+ {
+ return clib_error_return (0, "not enough free huge pages");
+ }
+
+ if (rv)
+ {
+ error = clib_error_return (0, "mount failed %d", errno);
+ goto done;
+ }
+
+ tmp = format (0, "--huge-dir%c", 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ if (!file_prefix)
+ {
+ tmp = format (0, "--file-prefix%c", 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ tmp = format (0, "vpp%c", 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ }
+ }
+
+ /*
+ * Blacklist all ethernet interfaces in the linux IP routing tables.
+ */
+ dm->eth_if_blacklist = format (0, "%c", 0);
+ rte_cmd = format (0, "route --inet --inet6 -n|awk '{print $7}'|sort -u|"
+ "egrep $(echo $(ls -1d /sys/class/net/*/device|"
+ "cut -d/ -f5)|sed -s 's/ /|/g')%c", 0);
+ if ((rte_fp = popen ((const char *)rte_cmd, "r")) == NULL)
+ {
+ error = clib_error_return_unix (0, "Unable to find blacklist ethernet"
+ " interface(s) in linux routing tables.");
+ goto rte_cmd_err;
+
+ }
+
+ vec_validate (ethname, BUFSIZ);
+ while (fgets ((char *)ethname, BUFSIZ, rte_fp) != NULL)
+ {
+ FILE *rlnk_fp;
+ u8 * rlnk_cmd = 0, * devname = 0;
+
+ ethname[strlen ((char *)ethname) - 1] = 0; // chomp trailing newline.
+
+ rlnk_cmd = format (0, "readlink /sys/class/net/%s%c",
+ ethname, 0);
+
+ if ((rlnk_fp = popen ((const char *)rlnk_cmd, "r")) == NULL)
+ {
+ error = clib_error_return_unix (0, "Unable to read %s link.",
+ ethname);
+ goto rlnk_cmd_err;
+ }
+
+ vec_validate (devname, BUFSIZ);
+ while (fgets ((char *)devname, BUFSIZ, rlnk_fp) != NULL)
+ {
+ char * pci_id = 0;
+
+ /*
+ * Extract the device PCI ID name from the link. It is the first
+ * PCI ID searching backwards from the end of the link pathname.
+ * For example:
+ * readlink /sys/class/net/eth0
+ * ../../devices/pci0000:00/0000:00:0a.0/virtio4/net/eth0
+ */
+ for (pci_id = (char *)((devname + strlen((char *)devname)));
+ ((u8 *)pci_id > devname) && *pci_id != '.'; pci_id--)
+ ;
+
+ /*
+ * Verify that the field found is a valid PCI ID.
+ */
+ if ((*(pci_id - 1) == '.') || ((u8 *)(pci_id - 11) < devname) ||
+ (*(pci_id - 11) != '/') || (*(pci_id - 3) != ':') ||
+ (*(pci_id - 6) != ':'))
+ {
+ devname[strlen ((char *)devname) - 1] = 0; // chomp trailing newline.
+ clib_warning ("Unable to extract %s PCI ID (0x%llx \"%s\") "
+ "from 0x%llx \"%s\"", ethname, pci_id, pci_id,
+ devname, devname);
+ continue;
+ }
+
+ pci_id[2] = 0;
+ pci_id -= 10;
+
+ /* Don't blacklist any interfaces which have been whitelisted.
+ */
+ if (dm->eth_if_whitelist &&
+ strstr ((char *)dm->eth_if_whitelist, (char *)pci_id))
+ continue;
+
+ _vec_len (dm->eth_if_blacklist) -= 1; // chomp trailing NULL.
+ dm->eth_if_blacklist = format (dm->eth_if_blacklist, " %s%c",
+ pci_id, 0);
+ }
+
+ rlnk_cmd_err:
+ pclose (rlnk_fp);
+ vec_free (rlnk_cmd);
+ vec_free (devname);
+ }
+
+ rte_cmd_err:
+ pclose (rte_fp);
+ vec_free (rte_cmd);
+ vec_free (ethname);
+
+ if (error)
+ return error;
+
+ /* I'll bet that -c and -n must be the first and second args... */
+ if (!dm->coremask_set_manually)
+ {
+ vlib_thread_registration_t * tr;
+ uword coremask;
+ int i;
+
+ /* main thread core */
+ coremask = 1 << tm->main_lcore;
+
+ for (i = 0; i < vec_len (tm->registrations); i++)
+ {
+ tr = tm->registrations[i];
+ if (clib_bitmap_is_zero(tr->coremask))
+ continue;
+ coremask |= tr->coremask[0];
+ }
+
+ vec_insert (dm->eal_init_args, 2, 1);
+ dm->eal_init_args[1] = (u8 *) "-c";
+ tmp = format (0, "%x%c", coremask, 0);
+ dm->eal_init_args[2] = tmp;
+ }
+
+ if (!dm->nchannels_set_manually)
+ {
+ vec_insert (dm->eal_init_args, 2, 3);
+ dm->eal_init_args[3] = (u8 *) "-n";
+ tmp = format (0, "%d", dm->nchannels);
+ dm->eal_init_args[4] = tmp;
+ }
+
+ /*
+ * If there are whitelisted devices,
+ * add the whitelist option & device list to the dpdk arg list...
+ */
+ if (dm->eth_if_whitelist)
+ {
+ unformat_init_string (in, (char *)dm->eth_if_whitelist,
+ vec_len(dm->eth_if_whitelist) - 1);
+ fmt = "-w%c";
+ }
+
+ /*
+ * Otherwise add the blacklisted devices to the dpdk arg list.
+ */
+ else
+ {
+ unformat_init_string (in, (char *)dm->eth_if_blacklist,
+ vec_len(dm->eth_if_blacklist) - 1);
+ fmt = "-b%c";
+ }
+
+ while (unformat_check_input (in) != UNFORMAT_END_OF_INPUT)
+ {
+ tmp = format (0, fmt, 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ unformat (in, "%s", &pci_dev_id);
+ vec_add1 (dm->eal_init_args, pci_dev_id);
+ }
+
+ if (no_pci == 0)
+ {
+ /*
+ * Bind Virtio pci devices to the igb_uio kernel driver.
+ */
+ error = dpdk_bind_eth_kernel_drivers (vm, "1af4:1000", VIRTIO_PCI_NAME);
+ if (error)
+ return error;
+
+ /*
+ * Bind vmxnet3 pci devices to the igb_uio kernel driver.
+ */
+ error = dpdk_bind_eth_kernel_drivers (vm, "15ad:07b0",
+ (char *) dm->uio_driver_name);
+ if (error)
+ return error;
+
+ /*
+ * Bind Intel ethernet pci devices to igb_uio kernel driver.
+ */
+ error = dpdk_bind_eth_kernel_drivers (vm, "8086:",
+ (char *) dm->uio_driver_name);
+ /*
+ * Bind Cisco VIC ethernet pci devices to igb_uio kernel driver.
+ */
+ error = dpdk_bind_eth_kernel_drivers (vm, "1137:0043",
+ (char *) dm->uio_driver_name);
+ }
+
+ /* set master-lcore */
+ tmp = format (0, "--master-lcore%c", 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ tmp = format (0, "%u%c", tm->main_lcore, 0);
+ vec_add1 (dm->eal_init_args, tmp);
+
+ /* NULL terminate the "argv" vector, in case of stupidity */
+ vec_add1 (dm->eal_init_args, 0);
+ _vec_len(dm->eal_init_args) -= 1;
+
+ /* Set up DPDK eal and packet mbuf pool early. */
+
+ log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE;
+
+ rte_set_log_level (log_level);
+
+ vm = dm->vlib_main;
+
+ ret = rte_eal_init(vec_len(dm->eal_init_args), (char **) dm->eal_init_args);
+
+ /* lazy umount hugepages */
+ umount2(DEFAULT_HUGE_DIR, MNT_DETACH);
+
+ if (ret < 0)
+ return clib_error_return (0, "rte_eal_init returned %d", ret);
+
+ /* main thread 1st */
+ error = vlib_buffer_pool_create(vm, dm->num_mbufs, MBUF_SIZE, rte_socket_id());
+ if (error)
+ return error;
+
+ for (i = 0; i < RTE_MAX_LCORE; i++)
+ {
+ error = vlib_buffer_pool_create(vm, dm->num_mbufs, MBUF_SIZE,
+ rte_lcore_to_socket_id(i));
+ if (error)
+ return error;
+ }
+
+ if (dm->use_rss)
+ {
+ vlib_node_runtime_t * rt = vlib_node_get_runtime (vm, dpdk_input_node.index);
+ rt->function = dpdk_input_rss;
+ }
+ done:
+ return error;
+}
+
+VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk");
+
+void dpdk_update_link_state (dpdk_device_t * xd, f64 now)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ struct rte_eth_link prev_link = xd->link;
+ u32 hw_flags = 0;
+ u8 hw_flags_chg = 0;
+
+ /* only update link state for PMD interfaces */
+ if (xd->dev_type != VNET_DPDK_DEV_ETH)
+ return;
+
+ xd->time_last_link_update = now ? now : xd->time_last_link_update;
+ memset(&xd->link, 0, sizeof(xd->link));
+ rte_eth_link_get_nowait (xd->device_index, &xd->link);
+
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t * vm = vlib_get_main();
+ ELOG_TYPE_DECLARE(e) = {
+ .format =
+ "update-link-state: sw_if_index %d, admin_up %d,"
+ "old link_state %d new link_state %d",
+ .format_args = "i4i1i1i1",
+ };
+
+ struct { u32 sw_if_index; u8 admin_up;
+ u8 old_link_state; u8 new_link_state;} *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->admin_up = xd->admin_up;
+ ed->old_link_state = (u8)
+ vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index);
+ ed->new_link_state = (u8) xd->link.link_status;
+ }
+
+ if ((xd->admin_up == 1) &&
+ ((xd->link.link_status != 0) ^
+ vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index)))
+ {
+ hw_flags_chg = 1;
+ hw_flags |= (xd->link.link_status ?
+ VNET_HW_INTERFACE_FLAG_LINK_UP: 0);
+ }
+
+ if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_duplex)
+ {
+ case ETH_LINK_HALF_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX;
+ break;
+ case ETH_LINK_FULL_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX;
+ break;
+ default:
+ break;
+ }
+ }
+ if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_speed)
+ {
+ case ETH_LINK_SPEED_10:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M;
+ break;
+ case ETH_LINK_SPEED_100:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M;
+ break;
+ case ETH_LINK_SPEED_1000:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G;
+ break;
+ case ETH_LINK_SPEED_10000:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G;
+ break;
+ case ETH_LINK_SPEED_40G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G;
+ break;
+ case 0:
+ break;
+ default:
+ clib_warning("unknown link speed %d", xd->link.link_speed);
+ break;
+ }
+ }
+ if (hw_flags_chg)
+ {
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t * vm = vlib_get_main();
+
+ ELOG_TYPE_DECLARE(e) = {
+ .format = "update-link-state: sw_if_index %d, new flags %d",
+ .format_args = "i4i4",
+ };
+
+ struct { u32 sw_if_index; u32 flags; } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->flags = hw_flags;
+ }
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, hw_flags);
+ }
+}
+
+static uword
+dpdk_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ clib_error_t * error;
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ void *vu_state;
+ int i;
+
+ error = dpdk_lib_init (dm);
+
+ /*
+ * Turn on the input node if we found some devices to drive
+ * and we're not running worker threads or i/o threads
+ */
+
+ if (error == 0 && vec_len(dm->devices) > 0)
+ {
+ if (tm->n_vlib_mains == 1)
+ vlib_node_set_state (vm, dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else if (tm->main_thread_is_io_node)
+ vlib_node_set_state (vm, dpdk_io_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else if (!dm->have_io_threads)
+ for (i=0; i < tm->n_vlib_mains; i++)
+ if (vec_len(dm->devices_by_cpu[i]) > 0)
+ vlib_node_set_state (vlib_mains[i], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ }
+
+ if (error)
+ clib_error_report (error);
+
+ dpdk_vhost_user_process_init(&vu_state);
+
+ dm->io_thread_release = 1;
+
+ f64 now = vlib_time_now (vm);
+ vec_foreach (xd, dm->devices)
+ {
+ dpdk_update_link_state (xd, now);
+ }
+
+ while (1)
+ {
+ vlib_process_wait_for_event_or_clock (vm, 5.0);
+
+ if (dpdk_get_admin_up_down_in_progress())
+ /* skip the poll if an admin up down is in progress (on any interface) */
+ continue;
+
+ vec_foreach (xd, dm->devices)
+ {
+ f64 now = vlib_time_now (vm);
+ if ((now - xd->time_last_stats_update) >= DPDK_STATS_POLL_INTERVAL)
+ dpdk_update_counters (xd, now);
+ if ((now - xd->time_last_link_update) >= DPDK_LINK_POLL_INTERVAL)
+ dpdk_update_link_state (xd, now);
+
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+ if (dpdk_vhost_user_process_if(vm, xd, vu_state) != 0)
+ continue;
+ }
+ }
+
+ dpdk_vhost_user_process_cleanup(vu_state);
+
+ return 0;
+}
+
+VLIB_REGISTER_NODE (dpdk_process_node,static) = {
+ .function = dpdk_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "dpdk-process",
+ .process_log2_n_stack_bytes = 17,
+};
+
+clib_error_t *
+dpdk_init (vlib_main_t * vm)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_node_t * ei;
+ clib_error_t * error = 0;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ /* verify that structs are cacheline aligned */
+ ASSERT(offsetof(dpdk_device_t, cacheline0) == 0);
+ ASSERT(offsetof(dpdk_device_t, cacheline1) == CLIB_CACHE_LINE_BYTES);
+ ASSERT(offsetof(dpdk_worker_t, cacheline0) == 0);
+ ASSERT(offsetof(frame_queue_trace_t, cacheline0) == 0);
+
+ /* Add references to DPDK Driver Constructor functions to get the dynamic
+ * loader to pull in the driver library & run the constructors.
+ */
+#define _(d) \
+do { \
+ void devinitfn_ ##d(void); \
+ __attribute__((unused)) void (* volatile pf)(void); \
+ pf = devinitfn_ ##d; \
+} while(0);
+
+#ifdef RTE_LIBRTE_EM_PMD
+ _(em_pmd_drv)
+#endif
+
+#ifdef RTE_LIBRTE_IGB_PMD
+ _(pmd_igb_drv)
+#endif
+
+#ifdef RTE_LIBRTE_IXGBE_PMD
+ _(rte_ixgbe_driver)
+#endif
+
+#ifdef RTE_LIBRTE_I40E_PMD
+ _(rte_i40e_driver)
+ _(rte_i40evf_driver)
+#endif
+
+#ifdef RTE_LIBRTE_FM10K_PMD
+ _(rte_fm10k_driver)
+#endif
+
+#ifdef RTE_LIBRTE_VIRTIO_PMD
+ _(rte_virtio_driver)
+#endif
+
+#ifdef RTE_LIBRTE_VMXNET3_PMD
+ _(rte_vmxnet3_driver)
+#endif
+
+#ifdef RTE_LIBRTE_VICE_PMD
+ _(rte_vice_driver)
+#endif
+
+#ifdef RTE_LIBRTE_ENIC_PMD
+ _(rte_enic_driver)
+#endif
+
+#ifdef RTE_LIBRTE_PMD_AF_PACKET
+ _(pmd_af_packet_drv)
+#endif
+
+#undef _
+
+ dm->vlib_main = vm;
+ dm->vnet_main = vnet_get_main();
+
+ ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input");
+ if (ei == 0)
+ return clib_error_return (0, "ethernet-input node AWOL");
+
+ dm->ethernet_input_node_index = ei->index;
+
+ dm->nchannels = 4;
+ dm->num_mbufs = dm->num_mbufs ? dm->num_mbufs : NB_MBUF;
+ vec_add1 (dm->eal_init_args, (u8 *) "vnet");
+
+ dm->dpdk_device_by_kni_port_id = hash_create (0, sizeof (uword));
+ dm->vu_sw_if_index_by_listener_fd = hash_create (0, sizeof (uword));
+ dm->vu_sw_if_index_by_sock_fd = hash_create (0, sizeof (uword));
+
+ /* $$$ use n_thread_stacks since it's known-good at this point */
+ vec_validate (dm->recycle, tm->n_thread_stacks - 1);
+
+ /* initialize EFD (early fast discard) default settings */
+ dm->efd.enabled = DPDK_EFD_DISABLED;
+ dm->efd.queue_hi_thresh = ((DPDK_EFD_DEFAULT_DEVICE_QUEUE_HI_THRESH_PCT *
+ DPDK_NB_RX_DESC_10GE)/100);
+ dm->efd.consec_full_frames_hi_thresh =
+ DPDK_EFD_DEFAULT_CONSEC_FULL_FRAMES_HI_THRESH;
+
+ /* vhost-user coalescence frames defaults */
+ dm->vhost_coalesce_frames = 32;
+ dm->vhost_coalesce_time = 1e-3;
+
+ /* init CLI */
+ if ((error = vlib_call_init_function (vm, dpdk_cli_init)))
+ return error;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (dpdk_init);
+
diff --git a/vnet/vnet/devices/dpdk/node.c b/vnet/vnet/devices/dpdk/node.c
new file mode 100644
index 00000000000..fde0eb23e14
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/node.c
@@ -0,0 +1,2010 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls-gre/packet.h>
+
+#include "dpdk_priv.h"
+
+#ifndef MAX
+#define MAX(a,b) ((a) < (b) ? (b) : (a))
+#endif
+
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+/*
+ * At least in certain versions of ESXi, vmware e1000's don't honor the
+ * "strip rx CRC" bit. Set this flag to work around that bug FOR UNIT TEST ONLY.
+ *
+ * If wireshark complains like so:
+ *
+ * "Frame check sequence: 0x00000000 [incorrect, should be <hex-num>]"
+ * and you're using ESXi emulated e1000's, set this flag FOR UNIT TEST ONLY.
+ *
+ * Note: do NOT check in this file with this workaround enabled! You'll lose
+ * actual data from e.g. 10xGE interfaces. The extra 4 bytes annoy
+ * wireshark, but they're harmless...
+ */
+#define VMWARE_LENGTH_BUG_WORKAROUND 0
+
+typedef struct {
+ u32 cached_next_index;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} handoff_dispatch_main_t;
+
+typedef struct {
+ u32 buffer_index;
+ u32 next_index;
+ u32 sw_if_index;
+} handoff_dispatch_trace_t;
+
+/* packet trace format function */
+static u8 * format_handoff_dispatch_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ handoff_dispatch_trace_t * t = va_arg (*args, handoff_dispatch_trace_t *);
+
+ s = format (s, "HANDOFF_DISPATCH: sw_if_index %d next_index %d buffer 0x%x",
+ t->sw_if_index,
+ t->next_index,
+ t->buffer_index);
+ return s;
+}
+
+handoff_dispatch_main_t handoff_dispatch_main;
+
+vlib_node_registration_t handoff_dispatch_node;
+
+#define foreach_handoff_dispatch_error \
+_(EXAMPLE, "example packets")
+
+typedef enum {
+#define _(sym,str) HANDOFF_DISPATCH_ERROR_##sym,
+ foreach_handoff_dispatch_error
+#undef _
+ HANDOFF_DISPATCH_N_ERROR,
+} handoff_dispatch_error_t;
+
+static char * handoff_dispatch_error_strings[] = {
+#define _(sym,string) string,
+ foreach_handoff_dispatch_error
+#undef _
+};
+
+static inline
+void vlib_put_handoff_queue_elt (vlib_frame_queue_elt_t * hf)
+{
+ CLIB_MEMORY_BARRIER();
+ hf->valid = 1;
+}
+
+static uword
+handoff_dispatch_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ dpdk_rx_next_t next_index;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ next0 = vnet_buffer(b0)->io_handoff.next_index;
+ next1 = vnet_buffer(b1)->io_handoff.next_index;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ handoff_dispatch_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ t->buffer_index = bi0;
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vlib_trace_buffer (vm, node, next1, b1, /* follow_chain */ 0);
+ handoff_dispatch_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ t->buffer_index = bi1;
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ next0 = vnet_buffer(b0)->io_handoff.next_index;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ handoff_dispatch_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ t->buffer_index = bi0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (handoff_dispatch_node) = {
+ .function = handoff_dispatch_node_fn,
+ .name = "handoff-dispatch",
+ .vector_size = sizeof (u32),
+ .format_trace = format_handoff_dispatch_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+ .flags = VLIB_NODE_FLAG_IS_HANDOFF,
+
+ .n_errors = ARRAY_LEN(handoff_dispatch_error_strings),
+ .error_strings = handoff_dispatch_error_strings,
+
+ .n_next_nodes = DPDK_RX_N_NEXT,
+
+ .next_nodes = {
+ [DPDK_RX_NEXT_DROP] = "error-drop",
+ [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input",
+ [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
+ [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ },
+};
+
+clib_error_t *handoff_dispatch_init (vlib_main_t *vm)
+{
+ handoff_dispatch_main_t * mp = &handoff_dispatch_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = &vnet_main;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (handoff_dispatch_init);
+
+u32 dpdk_get_handoff_node_index (void)
+{
+ return handoff_dispatch_node.index;
+}
+
+static char * dpdk_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_error
+#undef _
+};
+
+typedef struct {
+ u32 buffer_index;
+ u16 device_index;
+ u16 queue_index;
+ struct rte_mbuf mb;
+ vlib_buffer_t buffer; /* Copy of VLIB buffer; pkt data stored in pre_data. */
+} dpdk_rx_dma_trace_t;
+
+static u8 * format_dpdk_rx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main();
+ dpdk_rx_dma_trace_t * t = va_arg (*va, dpdk_rx_dma_trace_t *);
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, t->device_index);
+ format_function_t * f;
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U rx queue %d",
+ format_vnet_sw_interface_name, vnm, sw,
+ t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index,
+ format_vlib_buffer, &t->buffer);
+
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_dpdk_rx_rte_mbuf, &t->mb);
+#else
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_dpdk_rte_mbuf, &t->mb);
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+ f = node->format_buffer;
+ if (!f)
+ f = format_hex_bytes;
+ s = format (s, "\n%U%U", format_white_space, indent,
+ f, t->buffer.pre_data, sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+always_inline void
+dpdk_rx_next_and_error_from_mb_flags_x1 (dpdk_device_t *xd, struct rte_mbuf *mb,
+ vlib_buffer_t *b0,
+ u8 * next0, u8 * error0)
+{
+ u8 is0_ip4, is0_ip6, is0_mpls, n0;
+ uint16_t mb_flags = mb->ol_flags;
+
+ if (PREDICT_FALSE(mb_flags & (
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ PKT_EXT_RX_PKT_ERROR | PKT_EXT_RX_BAD_FCS |
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+ PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
+ )))
+ {
+ /* some error was flagged. determine the drop reason */
+ n0 = DPDK_RX_NEXT_DROP;
+ *error0 =
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ (mb_flags & PKT_EXT_RX_PKT_ERROR) ? DPDK_ERROR_RX_PACKET_ERROR :
+ (mb_flags & PKT_EXT_RX_BAD_FCS) ? DPDK_ERROR_RX_BAD_FCS :
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+ (mb_flags & PKT_RX_IP_CKSUM_BAD) ? DPDK_ERROR_IP_CHECKSUM_ERROR :
+ (mb_flags & PKT_RX_L4_CKSUM_BAD) ? DPDK_ERROR_L4_CHECKSUM_ERROR :
+ DPDK_ERROR_NONE;
+ }
+ else
+ {
+ *error0 = DPDK_ERROR_NONE;
+ if (xd->per_interface_next_index != ~0)
+ n0 = xd->per_interface_next_index;
+ else if (mb_flags & PKT_RX_VLAN_PKT)
+ n0 = DPDK_RX_NEXT_ETHERNET_INPUT;
+ else
+ {
+ n0 = DPDK_RX_NEXT_ETHERNET_INPUT;
+#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
+ is0_ip4 = (mb->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV4_EXT)) != 0;
+#else
+ is0_ip4 = (mb_flags & (PKT_RX_IPV4_HDR | PKT_RX_IPV4_HDR_EXT)) != 0;
+#endif
+
+ if (PREDICT_TRUE(is0_ip4))
+ n0 = DPDK_RX_NEXT_IP4_INPUT;
+ else
+ {
+#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
+ is0_ip6 =
+ (mb->packet_type & (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L3_IPV6_EXT)) != 0;
+#else
+ is0_ip6 =
+ (mb_flags & (PKT_RX_IPV6_HDR | PKT_RX_IPV6_HDR_EXT)) != 0;
+#endif
+ if (PREDICT_TRUE(is0_ip6))
+ n0 = DPDK_RX_NEXT_IP6_INPUT;
+ else
+ {
+ ethernet_header_t *h0 = (ethernet_header_t *) b0->data;
+ is0_mpls = (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST));
+ n0 = is0_mpls ? DPDK_RX_NEXT_MPLS_INPUT : n0;
+ }
+ }
+ }
+ }
+ *next0 = n0;
+}
+
+void dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id,
+ u32 * buffers,
+ uword n_buffers)
+{
+ vlib_main_t * vm = vlib_get_main();
+ u32 * b, n_left;
+ u8 next0;
+
+ n_left = n_buffers;
+ b = buffers;
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ dpdk_rx_dma_trace_t * t0;
+ struct rte_mbuf *mb;
+ u8 error0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ mb = ((struct rte_mbuf *)b0) - 1;
+ dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
+ &next0, &error0);
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = bi0;
+
+ memcpy (&t0->mb, mb, sizeof (t0->mb));
+ memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ memcpy (t0->buffer.pre_data, b0->data, sizeof (t0->buffer.pre_data));
+
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ /*
+ * Clear overloaded TX offload flags when a DPDK driver
+ * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
+ */
+ mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+ b += 1;
+ }
+}
+
+/*
+ * dpdk_efd_update_counters()
+ * Update EFD (early-fast-discard) counters
+ */
+void dpdk_efd_update_counters (dpdk_device_t *xd,
+ u32 n_buffers,
+ u16 enabled)
+{
+ if (enabled & DPDK_EFD_MONITOR_ENABLED)
+ {
+ u64 now = clib_cpu_time_now();
+ if (xd->efd_agent.last_poll_time > 0)
+ {
+ u64 elapsed_time = (now - xd->efd_agent.last_poll_time);
+ if (elapsed_time > xd->efd_agent.max_poll_delay)
+ xd->efd_agent.max_poll_delay = elapsed_time;
+ }
+ xd->efd_agent.last_poll_time = now;
+ }
+
+ xd->efd_agent.total_packet_cnt += n_buffers;
+ xd->efd_agent.last_burst_sz = n_buffers;
+
+ if (n_buffers > xd->efd_agent.max_burst_sz)
+ xd->efd_agent.max_burst_sz = n_buffers;
+
+ if (PREDICT_FALSE(n_buffers == VLIB_FRAME_SIZE))
+ {
+ xd->efd_agent.full_frames_cnt++;
+ xd->efd_agent.consec_full_frames_cnt++;
+ }
+ else
+ {
+ xd->efd_agent.consec_full_frames_cnt = 0;
+ }
+}
+
+/* is_efd_discardable()
+ * returns non zero DPDK error if packet meets early-fast-discard criteria,
+ * zero otherwise
+ */
+u32 is_efd_discardable (vlib_thread_main_t *tm,
+ vlib_buffer_t * b0,
+ struct rte_mbuf *mb)
+{
+ ethernet_header_t *eh = (ethernet_header_t *) b0->data;
+
+ if (eh->type == clib_host_to_net_u16(ETHERNET_TYPE_IP4))
+ {
+ ip4_header_t *ipv4 =
+ (ip4_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
+ u8 pkt_prec = (ipv4->tos >> 5);
+
+ return (tm->efd.ip_prec_bitmap & (1 << pkt_prec) ?
+ DPDK_ERROR_IPV4_EFD_DROP_PKTS : DPDK_ERROR_NONE);
+ }
+ else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_IP6))
+ {
+ ip6_header_t *ipv6 =
+ (ip6_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
+ u8 pkt_tclass =
+ ((ipv6->ip_version_traffic_class_and_flow_label >> 20) & 0xff);
+
+ return (tm->efd.ip_prec_bitmap & (1 << pkt_tclass) ?
+ DPDK_ERROR_IPV6_EFD_DROP_PKTS : DPDK_ERROR_NONE);
+ }
+ else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_MPLS_UNICAST))
+ {
+ mpls_unicast_header_t *mpls =
+ (mpls_unicast_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
+ u8 pkt_exp = ((mpls->label_exp_s_ttl >> 9) & 0x07);
+
+ return (tm->efd.mpls_exp_bitmap & (1 << pkt_exp) ?
+ DPDK_ERROR_MPLS_EFD_DROP_PKTS : DPDK_ERROR_NONE);
+ }
+ else if ((eh->type == clib_net_to_host_u16(ETHERNET_TYPE_VLAN)) ||
+ (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_DOT1AD)))
+ {
+ ethernet_vlan_header_t *vlan =
+ (ethernet_vlan_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
+ u8 pkt_cos = ((vlan->priority_cfi_and_id >> 13) & 0x07);
+
+ return (tm->efd.vlan_cos_bitmap & (1 << pkt_cos) ?
+ DPDK_ERROR_VLAN_EFD_DROP_PKTS : DPDK_ERROR_NONE);
+ }
+
+ return DPDK_ERROR_NONE;
+}
+
+/*
+ * This function is used when there are no worker threads.
+ * The main thread performs IO and forwards the packets.
+ */
+static inline u32 dpdk_device_input ( dpdk_main_t * dm,
+ dpdk_device_t * xd,
+ vlib_node_runtime_t * node,
+ u32 cpu_index,
+ u16 queue_id)
+{
+ u32 n_buffers;
+ u32 next_index = DPDK_RX_NEXT_ETHERNET_INPUT;
+ u32 n_left_to_next, * to_next;
+ u32 mb_index;
+ vlib_main_t * vm = vlib_get_main();
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__((unused));
+ vlib_buffer_free_list_t * fl;
+ u8 efd_discard_burst = 0;
+
+ if (xd->admin_up == 0)
+ return 0;
+
+ n_buffers = dpdk_rx_burst(dm, xd, queue_id);
+
+ if (n_buffers == 0)
+ {
+ /* check if EFD (dpdk) is enabled */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* reset a few stats */
+ xd->efd_agent.last_poll_time = 0;
+ xd->efd_agent.last_burst_sz = 0;
+ }
+ return 0;
+ }
+
+ vec_reset_length (xd->d_trace_buffers);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node);
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (PREDICT_FALSE(xd->admin_up != 1))
+ {
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
+
+ return 0;
+ }
+
+ /* Check for congestion if EFD (Early-Fast-Discard) is enabled
+ * in any mode (e.g. dpdk, monitor, or drop_all)
+ */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* update EFD counters */
+ dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
+
+ if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
+ {
+ /* discard all received packets */
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
+
+ xd->efd_agent.discard_cnt += n_buffers;
+ increment_efd_drop_counter(vm,
+ DPDK_ERROR_VLAN_EFD_DROP_PKTS,
+ n_buffers);
+
+ return 0;
+ }
+
+ if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
+ dm->efd.consec_full_frames_hi_thresh))
+ {
+ u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
+ queue_id);
+ if (device_queue_sz >= dm->efd.queue_hi_thresh)
+ {
+ /* dpdk device queue has reached the critical threshold */
+ xd->efd_agent.congestion_cnt++;
+
+ /* apply EFD to packets from the burst */
+ efd_discard_burst = 1;
+ }
+ }
+ }
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ u32 bi0;
+ u8 next0, error0;
+ u32 l3_offset0;
+ vlib_buffer_t * b0, * b_seg, * b_chain = 0;
+ u32 cntr_type;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_buffers > 0 && n_left_to_next > 0)
+ {
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
+ struct rte_mbuf *mb_seg = mb->next;
+
+ if (PREDICT_TRUE(n_buffers > 2))
+ {
+ struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ ASSERT(mb);
+
+ b0 = (vlib_buffer_t *)(mb+1);
+
+ /* check whether EFD is looking for packets to discard */
+ if (PREDICT_FALSE(efd_discard_burst))
+ {
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
+ {
+ rte_pktmbuf_free(mb);
+ xd->efd_agent.discard_cnt++;
+ increment_efd_drop_counter(vm,
+ cntr_type,
+ 1);
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+
+ /* Prefetch one next segment if it exists. */
+ if (PREDICT_FALSE(mb->nb_segs > 1))
+ {
+ struct rte_mbuf *pfmb = mb->next;
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ b_chain = b0;
+ }
+
+ vlib_buffer_init_for_free_list (b0, fl);
+ b0->clone_count = 0;
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+
+ to_next[0] = bi0;
+ to_next++;
+ n_left_to_next--;
+
+ dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
+ &next0, &error0);
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ /*
+ * Clear overloaded TX offload flags when a DPDK driver
+ * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
+ */
+
+ if (PREDICT_TRUE(trace_cnt == 0))
+ mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
+ else
+ trace_cnt--;
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+ b0->error = node->errors[error0];
+
+ l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
+ next0 == DPDK_RX_NEXT_IP6_INPUT ||
+ next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ b0->current_data = l3_offset0;
+ b0->current_length = mb->data_len - l3_offset0;
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ if (VMWARE_LENGTH_BUG_WORKAROUND)
+ b0->current_length -= 4;
+
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ n_rx_bytes += mb->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
+ {
+ ASSERT(mb_seg != 0);
+
+ b_seg = (vlib_buffer_t *)(mb_seg+1);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+ b_seg->clone_count = 0;
+
+ ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT(b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b0->total_length_not_including_first_buffer +=
+ mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ if (PREDICT_FALSE (n_trace > mb_index))
+ vec_add1 (xd->d_trace_buffers, bi0);
+ n_buffers--;
+ mb_index++;
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
+ {
+ dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers,
+ vec_len (xd->d_trace_buffers));
+ vlib_set_trace_count (vm, node, n_trace - vec_len (xd->d_trace_buffers));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ xd->vlib_sw_if_index,
+ mb_index, n_rx_bytes);
+
+ dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
+ dw->aggregate_rx_packets += mb_index;
+
+ return mb_index;
+}
+
+#if VIRL > 0
+#define VIRL_SPEED_LIMIT() \
+ /* Limit the input rate to 1000 vectors / sec */ \
+ { \
+ struct timespec ts, tsrem; \
+ \
+ ts.tv_sec = 0; \
+ ts.tv_nsec = 1000*1000; /* 1ms */ \
+ \
+ while (nanosleep(&ts, &tsrem) < 0) \
+ { \
+ ts = tsrem; \
+ } \
+ }
+#else
+#define VIRL_SPEED_LIMIT()
+#endif
+
+
+static uword
+dpdk_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ uword n_rx_packets = 0;
+ dpdk_device_and_queue_t * dq;
+ u32 cpu_index = os_get_cpu_number();
+
+ /*
+ * Poll all devices on this cpu for input/interrupts.
+ */
+ vec_foreach (dq, dm->devices_by_cpu[cpu_index])
+ {
+ xd = vec_elt_at_index(dm->devices, dq->device);
+ ASSERT(dq->queue_id == 0);
+ n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, 0);
+ }
+
+ VIRL_SPEED_LIMIT()
+
+ return n_rx_packets;
+}
+
+uword
+dpdk_input_rss (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ uword n_rx_packets = 0;
+ dpdk_device_and_queue_t * dq;
+ u32 cpu_index = os_get_cpu_number();
+
+ /*
+ * Poll all devices on this cpu for input/interrupts.
+ */
+ vec_foreach (dq, dm->devices_by_cpu[cpu_index])
+ {
+ xd = vec_elt_at_index(dm->devices, dq->device);
+ n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id);
+ }
+
+ VIRL_SPEED_LIMIT()
+
+ return n_rx_packets;
+}
+
+VLIB_REGISTER_NODE (dpdk_input_node) = {
+ .function = dpdk_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "dpdk-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_dpdk_rx_dma_trace,
+
+ .n_errors = DPDK_N_ERROR,
+ .error_strings = dpdk_error_strings,
+
+ .n_next_nodes = DPDK_RX_N_NEXT,
+ .next_nodes = {
+ [DPDK_RX_NEXT_DROP] = "error-drop",
+ [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
+ [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
+ [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ },
+};
+
+/*
+ * Override the next nodes for the dpdk input nodes.
+ * Must be invoked prior to VLIB_INIT_FUNCTION calls.
+ */
+void dpdk_set_next_node (dpdk_rx_next_t next, char *name)
+{
+ vlib_node_registration_t *r = &dpdk_input_node;
+ vlib_node_registration_t *r_io = &dpdk_io_input_node;
+ vlib_node_registration_t *r_handoff = &handoff_dispatch_node;
+
+ switch (next)
+ {
+ case DPDK_RX_NEXT_IP4_INPUT:
+ case DPDK_RX_NEXT_IP6_INPUT:
+ case DPDK_RX_NEXT_MPLS_INPUT:
+ case DPDK_RX_NEXT_ETHERNET_INPUT:
+ r->next_nodes[next] = name;
+ r_io->next_nodes[next] = name;
+ r_handoff->next_nodes[next] = name;
+ break;
+
+ default:
+ clib_warning ("%s: illegal next %d\n", __FUNCTION__, next);
+ break;
+ }
+}
+
+inline vlib_frame_queue_elt_t *
+vlib_get_handoff_queue_elt (u32 vlib_worker_index)
+{
+ vlib_frame_queue_t *fq;
+ vlib_frame_queue_elt_t *elt;
+ u64 new_tail;
+
+ fq = vlib_frame_queues[vlib_worker_index];
+ ASSERT (fq);
+
+ new_tail = __sync_add_and_fetch (&fq->tail, 1);
+
+ /* Wait until a ring slot is available */
+ while (new_tail >= fq->head_hint + fq->nelts)
+ vlib_worker_thread_barrier_check ();
+
+ elt = fq->elts + (new_tail & (fq->nelts-1));
+
+ /* this would be very bad... */
+ while (elt->valid)
+ ;
+
+ elt->msg_type = VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME;
+ elt->last_n_vectors = elt->n_vectors = 0;
+
+ return elt;
+}
+
+inline vlib_frame_queue_elt_t *
+dpdk_get_handoff_queue_elt (
+ u32 vlib_worker_index,
+ vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index)
+{
+ vlib_frame_queue_elt_t *elt;
+
+ if (handoff_queue_elt_by_worker_index [vlib_worker_index])
+ return handoff_queue_elt_by_worker_index [vlib_worker_index];
+
+ elt = vlib_get_handoff_queue_elt (vlib_worker_index);
+
+ handoff_queue_elt_by_worker_index [vlib_worker_index] = elt;
+
+ return elt;
+}
+
+static inline vlib_frame_queue_t *
+is_vlib_handoff_queue_congested (
+ u32 vlib_worker_index,
+ u32 queue_hi_thresh,
+ vlib_frame_queue_t ** handoff_queue_by_worker_index)
+{
+ vlib_frame_queue_t *fq;
+
+ fq = handoff_queue_by_worker_index [vlib_worker_index];
+ if (fq != (vlib_frame_queue_t *)(~0))
+ return fq;
+
+ fq = vlib_frame_queues[vlib_worker_index];
+ ASSERT (fq);
+
+ if (PREDICT_FALSE(fq->tail >= (fq->head_hint + queue_hi_thresh))) {
+ /* a valid entry in the array will indicate the queue has reached
+ * the specified threshold and is congested
+ */
+ handoff_queue_by_worker_index [vlib_worker_index] = fq;
+ fq->enqueue_full_events++;
+ return fq;
+ }
+
+ return NULL;
+}
+
+static inline u64 ipv4_get_key (ip4_header_t *ip)
+{
+ u64 hash_key;
+
+ hash_key = *((u64*)(&ip->address_pair)) ^ ip->protocol;
+
+ return hash_key;
+}
+
+static inline u64 ipv6_get_key (ip6_header_t *ip)
+{
+ u64 hash_key;
+
+ hash_key = ip->src_address.as_u64[0] ^
+ ip->src_address.as_u64[1] ^
+ ip->dst_address.as_u64[0] ^
+ ip->dst_address.as_u64[1] ^
+ ip->protocol;
+
+ return hash_key;
+}
+
+
+#define MPLS_BOTTOM_OF_STACK_BIT_MASK 0x00000100U
+#define MPLS_LABEL_MASK 0xFFFFF000U
+
+static inline u64 mpls_get_key (mpls_unicast_header_t *m)
+{
+ u64 hash_key;
+ u8 ip_ver;
+
+
+ /* find the bottom of the MPLS label stack. */
+ if (PREDICT_TRUE(m->label_exp_s_ttl &
+ clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) {
+ goto bottom_lbl_found;
+ }
+ m++;
+
+ if (PREDICT_TRUE(m->label_exp_s_ttl &
+ clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) {
+ goto bottom_lbl_found;
+ }
+ m++;
+
+ if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
+ goto bottom_lbl_found;
+ }
+ m++;
+
+ if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
+ goto bottom_lbl_found;
+ }
+ m++;
+
+ if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
+ goto bottom_lbl_found;
+ }
+
+ /* the bottom label was not found - use the last label */
+ hash_key = m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK);
+
+ return hash_key;
+
+
+bottom_lbl_found:
+ m++;
+ ip_ver = (*((u8 *)m) >> 4);
+
+ /* find out if it is IPV4 or IPV6 header */
+ if (PREDICT_TRUE(ip_ver == 4)) {
+ hash_key = ipv4_get_key((ip4_header_t *)m);
+ } else if (PREDICT_TRUE(ip_ver == 6)) {
+ hash_key = ipv6_get_key((ip6_header_t *)m);
+ } else {
+ /* use the bottom label */
+ hash_key = (m-1)->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK);
+ }
+
+ return hash_key;
+
+}
+
+static inline u64 eth_get_key (ethernet_header_t *h0)
+{
+ u64 hash_key;
+
+
+ if (PREDICT_TRUE(h0->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
+ hash_key = ipv4_get_key((ip4_header_t *)(h0+1));
+ } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_IP6)) {
+ hash_key = ipv6_get_key((ip6_header_t *)(h0+1));
+ } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
+ hash_key = mpls_get_key((mpls_unicast_header_t *)(h0+1));
+ } else if ((h0->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) ||
+ (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_DOT1AD))) {
+ ethernet_vlan_header_t * outer = (ethernet_vlan_header_t *)(h0 + 1);
+
+ outer = (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) ?
+ outer+1 : outer;
+ if (PREDICT_TRUE(outer->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
+ hash_key = ipv4_get_key((ip4_header_t *)(outer+1));
+ } else if (outer->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6)) {
+ hash_key = ipv6_get_key((ip6_header_t *)(outer+1));
+ } else if (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
+ hash_key = mpls_get_key((mpls_unicast_header_t *)(outer+1));
+ } else {
+ hash_key = outer->type;
+ }
+ } else {
+ hash_key = 0;
+ }
+
+ return hash_key;
+}
+
+/*
+ * This function is used when dedicated IO threads feed the worker threads.
+ *
+ * Devices are allocated to this thread based on instances and instance_id.
+ * If instances==0 then the function automatically determines the number
+ * of instances of this thread, and allocates devices between them.
+ * If instances != 0, then instance_id must be in the range 0..instances-1.
+ * The function allocates devices among the specified number of instances,
+ * with this thread having the given instance id. This option is used for
+ * splitting devices among differently named "io"-type threads.
+ */
+void dpdk_io_thread (vlib_worker_thread_t * w,
+ u32 instances,
+ u32 instance_id,
+ char *worker_name,
+ dpdk_io_thread_callback_t callback)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vlib_thread_registration_t * tr;
+ dpdk_main_t * dm = &dpdk_main;
+ char *io_name = w->registration->name;
+ dpdk_device_t * xd;
+ dpdk_device_t ** my_devices = 0;
+ vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index = 0;
+ vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0;
+ vlib_frame_queue_elt_t * hf = 0;
+ int i;
+ u32 n_left_to_next_worker = 0, * to_next_worker = 0;
+ u32 next_worker_index = 0;
+ u32 current_worker_index = ~0;
+ u32 cpu_index = os_get_cpu_number();
+ u32 num_workers = 0;
+ u32 num_devices = 0;
+ uword * p;
+ u16 queue_id = 0;
+ vlib_node_runtime_t * node_trace;
+ u32 first_worker_index = 0;
+
+ /* Wait until the dpdk init sequence is complete */
+ while (dm->io_thread_release == 0)
+ vlib_worker_thread_barrier_check();
+
+ clib_time_init (&vm->clib_time);
+
+ p = hash_get_mem (tm->thread_registrations_by_name, worker_name);
+ ASSERT (p);
+ tr = (vlib_thread_registration_t *) p[0];
+ if (tr)
+ {
+ num_workers = tr->count;
+ first_worker_index = tr->first_index;
+ }
+
+ /* Allocate devices to this thread */
+ if (instances == 0)
+ {
+ /* auto-assign */
+ instance_id = w->instance_id;
+
+ p = hash_get_mem (tm->thread_registrations_by_name, io_name);
+ tr = (vlib_thread_registration_t *) p[0];
+ /* Otherwise, how did we get here */
+ ASSERT (tr && tr->count);
+ instances = tr->count;
+ }
+ else
+ {
+ /* manually assign */
+ ASSERT (instance_id < instances);
+ }
+
+ vec_validate (handoff_queue_elt_by_worker_index,
+ first_worker_index + num_workers - 1);
+
+ vec_validate_init_empty (congested_handoff_queue_by_worker_index,
+ first_worker_index + num_workers - 1,
+ (vlib_frame_queue_t *)(~0));
+
+ /* packet tracing is triggered on the dpdk-input node for ease-of-use */
+ node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index);
+
+ /* And handle them... */
+ while (1)
+ {
+ u32 n_buffers;
+ u32 mb_index;
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__((unused));
+ vlib_buffer_free_list_t * fl;
+ u32 hash;
+ u64 hash_key;
+ u8 efd_discard_burst;
+
+ vlib_worker_thread_barrier_check ();
+
+ /* Invoke callback if supplied */
+ if (PREDICT_FALSE(callback != NULL))
+ callback(vm);
+
+ if (PREDICT_FALSE(vec_len(dm->devices) != num_devices))
+ {
+ vec_reset_length(my_devices);
+ vec_foreach (xd, dm->devices)
+ {
+ if (((xd - dm->devices) % tr->count) == instance_id)
+ {
+ fprintf(stderr, "i/o thread %d (cpu %d) takes port %d\n",
+ instance_id, (int) os_get_cpu_number(), (int) (xd - dm->devices));
+ vec_add1 (my_devices, xd);
+ }
+ }
+ num_devices = vec_len(dm->devices);
+ }
+
+ for (i = 0; i < vec_len (my_devices); i++)
+ {
+ xd = my_devices[i];
+
+ if (!xd->admin_up)
+ continue;
+
+ n_buffers = dpdk_rx_burst(dm, xd, 0 /* queue_id */);
+
+ if (n_buffers == 0)
+ {
+ /* check if EFD (dpdk) is enabled */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* reset a few stats */
+ xd->efd_agent.last_poll_time = 0;
+ xd->efd_agent.last_burst_sz = 0;
+ }
+ continue;
+ }
+
+ vec_reset_length (xd->d_trace_buffers);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (PREDICT_FALSE(xd->admin_up != 1))
+ {
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
+ continue;
+ }
+
+ /* reset EFD action for the burst */
+ efd_discard_burst = 0;
+
+ /* Check for congestion if EFD (Early-Fast-Discard) is enabled
+ * in any mode (e.g. dpdk, monitor, or drop_all)
+ */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* update EFD counters */
+ dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
+
+ if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
+ {
+ /* drop all received packets */
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
+
+ xd->efd_agent.discard_cnt += n_buffers;
+ increment_efd_drop_counter(vm,
+ DPDK_ERROR_VLAN_EFD_DROP_PKTS,
+ n_buffers);
+
+ continue;
+ }
+
+ if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
+ dm->efd.consec_full_frames_hi_thresh))
+ {
+ u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
+ queue_id);
+ if (device_queue_sz >= dm->efd.queue_hi_thresh)
+ {
+ /* dpdk device queue has reached the critical threshold */
+ xd->efd_agent.congestion_cnt++;
+
+ /* apply EFD to packets from the burst */
+ efd_discard_burst = 1;
+ }
+ }
+ }
+
+ fl = vlib_buffer_get_free_list
+ (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ u32 bi0;
+ u8 next0, error0;
+ u32 l3_offset0;
+ vlib_buffer_t * b0, * b_seg, * b_chain = 0;
+ ethernet_header_t * h0;
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
+ struct rte_mbuf *mb_seg = mb->next;
+
+ if (PREDICT_TRUE(n_buffers > 1))
+ {
+ struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (bp->data, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ b0 = (vlib_buffer_t *)(mb+1);
+
+ /* check whether EFD is looking for packets to discard */
+ if (PREDICT_FALSE(efd_discard_burst))
+ {
+ u32 cntr_type;
+ if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
+ {
+ rte_pktmbuf_free(mb);
+ xd->efd_agent.discard_cnt++;
+ increment_efd_drop_counter(vm,
+ cntr_type,
+ 1);
+
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+
+ /* Prefetch one next segment if it exists */
+ if (PREDICT_FALSE(mb->nb_segs > 1))
+ {
+ struct rte_mbuf *pfmb = mb->next;
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ b_chain = b0;
+ }
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+ vlib_buffer_init_for_free_list (b0, fl);
+ b0->clone_count = 0;
+
+ dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
+ &next0, &error0);
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ /*
+ * Clear overloaded TX offload flags when a DPDK driver
+ * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
+ */
+ if (PREDICT_TRUE(trace_cnt == 0))
+ mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
+ else
+ trace_cnt--;
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+ if (error0)
+ clib_warning ("bi %d error %d", bi0, error0);
+
+ b0->error = 0;
+
+ l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
+ next0 == DPDK_RX_NEXT_IP6_INPUT ||
+ next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ b0->current_data = l3_offset0;
+ b0->current_length = mb->data_len - l3_offset0;
+
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ if (VMWARE_LENGTH_BUG_WORKAROUND)
+ b0->current_length -= 4;
+
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ vnet_buffer(b0)->io_handoff.next_index = next0;
+ n_rx_bytes += mb->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
+ {
+ ASSERT(mb_seg != 0);
+
+ b_seg = (vlib_buffer_t *)(mb_seg+1);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+ b_seg->clone_count = 0;
+
+ ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT(b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b0->total_length_not_including_first_buffer +=
+ mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
+
+ if (PREDICT_FALSE (n_trace > mb_index))
+ vec_add1 (xd->d_trace_buffers, bi0);
+
+ next_worker_index = first_worker_index;
+
+ /*
+ * Force unknown traffic onto worker 0,
+ * and into ethernet-input. $$$$ add more hashes.
+ */
+ h0 = (ethernet_header_t *) b0->data;
+
+ /* Compute ingress LB hash */
+ hash_key = eth_get_key(h0);
+ hash = (u32)clib_xxhash(hash_key);
+
+ if (PREDICT_TRUE (is_pow2(num_workers)))
+ next_worker_index += hash & (num_workers - 1);
+ else
+ next_worker_index += hash % num_workers;
+
+ /* if EFD is enabled and not already discarding from dpdk,
+ * check the worker ring/queue for congestion
+ */
+ if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst))
+ {
+ vlib_frame_queue_t *fq;
+
+ /* fq will be valid if the ring is congested */
+ fq = is_vlib_handoff_queue_congested(
+ next_worker_index, tm->efd.queue_hi_thresh,
+ congested_handoff_queue_by_worker_index);
+
+ if (PREDICT_FALSE(fq != NULL))
+ {
+ u32 cntr_type;
+ if (PREDICT_TRUE(cntr_type =
+ is_efd_discardable(tm, b0, mb)))
+ {
+ /* discard the packet */
+ fq->enqueue_efd_discards++;
+ increment_efd_drop_counter(vm, cntr_type, 1);
+ rte_pktmbuf_free(mb);
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+ }
+
+ if (next_worker_index != current_worker_index)
+ {
+ if (hf)
+ hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
+
+ hf = dpdk_get_handoff_queue_elt(
+ next_worker_index,
+ handoff_queue_elt_by_worker_index);
+
+ n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
+ to_next_worker = &hf->buffer_index[hf->n_vectors];
+ current_worker_index = next_worker_index;
+ }
+
+ /* enqueue to correct worker thread */
+ to_next_worker[0] = bi0;
+ to_next_worker++;
+ n_left_to_next_worker--;
+
+ if (n_left_to_next_worker == 0)
+ {
+ hf->n_vectors = VLIB_FRAME_SIZE;
+ vlib_put_handoff_queue_elt(hf);
+ current_worker_index = ~0;
+ handoff_queue_elt_by_worker_index[next_worker_index] = 0;
+ hf = 0;
+ }
+
+ n_buffers--;
+ mb_index++;
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
+ {
+ /* credit the trace to the trace node */
+ dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers,
+ vec_len (xd->d_trace_buffers));
+ vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ xd->vlib_sw_if_index,
+ mb_index, n_rx_bytes);
+
+ dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
+ dw->aggregate_rx_packets += mb_index;
+ }
+
+ if (hf)
+ hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
+
+ /* Ship frames to the worker nodes */
+ for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
+ {
+ if (handoff_queue_elt_by_worker_index[i])
+ {
+ hf = handoff_queue_elt_by_worker_index[i];
+ /*
+ * It works better to let the handoff node
+ * rate-adapt, always ship the handoff queue element.
+ */
+ if (1 || hf->n_vectors == hf->last_n_vectors)
+ {
+ vlib_put_handoff_queue_elt(hf);
+ handoff_queue_elt_by_worker_index[i] = 0;
+ }
+ else
+ hf->last_n_vectors = hf->n_vectors;
+ }
+ congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0);
+ }
+ hf = 0;
+ current_worker_index = ~0;
+
+ vlib_increment_main_loop_counter (vm);
+ }
+}
+
+/*
+ * This function is used when the main thread performs IO and feeds the
+ * worker threads.
+ */
+static uword
+dpdk_io_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ uword n_rx_packets = 0;
+ static vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index;
+ static vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0;
+ vlib_frame_queue_elt_t * hf = 0;
+ int i;
+ u32 n_left_to_next_worker = 0, * to_next_worker = 0;
+ u32 next_worker_index = 0;
+ u32 current_worker_index = ~0;
+ u32 cpu_index = os_get_cpu_number();
+ static int num_workers_set;
+ static u32 num_workers;
+ u16 queue_id = 0;
+ vlib_node_runtime_t * node_trace;
+ static u32 first_worker_index;
+
+ if (PREDICT_FALSE(num_workers_set == 0))
+ {
+ uword * p;
+ vlib_thread_registration_t * tr;
+ /* Only the standard vnet worker threads are supported */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = (vlib_thread_registration_t *) p[0];
+ if (tr)
+ {
+ num_workers = tr->count;
+ first_worker_index = tr->first_index;
+ }
+ num_workers_set = 1;
+ }
+
+ if (PREDICT_FALSE(handoff_queue_elt_by_worker_index == 0))
+ {
+ vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1);
+
+ vec_validate_init_empty (congested_handoff_queue_by_worker_index,
+ first_worker_index + num_workers - 1,
+ (vlib_frame_queue_t *)(~0));
+ }
+
+ /* packet tracing is triggered on the dpdk-input node for ease-of-use */
+ node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index);
+
+ vec_foreach (xd, dm->devices)
+ {
+ u32 n_buffers;
+ u32 mb_index;
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__((unused));
+ vlib_buffer_free_list_t * fl;
+ u32 hash;
+ u64 hash_key;
+ u8 efd_discard_burst = 0;
+
+ if (!xd->admin_up)
+ continue;
+
+ n_buffers = dpdk_rx_burst(dm, xd, queue_id );
+
+ if (n_buffers == 0)
+ {
+ /* check if EFD (dpdk) is enabled */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* reset a few stats */
+ xd->efd_agent.last_poll_time = 0;
+ xd->efd_agent.last_burst_sz = 0;
+ }
+ continue;
+ }
+
+ vec_reset_length (xd->d_trace_buffers);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (PREDICT_FALSE(xd->admin_up != 1))
+ {
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
+ continue;
+ }
+
+ /* Check for congestion if EFD (Early-Fast-Discard) is enabled
+ * in any mode (e.g. dpdk, monitor, or drop_all)
+ */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* update EFD counters */
+ dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
+
+ if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
+ {
+ /* discard all received packets */
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
+
+ xd->efd_agent.discard_cnt += n_buffers;
+ increment_efd_drop_counter(vm,
+ DPDK_ERROR_VLAN_EFD_DROP_PKTS,
+ n_buffers);
+
+ continue;
+ }
+
+ if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
+ dm->efd.consec_full_frames_hi_thresh))
+ {
+ u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
+ queue_id);
+ if (device_queue_sz >= dm->efd.queue_hi_thresh)
+ {
+ /* dpdk device queue has reached the critical threshold */
+ xd->efd_agent.congestion_cnt++;
+
+ /* apply EFD to packets from the burst */
+ efd_discard_burst = 1;
+ }
+ }
+ }
+
+ fl = vlib_buffer_get_free_list
+ (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ u32 bi0;
+ u8 next0, error0;
+ u32 l3_offset0;
+ vlib_buffer_t * b0, * b_seg, * b_chain = 0;
+ ethernet_header_t * h0;
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
+ struct rte_mbuf *mb_seg = mb->next;
+
+ if (PREDICT_TRUE(n_buffers > 1))
+ {
+ struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (bp->data, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ b0 = (vlib_buffer_t *)(mb+1);
+
+ /* check whether EFD is looking for packets to discard */
+ if (PREDICT_FALSE(efd_discard_burst))
+ {
+ u32 cntr_type;
+ if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
+ {
+ rte_pktmbuf_free(mb);
+ xd->efd_agent.discard_cnt++;
+ increment_efd_drop_counter(vm,
+ cntr_type,
+ 1);
+
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+
+ /* Prefetch one next segment if it exists */
+ if (PREDICT_FALSE(mb->nb_segs > 1))
+ {
+ struct rte_mbuf *pfmb = mb->next;
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ b_chain = b0;
+ }
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+ vlib_buffer_init_for_free_list (b0, fl);
+ b0->clone_count = 0;
+
+ dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
+ &next0, &error0);
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ /*
+ * Clear overloaded TX offload flags when a DPDK driver
+ * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
+ */
+ if (PREDICT_TRUE(trace_cnt == 0))
+ mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
+ else
+ trace_cnt--;
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+ if (error0)
+ clib_warning ("bi %d error %d", bi0, error0);
+
+ b0->error = 0;
+
+ l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
+ next0 == DPDK_RX_NEXT_IP6_INPUT ||
+ next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ b0->current_data = l3_offset0;
+ b0->current_length = mb->data_len - l3_offset0;
+
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ if (VMWARE_LENGTH_BUG_WORKAROUND)
+ b0->current_length -= 4;
+
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ vnet_buffer(b0)->io_handoff.next_index = next0;
+ n_rx_bytes += mb->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
+ {
+ ASSERT(mb_seg != 0);
+
+ b_seg = (vlib_buffer_t *)(mb_seg+1);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+ b_seg->clone_count = 0;
+
+ ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT(b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b0->total_length_not_including_first_buffer +=
+ mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
+
+ if (PREDICT_FALSE (n_trace > mb_index))
+ vec_add1 (xd->d_trace_buffers, bi0);
+
+ next_worker_index = first_worker_index;
+
+ /*
+ * Force unknown traffic onto worker 0,
+ * and into ethernet-input. $$$$ add more hashes.
+ */
+ h0 = (ethernet_header_t *) b0->data;
+
+ /* Compute ingress LB hash */
+ hash_key = eth_get_key(h0);
+ hash = (u32)clib_xxhash(hash_key);
+
+ if (PREDICT_TRUE (is_pow2(num_workers)))
+ next_worker_index += hash & (num_workers - 1);
+ else
+ next_worker_index += hash % num_workers;
+
+ /* if EFD is enabled and not already discarding from dpdk,
+ * check the worker ring/queue for congestion
+ */
+ if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst))
+ {
+ vlib_frame_queue_t *fq;
+
+ /* fq will be valid if the ring is congested */
+ fq = is_vlib_handoff_queue_congested(
+ next_worker_index, tm->efd.queue_hi_thresh,
+ congested_handoff_queue_by_worker_index);
+
+ if (PREDICT_FALSE(fq != NULL))
+ {
+ u32 cntr_type;
+ if (PREDICT_TRUE(cntr_type =
+ is_efd_discardable(tm, b0, mb)))
+ {
+ /* discard the packet */
+ fq->enqueue_efd_discards++;
+ increment_efd_drop_counter(vm, cntr_type, 1);
+ rte_pktmbuf_free(mb);
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+ }
+
+ if (next_worker_index != current_worker_index)
+ {
+ if (hf)
+ hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
+
+ hf = dpdk_get_handoff_queue_elt(
+ next_worker_index,
+ handoff_queue_elt_by_worker_index);
+
+ n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
+ to_next_worker = &hf->buffer_index[hf->n_vectors];
+ current_worker_index = next_worker_index;
+ }
+
+ /* enqueue to correct worker thread */
+ to_next_worker[0] = bi0;
+ to_next_worker++;
+ n_left_to_next_worker--;
+
+ if (n_left_to_next_worker == 0)
+ {
+ hf->n_vectors = VLIB_FRAME_SIZE;
+ vlib_put_handoff_queue_elt(hf);
+ current_worker_index = ~0;
+ handoff_queue_elt_by_worker_index[next_worker_index] = 0;
+ hf = 0;
+ }
+
+ n_buffers--;
+ mb_index++;
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
+ {
+ /* credit the trace to the trace node */
+ dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers,
+ vec_len (xd->d_trace_buffers));
+ vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ xd->vlib_sw_if_index,
+ mb_index, n_rx_bytes);
+
+ dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
+ dw->aggregate_rx_packets += mb_index;
+ n_rx_packets += mb_index;
+ }
+
+ if (hf)
+ hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
+
+ /* Ship frames to the worker nodes */
+ for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
+ {
+ if (handoff_queue_elt_by_worker_index[i])
+ {
+ hf = handoff_queue_elt_by_worker_index[i];
+ /*
+ * It works better to let the handoff node
+ * rate-adapt, always ship the handoff queue element.
+ */
+ if (1 || hf->n_vectors == hf->last_n_vectors)
+ {
+ vlib_put_handoff_queue_elt(hf);
+ handoff_queue_elt_by_worker_index[i] = 0;
+ }
+ else
+ hf->last_n_vectors = hf->n_vectors;
+ }
+ congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0);
+ }
+ hf = 0;
+ current_worker_index = ~0;
+ return n_rx_packets;
+}
+
+VLIB_REGISTER_NODE (dpdk_io_input_node) = {
+ .function = dpdk_io_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "dpdk-io-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_dpdk_rx_dma_trace,
+
+ .n_errors = DPDK_N_ERROR,
+ .error_strings = dpdk_error_strings,
+
+ .n_next_nodes = DPDK_RX_N_NEXT,
+ .next_nodes = {
+ [DPDK_RX_NEXT_DROP] = "error-drop",
+ [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
+ [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
+ [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ },
+};
+
+/*
+ * set_efd_bitmap()
+ * Based on the operation type, set lower/upper bits for the given index value
+ */
+void
+set_efd_bitmap (u8 *bitmap, u32 value, u32 op)
+{
+ int ix;
+
+ *bitmap = 0;
+ for (ix = 0; ix < 8; ix++) {
+ if (((op == EFD_OPERATION_LESS_THAN) && (ix < value)) ||
+ ((op == EFD_OPERATION_GREATER_OR_EQUAL) && (ix >= value))){
+ (*bitmap) |= (1 << ix);
+ }
+ }
+}
+
+void
+efd_config (u32 enabled,
+ u32 ip_prec, u32 ip_op,
+ u32 mpls_exp, u32 mpls_op,
+ u32 vlan_cos, u32 vlan_op)
+{
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ dpdk_main_t * dm = &dpdk_main;
+
+ if (enabled) {
+ tm->efd.enabled |= VLIB_EFD_DISCARD_ENABLED;
+ dm->efd.enabled |= DPDK_EFD_DISCARD_ENABLED;
+ } else {
+ tm->efd.enabled &= ~VLIB_EFD_DISCARD_ENABLED;
+ dm->efd.enabled &= ~DPDK_EFD_DISCARD_ENABLED;
+ }
+
+ set_efd_bitmap(&tm->efd.ip_prec_bitmap, ip_prec, ip_op);
+ set_efd_bitmap(&tm->efd.mpls_exp_bitmap, mpls_exp, mpls_op);
+ set_efd_bitmap(&tm->efd.vlan_cos_bitmap, vlan_cos, vlan_op);
+
+}
diff --git a/vnet/vnet/devices/dpdk/threads.c b/vnet/vnet/devices/dpdk/threads.c
new file mode 100644
index 00000000000..aa32f1007c3
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/threads.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <signal.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include <vlibmemory/api.h>
+#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */
+
+#define vl_typedefs /* define message structures */
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_typedefs
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_printfun
+
+vlib_thread_main_t vlib_thread_main;
+
+frame_queue_trace_t *frame_queue_traces;
+
+/*
+ * Check the frame queue to see if any frames are available.
+ * If so, pull the packets off the frames and put them to
+ * the handoff node.
+ */
+static inline int vlib_frame_queue_dequeue_internal (vlib_main_t *vm)
+{
+ u32 thread_id = vm->cpu_index;
+ vlib_frame_queue_t *fq = vlib_frame_queues[thread_id];
+ vlib_frame_queue_elt_t *elt;
+ u32 * from, * to;
+ vlib_frame_t * f;
+ int msg_type;
+ int processed = 0;
+ u32 n_left_to_node;
+ u32 vectors = 0;
+
+ ASSERT (fq);
+ ASSERT(vm == vlib_mains[thread_id]);
+
+ /*
+ * Gather trace data for frame queues
+ */
+ if (PREDICT_FALSE(fq->trace))
+ {
+ frame_queue_trace_t *fqt;
+ u32 elix;
+
+ fqt = &frame_queue_traces[thread_id];
+ fqt->nelts = fq->nelts;
+ fqt->head = fq->head;
+ fqt->head_hint = fq->head_hint;
+ fqt->tail = fq->tail;
+ fqt->threshold = fq->vector_threshold;
+ fqt->n_in_use = fqt->tail - fqt->head;
+ if (fqt->n_in_use > fqt->nelts){
+ fqt->n_in_use = 0;
+ }
+
+ for (elix=0; elix<fqt->nelts; elix++) {
+ elt = fq->elts + ((fq->head+1 + elix) & (fq->nelts-1));
+ if (1 || elt->valid)
+ {
+ fqt->n_vectors[elix] = elt->n_vectors;
+ }
+ }
+ fqt->written = 1;
+ }
+
+ while (1)
+ {
+ if (fq->head == fq->tail)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+
+ elt = fq->elts + ((fq->head+1) & (fq->nelts-1));
+
+ if (!elt->valid)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+
+ from = elt->buffer_index;
+ msg_type = elt->msg_type;
+
+ ASSERT (msg_type == VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME);
+ ASSERT (elt->n_vectors <= VLIB_FRAME_SIZE);
+
+ f = vlib_get_frame_to_node
+ (vm, 1 ? handoff_dispatch_node.index : ethernet_input_node.index);
+
+ to = vlib_frame_vector_args (f);
+
+ n_left_to_node = elt->n_vectors;
+
+ while (n_left_to_node >= 4)
+ {
+ to[0] = from[0];
+ to[1] = from[1];
+ to[2] = from[2];
+ to[3] = from[3];
+ to += 4;
+ from += 4;
+ n_left_to_node -= 4;
+ }
+
+ while (n_left_to_node > 0)
+ {
+ to[0] = from[0];
+ to++;
+ from++;
+ n_left_to_node--;
+ }
+
+ vectors += elt->n_vectors;
+ f->n_vectors = elt->n_vectors;
+ vlib_put_frame_to_node
+ (vm, 1 ? handoff_dispatch_node.index : ethernet_input_node.index, f);
+
+ elt->valid = 0;
+ elt->n_vectors = 0;
+ elt->msg_type = 0xfefefefe;
+ CLIB_MEMORY_BARRIER();
+ fq->head++;
+ processed++;
+
+ /*
+ * Limit the number of packets pushed into the graph
+ */
+ if (vectors >= fq->vector_threshold)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+ }
+ ASSERT(0);
+ return processed;
+}
+
+int dpdk_frame_queue_dequeue (vlib_main_t *vm)
+{
+ return vlib_frame_queue_dequeue_internal (vm);
+}
+
+/*
+ * dpdk_worker_thread - Contains the main loop of a worker thread.
+ *
+ * w
+ * Information for the current thread
+ * io_name
+ * The name of thread performing dpdk device IO (if any). If there are no
+ * instances of that thread, then the current thread will do dpdk device
+ * polling. Ports will be divided among instances of the current thread.
+ * callback
+ * If not null, this function will be called once during each main loop.
+ */
+static_always_inline void
+dpdk_worker_thread_internal (vlib_main_t *vm,
+ dpdk_worker_thread_callback_t callback,
+ int have_io_threads)
+{
+ vlib_node_main_t * nm = &vm->node_main;
+ u64 cpu_time_now = clib_cpu_time_now ();
+
+ while (1)
+ {
+ vlib_worker_thread_barrier_check ();
+
+ vlib_frame_queue_dequeue_internal (vm);
+
+ /* Invoke callback if supplied */
+ if (PREDICT_FALSE(callback != NULL))
+ callback(vm);
+
+ if (!have_io_threads)
+ {
+ vlib_node_runtime_t * n;
+ vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+ {
+ cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT,
+ VLIB_NODE_STATE_POLLING, /* frame */ 0,
+ cpu_time_now);
+ }
+
+ }
+
+ if (_vec_len (nm->pending_frames))
+ {
+ int i;
+ cpu_time_now = clib_cpu_time_now ();
+ for (i = 0; i < _vec_len (nm->pending_frames); i++) {
+ vlib_pending_frame_t *p;
+
+ p = nm->pending_frames + i;
+
+ cpu_time_now = dispatch_pending_node (vm, p, cpu_time_now);
+ }
+ _vec_len (nm->pending_frames) = 0;
+ }
+ vlib_increment_main_loop_counter (vm);
+
+ /* Record time stamp in case there are no enabled nodes and above
+ calls do not update time stamp. */
+ cpu_time_now = clib_cpu_time_now ();
+ }
+}
+
+void dpdk_worker_thread (vlib_worker_thread_t * w,
+ char *io_name,
+ dpdk_worker_thread_callback_t callback)
+{
+ vlib_main_t *vm;
+ uword * p;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vlib_thread_registration_t * tr;
+ dpdk_main_t * dm = &dpdk_main;
+
+ vm = vlib_get_main();
+
+ ASSERT(vm->cpu_index == os_get_cpu_number());
+
+ clib_time_init (&vm->clib_time);
+ clib_mem_set_heap (w->thread_mheap);
+
+ /* Wait until the dpdk init sequence is complete */
+ while (dm->io_thread_release == 0)
+ vlib_worker_thread_barrier_check ();
+
+ /* any I/O threads? */
+ p = hash_get_mem (tm->thread_registrations_by_name, io_name);
+ tr = (vlib_thread_registration_t *)p[0];
+
+ if (tr && tr->count > 0)
+ dpdk_worker_thread_internal(vm, callback, /* have_io_threads */ 1);
+ else
+ dpdk_worker_thread_internal(vm, callback, /* have_io_threads */ 0);
+}
+
+void dpdk_worker_thread_fn (void * arg)
+{
+ vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+ vlib_worker_thread_init (w);
+ dpdk_worker_thread (w, "io", 0);
+}
+
+#if VIRL == 0
+VLIB_REGISTER_THREAD (worker_thread_reg, static) = {
+ .name = "workers",
+ .short_name = "wk",
+ .function = dpdk_worker_thread_fn,
+ .mheap_size = 256<<20,
+};
+#endif
+
+void dpdk_io_thread_fn (void * arg)
+{
+ vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+ vlib_worker_thread_init (w);
+ dpdk_io_thread (w, 0, 0, "workers", 0);
+}
+
+#if VIRL == 0
+VLIB_REGISTER_THREAD (io_thread_reg, static) = {
+ .name = "io",
+ .short_name = "io",
+ .function = dpdk_io_thread_fn,
+ .mheap_size = 256<<20,
+};
+#endif
+
+static void vl_api_rpc_call_t_handler (vl_api_rpc_call_t * mp)
+{
+ vl_api_rpc_reply_t * rmp;
+ int (*fp)(void *);
+ i32 rv = 0;
+ vlib_main_t * vm = vlib_get_main();
+
+ if (mp->function == 0)
+ {
+ rv = -1;
+ clib_warning ("rpc NULL function pointer");
+ }
+
+ else
+ {
+ if (mp->need_barrier_sync)
+ vlib_worker_thread_barrier_sync (vm);
+
+ fp = (void *)(mp->function);
+ rv = (*fp)(mp->data);
+
+ if (mp->need_barrier_sync)
+ vlib_worker_thread_barrier_release (vm);
+ }
+
+ if (mp->send_reply)
+ {
+ unix_shared_memory_queue_t * q =
+ vl_api_client_index_to_input_queue (mp->client_index);
+ if (q)
+ {
+ rmp = vl_msg_api_alloc_as_if_client (sizeof (*rmp));
+ rmp->_vl_msg_id = ntohs (VL_API_RPC_REPLY);
+ rmp->context = mp->context;
+ rmp->retval = rv;
+ vl_msg_api_send_shmem (q, (u8 *)&rmp);
+ }
+ }
+ if (mp->multicast)
+ {
+ clib_warning ("multicast not yet implemented...");
+ }
+}
+
+static void vl_api_rpc_reply_t_handler (vl_api_rpc_reply_t * mp)
+{ clib_warning ("unimplemented"); }
+
+void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length)
+{
+ vl_api_rpc_call_t * mp;
+ api_main_t *am = &api_main;
+ vl_shmem_hdr_t *shmem_hdr = am->shmem_hdr;
+
+ mp = vl_msg_api_alloc_as_if_client (sizeof (*mp) + data_length);
+ memset (mp, 0, sizeof (*mp));
+ memcpy (mp->data, data, data_length);
+ mp->_vl_msg_id = ntohs (VL_API_RPC_CALL);
+ mp->function = (u64)fp;
+ mp->need_barrier_sync = 1;
+
+ /* Use the "normal" control-plane mechanism for the main thread */
+ vl_msg_api_send_shmem (shmem_hdr->vl_input_queue, (u8 *)&mp);
+}
+
+
+#define foreach_rpc_api_msg \
+_(RPC_CALL,rpc_call) \
+_(RPC_REPLY,rpc_reply)
+
+static clib_error_t *
+rpc_api_hookup (vlib_main_t *vm)
+{
+#define _(N,n) \
+ vl_msg_api_set_handlers(VL_API_##N, #n, \
+ vl_api_##n##_t_handler, \
+ vl_noop_handler, \
+ vl_noop_handler, \
+ vl_api_##n##_t_print, \
+ sizeof(vl_api_##n##_t), 0 /* do not trace */);
+ foreach_rpc_api_msg;
+#undef _
+ return 0;
+}
+
+VLIB_API_INIT_FUNCTION(rpc_api_hookup);
diff --git a/vnet/vnet/devices/dpdk/threads.h b/vnet/vnet/devices/dpdk/threads.h
new file mode 100644
index 00000000000..8f0fcbdb465
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/threads.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_dpdk_threads_h__
+#define __included_dpdk_threads_h__
+
+#include <vnet/vnet.h>
+
+void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length);
+
+typedef void (*dpdk_worker_thread_callback_t) (vlib_main_t *vm);
+
+void dpdk_worker_thread (vlib_worker_thread_t * w,
+ char *io_name,
+ dpdk_worker_thread_callback_t callback);
+
+int dpdk_frame_queue_dequeue (vlib_main_t *vm);
+
+#endif /* __included_dpdk_threads_h__ */
diff --git a/vnet/vnet/devices/dpdk/vhost_user.c b/vnet/vnet/devices/dpdk/vhost_user.c
new file mode 100644
index 00000000000..5ab4c22ed3e
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/vhost_user.c
@@ -0,0 +1,1550 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include <vnet/devices/virtio/vhost-user.h>
+
+#define VHOST_USER_DEBUG_SOCKET 0
+
+#if VHOST_USER_DEBUG_SOCKET == 1
+#define DBG_SOCK(args...) clib_warning(args);
+#else
+#define DBG_SOCK(args...)
+#endif
+
+/*
+ * DPDK vhost-user functions
+ */
+
+/* portions taken from dpdk
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
+{
+ struct virtio_memory_regions *region;
+ uint64_t vhost_va = 0;
+ uint32_t regionidx = 0;
+
+ /* Find the region where the address lives. */
+ for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+ region = &dev->mem->regions[regionidx];
+ if ((qemu_va >= region->userspace_address) &&
+ (qemu_va <= region->userspace_address +
+ region->memory_size)) {
+ vhost_va = qemu_va + region->guest_phys_address +
+ region->address_offset -
+ region->userspace_address;
+ break;
+ }
+ }
+ return vhost_va;
+}
+
+static dpdk_device_t *
+dpdk_vhost_user_device_from_hw_if_index(u32 hw_if_index)
+{
+ vnet_main_t *vnm = vnet_get_main();
+ dpdk_main_t * dm = &dpdk_main;
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ if (xd->dev_type != VNET_DPDK_DEV_VHOST_USER)
+ return 0;
+
+ return xd;
+}
+
+static dpdk_device_t *
+dpdk_vhost_user_device_from_sw_if_index(u32 sw_if_index)
+{
+ vnet_main_t *vnm = vnet_get_main();
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, sw_if_index);
+ ASSERT (sw->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+
+ return dpdk_vhost_user_device_from_hw_if_index(sw->hw_if_index);
+}
+
+static inline void * map_guest_mem(dpdk_device_t * xd, u64 addr)
+{
+ dpdk_vu_intf_t * vui = xd->vu_intf;
+ struct virtio_memory * mem = xd->vu_vhost_dev.mem;
+ int i;
+ for (i=0; i<mem->nregions; i++) {
+ if ((mem->regions[i].guest_phys_address <= addr) &&
+ ((mem->regions[i].guest_phys_address + mem->regions[i].memory_size) > addr)) {
+ return (void *) (vui->region_addr[i] + addr - mem->regions[i].guest_phys_address);
+ }
+ }
+ DBG_SOCK("failed to map guest mem addr %llx", addr);
+ return 0;
+}
+
+static clib_error_t *
+dpdk_create_vhost_user_if_internal (u32 * hw_if_index, u32 if_id)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_main_t * vm = vlib_get_main();
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vnet_sw_interface_t * sw;
+ clib_error_t * error;
+ dpdk_device_and_queue_t * dq;
+
+ dpdk_device_t * xd = NULL;
+ u8 addr[6];
+ int j;
+
+ vlib_worker_thread_barrier_sync (vm);
+
+ int inactive_cnt = vec_len(dm->vu_inactive_interfaces_device_index);
+ // if there are any inactive ifaces
+ if (inactive_cnt > 0) {
+ // take last
+ u32 vui_idx = dm->vu_inactive_interfaces_device_index[inactive_cnt - 1];
+ if (vec_len(dm->devices) > vui_idx) {
+ xd = vec_elt_at_index (dm->devices, vui_idx);
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER) {
+ DBG_SOCK("reusing inactive vhost-user interface sw_if_index %d", xd->vlib_sw_if_index);
+ } else {
+ clib_warning("error: inactive vhost-user interface sw_if_index %d not VHOST_USER type!",
+ xd->vlib_sw_if_index);
+ // reset so new interface is created
+ xd = NULL;
+ }
+ }
+ // "remove" from inactive list
+ _vec_len(dm->vu_inactive_interfaces_device_index) -= 1;
+ }
+
+ if (xd) {
+ // existing interface used - do not overwrite if_id if not needed
+ if (if_id != (u32)~0)
+ xd->vu_if_id = if_id;
+
+ // reset virtqueues
+ for (j = 0; j < VIRTIO_QNUM; j++)
+ {
+ memset(xd->vu_vhost_dev.virtqueue[j], 0, sizeof(struct vhost_virtqueue));
+ }
+ // reset lockp
+ memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES);
+
+ // reset tx vectors
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+ sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ // reset rx vector
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+ } else {
+ // vui was not retrieved from inactive ifaces - create new
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->dev_type = VNET_DPDK_DEV_VHOST_USER;
+ xd->rx_q_used = 1;
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used, CLIB_CACHE_LINE_BYTES);
+
+ if (if_id == (u32)~0)
+ xd->vu_if_id = dm->next_vu_if_id++;
+ else
+ xd->vu_if_id = if_id;
+
+ xd->device_index = xd - dm->devices;
+ xd->per_interface_next_index = ~0;
+ xd->vu_intf = NULL;
+
+ xd->vu_vhost_dev.mem = clib_mem_alloc (sizeof(struct virtio_memory) +
+ VHOST_MEMORY_MAX_NREGIONS *
+ sizeof(struct virtio_memory_regions));
+
+ for (j = 0; j < VIRTIO_QNUM; j++)
+ {
+ xd->vu_vhost_dev.virtqueue[j] = clib_mem_alloc (sizeof(struct vhost_virtqueue));
+ memset(xd->vu_vhost_dev.virtqueue[j], 0, sizeof(struct vhost_virtqueue));
+ }
+
+ xd->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+ sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ // reset rx vector
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->frames, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ }
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (addr+2, &rnd, sizeof(rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface
+ (dm->vnet_main,
+ dpdk_device_class.index,
+ xd->device_index,
+ /* ethernet address */ addr,
+ &xd->vlib_hw_if_index,
+ 0);
+
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+
+ if (!xd->vu_intf)
+ xd->vu_intf = clib_mem_alloc (sizeof(*(xd->vu_intf)));
+
+ *hw_if_index = xd->vlib_hw_if_index;
+
+ int cpu = (xd->device_index % dm->input_cpu_count) +
+ dm->input_cpu_first_index;
+
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+
+ // start polling if it was not started yet (because of no phys ifaces)
+ if (tm->n_vlib_mains == 1 && dpdk_input_node.state != VLIB_NODE_STATE_POLLING)
+ vlib_node_set_state (vm, dpdk_input_node.index, VLIB_NODE_STATE_POLLING);
+
+ if (tm->n_vlib_mains > 1 && tm->main_thread_is_io_node)
+ vlib_node_set_state (vm, dpdk_io_input_node.index, VLIB_NODE_STATE_POLLING);
+
+ if (tm->n_vlib_mains > 1 && !tm->main_thread_is_io_node)
+ vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ vlib_worker_thread_barrier_release (vm);
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_get_features(u32 hw_if_index, u64 * features)
+{
+ *features = rte_vhost_feature_get();
+
+ DBG_SOCK("supported features: 0x%x", *features);
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_features(u32 hw_if_index, u64 features)
+{
+ dpdk_device_t * xd;
+ u16 hdr_len = sizeof(struct virtio_net_hdr);
+
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ xd->vu_vhost_dev.features = features;
+
+ if (xd->vu_vhost_dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+ hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+
+ xd->vu_vhost_dev.virtqueue[VIRTIO_RXQ]->vhost_hlen = hdr_len;
+ xd->vu_vhost_dev.virtqueue[VIRTIO_TXQ]->vhost_hlen = hdr_len;
+
+ xd->vu_is_running = 0;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_mem_table(u32 hw_if_index, vhost_user_memory_t * vum, int fd[])
+{
+ struct virtio_memory * mem;
+ int i;
+ dpdk_device_t * xd;
+ dpdk_vu_intf_t * vui;
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vui = xd->vu_intf;
+ mem = xd->vu_vhost_dev.mem;
+
+ mem->nregions = vum->nregions;
+
+ for (i=0; i < mem->nregions; i++) {
+ u64 mapped_size, mapped_address;
+
+ mem->regions[i].guest_phys_address = vum->regions[i].guest_phys_addr;
+ mem->regions[i].guest_phys_address_end = vum->regions[i].guest_phys_addr +
+ vum->regions[i].memory_size;
+ mem->regions[i].memory_size = vum->regions[i].memory_size;
+ mem->regions[i].userspace_address = vum->regions[i].userspace_addr;
+
+ mapped_size = mem->regions[i].memory_size + vum->regions[i].mmap_offset;
+ mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mapped_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd[i], 0);
+
+ if ((void *)mapped_address == MAP_FAILED)
+ {
+ clib_warning("mmap error");
+ return 0;
+ }
+
+ mapped_address += vum->regions[i].mmap_offset;
+ vui->region_addr[i] = mapped_address;
+ vui->region_fd[i] = fd[i];
+ mem->regions[i].address_offset = mapped_address - mem->regions[i].guest_phys_address;
+
+ if (vum->regions[i].guest_phys_addr == 0) {
+ mem->base_address = vum->regions[i].userspace_addr;
+ mem->mapped_address = mem->regions[i].address_offset;
+ }
+ }
+
+ xd->vu_is_running = 0;
+
+ DBG_SOCK("done");
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_num(u32 hw_if_index, u8 idx, u32 num)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ DBG_SOCK("idx %u num %u", idx, num);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ vq->size = num;
+
+ xd->vu_is_running = 0;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_addr(u32 hw_if_index, u8 idx, u64 desc, u64 used, u64 avail)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ DBG_SOCK("idx %u desc 0x%x used 0x%x avail 0x%x", idx, desc, used, avail);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+
+ vq->desc = (struct vring_desc *) qva_to_vva(&xd->vu_vhost_dev, desc);
+ vq->used = (struct vring_used *) qva_to_vva(&xd->vu_vhost_dev, used);
+ vq->avail = (struct vring_avail *) qva_to_vva(&xd->vu_vhost_dev, avail);
+
+ if (!(vq->desc && vq->used && vq->avail)) {
+ clib_warning("falied to set vring addr");
+ }
+
+ xd->vu_is_running = 0;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_get_vring_base(u32 hw_if_index, u8 idx, u32 * num)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ *num = vq->last_used_idx;
+
+ DBG_SOCK("idx %u num %u", idx, *num);
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_base(u32 hw_if_index, u8 idx, u32 num)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ DBG_SOCK("idx %u num %u", idx, num);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ vq->last_used_idx = num;
+ vq->last_used_idx_res = num;
+
+ xd->vu_is_running = 0;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_kick(u32 hw_if_index, u8 idx, int fd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq, *vq0, *vq1;
+
+ DBG_SOCK("idx %u fd %d", idx, fd);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ vq->kickfd = fd;
+
+ vq0 = xd->vu_vhost_dev.virtqueue[0];
+ vq1 = xd->vu_vhost_dev.virtqueue[1];
+
+ if (vq0->desc && vq0->avail && vq0->used &&
+ vq1->desc && vq1->avail && vq1->used) {
+ xd->vu_is_running = 1;
+ if (xd->admin_up)
+ vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP |
+ ETH_LINK_FULL_DUPLEX );
+ }
+
+ return 0;
+}
+
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_call(u32 hw_if_index, u8 idx, int fd)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ DBG_SOCK("idx %u fd %d", idx, fd);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ /* reset callfd to force no interrupts */
+ vq->callfd = -1;
+
+ return 0;
+}
+
+u8
+dpdk_vhost_user_want_interrupt(dpdk_device_t *xd, int idx)
+{
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+ ASSERT(vui != NULL);
+
+ if (PREDICT_FALSE(vui->num_vrings <= 0))
+ return 0;
+
+ dpdk_vu_vring *vring = &(vui->vrings[idx]);
+ struct vhost_virtqueue *vq = xd->vu_vhost_dev.virtqueue[idx];
+
+ /* return if vm is interested in interrupts */
+ return (vring->callfd > 0) && !(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+void
+dpdk_vhost_user_send_interrupt(vlib_main_t * vm, dpdk_device_t * xd, int idx)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+ ASSERT(vui != NULL);
+
+ if (PREDICT_FALSE(vui->num_vrings <= 0))
+ return;
+
+ dpdk_vu_vring *vring = &(vui->vrings[idx]);
+ struct vhost_virtqueue *vq = xd->vu_vhost_dev.virtqueue[idx];
+
+ /* if vm is interested in interrupts */
+ if((vring->callfd > 0) && !(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+ u64 x = 1;
+ int rv __attribute__((unused));
+ /* $$$$ pay attention to rv */
+ rv = write(vring->callfd, &x, sizeof(x));
+ vring->n_since_last_int = 0;
+ vring->int_deadline = vlib_time_now(vm) + dm->vhost_coalesce_time;
+ }
+}
+
+/*
+ * vhost-user interface management functions
+ */
+
+// initialize vui with specified attributes
+static void
+dpdk_vhost_user_vui_init(vnet_main_t * vnm,
+ dpdk_device_t *xd, int sockfd,
+ const char * sock_filename,
+ u8 is_server, u64 feature_mask,
+ u32 * sw_if_index)
+{
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+ memset(vui, 0, sizeof(*vui));
+
+ vui->unix_fd = sockfd;
+ vui->num_vrings = 2;
+ vui->sock_is_server = is_server;
+ strncpy(vui->sock_filename, sock_filename, ARRAY_LEN(vui->sock_filename)-1);
+ vui->sock_errno = 0;
+ vui->is_up = 0;
+ vui->feature_mask = feature_mask;
+ vui->active = 1;
+ vui->unix_file_index = ~0;
+
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+
+ if (sw_if_index)
+ *sw_if_index = xd->vlib_sw_if_index;
+}
+
+// register vui and start polling on it
+static void
+dpdk_vhost_user_vui_register(vlib_main_t * vm, dpdk_device_t *xd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+
+ hash_set (dm->vu_sw_if_index_by_listener_fd, vui->unix_fd,
+ xd->vlib_sw_if_index);
+}
+
+static inline void
+dpdk_vhost_user_if_disconnect(dpdk_device_t * xd)
+{
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+ vnet_main_t * vnm = vnet_get_main();
+ dpdk_main_t * dm = &dpdk_main;
+
+ xd->admin_up = 0;
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+
+ if (vui->unix_file_index != ~0) {
+ unix_file_del (&unix_main, unix_main.file_pool + vui->unix_file_index);
+ vui->unix_file_index = ~0;
+ }
+
+ hash_unset(dm->vu_sw_if_index_by_sock_fd, vui->unix_fd);
+ hash_unset(dm->vu_sw_if_index_by_listener_fd, vui->unix_fd);
+ close(vui->unix_fd);
+ vui->unix_fd = -1;
+ vui->is_up = 0;
+
+ DBG_SOCK("interface ifindex %d disconnected", xd->vlib_sw_if_index);
+}
+
+static clib_error_t * dpdk_vhost_user_callfd_read_ready (unix_file_t * uf)
+{
+ __attribute__((unused)) int n;
+ u8 buff[8];
+ n = read(uf->file_descriptor, ((char*)&buff), 8);
+ return 0;
+}
+
+static clib_error_t * dpdk_vhost_user_socket_read (unix_file_t * uf)
+{
+ int n;
+ int fd, number_of_fds = 0;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ vhost_user_msg_t msg;
+ struct msghdr mh;
+ struct iovec iov[1];
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t *xd;
+ dpdk_vu_intf_t *vui;
+ struct cmsghdr *cmsg;
+ uword * p;
+ u8 q;
+ unix_file_t template = {0};
+ vnet_main_t * vnm = vnet_get_main();
+
+ p = hash_get (dm->vu_sw_if_index_by_sock_fd, uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("FD %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ xd = dpdk_vhost_user_device_from_sw_if_index(p[0]);
+
+ ASSERT(xd != NULL);
+ vui = xd->vu_intf;
+
+ char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))];
+
+ memset(&mh, 0, sizeof(mh));
+ memset(control, 0, sizeof(control));
+
+ /* set the payload */
+ iov[0].iov_base = (void *) &msg;
+ iov[0].iov_len = VHOST_USER_MSG_HDR_SZ;
+
+ mh.msg_iov = iov;
+ mh.msg_iovlen = 1;
+ mh.msg_control = control;
+ mh.msg_controllen = sizeof(control);
+
+ n = recvmsg(uf->file_descriptor, &mh, 0);
+
+ if (n != VHOST_USER_MSG_HDR_SZ)
+ goto close_socket;
+
+ if (mh.msg_flags & MSG_CTRUNC) {
+ goto close_socket;
+ }
+
+ cmsg = CMSG_FIRSTHDR(&mh);
+
+ if (cmsg && (cmsg->cmsg_len > 0) && (cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS) &&
+ (cmsg->cmsg_len - CMSG_LEN(0) <= VHOST_MEMORY_MAX_NREGIONS * sizeof(int))) {
+ number_of_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+ memcpy(fds, CMSG_DATA(cmsg), number_of_fds * sizeof(int));
+ }
+
+ /* version 1, no reply bit set*/
+ if ((msg.flags & 7) != 1) {
+ DBG_SOCK("malformed message received. closing socket");
+ goto close_socket;
+ }
+
+ {
+ int rv __attribute__((unused));
+ /* $$$$ pay attention to rv */
+ rv = read(uf->file_descriptor, ((char*)&msg) + n, msg.size);
+ }
+
+ switch (msg.request) {
+ case VHOST_USER_GET_FEATURES:
+ DBG_SOCK("if %d msg VHOST_USER_GET_FEATURES",
+ xd->vlib_hw_if_index);
+
+ msg.flags |= 4;
+
+ dpdk_vhost_user_get_features(xd->vlib_hw_if_index, &msg.u64);
+ msg.u64 &= vui->feature_mask;
+ msg.size = sizeof(msg.u64);
+ break;
+
+ case VHOST_USER_SET_FEATURES:
+ DBG_SOCK("if %d msg VHOST_USER_SET_FEATURES features 0x%016llx",
+ xd->vlib_hw_if_index, msg.u64);
+
+ dpdk_vhost_user_set_features(xd->vlib_hw_if_index, msg.u64);
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_MEM_TABLE nregions %d",
+ xd->vlib_hw_if_index, msg.memory.nregions);
+
+ if ((msg.memory.nregions < 1) ||
+ (msg.memory.nregions > VHOST_MEMORY_MAX_NREGIONS)) {
+
+ DBG_SOCK("number of mem regions must be between 1 and %i",
+ VHOST_MEMORY_MAX_NREGIONS);
+
+ goto close_socket;
+ }
+
+ if (msg.memory.nregions != number_of_fds) {
+ DBG_SOCK("each memory region must have FD");
+ goto close_socket;
+ }
+
+ dpdk_vhost_user_set_mem_table(xd->vlib_hw_if_index, &msg.memory, fds);
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_NUM idx %d num %d",
+ xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+
+ if ((msg.state.num > 32768) || /* maximum ring size is 32768 */
+ (msg.state.num == 0) || /* it cannot be zero */
+ (msg.state.num % 2)) /* must be power of 2 */
+ goto close_socket;
+
+ dpdk_vhost_user_set_vring_num(xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+ break;
+
+ case VHOST_USER_SET_VRING_ADDR:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ADDR idx %d",
+ xd->vlib_hw_if_index, msg.state.index);
+
+ dpdk_vhost_user_set_vring_addr(xd->vlib_hw_if_index, msg.state.index,
+ msg.addr.desc_user_addr,
+ msg.addr.used_user_addr,
+ msg.addr.avail_user_addr);
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ DBG_SOCK("if %d msg VHOST_USER_SET_OWNER",
+ xd->vlib_hw_if_index);
+ break;
+
+ case VHOST_USER_RESET_OWNER:
+ DBG_SOCK("if %d msg VHOST_USER_RESET_OWNER",
+ xd->vlib_hw_if_index);
+ break;
+
+ case VHOST_USER_SET_VRING_CALL:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_CALL u64 %d",
+ xd->vlib_hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ /* if there is old fd, delete it */
+ if (vui->vrings[q].callfd) {
+ unix_file_t * uf = pool_elt_at_index (unix_main.file_pool,
+ vui->vrings[q].callfd_idx);
+ unix_file_del (&unix_main, uf);
+ }
+ vui->vrings[q].callfd = fds[0];
+ template.read_function = dpdk_vhost_user_callfd_read_ready;
+ template.file_descriptor = fds[0];
+ vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template);
+ }
+ else
+ vui->vrings[q].callfd = -1;
+
+ dpdk_vhost_user_set_vring_call(xd->vlib_hw_if_index, q, vui->vrings[q].callfd);
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_KICK u64 %d",
+ xd->vlib_hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ vui->vrings[q].kickfd = fds[0];
+ }
+ else
+ vui->vrings[q].kickfd = -1;
+
+ dpdk_vhost_user_set_vring_kick(xd->vlib_hw_if_index, q, vui->vrings[q].kickfd);
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ERR u64 %d",
+ xd->vlib_hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ fd = fds[0];
+ }
+ else
+ fd = -1;
+
+ vui->vrings[q].errfd = fd;
+ break;
+
+ case VHOST_USER_SET_VRING_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
+ xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+
+ dpdk_vhost_user_set_vring_base(xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
+ xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+
+ msg.flags |= 4;
+ msg.size = sizeof(msg.state);
+
+ dpdk_vhost_user_get_vring_base(xd->vlib_hw_if_index, msg.state.index, &msg.state.num);
+ break;
+
+ case VHOST_USER_NONE:
+ DBG_SOCK("if %d msg VHOST_USER_NONE",
+ xd->vlib_hw_if_index);
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_LOG_BASE",
+ xd->vlib_hw_if_index);
+ break;
+
+ case VHOST_USER_SET_LOG_FD:
+ DBG_SOCK("if %d msg VHOST_USER_SET_LOG_FD",
+ xd->vlib_hw_if_index);
+ break;
+
+ default:
+ DBG_SOCK("unknown vhost-user message %d received. closing socket",
+ msg.request);
+ goto close_socket;
+ }
+
+ /* if we have pointers to descriptor table, go up*/
+ if (!vui->is_up &&
+ xd->vu_vhost_dev.virtqueue[VHOST_NET_VRING_IDX_TX]->desc &&
+ xd->vu_vhost_dev.virtqueue[VHOST_NET_VRING_IDX_RX]->desc) {
+
+ DBG_SOCK("interface %d connected", xd->vlib_sw_if_index);
+
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP);
+ vui->is_up = 1;
+ }
+
+ /* if we need to reply */
+ if (msg.flags & 4)
+ {
+ n = send(uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
+ if (n != (msg.size + VHOST_USER_MSG_HDR_SZ))
+ goto close_socket;
+ }
+
+ return 0;
+
+close_socket:
+ DBG_SOCK("error: close_socket");
+ dpdk_vhost_user_if_disconnect(xd);
+ return 0;
+}
+
+static clib_error_t * dpdk_vhost_user_socket_error (unix_file_t * uf)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t *xd;
+ uword * p;
+
+ p = hash_get (dm->vu_sw_if_index_by_sock_fd, uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("FD %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ xd = dpdk_vhost_user_device_from_sw_if_index(p[0]);
+
+ dpdk_vhost_user_if_disconnect(xd);
+ return 0;
+}
+
+static clib_error_t * dpdk_vhost_user_socksvr_accept_ready (unix_file_t * uf)
+{
+ int client_fd, client_len;
+ struct sockaddr_un client;
+ unix_file_t template = {0};
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = NULL;
+ dpdk_vu_intf_t * vui;
+ uword * p;
+
+ p = hash_get (dm->vu_sw_if_index_by_listener_fd,
+ uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("fd %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+
+ xd = dpdk_vhost_user_device_from_sw_if_index(p[0]);
+ ASSERT(xd != NULL);
+ vui = xd->vu_intf;
+
+ client_len = sizeof(client);
+ client_fd = accept (uf->file_descriptor,
+ (struct sockaddr *)&client,
+ (socklen_t *)&client_len);
+
+ if (client_fd < 0)
+ return clib_error_return_unix (0, "accept");
+
+ template.read_function = dpdk_vhost_user_socket_read;
+ template.error_function = dpdk_vhost_user_socket_error;
+ template.file_descriptor = client_fd;
+ vui->unix_file_index = unix_file_add (&unix_main, &template);
+
+ vui->client_fd = client_fd;
+ hash_set (dm->vu_sw_if_index_by_sock_fd, vui->client_fd,
+ xd->vlib_sw_if_index);
+
+ return 0;
+}
+
+// init server socket on specified sock_filename
+static int dpdk_vhost_user_init_server_sock(const char * sock_filename, int *sockfd)
+{
+ int rv = 0, len;
+ struct sockaddr_un un;
+ int fd;
+ /* create listening socket */
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+
+ if (fd < 0) {
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ }
+
+ un.sun_family = AF_UNIX;
+ strcpy((char *) un.sun_path, (char *) sock_filename);
+
+ /* remove if exists */
+ unlink( (char *) sock_filename);
+
+ len = strlen((char *) un.sun_path) + strlen((char *) sock_filename);
+
+ if (bind(fd, (struct sockaddr *) &un, len) == -1) {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_2;
+ goto error;
+ }
+
+ if (listen(fd, 1) == -1) {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_3;
+ goto error;
+ }
+
+ unix_file_t template = {0};
+ template.read_function = dpdk_vhost_user_socksvr_accept_ready;
+ template.file_descriptor = fd;
+ unix_file_add (&unix_main, &template);
+ *sockfd = fd;
+ return rv;
+
+error:
+ close(fd);
+ return rv;
+}
+
+/*
+ * vhost-user interface control functions used from vpe api
+ */
+
+int dpdk_vhost_user_create_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 * sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t *xd;
+ u32 hw_if_idx = ~0;
+ int sockfd = -1;
+ int rv = 0;
+
+ // using virtio vhost user?
+ if (dm->use_virtio_vhost) {
+ return vhost_user_create_if(vnm, vm, sock_filename, is_server,
+ sw_if_index, feature_mask, renumber, custom_dev_instance);
+ }
+
+ if (is_server) {
+ if ((rv = dpdk_vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) {
+ return rv;
+ }
+ }
+
+ if (renumber) {
+ // set next vhost-user if id if custom one is higher or equal
+ if (custom_dev_instance >= dm->next_vu_if_id)
+ dm->next_vu_if_id = custom_dev_instance + 1;
+
+ dpdk_create_vhost_user_if_internal(&hw_if_idx, custom_dev_instance);
+ } else
+ dpdk_create_vhost_user_if_internal(&hw_if_idx, (u32)~0);
+ DBG_SOCK("dpdk vhost-user interface created hw_if_index %d", hw_if_idx);
+
+ xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_idx);
+ ASSERT(xd != NULL);
+
+ dpdk_vhost_user_vui_init (vnm, xd, sockfd, sock_filename, is_server,
+ feature_mask, sw_if_index);
+
+ dpdk_vhost_user_vui_register (vm, xd);
+ return rv;
+}
+
+int dpdk_vhost_user_modify_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ dpdk_vu_intf_t * vui = NULL;
+ u32 sw_if_idx = ~0;
+ int sockfd = -1;
+ int rv = 0;
+
+ // using virtio vhost user?
+ if (dm->use_virtio_vhost) {
+ return vhost_user_modify_if(vnm, vm, sock_filename, is_server,
+ sw_if_index, feature_mask, renumber, custom_dev_instance);
+ }
+
+ xd = dpdk_vhost_user_device_from_sw_if_index(sw_if_index);
+
+ if (xd == NULL)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ vui = xd->vu_intf;
+
+ // interface is inactive
+ vui->active = 0;
+ // disconnect interface sockets
+ dpdk_vhost_user_if_disconnect(xd);
+
+ if (is_server) {
+ if ((rv = dpdk_vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) {
+ return rv;
+ }
+ }
+
+ dpdk_vhost_user_vui_init (vnm, xd, sockfd, sock_filename, is_server,
+ feature_mask, &sw_if_idx);
+
+ if (renumber) {
+ vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
+ }
+
+ dpdk_vhost_user_vui_register (vm, xd);
+
+ return rv;
+}
+
+int dpdk_vhost_user_delete_if(vnet_main_t * vnm, vlib_main_t * vm,
+ u32 sw_if_index)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = NULL;
+ dpdk_vu_intf_t * vui;
+ int rv = 0;
+
+ // using virtio vhost user?
+ if (dm->use_virtio_vhost) {
+ return vhost_user_delete_if(vnm, vm, sw_if_index);
+ }
+
+ xd = dpdk_vhost_user_device_from_sw_if_index(sw_if_index);
+
+ if (xd == NULL)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ vui = xd->vu_intf;
+
+ // interface is inactive
+ vui->active = 0;
+ // disconnect interface sockets
+ dpdk_vhost_user_if_disconnect(xd);
+ // add to inactive interface list
+ vec_add1 (dm->vu_inactive_interfaces_device_index, xd->device_index);
+
+ ethernet_delete_interface (vnm, xd->vlib_hw_if_index);
+ DBG_SOCK ("deleted (deactivated) vhost-user interface sw_if_index %d", sw_if_index);
+
+ return rv;
+}
+
+int dpdk_vhost_user_dump_ifs(vnet_main_t * vnm, vlib_main_t * vm, vhost_user_intf_details_t **out_vuids)
+{
+ int rv = 0;
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ dpdk_vu_intf_t * vui;
+ struct virtio_net * vhost_dev;
+ vhost_user_intf_details_t * r_vuids = NULL;
+ vhost_user_intf_details_t * vuid = NULL;
+ u32 * hw_if_indices = 0;
+ vnet_hw_interface_t * hi;
+ u8 *s = NULL;
+ int i;
+
+ if (!out_vuids)
+ return -1;
+
+ // using virtio vhost user?
+ if (dm->use_virtio_vhost) {
+ return vhost_user_dump_ifs(vnm, vm, out_vuids);
+ }
+
+ vec_foreach (xd, dm->devices) {
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER &&
+ xd->vu_intf->active)
+ vec_add1(hw_if_indices, xd->vlib_hw_if_index);
+ }
+
+ for (i = 0; i < vec_len (hw_if_indices); i++) {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_indices[i]);
+ if (!xd) {
+ clib_warning("invalid vhost-user interface hw_if_index %d", hw_if_indices[i]);
+ continue;
+ }
+
+ vui = xd->vu_intf;
+ ASSERT(vui != NULL);
+ vhost_dev = &xd->vu_vhost_dev;
+ u32 virtio_net_hdr_sz = (vui->num_vrings > 0 ?
+ vhost_dev->virtqueue[0]->vhost_hlen : 0);
+
+ vec_add2(r_vuids, vuid, 1);
+ vuid->sw_if_index = xd->vlib_sw_if_index;
+ vuid->virtio_net_hdr_sz = virtio_net_hdr_sz;
+ vuid->features = vhost_dev->features;
+ vuid->is_server = vui->sock_is_server;
+ vuid->num_regions = (vhost_dev->mem != NULL ? vhost_dev->mem->nregions : 0);
+ vuid->sock_errno = vui->sock_errno;
+ strncpy((char *)vuid->sock_filename, (char *)vui->sock_filename,
+ ARRAY_LEN(vuid->sock_filename)-1);
+
+ s = format (s, "%v%c", hi->name, 0);
+
+ strncpy((char *)vuid->if_name, (char *)s,
+ ARRAY_LEN(vuid->if_name)-1);
+ _vec_len(s) = 0;
+ }
+
+ vec_free (s);
+ vec_free (hw_if_indices);
+
+ *out_vuids = r_vuids;
+
+ return rv;
+}
+
+/*
+ * Processing functions called from dpdk process fn
+ */
+
+typedef struct {
+ struct sockaddr_un sun;
+ int sockfd;
+ unix_file_t template;
+ uword *event_data;
+} dpdk_vu_process_state;
+
+void dpdk_vhost_user_process_init (void **ctx)
+{
+ dpdk_vu_process_state *state = clib_mem_alloc (sizeof(dpdk_vu_process_state));
+ memset(state, 0, sizeof(*state));
+ state->sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ state->sun.sun_family = AF_UNIX;
+ state->template.read_function = dpdk_vhost_user_socket_read;
+ state->template.error_function = dpdk_vhost_user_socket_error;
+ state->event_data = 0;
+ *ctx = state;
+}
+
+void dpdk_vhost_user_process_cleanup (void *ctx)
+{
+ clib_mem_free(ctx);
+}
+
+uword dpdk_vhost_user_process_if (vlib_main_t *vm, dpdk_device_t *xd, void *ctx)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_vu_process_state *state = (dpdk_vu_process_state *)ctx;
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+
+ if (vui->sock_is_server || !vui->active)
+ return 0;
+
+ if (vui->unix_fd == -1) {
+ /* try to connect */
+ strncpy(state->sun.sun_path, (char *) vui->sock_filename, sizeof(state->sun.sun_path) - 1);
+
+ if (connect(state->sockfd, (struct sockaddr *) &(state->sun), sizeof(struct sockaddr_un)) == 0) {
+ vui->sock_errno = 0;
+ vui->unix_fd = state->sockfd;
+ state->template.file_descriptor = state->sockfd;
+ vui->unix_file_index = unix_file_add (&unix_main, &(state->template));
+ hash_set (dm->vu_sw_if_index_by_sock_fd, state->sockfd, xd->vlib_sw_if_index);
+
+ state->sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (state->sockfd < 0)
+ return -1;
+ } else {
+ vui->sock_errno = errno;
+ }
+ } else {
+ /* check if socket is alive */
+ int error = 0;
+ socklen_t len = sizeof (error);
+ int retval = getsockopt(vui->unix_fd, SOL_SOCKET, SO_ERROR, &error, &len);
+
+ if (retval)
+ dpdk_vhost_user_if_disconnect(xd);
+ }
+ return 0;
+}
+
+/*
+ * CLI functions
+ */
+
+static clib_error_t *
+dpdk_vhost_user_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u8 * sock_filename = NULL;
+ u32 sw_if_index;
+ u8 is_server = 0;
+ u64 feature_mask = (u64)~0;
+ u8 renumber = 0;
+ u32 custom_dev_instance = ~0;
+
+ if (dm->use_virtio_vhost) {
+ return vhost_user_connect_command_fn(vm, input, cmd);
+ }
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "socket %s", &sock_filename))
+ ;
+ else if (unformat (line_input, "server"))
+ is_server = 1;
+ else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
+ ;
+ else if (unformat (line_input, "renumber %d", &custom_dev_instance)) {
+ renumber = 1;
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ vnet_main_t *vnm = vnet_get_main();
+ if (sock_filename == NULL)
+ return clib_error_return (0, "missing socket file");
+
+ dpdk_vhost_user_create_if(vnm, vm, (char *)sock_filename,
+ is_server, &sw_if_index, feature_mask,
+ renumber, custom_dev_instance);
+
+ vec_free(sock_filename);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dpdk_vhost_user_connect_command, static) = {
+ .path = "create vhost-user",
+ .short_help = "create vhost-user socket <socket-filename> [server] [feature-mask <hex>] [renumber <dev_instance>]",
+ .function = dpdk_vhost_user_connect_command_fn,
+};
+
+static clib_error_t *
+dpdk_vhost_user_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ clib_error_t * error = 0;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u32 sw_if_index = ~0;
+
+ if (dm->use_virtio_vhost) {
+ return vhost_user_delete_command_fn(vm, input, cmd);
+ }
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (sw_if_index == ~0) {
+ error = clib_error_return (0, "invalid sw_if_index",
+ format_unformat_error, input);
+ return error;
+ }
+
+ vnet_main_t *vnm = vnet_get_main();
+
+ dpdk_vhost_user_delete_if(vnm, vm, sw_if_index);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dpdk_vhost_user_delete_command, static) = {
+ .path = "delete vhost-user",
+ .short_help = "delete vhost-user sw_if_index <nn>",
+ .function = dpdk_vhost_user_delete_command_fn,
+};
+
+#define foreach_dpdk_vhost_feature \
+ _ (VIRTIO_NET_F_MRG_RXBUF) \
+ _ (VIRTIO_NET_F_CTRL_VQ) \
+ _ (VIRTIO_NET_F_CTRL_RX)
+
+static clib_error_t *
+show_dpdk_vhost_user_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t * error = 0;
+ dpdk_main_t * dm = &dpdk_main;
+ vnet_main_t * vnm = vnet_get_main();
+ dpdk_device_t * xd;
+ dpdk_vu_intf_t * vui;
+ struct virtio_net * vhost_dev;
+ u32 hw_if_index, * hw_if_indices = 0;
+ vnet_hw_interface_t * hi;
+ int i, j, q;
+ int show_descr = 0;
+ struct virtio_memory * mem;
+ struct feat_struct { u8 bit; char *str;};
+ struct feat_struct *feat_entry;
+
+ static struct feat_struct feat_array[] = {
+#define _(f) { .str = #f, .bit = f, },
+ foreach_dpdk_vhost_feature
+#undef _
+ { .str = NULL }
+ };
+
+ if (dm->use_virtio_vhost) {
+ return show_vhost_user_command_fn(vm, input, cmd);
+ }
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) {
+ vec_add1 (hw_if_indices, hw_if_index);
+ vlib_cli_output(vm, "add %d", hw_if_index);
+ }
+ else if (unformat (input, "descriptors") || unformat (input, "desc") )
+ show_descr = 1;
+ else {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+ if (vec_len (hw_if_indices) == 0) {
+ vec_foreach (xd, dm->devices) {
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER && xd->vu_intf->active)
+ vec_add1(hw_if_indices, xd->vlib_hw_if_index);
+ }
+ }
+
+ vlib_cli_output (vm, "DPDK vhost-user interfaces");
+ vlib_cli_output (vm, "Global:\n coalesce frames %d time %e\n\n",
+ dm->vhost_coalesce_frames, dm->vhost_coalesce_time);
+
+ for (i = 0; i < vec_len (hw_if_indices); i++) {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_indices[i]))) {
+ error = clib_error_return (0, "not dpdk vhost-user interface: '%s'",
+ hi->name);
+ goto done;
+ }
+ vui = xd->vu_intf;
+ vhost_dev = &xd->vu_vhost_dev;
+ mem = vhost_dev->mem;
+ u32 virtio_net_hdr_sz = (vui->num_vrings > 0 ?
+ vhost_dev->virtqueue[0]->vhost_hlen : 0);
+
+ vlib_cli_output (vm, "Interface: %s (ifindex %d)",
+ hi->name, hw_if_indices[i]);
+
+ vlib_cli_output (vm, "virtio_net_hdr_sz %d\n features (0x%llx): \n",
+ virtio_net_hdr_sz, xd->vu_vhost_dev.features);
+
+ feat_entry = (struct feat_struct *) &feat_array;
+ while(feat_entry->str) {
+ if (xd->vu_vhost_dev.features & (1 << feat_entry->bit))
+ vlib_cli_output (vm, " %s (%d)", feat_entry->str, feat_entry->bit);
+ feat_entry++;
+ }
+
+ vlib_cli_output (vm, "\n");
+
+ vlib_cli_output (vm, " socket filename %s type %s errno \"%s\"\n\n",
+ vui->sock_filename, vui->sock_is_server ? "server" : "client",
+ strerror(vui->sock_errno));
+
+ vlib_cli_output (vm, " Memory regions (total %d)\n", mem->nregions);
+
+ if (mem->nregions){
+ vlib_cli_output(vm, " region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr\n");
+ vlib_cli_output(vm, " ====== ===== ================== ================== ================== ================== ==================\n");
+ }
+ for (j = 0; j < mem->nregions; j++) {
+ vlib_cli_output(vm, " %d %-5d 0x%016lx 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", j,
+ vui->region_fd[j],
+ mem->regions[j].guest_phys_address,
+ mem->regions[j].memory_size,
+ mem->regions[j].userspace_address,
+ mem->regions[j].address_offset,
+ vui->region_addr[j]);
+ }
+ for (q = 0; q < vui->num_vrings; q++) {
+ struct vhost_virtqueue *vq = vhost_dev->virtqueue[q];
+
+ vlib_cli_output(vm, "\n Virtqueue %d\n", q);
+
+ vlib_cli_output(vm, " qsz %d last_used_idx %d last_used_idx_res %d\n",
+ vq->size, vq->last_used_idx, vq->last_used_idx_res);
+
+ if (vq->avail && vq->used)
+ vlib_cli_output(vm, " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
+ vq->avail->flags, vq->avail->idx, vq->used->flags, vq->used->idx);
+
+ vlib_cli_output(vm, " kickfd %d callfd %d errfd %d\n",
+ vui->vrings[q].kickfd,
+ vui->vrings[q].callfd,
+ vui->vrings[q].errfd);
+
+ if (show_descr) {
+ vlib_cli_output(vm, "\n descriptor table:\n");
+ vlib_cli_output(vm, " id addr len flags next user_addr\n");
+ vlib_cli_output(vm, " ===== ================== ===== ====== ===== ==================\n");
+ for(j = 0; j < vq->size; j++) {
+ vlib_cli_output(vm, " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
+ j,
+ vq->desc[j].addr,
+ vq->desc[j].len,
+ vq->desc[j].flags,
+ vq->desc[j].next,
+ (u64) map_guest_mem(xd, vq->desc[j].addr));}
+ }
+ }
+ vlib_cli_output (vm, "\n");
+ }
+done:
+ vec_free (hw_if_indices);
+ return error;
+}
+
+VLIB_CLI_COMMAND (show_vhost_user_command, static) = {
+ .path = "show vhost-user",
+ .short_help = "show vhost-user interface",
+ .function = show_dpdk_vhost_user_command_fn,
+};
+
diff --git a/vnet/vnet/devices/ssvm/node.c b/vnet/vnet/devices/ssvm/node.c
new file mode 100644
index 00000000000..fe53d1199a2
--- /dev/null
+++ b/vnet/vnet/devices/ssvm/node.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ssvm_eth.h"
+
+vlib_node_registration_t ssvm_eth_input_node;
+
+typedef struct {
+ u32 next_index;
+ u32 sw_if_index;
+} ssvm_eth_input_trace_t;
+
+/* packet trace format function */
+static u8 * format_ssvm_eth_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ssvm_eth_input_trace_t * t = va_arg (*args, ssvm_eth_input_trace_t *);
+
+ s = format (s, "SSVM_ETH_INPUT: sw_if_index %d, next index %d",
+ t->sw_if_index, t->next_index);
+ return s;
+}
+
+vlib_node_registration_t ssvm_eth_input_node;
+
+#define foreach_ssvm_eth_input_error \
+_(NO_BUFFERS, "Rx packet drops (no buffers)")
+
+typedef enum {
+#define _(sym,str) SSVM_ETH_INPUT_ERROR_##sym,
+ foreach_ssvm_eth_input_error
+#undef _
+ SSVM_ETH_INPUT_N_ERROR,
+} ssvm_eth_input_error_t;
+
+static char * ssvm_eth_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ssvm_eth_input_error
+#undef _
+};
+
+typedef enum {
+ SSVM_ETH_INPUT_NEXT_DROP,
+ SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT,
+ SSVM_ETH_INPUT_NEXT_IP4_INPUT,
+ SSVM_ETH_INPUT_NEXT_IP6_INPUT,
+ SSVM_ETH_INPUT_NEXT_MPLS_INPUT,
+ SSVM_ETH_INPUT_N_NEXT,
+} ssvm_eth_input_next_t;
+
+static inline uword
+ssvm_eth_device_input (ssvm_eth_main_t * em,
+ ssvm_private_t * intfc,
+ vlib_node_runtime_t * node)
+{
+ ssvm_shared_header_t * sh = intfc->sh;
+ vlib_main_t * vm = em->vlib_main;
+ unix_shared_memory_queue_t * q;
+ ssvm_eth_queue_elt_t * elt, * elts;
+ u32 elt_index;
+ u32 my_pid = intfc->my_pid;
+ int rx_queue_index;
+ u32 n_to_alloc = VLIB_FRAME_SIZE * 2;
+ u32 n_allocated, n_present_in_cache;
+#if DPDK > 0
+ u32 next_index = DPDK_RX_NEXT_ETHERNET_INPUT;
+#else
+ u32 next_index = 0;
+#endif
+ vlib_buffer_free_list_t * fl;
+ u32 n_left_to_next, * to_next;
+ u32 next0;
+ u32 n_buffers;
+ u32 n_available;
+ u32 bi0, saved_bi0;
+ vlib_buffer_t * b0, * prev;
+ u32 saved_cache_size = 0;
+ ethernet_header_t * eh0;
+ u16 type0;
+ u32 n_rx_bytes = 0, l3_offset0;
+ u32 cpu_index = os_get_cpu_number();
+ u32 trace_cnt __attribute__((unused)) = vlib_get_trace_count (vm, node);
+ volatile u32 * lock;
+ u32 * elt_indices;
+
+ /* Either side down? buh-bye... */
+ if ((u64)(sh->opaque [MASTER_ADMIN_STATE_INDEX]) == 0 ||
+ (u64)(sh->opaque [SLAVE_ADMIN_STATE_INDEX]) == 0)
+ return 0;
+
+ if (intfc->i_am_master)
+ q = (unix_shared_memory_queue_t *)(sh->opaque [TO_MASTER_Q_INDEX]);
+ else
+ q = (unix_shared_memory_queue_t *)(sh->opaque [TO_SLAVE_Q_INDEX]);
+
+ /* Nothing to do? */
+ if (q->cursize == 0)
+ return 0;
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ vec_reset_length (intfc->rx_queue);
+
+ lock = (u32 *) q;
+ while (__sync_lock_test_and_set (lock, 1))
+ ;
+ while (q->cursize > 0)
+ {
+ unix_shared_memory_queue_sub_raw (q, (u8 *)&elt_index);
+ ASSERT(elt_index < 2048);
+ vec_add1 (intfc->rx_queue, elt_index);
+ }
+ CLIB_MEMORY_BARRIER();
+ *lock = 0;
+
+ n_present_in_cache = vec_len (em->buffer_cache);
+
+ if (vec_len (em->buffer_cache) < vec_len (intfc->rx_queue) * 2)
+ {
+ vec_validate (em->buffer_cache,
+ n_to_alloc + vec_len (em->buffer_cache) - 1);
+ n_allocated =
+ vlib_buffer_alloc (vm, &em->buffer_cache [n_present_in_cache],
+ n_to_alloc);
+
+ n_present_in_cache += n_allocated;
+ _vec_len (em->buffer_cache) = n_present_in_cache;
+ }
+
+ elts = (ssvm_eth_queue_elt_t *) (sh->opaque [CHUNK_POOL_INDEX]);
+
+ n_buffers = vec_len (intfc->rx_queue);
+ rx_queue_index = 0;
+
+ while (n_buffers > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_buffers > 0 && n_left_to_next > 0)
+ {
+ elt = elts + intfc->rx_queue[rx_queue_index];
+
+ saved_cache_size = n_present_in_cache;
+ if (PREDICT_FALSE(saved_cache_size == 0))
+ {
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ goto out;
+ }
+ saved_bi0 = bi0 = em->buffer_cache [--n_present_in_cache];
+ b0 = vlib_get_buffer (vm, bi0);
+ prev = 0;
+
+ while (1)
+ {
+ vlib_buffer_init_for_free_list (b0, fl);
+ b0->clone_count = 0;
+
+ b0->current_data = elt->current_data_hint;
+ b0->current_length = elt->length_this_buffer;
+ b0->total_length_not_including_first_buffer =
+ elt->total_length_not_including_first_buffer;
+
+ memcpy (b0->data + b0->current_data, elt->data,
+ b0->current_length);
+
+ if (PREDICT_FALSE(prev != 0))
+ prev->next_buffer = bi0;
+
+ if (PREDICT_FALSE(elt->flags & SSVM_BUFFER_NEXT_PRESENT))
+ {
+ prev = b0;
+ if (PREDICT_FALSE(n_present_in_cache == 0))
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ goto out;
+ }
+ bi0 = em->buffer_cache [--n_present_in_cache];
+ b0 = vlib_get_buffer (vm, bi0);
+ }
+ else
+ break;
+ }
+
+ saved_cache_size = n_present_in_cache;
+
+ to_next[0] = saved_bi0;
+ to_next++;
+ n_left_to_next--;
+
+ b0 = vlib_get_buffer (vm, saved_bi0);
+ eh0 = vlib_buffer_get_current (b0);
+
+ type0 = clib_net_to_host_u16 (eh0->type);
+
+ next0 = SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT;
+
+ if (type0 == ETHERNET_TYPE_IP4)
+ next0 = SSVM_ETH_INPUT_NEXT_IP4_INPUT;
+ else if (type0 == ETHERNET_TYPE_IP6)
+ next0 = SSVM_ETH_INPUT_NEXT_IP6_INPUT;
+ else if (type0 == ETHERNET_TYPE_MPLS_UNICAST)
+ next0 = SSVM_ETH_INPUT_NEXT_MPLS_INPUT;
+
+ l3_offset0 = ((next0 == SSVM_ETH_INPUT_NEXT_IP4_INPUT ||
+ next0 == SSVM_ETH_INPUT_NEXT_IP6_INPUT ||
+ next0 == SSVM_ETH_INPUT_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ n_rx_bytes += b0->current_length
+ + b0->total_length_not_including_first_buffer;
+
+ b0->current_data += l3_offset0;
+ b0->current_length -= l3_offset0;
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = intfc->vlib_hw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
+
+ /* $$$$ tracing */
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ n_buffers--;
+ rx_queue_index++;
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ out:
+ if (em->buffer_cache)
+ _vec_len (em->buffer_cache) = saved_cache_size;
+ else
+ ASSERT (saved_cache_size == 0);
+
+ ssvm_lock (sh, my_pid, 2);
+
+ ASSERT(vec_len(intfc->rx_queue) > 0);
+
+ n_available = (u32)(u64)(sh->opaque[CHUNK_POOL_NFREE]);
+ elt_indices = (u32 *)(sh->opaque[CHUNK_POOL_FREELIST_INDEX]);
+
+ memcpy (&elt_indices[n_available], intfc->rx_queue,
+ vec_len (intfc->rx_queue) * sizeof (u32));
+
+ n_available += vec_len (intfc->rx_queue);
+ sh->opaque[CHUNK_POOL_NFREE] = (void *) (u64) n_available;
+
+ ssvm_unlock (sh);
+
+ vlib_error_count (vm, node->node_index, SSVM_ETH_INPUT_ERROR_NO_BUFFERS,
+ n_buffers);
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX, cpu_index,
+ intfc->vlib_hw_if_index,
+ rx_queue_index, n_rx_bytes);
+
+ return rx_queue_index;
+}
+
+static uword
+ssvm_eth_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ ssvm_private_t * intfc;
+ uword n_rx_packets = 0;
+
+ vec_foreach (intfc, em->intfcs)
+ {
+ n_rx_packets += ssvm_eth_device_input (em, intfc, node);
+ }
+
+ return n_rx_packets;
+}
+
+VLIB_REGISTER_NODE (ssvm_eth_input_node) = {
+ .function = ssvm_eth_input_node_fn,
+ .name = "ssvm_eth_input",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ssvm_eth_input_trace,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .n_errors = ARRAY_LEN(ssvm_eth_input_error_strings),
+ .error_strings = ssvm_eth_input_error_strings,
+
+ .n_next_nodes = SSVM_ETH_INPUT_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [SSVM_ETH_INPUT_NEXT_DROP] = "error-drop",
+ [SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [SSVM_ETH_INPUT_NEXT_IP4_INPUT] = "ip4-input",
+ [SSVM_ETH_INPUT_NEXT_IP6_INPUT] = "ip6-input",
+ [SSVM_ETH_INPUT_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ },
+};
+
diff --git a/vnet/vnet/devices/ssvm/ssvm_eth.c b/vnet/vnet/devices/ssvm/ssvm_eth.c
new file mode 100644
index 00000000000..aad63f02bba
--- /dev/null
+++ b/vnet/vnet/devices/ssvm/ssvm_eth.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ssvm_eth.h"
+
+ssvm_eth_main_t ssvm_eth_main;
+
+#define foreach_ssvm_eth_tx_func_error \
+_(RING_FULL, "Tx packet drops (ring full)") \
+_(NO_BUFFERS, "Tx packet drops (no buffers)") \
+_(ADMIN_DOWN, "Tx packet drops (admin down)")
+
+typedef enum {
+#define _(f,s) SSVM_ETH_TX_ERROR_##f,
+ foreach_ssvm_eth_tx_func_error
+#undef _
+ SSVM_ETH_TX_N_ERROR,
+} ssvm_eth_tx_func_error_t;
+
+static u32 ssvm_eth_flag_change (vnet_main_t * vnm,
+ vnet_hw_interface_t * hi,
+ u32 flags);
+
+int ssvm_eth_create (ssvm_eth_main_t * em, u8 * name, int is_master)
+{
+ ssvm_private_t * intfc;
+ void * oldheap;
+ clib_error_t * e;
+ unix_shared_memory_queue_t * q;
+ ssvm_shared_header_t * sh;
+ ssvm_eth_queue_elt_t * elts;
+ u32 * elt_indices;
+ u8 enet_addr[6];
+ int i, rv;
+
+ vec_add2 (em->intfcs, intfc, 1);
+
+ intfc->ssvm_size = em->segment_size;
+ intfc->i_am_master = 1;
+ intfc->name = name;
+ if (is_master == 0)
+ {
+ rv = ssvm_slave_init (intfc, 20 /* timeout in seconds */);
+ if (rv < 0)
+ return rv;
+ goto create_vnet_interface;
+ }
+
+ intfc->requested_va = em->next_base_va;
+ em->next_base_va += em->segment_size;
+ rv = ssvm_master_init (intfc, intfc - em->intfcs /* master index */);
+
+ if (rv < 0)
+ return rv;
+
+ /* OK, segment created, set up queues and so forth. */
+
+ sh = intfc->sh;
+ oldheap = ssvm_push_heap (sh);
+
+ q = unix_shared_memory_queue_init (em->queue_elts, sizeof (u32),
+ 0 /* consumer pid not interesting */,
+ 0 /* signal not sent */);
+ sh->opaque [TO_MASTER_Q_INDEX] = (void *)q;
+ q = unix_shared_memory_queue_init (em->queue_elts, sizeof (u32),
+ 0 /* consumer pid not interesting */,
+ 0 /* signal not sent */);
+ sh->opaque [TO_SLAVE_Q_INDEX] = (void *)q;
+
+ /*
+ * Preallocate the requested number of buffer chunks
+ * There must be a better way to do this, etc.
+ * Add some slop to avoid pool reallocation, which will not go well
+ */
+ elts = 0;
+ elt_indices = 0;
+
+ vec_validate_aligned (elts, em->nbuffers - 1, CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (elt_indices, em->nbuffers - 1, CLIB_CACHE_LINE_BYTES);
+
+ for (i = 0; i < em->nbuffers; i++)
+ elt_indices[i] = i;
+
+ sh->opaque [CHUNK_POOL_INDEX] = (void *) elts;
+ sh->opaque [CHUNK_POOL_FREELIST_INDEX] = (void *) elt_indices;
+ sh->opaque [CHUNK_POOL_NFREE] = (void *) em->nbuffers;
+
+ ssvm_pop_heap (oldheap);
+
+ create_vnet_interface:
+
+ sh = intfc->sh;
+
+ memset (enet_addr, 0, sizeof (enet_addr));
+ enet_addr[0] = 2;
+ enet_addr[1] = 0xFE;
+ enet_addr[2] = is_master;
+ enet_addr[5] = sh->master_index;
+
+ e = ethernet_register_interface
+ (em->vnet_main, ssvm_eth_device_class.index,
+ intfc - em->intfcs,
+ /* ethernet address */ enet_addr,
+ &intfc->vlib_hw_if_index,
+ ssvm_eth_flag_change);
+
+ if (e)
+ {
+ clib_error_report (e);
+ /* $$$$ unmap offending region? */
+ return VNET_API_ERROR_INVALID_INTERFACE;
+ }
+
+ /* Declare link up */
+ vnet_hw_interface_set_flags (em->vnet_main, intfc->vlib_hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+
+ /* Let the games begin... */
+ if (is_master)
+ sh->ready = 1;
+ return 0;
+}
+
+static clib_error_t *
+ssvm_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ u8 * name;
+ int is_master = 1;
+ int i, rv;
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "base-va %llx", &em->next_base_va))
+ ;
+ else if (unformat (input, "segment-size %lld", &em->segment_size))
+ em->segment_size = 1ULL << (max_log2 (em->segment_size));
+ else if (unformat (input, "nbuffers %lld", &em->nbuffers))
+ ;
+ else if (unformat (input, "queue-elts %lld", &em->queue_elts))
+ ;
+ else if (unformat (input, "slave"))
+ is_master = 0;
+ else if (unformat (input, "%s", &name))
+ vec_add1 (em->names, name);
+ else
+ break;
+ }
+
+ /* No configured instances, we're done... */
+ if (vec_len (em->names) == 0)
+ return 0;
+
+ for (i = 0; i < vec_len (em->names); i++)
+ {
+ rv = ssvm_eth_create (em, em->names[i], is_master);
+ if (rv < 0)
+ return clib_error_return (0, "ssvm_eth_create '%s' failed, error %d",
+ em->names[i], rv);
+ }
+
+ vlib_node_set_state (vm, ssvm_eth_input_node.index, VLIB_NODE_STATE_POLLING);
+
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (ssvm_config, "ssvm_eth");
+
+
+static clib_error_t * ssvm_eth_init (vlib_main_t * vm)
+{
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+
+ if (((sizeof(ssvm_eth_queue_elt_t) / CLIB_CACHE_LINE_BYTES)
+ * CLIB_CACHE_LINE_BYTES) != sizeof(ssvm_eth_queue_elt_t))
+ clib_warning ("ssvm_eth_queue_elt_t size %d not a multiple of %d",
+ sizeof(ssvm_eth_queue_elt_t), CLIB_CACHE_LINE_BYTES);
+
+ em->vlib_main = vm;
+ em->vnet_main = vnet_get_main();
+ em->elog_main = &vm->elog_main;
+
+ /* default config param values... */
+
+ em->next_base_va = 0x600000000ULL;
+ /*
+ * Allocate 2 full superframes in each dir (256 x 2 x 2 x 2048 bytes),
+ * 2mb; double that so we have plenty of space... 4mb
+ */
+ em->segment_size = 8<<20;
+ em->nbuffers = 1024;
+ em->queue_elts = 512;
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ssvm_eth_init);
+
+static char * ssvm_eth_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_ssvm_eth_tx_func_error
+#undef _
+};
+
+static u8 * format_ssvm_eth_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+
+ s = format (s, "ssvmEthernet%d", i);
+ return s;
+}
+
+static u8 * format_ssvm_eth_device (u8 * s, va_list * args)
+{
+ s = format (s, "SSVM Ethernet");
+ return s;
+}
+
+static u8 * format_ssvm_eth_tx_trace (u8 * s, va_list * args)
+{
+ s = format (s, "Unimplemented...");
+ return s;
+}
+
+
+static uword
+ssvm_eth_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
+ ssvm_private_t * intfc = vec_elt_at_index (em->intfcs, rd->dev_instance);
+ ssvm_shared_header_t * sh = intfc->sh;
+ unix_shared_memory_queue_t * q;
+ u32 * from;
+ u32 n_left;
+ ssvm_eth_queue_elt_t * elts, * elt, * prev_elt;
+ u32 my_pid = intfc->my_pid;
+ vlib_buffer_t * b0;
+ u32 bi0;
+ u32 size_this_buffer;
+ u32 chunks_this_buffer;
+ u8 i_am_master = intfc->i_am_master;
+ u32 elt_index;
+ int is_ring_full, interface_down;
+ int i;
+ volatile u32 *queue_lock;
+ u32 n_to_alloc = VLIB_FRAME_SIZE;
+ u32 n_allocated, n_present_in_cache, n_available;
+ u32 * elt_indices;
+
+ if (i_am_master)
+ q = (unix_shared_memory_queue_t *)sh->opaque [TO_SLAVE_Q_INDEX];
+ else
+ q = (unix_shared_memory_queue_t *)sh->opaque [TO_MASTER_Q_INDEX];
+
+ queue_lock = (u32 *) q;
+
+ from = vlib_frame_vector_args (f);
+ n_left = f->n_vectors;
+ is_ring_full = 0;
+ interface_down = 0;
+
+ n_present_in_cache = vec_len (em->chunk_cache);
+
+ /* admin / link up/down check */
+ if ((u64)(sh->opaque [MASTER_ADMIN_STATE_INDEX]) == 0 ||
+ (u64)(sh->opaque [SLAVE_ADMIN_STATE_INDEX]) == 0)
+ {
+ interface_down = 1;
+ goto out;
+ }
+
+ ssvm_lock (sh, my_pid, 1);
+
+ elts = (ssvm_eth_queue_elt_t *) (sh->opaque [CHUNK_POOL_INDEX]);
+ elt_indices = (u32 *) (sh->opaque [CHUNK_POOL_FREELIST_INDEX]);
+ n_available = (u32) (u64) (sh->opaque [CHUNK_POOL_NFREE]);
+
+ if (n_present_in_cache < n_left*2)
+ {
+ vec_validate (em->chunk_cache,
+ n_to_alloc + n_present_in_cache - 1);
+
+ n_allocated = n_to_alloc < n_available ? n_to_alloc : n_available;
+
+ if (PREDICT_TRUE(n_allocated > 0))
+ {
+ memcpy (&em->chunk_cache[n_present_in_cache],
+ &elt_indices[n_available - n_allocated],
+ sizeof(u32) * n_allocated);
+ }
+
+ n_present_in_cache += n_allocated;
+ n_available -= n_allocated;
+ sh->opaque [CHUNK_POOL_NFREE] = (void *) (u64) n_available;
+ _vec_len (em->chunk_cache) = n_present_in_cache;
+ }
+
+ ssvm_unlock (sh);
+
+ while (n_left)
+ {
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+
+ size_this_buffer = vlib_buffer_length_in_chain (vm, b0);
+ chunks_this_buffer = (size_this_buffer + (SSVM_BUFFER_SIZE - 1))
+ / SSVM_BUFFER_SIZE;
+
+ /* If we're not going to be able to enqueue the buffer, tail drop. */
+ if (q->cursize >= q->maxsize)
+ {
+ is_ring_full = 1;
+ break;
+ }
+
+ prev_elt = 0;
+ elt_index = ~0;
+ for (i = 0; i < chunks_this_buffer; i++)
+ {
+ if (PREDICT_FALSE (n_present_in_cache == 0))
+ goto out;
+
+ elt_index = em->chunk_cache[--n_present_in_cache];
+ elt = elts + elt_index;
+
+ elt->type = SSVM_PACKET_TYPE;
+ elt->flags = 0;
+ elt->total_length_not_including_first_buffer =
+ b0->total_length_not_including_first_buffer;
+ elt->length_this_buffer = b0->current_length;
+ elt->current_data_hint = b0->current_data;
+ elt->owner = !i_am_master;
+ elt->tag = 1;
+
+ memcpy (elt->data, b0->data + b0->current_data, b0->current_length);
+
+ if (PREDICT_FALSE (prev_elt != 0))
+ prev_elt->next_index = elt - elts;
+
+ if (PREDICT_FALSE(i < (chunks_this_buffer-1)))
+ {
+ elt->flags = SSVM_BUFFER_NEXT_PRESENT;
+ ASSERT (b0->flags & VLIB_BUFFER_NEXT_PRESENT);
+ b0 = vlib_get_buffer (vm, b0->next_buffer);
+ }
+ prev_elt = elt;
+ }
+
+ while (__sync_lock_test_and_set (queue_lock, 1))
+ ;
+
+ unix_shared_memory_queue_add_raw (q, (u8 *)&elt_index);
+ CLIB_MEMORY_BARRIER();
+ *queue_lock = 0;
+
+ from++;
+ n_left--;
+ }
+
+ out:
+ if (PREDICT_FALSE(n_left))
+ {
+ if (is_ring_full)
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_RING_FULL,
+ n_left);
+ else if (interface_down)
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_ADMIN_DOWN,
+ n_left);
+ else
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_NO_BUFFERS,
+ n_left);
+
+ vlib_buffer_free (vm, from, n_left);
+ }
+ else
+ vlib_buffer_free (vm, vlib_frame_vector_args (f), f->n_vectors);
+
+ if (PREDICT_TRUE(vec_len(em->chunk_cache)))
+ _vec_len(em->chunk_cache) = n_present_in_cache;
+
+ return f->n_vectors;
+}
+
+static void ssvm_eth_clear_hw_interface_counters (u32 instance)
+{
+ /* Nothing for now */
+}
+
+static clib_error_t *
+ssvm_eth_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t * hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ ssvm_private_t * intfc = vec_elt_at_index (em->intfcs, hif->dev_instance);
+ ssvm_shared_header_t * sh;
+
+ /* publish link-state in shared-memory, to discourage buffer-wasting */
+ sh = intfc->sh;
+ if (intfc->i_am_master)
+ sh->opaque [MASTER_ADMIN_STATE_INDEX] = (void *) is_up;
+ else
+ sh->opaque [SLAVE_ADMIN_STATE_INDEX] = (void *) is_up;
+
+ return 0;
+}
+
+static clib_error_t *
+ssvm_eth_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t * st,
+ int is_add)
+{
+ /* Nothing for now */
+ return 0;
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void
+ssvm_eth_set_interface_next_node (vnet_main_t *vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ ssvm_private_t * intfc = pool_elt_at_index (em->intfcs, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ intfc->per_interface_next_index = node_index;
+ return;
+ }
+
+ intfc->per_interface_next_index =
+ vlib_node_add_next (em->vlib_main, ssvm_eth_input_node.index, node_index);
+}
+
+static u32 ssvm_eth_flag_change (vnet_main_t * vnm,
+ vnet_hw_interface_t * hi,
+ u32 flags)
+{
+ /* nothing for now */
+ return 0;
+}
+
+VNET_DEVICE_CLASS (ssvm_eth_device_class) = {
+ .name = "ssvm-eth",
+ .tx_function = ssvm_eth_interface_tx,
+ .tx_function_n_errors = SSVM_ETH_TX_N_ERROR,
+ .tx_function_error_strings = ssvm_eth_tx_func_error_strings,
+ .format_device_name = format_ssvm_eth_device_name,
+ .format_device = format_ssvm_eth_device,
+ .format_tx_trace = format_ssvm_eth_tx_trace,
+ .clear_counters = ssvm_eth_clear_hw_interface_counters,
+ .admin_up_down_function = ssvm_eth_interface_admin_up_down,
+ .subif_add_del_function = ssvm_eth_subif_add_del_function,
+ .rx_redirect_to_node = ssvm_eth_set_interface_next_node,
+ .no_flatten_output_chains = 1,
+};
diff --git a/vnet/vnet/devices/ssvm/ssvm_eth.h b/vnet/vnet/devices/ssvm/ssvm_eth.h
new file mode 100644
index 00000000000..1b077220305
--- /dev/null
+++ b/vnet/vnet/devices/ssvm/ssvm_eth.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ssvm_eth_h__
+#define __included_ssvm_eth_h__
+
+#include <vnet/vnet.h>
+
+#include <vppinfra/elog.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/elog.h>
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+#include <vlibmemory/unix_shared_memory_queue.h>
+
+#include <ssvm.h>
+
+vnet_device_class_t ssvm_eth_device_class;
+vlib_node_registration_t ssvm_eth_input_node;
+
+#define SSVM_BUFFER_SIZE \
+ (VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES + VLIB_BUFFER_PRE_DATA_SIZE)
+#define SSVM_PACKET_TYPE 1
+
+typedef struct {
+ /* Type of queue element */
+ u8 type;
+ u8 flags;
+#define SSVM_BUFFER_NEXT_PRESENT (1<<0)
+ u8 owner;
+ u8 tag;
+ i16 current_data_hint;
+ u16 length_this_buffer;
+ u16 total_length_not_including_first_buffer;
+ u16 pad;
+ u32 next_index;
+ /* offset 16 */
+ u8 data [SSVM_BUFFER_SIZE];
+ /* pad to an even multiple of 64 octets */
+ u8 pad2[CLIB_CACHE_LINE_BYTES - 16];
+} ssvm_eth_queue_elt_t;
+
+typedef struct {
+ /* vector of point-to-point connections */
+ ssvm_private_t * intfcs;
+
+ u32 * buffer_cache;
+ u32 * chunk_cache;
+
+ /* Configurable parameters */
+ /* base address for next placement */
+ u64 next_base_va;
+ u64 segment_size;
+ u64 nbuffers;
+ u64 queue_elts;
+
+ /* Segment names */
+ u8 ** names;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+ elog_main_t * elog_main;
+} ssvm_eth_main_t;
+
+ssvm_eth_main_t ssvm_eth_main;
+
+typedef enum {
+ CHUNK_POOL_FREELIST_INDEX = 0,
+ CHUNK_POOL_INDEX,
+ CHUNK_POOL_NFREE,
+ TO_MASTER_Q_INDEX,
+ TO_SLAVE_Q_INDEX,
+ MASTER_ADMIN_STATE_INDEX,
+ SLAVE_ADMIN_STATE_INDEX,
+} ssvm_eth_opaque_index_t;
+
+/*
+ * debug scaffolding.
+ */
+static inline void ssvm_eth_validate_freelists (int need_lock)
+{
+#if CLIB_DEBUG > 0
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ ssvm_private_t * intfc;
+ ssvm_shared_header_t * sh;
+ u32 * elt_indices;
+ u32 n_available;
+ int i;
+
+ for (i = 0; i < vec_len (em->intfcs); i++)
+ {
+ intfc = em->intfcs + i;
+ sh = intfc->sh;
+ u32 my_pid = intfc->my_pid;
+
+ if (need_lock)
+ ssvm_lock (sh, my_pid, 15);
+
+ elt_indices = (u32 *) (sh->opaque [CHUNK_POOL_FREELIST_INDEX]);
+ n_available = (u32) (u64) (sh->opaque [CHUNK_POOL_NFREE]);
+
+ for (i = 0; i < n_available; i++)
+ ASSERT (elt_indices[i] < 2048);
+
+ if (need_lock)
+ ssvm_unlock (sh);
+ }
+#endif
+}
+
+#endif /* __included_ssvm_eth_h__ */
diff --git a/vnet/vnet/devices/virtio/vhost-user.c b/vnet/vnet/devices/virtio/vhost-user.c
new file mode 100644
index 00000000000..4df025c21b6
--- /dev/null
+++ b/vnet/vnet/devices/virtio/vhost-user.c
@@ -0,0 +1,1957 @@
+/*
+ *------------------------------------------------------------------
+ * vhost.c - vhost-user
+ *
+ * Copyright (c) 2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <fcntl.h> /* for open */
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h> /* for iovec */
+#include <netinet/in.h>
+#include <sys/vfs.h>
+
+#include <linux/if_arp.h>
+#include <linux/if_tun.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <vnet/ip/ip.h>
+
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/virtio/vhost-user.h>
+
+#define VHOST_USER_DEBUG_SOCKET 0
+#define VHOST_USER_DEBUG_VQ 0
+
+/* Set to get virtio_net_hdr in buffer pre-data
+ details will be shown in packet trace */
+#define VHOST_USER_COPY_TX_HDR 0
+
+#if VHOST_USER_DEBUG_SOCKET == 1
+#define DBG_SOCK(args...) clib_warning(args);
+#else
+#define DBG_SOCK(args...)
+#endif
+
+#if VHOST_USER_DEBUG_VQ == 1
+#define DBG_VQ(args...) clib_warning(args);
+#else
+#define DBG_VQ(args...)
+#endif
+
+vlib_node_registration_t vhost_user_input_node;
+
+#define foreach_vhost_user_tx_func_error \
+ _(PKT_DROP_NOBUF, "tx packet drops (no available descriptors)") \
+ _(MMAP_FAIL, "mmap failure")
+
+typedef enum {
+#define _(f,s) VHOST_USER_TX_FUNC_ERROR_##f,
+ foreach_vhost_user_tx_func_error
+#undef _
+ VHOST_USER_TX_FUNC_N_ERROR,
+} vhost_user_tx_func_error_t;
+
+static char * vhost_user_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_vhost_user_tx_func_error
+#undef _
+};
+
+#define foreach_vhost_user_input_func_error \
+ _(NO_ERROR, "no error") \
+ _(UNDERSIZED_FRAME, "undersized ethernet frame received (< 14 bytes)")
+
+typedef enum {
+#define _(f,s) VHOST_USER_INPUT_FUNC_ERROR_##f,
+ foreach_vhost_user_input_func_error
+#undef _
+ VHOST_USER_INPUT_FUNC_N_ERROR,
+} vhost_user_input_func_error_t;
+
+static char * vhost_user_input_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_vhost_user_input_func_error
+#undef _
+};
+
+static vhost_user_main_t vhost_user_main = {
+ .mtu_bytes = 1518,
+};
+
+VNET_HW_INTERFACE_CLASS (vhost_interface_class, static) = {
+ .name = "vhost-user",
+};
+
+static u8 * format_vhost_user_interface_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ u32 show_dev_instance = ~0;
+ vhost_user_main_t * vum = &vhost_user_main;
+
+ if (i < vec_len (vum->show_dev_instance_by_real_dev_instance))
+ show_dev_instance = vum->show_dev_instance_by_real_dev_instance[i];
+
+ if (show_dev_instance != ~0)
+ i = show_dev_instance;
+
+ s = format (s, "VirtualEthernet0/0/%d", i);
+ return s;
+}
+
+static int vhost_user_name_renumber (vnet_hw_interface_t * hi,
+ u32 new_dev_instance)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+
+ vec_validate_init_empty (vum->show_dev_instance_by_real_dev_instance,
+ hi->dev_instance, ~0);
+
+ vum->show_dev_instance_by_real_dev_instance [hi->dev_instance] =
+ new_dev_instance;
+
+ DBG_SOCK("renumbered vhost-user interface dev_instance %d to %d",
+ hi->dev_instance, new_dev_instance);
+
+ return 0;
+}
+
+
+static inline void * map_guest_mem(vhost_user_intf_t * vui, u64 addr)
+{
+ int i;
+ for (i=0; i<vui->nregions; i++) {
+ if ((vui->regions[i].guest_phys_addr <= addr) &&
+ ((vui->regions[i].guest_phys_addr + vui->regions[i].memory_size) > addr)) {
+ return (void *) (vui->region_mmap_addr[i] + addr - vui->regions[i].guest_phys_addr);
+ }
+ }
+ DBG_VQ("failed to map guest mem addr %llx", addr);
+ return 0;
+}
+
+static inline void * map_user_mem(vhost_user_intf_t * vui, u64 addr)
+{
+ int i;
+ for (i=0; i<vui->nregions; i++) {
+ if ((vui->regions[i].userspace_addr <= addr) &&
+ ((vui->regions[i].userspace_addr + vui->regions[i].memory_size) > addr)) {
+ return (void *) (vui->region_mmap_addr[i] + addr - vui->regions[i].userspace_addr);
+ }
+ }
+ return 0;
+}
+
+static long get_huge_page_size(int fd)
+{
+ struct statfs s;
+ fstatfs(fd, &s);
+ return s.f_bsize;
+}
+
+static void unmap_all_mem_regions(vhost_user_intf_t * vui)
+{
+ int i,r;
+ for (i=0; i<vui->nregions; i++) {
+ if (vui->region_mmap_addr[i] != (void *) -1) {
+
+ long page_sz = get_huge_page_size(vui->region_mmap_fd[i]);
+
+ ssize_t map_sz = (vui->regions[i].memory_size +
+ vui->regions[i].mmap_offset + page_sz) & ~(page_sz - 1);
+
+ r = munmap(vui->region_mmap_addr[i] - vui->regions[i].mmap_offset, map_sz);
+
+ DBG_SOCK("unmap memory region %d addr 0x%lx len 0x%lx page_sz 0x%x", i,
+ vui->region_mmap_addr[i], map_sz, page_sz);
+
+ vui->region_mmap_addr[i]= (void *) -1;
+
+ if (r == -1) {
+ clib_warning("failed to unmap memory region (errno %d)", errno);
+ }
+ close(vui->region_mmap_fd[i]);
+ }
+ }
+ vui->nregions = 0;
+}
+
+
+static clib_error_t * vhost_user_callfd_read_ready (unix_file_t * uf)
+{
+ __attribute__((unused)) int n;
+ u8 buff[8];
+ n = read(uf->file_descriptor, ((char*)&buff), 8);
+ return 0;
+}
+
+static inline void vhost_user_if_disconnect(vhost_user_intf_t * vui)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vnet_main_t * vnm = vnet_get_main();
+ int q;
+
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+
+ if (vui->unix_file_index != ~0) {
+ unix_file_del (&unix_main, unix_main.file_pool + vui->unix_file_index);
+ vui->unix_file_index = ~0;
+ }
+
+ hash_unset(vum->vhost_user_interface_index_by_sock_fd, vui->unix_fd);
+ hash_unset(vum->vhost_user_interface_index_by_listener_fd, vui->unix_fd);
+ close(vui->unix_fd);
+ vui->unix_fd = -1;
+ vui->is_up = 0;
+ for (q = 0; q < vui->num_vrings; q++) {
+ vui->vrings[q].desc = NULL;
+ vui->vrings[q].avail = NULL;
+ vui->vrings[q].used = NULL;
+ }
+
+ unmap_all_mem_regions(vui);
+ DBG_SOCK("interface ifindex %d disconnected", vui->sw_if_index);
+}
+
+static clib_error_t * vhost_user_socket_read (unix_file_t * uf)
+{
+ int n, i;
+ int fd, number_of_fds = 0;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ vhost_user_msg_t msg;
+ struct msghdr mh;
+ struct iovec iov[1];
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ struct cmsghdr *cmsg;
+ uword * p;
+ u8 q;
+ unix_file_t template = {0};
+ vnet_main_t * vnm = vnet_get_main();
+
+ p = hash_get (vum->vhost_user_interface_index_by_sock_fd,
+ uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("FD %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+
+ char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))];
+
+ memset(&mh, 0, sizeof(mh));
+ memset(control, 0, sizeof(control));
+
+ /* set the payload */
+ iov[0].iov_base = (void *) &msg;
+ iov[0].iov_len = VHOST_USER_MSG_HDR_SZ;
+
+ mh.msg_iov = iov;
+ mh.msg_iovlen = 1;
+ mh.msg_control = control;
+ mh.msg_controllen = sizeof(control);
+
+ n = recvmsg(uf->file_descriptor, &mh, 0);
+
+ if (n != VHOST_USER_MSG_HDR_SZ)
+ goto close_socket;
+
+ if (mh.msg_flags & MSG_CTRUNC) {
+ goto close_socket;
+ }
+
+ cmsg = CMSG_FIRSTHDR(&mh);
+
+ if (cmsg && (cmsg->cmsg_len > 0) && (cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS) &&
+ (cmsg->cmsg_len - CMSG_LEN(0) <= VHOST_MEMORY_MAX_NREGIONS * sizeof(int))) {
+ number_of_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+ memcpy(fds, CMSG_DATA(cmsg), number_of_fds * sizeof(int));
+ }
+
+ /* version 1, no reply bit set*/
+ if ((msg.flags & 7) != 1) {
+ DBG_SOCK("malformed message received. closing socket");
+ goto close_socket;
+ }
+
+ {
+ int rv __attribute__((unused));
+ /* $$$$ pay attention to rv */
+ rv = read(uf->file_descriptor, ((char*)&msg) + n, msg.size);
+ }
+
+ switch (msg.request) {
+ case VHOST_USER_GET_FEATURES:
+ DBG_SOCK("if %d msg VHOST_USER_GET_FEATURES",
+ vui->hw_if_index);
+
+ msg.flags |= 4;
+ msg.u64 = (1 << FEAT_VIRTIO_NET_F_MRG_RXBUF) |
+ (1 << FEAT_VIRTIO_F_ANY_LAYOUT);
+ msg.u64 &= vui->feature_mask;
+
+ msg.size = sizeof(msg.u64);
+ break;
+
+ case VHOST_USER_SET_FEATURES:
+ DBG_SOCK("if %d msg VHOST_USER_SET_FEATURES features 0x%016llx",
+ vui->hw_if_index, msg.u64);
+
+ vui->features = msg.u64;
+ if (vui->features & (1 << FEAT_VIRTIO_NET_F_MRG_RXBUF))
+ vui->virtio_net_hdr_sz = 12;
+ else
+ vui->virtio_net_hdr_sz = 10;
+
+ vui->is_any_layout = (vui->features & (1 << FEAT_VIRTIO_F_ANY_LAYOUT)) ? 1 : 0;
+
+ ASSERT (vui->virtio_net_hdr_sz < VLIB_BUFFER_PRE_DATA_SIZE);
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+ vui->is_up = 0;
+
+ for (q = 0; q < 2; q++) {
+ vui->vrings[q].desc = 0;
+ vui->vrings[q].avail = 0;
+ vui->vrings[q].used = 0;
+ }
+
+ DBG_SOCK("interface %d disconnected", vui->sw_if_index);
+
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_MEM_TABLE nregions %d",
+ vui->hw_if_index, msg.memory.nregions);
+
+ if ((msg.memory.nregions < 1) ||
+ (msg.memory.nregions > VHOST_MEMORY_MAX_NREGIONS)) {
+
+ DBG_SOCK("number of mem regions must be between 1 and %i",
+ VHOST_MEMORY_MAX_NREGIONS);
+
+ goto close_socket;
+ }
+
+ if (msg.memory.nregions != number_of_fds) {
+ DBG_SOCK("each memory region must have FD");
+ goto close_socket;
+ }
+ unmap_all_mem_regions(vui);
+ for(i=0; i < msg.memory.nregions; i++) {
+ memcpy(&(vui->regions[i]), &msg.memory.regions[i],
+ sizeof(vhost_user_memory_region_t));
+
+ long page_sz = get_huge_page_size(fds[i]);
+
+ /* align size to 2M page */
+ ssize_t map_sz = (vui->regions[i].memory_size +
+ vui->regions[i].mmap_offset + page_sz) & ~(page_sz - 1);
+
+ vui->region_mmap_addr[i] = mmap(0, map_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fds[i], 0);
+
+ DBG_SOCK("map memory region %d addr 0 len 0x%lx fd %d mapped 0x%lx "
+ "page_sz 0x%x", i, map_sz, fds[i], vui->region_mmap_addr[i], page_sz);
+
+ if (vui->region_mmap_addr[i] == MAP_FAILED) {
+ clib_warning("failed to map memory. errno is %d", errno);
+ goto close_socket;
+ }
+ vui->region_mmap_addr[i] += vui->regions[i].mmap_offset;
+ vui->region_mmap_fd[i] = fds[i];
+ }
+ vui->nregions = msg.memory.nregions;
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_NUM idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ if ((msg.state.num > 32768) || /* maximum ring size is 32768 */
+ (msg.state.num == 0) || /* it cannot be zero */
+ (msg.state.num % 2)) /* must be power of 2 */
+ goto close_socket;
+ vui->vrings[msg.state.index].qsz = msg.state.num;
+ break;
+
+ case VHOST_USER_SET_VRING_ADDR:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ADDR idx %d",
+ vui->hw_if_index, msg.state.index);
+
+ vui->vrings[msg.state.index].desc = (vring_desc_t *)
+ map_user_mem(vui, msg.addr.desc_user_addr);
+ vui->vrings[msg.state.index].used = (vring_used_t *)
+ map_user_mem(vui, msg.addr.used_user_addr);
+ vui->vrings[msg.state.index].avail = (vring_avail_t *)
+ map_user_mem(vui, msg.addr.avail_user_addr);
+
+ if ((vui->vrings[msg.state.index].desc == NULL) ||
+ (vui->vrings[msg.state.index].used == NULL) ||
+ (vui->vrings[msg.state.index].avail == NULL)) {
+ DBG_SOCK("failed to map user memory for hw_if_index %d",
+ vui->hw_if_index);
+ goto close_socket;
+ }
+
+ vui->vrings[msg.state.index].last_used_idx =
+ vui->vrings[msg.state.index].used->idx;
+
+ /* tell driver that we don't want interrupts */
+ vui->vrings[msg.state.index].used->flags |= 1;
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ DBG_SOCK("if %d msg VHOST_USER_SET_OWNER",
+ vui->hw_if_index);
+ break;
+
+ case VHOST_USER_RESET_OWNER:
+ DBG_SOCK("if %d msg VHOST_USER_RESET_OWNER",
+ vui->hw_if_index);
+ break;
+
+ case VHOST_USER_SET_VRING_CALL:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_CALL u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ /* if there is old fd, delete it */
+ if (vui->vrings[q].callfd) {
+ unix_file_t * uf = pool_elt_at_index (unix_main.file_pool,
+ vui->vrings[q].callfd_idx);
+ unix_file_del (&unix_main, uf);
+ }
+ vui->vrings[q].callfd = fds[0];
+ template.read_function = vhost_user_callfd_read_ready;
+ template.file_descriptor = fds[0];
+ vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template);
+ }
+ else
+ vui->vrings[q].callfd = -1;
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_KICK u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ vui->vrings[q].kickfd = fds[0];
+ }
+ else
+ vui->vrings[q].kickfd = -1;
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ERR u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ fd = fds[0];
+ }
+ else
+ fd = -1;
+
+ vui->vrings[q].errfd = fd;
+ break;
+
+ case VHOST_USER_SET_VRING_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ vui->vrings[msg.state.index].last_avail_idx = msg.state.num;
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ msg.state.num = vui->vrings[msg.state.index].last_used_idx;
+ msg.flags |= 4;
+ msg.size = sizeof(msg.state);
+ break;
+
+ case VHOST_USER_NONE:
+ DBG_SOCK("if %d msg VHOST_USER_NONE",
+ vui->hw_if_index);
+
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_LOG_BASE",
+ vui->hw_if_index);
+
+ break;
+
+ case VHOST_USER_SET_LOG_FD:
+ DBG_SOCK("if %d msg VHOST_USER_SET_LOG_FD",
+ vui->hw_if_index);
+
+ break;
+
+ default:
+ DBG_SOCK("unknown vhost-user message %d received. closing socket",
+ msg.request);
+ goto close_socket;
+ }
+
+ /* if we have pointers to descriptor table, go up*/
+ if (!vui->is_up &&
+ vui->vrings[VHOST_NET_VRING_IDX_TX].desc &&
+ vui->vrings[VHOST_NET_VRING_IDX_RX].desc) {
+
+ DBG_SOCK("interface %d connected", vui->sw_if_index);
+
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP);
+ vui->is_up = 1;
+
+ }
+
+ /* if we need to reply */
+ if (msg.flags & 4)
+ {
+ n = send(uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
+ if (n != (msg.size + VHOST_USER_MSG_HDR_SZ))
+ goto close_socket;
+ }
+
+ return 0;
+
+close_socket:
+ vhost_user_if_disconnect(vui);
+ return 0;
+}
+
+static clib_error_t * vhost_user_socket_error (unix_file_t * uf)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ uword * p;
+
+ p = hash_get (vum->vhost_user_interface_index_by_sock_fd,
+ uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("fd %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+
+ vhost_user_if_disconnect(vui);
+ return 0;
+}
+
+static clib_error_t * vhost_user_socksvr_accept_ready (unix_file_t * uf)
+{
+ int client_fd, client_len;
+ struct sockaddr_un client;
+ unix_file_t template = {0};
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ uword * p;
+
+ p = hash_get (vum->vhost_user_interface_index_by_listener_fd,
+ uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("fd %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+
+ client_len = sizeof(client);
+ client_fd = accept (uf->file_descriptor,
+ (struct sockaddr *)&client,
+ (socklen_t *)&client_len);
+
+ if (client_fd < 0)
+ return clib_error_return_unix (0, "accept");
+
+ template.read_function = vhost_user_socket_read;
+ template.error_function = vhost_user_socket_error;
+ template.file_descriptor = client_fd;
+ vui->unix_file_index = unix_file_add (&unix_main, &template);
+
+ vui->client_fd = client_fd;
+ hash_set (vum->vhost_user_interface_index_by_sock_fd, vui->client_fd,
+ vui - vum->vhost_user_interfaces);
+
+ return 0;
+}
+
+static clib_error_t *
+vhost_user_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+ vhost_user_main_t * vum = &vhost_user_main;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ error = vlib_call_init_function (vm, ip4_init);
+ if (error)
+ return error;
+
+ vum->vhost_user_interface_index_by_listener_fd = hash_create (0, sizeof (uword));
+ vum->vhost_user_interface_index_by_sock_fd = hash_create (0, sizeof (uword));
+ vum->vhost_user_interface_index_by_sw_if_index = hash_create (0, sizeof (uword));
+ vum->coalesce_frames = 32;
+ vum->coalesce_time = 1e-3;
+
+ vec_validate_aligned (vum->rx_buffers, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (vhost_user_init);
+
+static clib_error_t *
+vhost_user_exit (vlib_main_t * vm)
+{
+ /* TODO cleanup */
+ return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (vhost_user_exit);
+
+enum {
+ VHOST_USER_RX_NEXT_ETHERNET_INPUT,
+ VHOST_USER_RX_NEXT_DROP,
+ VHOST_USER_RX_N_NEXT,
+};
+
+
+typedef struct {
+ u16 virtqueue;
+ u16 device_index;
+#if VHOST_USER_COPY_TX_HDR == 1
+ virtio_net_hdr_t hdr;
+#endif
+} vhost_user_input_trace_t;
+
+static u8 * format_vhost_user_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main();
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_input_trace_t * t = va_arg (*va, vhost_user_input_trace_t *);
+ vhost_user_intf_t * vui = vec_elt_at_index (vum->vhost_user_interfaces,
+ t->device_index);
+
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, vui->sw_if_index);
+
+#if VHOST_USER_COPY_TX_HDR == 1
+ uword indent = format_get_indent (s);
+#endif
+
+ s = format (s, "%U virtqueue %d",
+ format_vnet_sw_interface_name, vnm, sw,
+ t->virtqueue);
+
+#if VHOST_USER_COPY_TX_HDR == 1
+ s = format (s, "\n%Uvirtio_net_hdr flags 0x%02x gso_type %u hdr_len %u",
+ format_white_space, indent,
+ t->hdr.flags,
+ t->hdr.gso_type,
+ t->hdr.hdr_len);
+#endif
+
+ return s;
+}
+
+void vhost_user_rx_trace (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vhost_user_intf_t *vui,
+ i16 virtqueue)
+{
+ u32 * b, n_left;
+ vhost_user_main_t * vum = &vhost_user_main;
+
+ u32 next_index = VHOST_USER_RX_NEXT_ETHERNET_INPUT;
+
+ n_left = vec_len(vui->d_trace_buffers);
+ b = vui->d_trace_buffers;
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ vhost_user_input_trace_t * t0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ vlib_trace_buffer (vm, node, next_index, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->virtqueue = virtqueue;
+ t0->device_index = vui - vum->vhost_user_interfaces;
+#if VHOST_USER_COPY_TX_HDR == 1
+ rte_memcpy(&t0->hdr, b0->pre_data, sizeof(virtio_net_hdr_t));
+#endif
+
+ b+=1;
+ }
+}
+
+static inline void vhost_user_send_call(vlib_main_t * vm, vhost_user_vring_t * vq)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ u64 x = 1;
+ int rv __attribute__((unused));
+ /* $$$$ pay attention to rv */
+ rv = write(vq->callfd, &x, sizeof(x));
+ vq->n_since_last_int = 0;
+ vq->int_deadline = vlib_time_now(vm) + vum->coalesce_time;
+}
+
+static u32 vhost_user_if_input ( vlib_main_t * vm,
+ vhost_user_main_t * vum,
+ vhost_user_intf_t * vui,
+ vlib_node_runtime_t * node)
+{
+ vhost_user_vring_t * txvq = &vui->vrings[VHOST_NET_VRING_IDX_TX];
+ vhost_user_vring_t * rxvq = &vui->vrings[VHOST_NET_VRING_IDX_RX];
+ uword n_rx_packets = 0;
+ uword n_left;
+ u32 bi;
+ u32 n_left_to_next, * to_next;
+ u32 next_index = VHOST_USER_RX_NEXT_ETHERNET_INPUT;
+ uword n_rx_bytes = 0;
+ uword n_trace = vlib_get_trace_count (vm, node);
+ u16 qsz_mask;
+ f64 now = vlib_time_now (vm);
+ u32 cpu_index;
+
+ vec_reset_length (vui->d_trace_buffers);
+ u32 free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
+
+ /* no descriptor ptr - bail out */
+ if (PREDICT_FALSE(!txvq->desc))
+ return 0;
+
+ /* do we have pending intterupts ? */
+ if ((txvq->n_since_last_int) && (txvq->int_deadline < now))
+ vhost_user_send_call(vm, txvq);
+
+ if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now))
+ vhost_user_send_call(vm, rxvq);
+
+ /* only bit 0 of avail.flags is used so we don't want to deal with this
+ interface if any other bit is set */
+ if (PREDICT_FALSE(txvq->avail->flags & 0xFFFE))
+ return 0;
+
+ /* nothing to do */
+ if (txvq->avail->idx == txvq->last_avail_idx)
+ return 0;
+
+ cpu_index = os_get_cpu_number();
+
+ if (PREDICT_TRUE(txvq->avail->idx > txvq->last_avail_idx))
+ n_left = txvq->avail->idx - txvq->last_avail_idx;
+ else /* wrapped */
+ n_left = (u16) -1 - txvq->last_avail_idx + txvq->avail->idx;
+
+ if (PREDICT_FALSE(!vui->admin_up)) {
+ /* if intf is admin down, just drop all packets waiting in the ring */
+ txvq->last_avail_idx = txvq->last_used_idx = txvq->avail->idx;
+ CLIB_MEMORY_BARRIER();
+ txvq->used->idx = txvq->last_used_idx;
+ vhost_user_send_call(vm, txvq);
+
+ return 0;
+ }
+
+ if (PREDICT_FALSE(n_left > txvq->qsz)) {
+ return 0;
+ }
+
+ if (PREDICT_FALSE(n_left > VLIB_FRAME_SIZE))
+ n_left = VLIB_FRAME_SIZE;
+
+ /* Make sure we have some RX buffers. */
+ {
+ uword l = vec_len (vum->rx_buffers[cpu_index]);
+ uword n_alloc;
+
+ if (l < n_left)
+ {
+ if (! vum->rx_buffers[cpu_index]) {
+ vec_alloc (vum->rx_buffers[cpu_index], 2 * VLIB_FRAME_SIZE );
+ }
+
+ n_alloc = vlib_buffer_alloc_from_free_list
+ (vm, vum->rx_buffers[cpu_index] + l, 2 * VLIB_FRAME_SIZE - l,
+ free_list_index);
+ if (n_alloc == 0)
+ return 0;
+ _vec_len (vum->rx_buffers[cpu_index]) = l + n_alloc;
+ }
+ }
+
+ qsz_mask = txvq->qsz - 1;
+
+ while (n_left > 0) {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left > 0 && n_left_to_next > 0) {
+ vlib_buffer_t * b;
+ u16 desc_chain_head = txvq->avail->ring[txvq->last_avail_idx & qsz_mask];
+ u16 desc_current = desc_chain_head;
+ uword i_rx = vec_len (vum->rx_buffers[cpu_index]) - 1;
+
+ bi = vum->rx_buffers[cpu_index][i_rx];
+ b = vlib_get_buffer (vm, bi);
+
+ vlib_prefetch_buffer_with_index (vm, vum->rx_buffers[cpu_index][i_rx-1], STORE);
+
+ uword offset;
+ if (PREDICT_TRUE(vui->is_any_layout))
+ offset = vui->virtio_net_hdr_sz;
+ else if (!(txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT))
+ /* WSA case, no ANYLAYOUT but single buffer */
+ offset = vui->virtio_net_hdr_sz;
+ else
+ /* CSR case without ANYLAYOUT, skip 1st buffer */
+ offset = txvq->desc[desc_current].len;
+
+ uword ptr=0;
+
+ while(1) {
+ void * buffer_addr = map_guest_mem(vui, txvq->desc[desc_current].addr);
+ CLIB_PREFETCH (&txvq->desc[txvq->desc[desc_current].next], sizeof (vring_desc_t), READ);
+
+#if VHOST_USER_COPY_TX_HDR == 1
+ if (PREDICT_TRUE(offset)) {
+ rte_memcpy(b->pre_data, buffer_addr, sizeof(virtio_net_hdr_t)); /* 12 byte hdr is not used on tx */
+ }
+#endif
+
+ if (txvq->desc[desc_current].len > offset) {
+ u16 len = txvq->desc[desc_current].len - offset;
+
+ if (PREDICT_FALSE(len > VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES))
+ len = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+
+ rte_memcpy(vlib_buffer_get_current (b) + ptr,
+ buffer_addr + offset, len);
+ }
+ ptr += txvq->desc[desc_current].len - offset;
+ offset = 0;
+
+ /* if next flag is set, take next desc in the chain */
+ if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT )
+ desc_current = txvq->desc[desc_current].next;
+ else
+ break;
+ }
+
+ txvq->last_avail_idx++;
+
+ /* returning buffer */
+ txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head;
+ txvq->used->ring[txvq->last_used_idx & qsz_mask].len = ptr + vui->virtio_net_hdr_sz;
+
+ txvq->last_used_idx++;
+
+ b->current_length = ptr;
+
+ if(PREDICT_FALSE(b->current_length < 14)) {
+ vlib_error_count(vm, vhost_user_input_node.index,
+ VHOST_USER_INPUT_FUNC_ERROR_UNDERSIZED_FRAME, 1);
+ goto skip_frame;
+ }
+
+ b->flags = 0;
+ b->current_data = 0;
+ b->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ n_rx_bytes += ptr;
+ _vec_len (vum->rx_buffers[cpu_index]) = i_rx;
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See .../vlib/vlib/buffer.h
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b);
+
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = vui->sw_if_index;
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32)~0;
+ b->error = node->errors[0];
+
+ to_next[0] = bi;
+ to_next++;
+ n_left_to_next--;
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi, next_index);
+
+ if (PREDICT_FALSE (n_trace > n_rx_packets))
+ vec_add1 (vui->d_trace_buffers, bi);
+
+ n_rx_packets++;
+skip_frame:
+ n_left--;
+ }
+
+ /* give buffers back to driver */
+ CLIB_MEMORY_BARRIER();
+ txvq->used->idx = txvq->last_used_idx;
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ if (PREDICT_FALSE (vec_len (vui->d_trace_buffers) > 0))
+ {
+ vhost_user_rx_trace (vm, node, vui, VHOST_NET_VRING_IDX_TX);
+ vlib_set_trace_count (vm, node, n_trace - vec_len (vui->d_trace_buffers));
+ }
+
+ /* if no packets received we're done */
+ if(!n_rx_packets)
+ return 0;
+
+ /* interrupt (call) handling */
+ if((txvq->callfd > 0) && !(txvq->avail->flags & 1)) {
+ txvq->n_since_last_int += n_rx_packets;
+
+ if(txvq->n_since_last_int > vum->coalesce_frames)
+ vhost_user_send_call(vm, txvq);
+ }
+
+ /* increase rx counters */
+ vlib_increment_combined_counter
+ (vnet_main.interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number(),
+ vui->sw_if_index,
+ n_rx_packets, n_rx_bytes);
+
+ return n_rx_packets;
+}
+
+static uword
+vhost_user_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ dpdk_main_t * dm = &dpdk_main;
+ vhost_user_intf_t * vui;
+ uword n_rx_packets = 0;
+ u32 cpu_index = os_get_cpu_number();
+ int i;
+
+ for(i = 0; i < vec_len(vum->vhost_user_interfaces); i++ )
+ {
+ vui = vec_elt_at_index(vum->vhost_user_interfaces, i);
+ if (!vui->is_up ||
+ (i % dm->input_cpu_count) == (cpu_index - dm->input_cpu_first_index))
+ continue;
+ n_rx_packets +=
+ vhost_user_if_input (vm, vum, vui, node);
+ }
+ return n_rx_packets;
+}
+
+VLIB_REGISTER_NODE (vhost_user_input_node) = {
+ .function = vhost_user_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "vhost-user-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_vhost_user_input_trace,
+
+ .n_errors = VHOST_USER_INPUT_FUNC_N_ERROR,
+ .error_strings = vhost_user_input_func_error_strings,
+
+ .n_next_nodes = VHOST_USER_RX_N_NEXT,
+ .next_nodes = {
+ [VHOST_USER_RX_NEXT_DROP] = "error-drop",
+ [VHOST_USER_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ },
+};
+
+static uword
+vhost_user_intfc_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * buffers = vlib_frame_args (frame);
+ u32 n_left = 0;
+ u16 used_index;
+ vhost_user_main_t * vum = &vhost_user_main;
+ uword n_packets = 0;
+ uword n_avail_desc;
+ vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
+ vhost_user_intf_t * vui = vec_elt_at_index (vum->vhost_user_interfaces, rd->dev_instance);
+ vhost_user_vring_t * rxvq = &vui->vrings[VHOST_NET_VRING_IDX_RX];
+ u16 qsz_mask;
+
+ if (PREDICT_FALSE(!vui->is_up))
+ goto done2;
+
+ if (PREDICT_FALSE(!rxvq->desc))
+ goto done2;
+
+ if (PREDICT_FALSE(vui->lockp != 0))
+ {
+ while (__sync_lock_test_and_set (vui->lockp, 1))
+ ;
+ }
+
+
+ /* only bit 0 of avail.flags is used so we don't want to deal with this
+ interface if any other bit is set */
+ if (PREDICT_FALSE(rxvq->avail->flags & 0xFFFE))
+ goto done2;
+
+ if (PREDICT_FALSE((rxvq->avail->idx == rxvq->last_avail_idx) ||
+ vui->sock_errno != 0)) {
+ vlib_simple_counter_main_t * cm;
+ vnet_main_t * vnm = vnet_get_main();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+ vlib_increment_simple_counter (cm, os_get_cpu_number(),
+ 0, frame->n_vectors);
+
+ vlib_error_count (vm, node->node_index,
+ VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF,
+ frame->n_vectors);
+ goto done2;
+ }
+
+ if (PREDICT_TRUE(rxvq->avail->idx > rxvq->last_avail_idx))
+ n_avail_desc = rxvq->avail->idx - rxvq->last_avail_idx;
+ else /* wrapped */
+ n_avail_desc = (u16) -1 - rxvq->last_avail_idx + rxvq->avail->idx;
+
+ DBG_VQ("rxvq->avail->idx %d rxvq->last_avail_idx %d n_avail_desc %d",
+ rxvq->avail->idx, rxvq->last_avail_idx, n_avail_desc);
+
+ n_left = n_packets = frame->n_vectors;
+ if (PREDICT_FALSE(n_packets > n_avail_desc)) {
+ vlib_simple_counter_main_t * cm;
+ vnet_main_t * vnm = vnet_get_main();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+ vlib_increment_simple_counter (cm, os_get_cpu_number(),
+ 0, frame->n_vectors);
+
+ vlib_error_count (vm, node->node_index,
+ VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF,
+ n_packets - n_avail_desc);
+ n_left = n_packets = n_avail_desc;
+ }
+
+ used_index = rxvq->used->idx;
+ qsz_mask = rxvq->qsz - 1; /* qsz is always power of 2 */
+
+ while (n_left >= 4)
+ {
+ vlib_buffer_t * b0, * b1;
+ u16 desc_chain_head0,desc_chain_head1;
+ u16 desc_current0,desc_current1;
+ uword offset0, offset1;
+ u16 bytes_left0, bytes_left1;
+ void *buffer_addr0, *buffer_addr1;
+
+ vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD);
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ buffers+=2;
+ n_left-=2;
+
+ desc_current0 = desc_chain_head0 = rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask];
+ desc_current1 = desc_chain_head1 = rxvq->avail->ring[(rxvq->last_avail_idx+1) & qsz_mask];
+
+ offset0 = vui->virtio_net_hdr_sz;
+
+ offset1 = vui->virtio_net_hdr_sz;
+
+ bytes_left0 = b0->current_length;
+ bytes_left1 = b1->current_length;
+
+ buffer_addr0 = map_guest_mem(vui, rxvq->desc[desc_current0].addr);
+ buffer_addr1 = map_guest_mem(vui, rxvq->desc[desc_current1].addr);
+
+ if (PREDICT_FALSE(!buffer_addr0)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+ if (PREDICT_FALSE(!buffer_addr1)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+
+ virtio_net_hdr_mrg_rxbuf_t * hdr0 = (virtio_net_hdr_mrg_rxbuf_t *) buffer_addr0;
+ virtio_net_hdr_mrg_rxbuf_t * hdr1 = (virtio_net_hdr_mrg_rxbuf_t *) buffer_addr1;
+ hdr0->hdr.flags = 0;
+ hdr1->hdr.flags = 0;
+ hdr0->hdr.gso_type = 0;
+ hdr1->hdr.gso_type = 0;
+
+ if (vui->virtio_net_hdr_sz == 12) {
+ hdr0->num_buffers = 1;
+ hdr1->num_buffers = 1;
+ }
+
+ buffer_addr0 += offset0;
+ buffer_addr1 += offset1;
+
+ if (PREDICT_FALSE(!vui->is_any_layout && rxvq->desc[desc_current0].flags & VIRTQ_DESC_F_NEXT))
+ rxvq->desc[desc_current0].len = vui->virtio_net_hdr_sz;
+
+ if (PREDICT_FALSE(!vui->is_any_layout && rxvq->desc[desc_current1].flags & VIRTQ_DESC_F_NEXT))
+ rxvq->desc[desc_current1].len = vui->virtio_net_hdr_sz;
+
+ while(1) {
+ if (rxvq->desc[desc_current0].len - offset0 > 0 ) {
+ u16 bytes_to_copy = bytes_left0 > (rxvq->desc[desc_current0].len - offset0) ? (rxvq->desc[desc_current0].len - offset0) : bytes_left0;
+ rte_memcpy(buffer_addr0, vlib_buffer_get_current (b0) + b0->current_length - bytes_left0, bytes_to_copy);
+ bytes_left0 -= bytes_to_copy;
+ }
+
+ if (rxvq->desc[desc_current0].flags & VIRTQ_DESC_F_NEXT ) {
+ offset0 = 0;
+ desc_current0 = rxvq->desc[desc_current1].next;
+ buffer_addr0 = map_guest_mem(vui, rxvq->desc[desc_current0].addr);
+ if (PREDICT_FALSE(!buffer_addr0)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+ }
+ else
+ break;
+ }
+
+ while(1) {
+ if (rxvq->desc[desc_current1].len - offset1 > 0 ) {
+ u16 bytes_to_copy = bytes_left1 > (rxvq->desc[desc_current1].len - offset1) ? (rxvq->desc[desc_current1].len - offset1) : bytes_left1;
+ rte_memcpy(buffer_addr1, vlib_buffer_get_current (b1) + b1->current_length - bytes_left1, bytes_to_copy);
+ bytes_left1 -= bytes_to_copy;
+ }
+
+ if (rxvq->desc[desc_current1].flags & VIRTQ_DESC_F_NEXT ) {
+ offset1 = 0;
+ desc_current1 = rxvq->desc[desc_current1].next;
+ buffer_addr1 = map_guest_mem(vui, rxvq->desc[desc_current1].addr);
+ if (PREDICT_FALSE(!buffer_addr1)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+ }
+ else
+ break;
+ }
+
+ rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head0;
+ rxvq->used->ring[used_index & qsz_mask].len = b0->current_length + vui->virtio_net_hdr_sz;
+ used_index+=1;
+ rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head1;
+ rxvq->used->ring[used_index & qsz_mask].len = b1->current_length + vui->virtio_net_hdr_sz;
+ used_index+=1;
+ rxvq->last_avail_idx+=2;
+ }
+
+ while (n_left > 0)
+ {
+ vlib_buffer_t * b0;
+ u16 desc_chain_head;
+ u16 desc_current;
+ void *buffer_addr;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ buffers++;
+ n_left--;
+
+ desc_chain_head = rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask];
+ desc_current = desc_chain_head;
+
+ uword offset = vui->virtio_net_hdr_sz;
+
+ u16 bytes_left = b0->current_length;
+ buffer_addr = map_guest_mem(vui, rxvq->desc[desc_current].addr);
+ if (PREDICT_FALSE(!buffer_addr)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+
+ virtio_net_hdr_mrg_rxbuf_t * hdr = (virtio_net_hdr_mrg_rxbuf_t *) buffer_addr;
+ hdr->hdr.flags = 0;
+ hdr->hdr.gso_type = 0;
+
+ if (vui->virtio_net_hdr_sz == 12) {
+ hdr->num_buffers = 1;
+ }
+
+ buffer_addr += offset;
+
+ if (PREDICT_FALSE(!vui->is_any_layout && rxvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT))
+ rxvq->desc[desc_current].len = vui->virtio_net_hdr_sz;
+
+ while(1) {
+ if (rxvq->desc[desc_current].len - offset > 0 ) {
+ u16 bytes_to_copy = bytes_left > (rxvq->desc[desc_current].len - offset) ? (rxvq->desc[desc_current].len - offset) : bytes_left;
+ rte_memcpy(buffer_addr, vlib_buffer_get_current (b0) + b0->current_length - bytes_left, bytes_to_copy);
+ bytes_left -= bytes_to_copy;
+ }
+
+ if (rxvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT ) {
+ offset = 0;
+ desc_current = rxvq->desc[desc_current].next;
+ buffer_addr = map_guest_mem(vui, rxvq->desc[desc_current].addr);
+ if (PREDICT_FALSE(!buffer_addr)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+ }
+ else
+ break;
+ }
+
+ rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head;
+ rxvq->used->ring[used_index & qsz_mask].len = b0->current_length + vui->virtio_net_hdr_sz;
+
+ used_index++;
+ rxvq->last_avail_idx++;
+ }
+
+done:
+ CLIB_MEMORY_BARRIER();
+ rxvq->used->idx = used_index;
+
+ /* interrupt (call) handling */
+ if((rxvq->callfd > 0) && !(rxvq->avail->flags & 1)) {
+ rxvq->n_since_last_int += n_packets - n_left;
+
+ if(rxvq->n_since_last_int > vum->coalesce_frames)
+ vhost_user_send_call(vm, rxvq);
+ }
+
+done2:
+
+ if (PREDICT_FALSE(vui->lockp != 0))
+ *vui->lockp = 0;
+
+ vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+ return frame->n_vectors;
+}
+
+static clib_error_t *
+vhost_user_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t * hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui = vec_elt_at_index (vum->vhost_user_interfaces, hif->dev_instance);
+
+ vui->admin_up = is_up;
+
+ if (is_up)
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+
+ return /* no error */ 0;
+}
+
+VNET_DEVICE_CLASS (vhost_user_dev_class,static) = {
+ .name = "vhost-user",
+ .tx_function = vhost_user_intfc_tx,
+ .tx_function_n_errors = VHOST_USER_TX_FUNC_N_ERROR,
+ .tx_function_error_strings = vhost_user_tx_func_error_strings,
+ .format_device_name = format_vhost_user_interface_name,
+ .name_renumber = vhost_user_name_renumber,
+ .admin_up_down_function = vhost_user_interface_admin_up_down,
+};
+
+static uword
+vhost_user_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ struct sockaddr_un sun;
+ int sockfd;
+ unix_file_t template = {0};
+ f64 timeout = 3153600000.0 /* 100 years */;
+ uword *event_data = 0;
+
+ sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ sun.sun_family = AF_UNIX;
+ template.read_function = vhost_user_socket_read;
+ template.error_function = vhost_user_socket_error;
+
+
+ if (sockfd < 0)
+ return 0;
+
+ while (1) {
+ vlib_process_wait_for_event_or_clock (vm, timeout);
+ vlib_process_get_events (vm, &event_data);
+ vec_reset_length (event_data);
+
+ timeout = 3.0;
+
+ vec_foreach (vui, vum->vhost_user_interfaces) {
+
+ if (vui->sock_is_server || !vui->active)
+ continue;
+
+ if (vui->unix_fd == -1) {
+ /* try to connect */
+
+ strncpy(sun.sun_path, (char *) vui->sock_filename, sizeof(sun.sun_path) - 1);
+
+ if (connect(sockfd, (struct sockaddr *) &sun, sizeof(struct sockaddr_un)) == 0) {
+ vui->sock_errno = 0;
+ vui->unix_fd = sockfd;
+ template.file_descriptor = sockfd;
+ vui->unix_file_index = unix_file_add (&unix_main, &template);
+ hash_set (vum->vhost_user_interface_index_by_sock_fd, sockfd, vui - vum->vhost_user_interfaces);
+
+ sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (sockfd < 0)
+ return 0;
+ }
+ else {
+ vui->sock_errno = errno;
+ }
+ } else {
+ /* check if socket is alive */
+ int error = 0;
+ socklen_t len = sizeof (error);
+ int retval = getsockopt(vui->unix_fd, SOL_SOCKET, SO_ERROR, &error, &len);
+
+ if (retval)
+ vhost_user_if_disconnect(vui);
+ }
+ }
+ }
+ return 0;
+}
+
+VLIB_REGISTER_NODE (vhost_user_process_node,static) = {
+ .function = vhost_user_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "vhost-user-process",
+};
+
+int vhost_user_delete_if(vnet_main_t * vnm, vlib_main_t * vm,
+ u32 sw_if_index)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ uword *p = NULL;
+ int rv = 0;
+
+ p = hash_get (vum->vhost_user_interface_index_by_sw_if_index,
+ sw_if_index);
+ if (p == 0) {
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+ } else {
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+ }
+
+ // interface is inactive
+ vui->active = 0;
+ // disconnect interface sockets
+ vhost_user_if_disconnect(vui);
+ // add to inactive interface list
+ vec_add1 (vum->vhost_user_inactive_interfaces_index, p[0]);
+
+ // reset renumbered iface
+ if (p[0] < vec_len (vum->show_dev_instance_by_real_dev_instance))
+ vum->show_dev_instance_by_real_dev_instance[p[0]] = ~0;
+
+ ethernet_delete_interface (vnm, vui->hw_if_index);
+ DBG_SOCK ("deleted (deactivated) vhost-user interface instance %d", p[0]);
+
+ return rv;
+}
+
+// init server socket on specified sock_filename
+static int vhost_user_init_server_sock(const char * sock_filename, int *sockfd)
+{
+ int rv = 0, len;
+ struct sockaddr_un un;
+ int fd;
+ /* create listening socket */
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+
+ if (fd < 0) {
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ }
+
+ un.sun_family = AF_UNIX;
+ strcpy((char *) un.sun_path, (char *) sock_filename);
+
+ /* remove if exists */
+ unlink( (char *) sock_filename);
+
+ len = strlen((char *) un.sun_path) + strlen((char *) sock_filename);
+
+ if (bind(fd, (struct sockaddr *) &un, len) == -1) {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_2;
+ goto error;
+ }
+
+ if (listen(fd, 1) == -1) {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_3;
+ goto error;
+ }
+
+ unix_file_t template = {0};
+ template.read_function = vhost_user_socksvr_accept_ready;
+ template.file_descriptor = fd;
+ unix_file_add (&unix_main, &template);
+ *sockfd = fd;
+ return rv;
+
+error:
+ close(fd);
+ return rv;
+}
+
+// get new vhost_user_intf_t from inactive interfaces or create new one
+static vhost_user_intf_t *vhost_user_vui_new()
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui = NULL;
+ int inactive_cnt = vec_len(vum->vhost_user_inactive_interfaces_index);
+ // if there are any inactive ifaces
+ if (inactive_cnt > 0) {
+ // take last
+ u32 vui_idx = vum->vhost_user_inactive_interfaces_index[inactive_cnt - 1];
+ if (vec_len(vum->vhost_user_interfaces) > vui_idx) {
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, vui_idx);
+ DBG_SOCK("reusing inactive vhost-user interface index %d", vui_idx);
+ }
+ // "remove" from inactive list
+ _vec_len(vum->vhost_user_inactive_interfaces_index) -= 1;
+ }
+
+ // vui was not retrieved from inactive ifaces - create new
+ if (!vui)
+ vec_add2 (vum->vhost_user_interfaces, vui, 1);
+ return vui;
+}
+
+// create ethernet interface for vhost user intf
+static void vhost_user_create_ethernet(vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_t *vui)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ u8 hwaddr[6];
+ clib_error_t * error;
+
+ /* create hw and sw interface */
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (hwaddr+2, &rnd, sizeof(rnd));
+ hwaddr[0] = 2;
+ hwaddr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface
+ (vnm,
+ vhost_user_dev_class.index,
+ vui - vum->vhost_user_interfaces /* device instance */,
+ hwaddr /* ethernet address */,
+ &vui->hw_if_index,
+ 0 /* flag change */);
+ if (error)
+ clib_error_report (error);
+}
+
+// initialize vui with specified attributes
+static void vhost_user_vui_init(vnet_main_t * vnm,
+ vhost_user_intf_t *vui, int sockfd,
+ const char * sock_filename,
+ u8 is_server, u64 feature_mask,
+ u32 * sw_if_index)
+{
+ vnet_sw_interface_t * sw;
+ sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index);
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ vui->unix_fd = sockfd;
+ vui->sw_if_index = sw->sw_if_index;
+ vui->num_vrings = 2;
+ vui->sock_is_server = is_server;
+ strncpy(vui->sock_filename, sock_filename, ARRAY_LEN(vui->sock_filename)-1);
+ vui->sock_errno = 0;
+ vui->is_up = 0;
+ vui->feature_mask = feature_mask;
+ vui->active = 1;
+ vui->unix_file_index = ~0;
+
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+
+ if (sw_if_index)
+ *sw_if_index = vui->sw_if_index;
+
+ if (tm->n_vlib_mains > 1)
+ {
+ vui->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) vui->lockp, 0, CLIB_CACHE_LINE_BYTES);
+ }
+}
+
+// register vui and start polling on it
+static void vhost_user_vui_register(vlib_main_t * vm, vhost_user_intf_t *vui)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ dpdk_main_t * dm = &dpdk_main;
+ int cpu_index;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ hash_set (vum->vhost_user_interface_index_by_listener_fd, vui->unix_fd,
+ vui - vum->vhost_user_interfaces);
+ hash_set (vum->vhost_user_interface_index_by_sw_if_index, vui->sw_if_index,
+ vui - vum->vhost_user_interfaces);
+
+ /* start polling */
+ cpu_index = dm->input_cpu_first_index +
+ (vui - vum->vhost_user_interfaces) % dm->input_cpu_count;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_node_set_state (vm, vhost_user_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else if (!dm->have_io_threads)
+ vlib_node_set_state (vlib_mains[cpu_index], vhost_user_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ /* tell process to start polling for sockets */
+ vlib_process_signal_event(vm, vhost_user_process_node.index, 0, 0);
+}
+
+int vhost_user_create_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 * sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance)
+{
+ vhost_user_intf_t * vui = NULL;
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ u32 sw_if_idx = ~0;
+ int sockfd = -1;
+ int rv = 0;
+
+ if (tm->n_vlib_mains > 1 && dm->have_io_threads)
+ {
+ clib_warning("vhost-user interfaces are not supported with multiple io threads");
+ return -1;
+ }
+
+ if (is_server) {
+ if ((rv = vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) {
+ return rv;
+ }
+ }
+
+ vui = vhost_user_vui_new ();
+ ASSERT(vui != NULL);
+
+ vhost_user_create_ethernet (vnm, vm, vui);
+ vhost_user_vui_init (vnm, vui, sockfd, sock_filename, is_server,
+ feature_mask, &sw_if_idx);
+
+ if (renumber) {
+ vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
+ }
+
+ vhost_user_vui_register (vm, vui);
+
+ if (sw_if_index)
+ *sw_if_index = sw_if_idx;
+
+ return rv;
+}
+
+int vhost_user_modify_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui = NULL;
+ u32 sw_if_idx = ~0;
+ int sockfd = -1;
+ int rv = 0;
+ uword *p = NULL;
+
+ p = hash_get (vum->vhost_user_interface_index_by_sw_if_index,
+ sw_if_index);
+ if (p == 0) {
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+ } else {
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+ }
+
+ // interface is inactive
+ vui->active = 0;
+ // disconnect interface sockets
+ vhost_user_if_disconnect(vui);
+
+ if (is_server) {
+ if ((rv = vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) {
+ return rv;
+ }
+ }
+
+ vhost_user_vui_init (vnm, vui, sockfd, sock_filename, is_server,
+ feature_mask, &sw_if_idx);
+
+ if (renumber) {
+ vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
+ }
+
+ vhost_user_vui_register (vm, vui);
+
+ return rv;
+}
+
+clib_error_t *
+vhost_user_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u8 * sock_filename = NULL;
+ u32 sw_if_index;
+ u8 is_server = 0;
+ u64 feature_mask = (u64)~0;
+ u8 renumber = 0;
+ u32 custom_dev_instance = ~0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "socket %s", &sock_filename))
+ ;
+ else if (unformat (line_input, "server"))
+ is_server = 1;
+ else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
+ ;
+ else if (unformat (line_input, "renumber %d", &custom_dev_instance)) {
+ renumber = 1;
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ vnet_main_t *vnm = vnet_get_main();
+
+ vhost_user_create_if(vnm, vm, (char *)sock_filename,
+ is_server, &sw_if_index, feature_mask,
+ renumber, custom_dev_instance);
+
+ vec_free(sock_filename);
+
+ return 0;
+}
+
+clib_error_t *
+vhost_user_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u32 sw_if_index = ~0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ vnet_main_t *vnm = vnet_get_main();
+
+ vhost_user_delete_if(vnm, vm, sw_if_index);
+
+ return 0;
+}
+
+int vhost_user_dump_ifs(vnet_main_t * vnm, vlib_main_t * vm, vhost_user_intf_details_t **out_vuids)
+{
+ int rv = 0;
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ vhost_user_intf_details_t * r_vuids = NULL;
+ vhost_user_intf_details_t * vuid = NULL;
+ u32 * hw_if_indices = 0;
+ vnet_hw_interface_t * hi;
+ u8 *s = NULL;
+ int i;
+
+ if (!out_vuids)
+ return -1;
+
+ vec_foreach (vui, vum->vhost_user_interfaces) {
+ if (vui->active)
+ vec_add1(hw_if_indices, vui->hw_if_index);
+ }
+
+ for (i = 0; i < vec_len (hw_if_indices); i++) {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, hi->dev_instance);
+
+ vec_add2(r_vuids, vuid, 1);
+ vuid->sw_if_index = vui->sw_if_index;
+ vuid->virtio_net_hdr_sz = vui->virtio_net_hdr_sz;
+ vuid->features = vui->features;
+ vuid->is_server = vui->sock_is_server;
+ vuid->num_regions = vui->nregions;
+ vuid->sock_errno = vui->sock_errno;
+ strncpy((char *)vuid->sock_filename, (char *)vui->sock_filename,
+ ARRAY_LEN(vuid->sock_filename)-1);
+
+ s = format (s, "%v%c", hi->name, 0);
+
+ strncpy((char *)vuid->if_name, (char *)s,
+ ARRAY_LEN(vuid->if_name)-1);
+ _vec_len(s) = 0;
+ }
+
+ vec_free (s);
+ vec_free (hw_if_indices);
+
+ *out_vuids = r_vuids;
+
+ return rv;
+}
+
+clib_error_t *
+show_vhost_user_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t * error = 0;
+ vnet_main_t * vnm = vnet_get_main();
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ u32 hw_if_index, * hw_if_indices = 0;
+ vnet_hw_interface_t * hi;
+ int i, j, q;
+ int show_descr = 0;
+ struct feat_struct { u8 bit; char *str;};
+ struct feat_struct *feat_entry;
+
+ static struct feat_struct feat_array[] = {
+#define _(s,b) { .str = #s, .bit = b, },
+ foreach_virtio_net_feature
+#undef _
+ { .str = NULL }
+ };
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) {
+ vec_add1 (hw_if_indices, hw_if_index);
+ vlib_cli_output(vm, "add %d", hw_if_index);
+ }
+ else if (unformat (input, "descriptors") || unformat (input, "desc") )
+ show_descr = 1;
+ else {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+ if (vec_len (hw_if_indices) == 0) {
+ vec_foreach (vui, vum->vhost_user_interfaces) {
+ if (vui->active)
+ vec_add1(hw_if_indices, vui->hw_if_index);
+ }
+ }
+ vlib_cli_output (vm, "Virtio vhost-user interfaces");
+ vlib_cli_output (vm, "Global:\n coalesce frames %d time %e\n\n",
+ vum->coalesce_frames, vum->coalesce_time);
+
+ for (i = 0; i < vec_len (hw_if_indices); i++) {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, hi->dev_instance);
+ vlib_cli_output (vm, "Interface: %s (ifindex %d)",
+ hi->name, hw_if_indices[i]);
+
+ vlib_cli_output (vm, "virtio_net_hdr_sz %d\n features (0x%llx): \n",
+ vui->virtio_net_hdr_sz, vui->features);
+
+ feat_entry = (struct feat_struct *) &feat_array;
+ while(feat_entry->str) {
+ if (vui->features & (1 << feat_entry->bit))
+ vlib_cli_output (vm, " %s (%d)", feat_entry->str, feat_entry->bit);
+ feat_entry++;
+ }
+
+ vlib_cli_output (vm, "\n");
+
+
+ vlib_cli_output (vm, " socket filename %s type %s errno \"%s\"\n\n",
+ vui->sock_filename, vui->sock_is_server ? "server" : "client",
+ strerror(vui->sock_errno));
+
+ vlib_cli_output (vm, " Memory regions (total %d)\n", vui->nregions);
+
+ if (vui->nregions){
+ vlib_cli_output(vm, " region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr\n");
+ vlib_cli_output(vm, " ====== ===== ================== ================== ================== ================== ==================\n");
+ }
+ for (j = 0; j < vui->nregions; j++) {
+ vlib_cli_output(vm, " %d %-5d 0x%016lx 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", j,
+ vui->region_mmap_fd[j],
+ vui->regions[j].guest_phys_addr,
+ vui->regions[j].memory_size,
+ vui->regions[j].userspace_addr,
+ vui->regions[j].mmap_offset,
+ (u64) vui->region_mmap_addr[j]);
+ }
+ for (q = 0; q < vui->num_vrings; q++) {
+ vlib_cli_output(vm, "\n Virtqueue %d\n", q);
+
+ vlib_cli_output(vm, " qsz %d last_avail_idx %d last_used_idx %d\n",
+ vui->vrings[q].qsz,
+ vui->vrings[q].last_avail_idx,
+ vui->vrings[q].last_used_idx);
+
+ if (vui->vrings[q].avail && vui->vrings[q].used)
+ vlib_cli_output(vm, " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
+ vui->vrings[q].avail->flags,
+ vui->vrings[q].avail->idx,
+ vui->vrings[q].used->flags,
+ vui->vrings[q].used->idx);
+
+ vlib_cli_output(vm, " kickfd %d callfd %d errfd %d\n",
+ vui->vrings[q].kickfd,
+ vui->vrings[q].callfd,
+ vui->vrings[q].errfd);
+
+ if (show_descr) {
+ vlib_cli_output(vm, "\n descriptor table:\n");
+ vlib_cli_output(vm, " id addr len flags next user_addr\n");
+ vlib_cli_output(vm, " ===== ================== ===== ====== ===== ==================\n");
+ for(j = 0; j < vui->vrings[q].qsz; j++) {
+ vlib_cli_output(vm, " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
+ j,
+ vui->vrings[q].desc[j].addr,
+ vui->vrings[q].desc[j].len,
+ vui->vrings[q].desc[j].flags,
+ vui->vrings[q].desc[j].next,
+ (u64) map_guest_mem(vui, vui->vrings[q].desc[j].addr));}
+ }
+ }
+ vlib_cli_output (vm, "\n");
+ }
+done:
+ vec_free (hw_if_indices);
+ return error;
+}
+
+static clib_error_t *
+vhost_user_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "coalesce-frames %d", &vum->coalesce_frames))
+ ;
+ else if (unformat (input, "coalesce-time %f", &vum->coalesce_time))
+ ;
+ else if (unformat (input, "dont-dump-memory"))
+ vum->dont_dump_vhost_user_memory = 1;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ return 0;
+}
+
+/* vhost-user { ... } configuration. */
+VLIB_CONFIG_FUNCTION (vhost_user_config, "vhost-user");
+
+void
+vhost_user_unmap_all (void)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+
+ if (vum->dont_dump_vhost_user_memory)
+ {
+ vec_foreach (vui, vum->vhost_user_interfaces)
+ {
+ unmap_all_mem_regions(vui);
+ }
+ }
+}
diff --git a/vnet/vnet/devices/virtio/vhost-user.h b/vnet/vnet/devices/virtio/vhost-user.h
new file mode 100644
index 00000000000..3b57bcbfc16
--- /dev/null
+++ b/vnet/vnet/devices/virtio/vhost-user.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __VIRTIO_VHOST_USER_H__
+#define __VIRTIO_VHOST_USER_H__
+/* vhost-user data structures */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+#define VHOST_USER_MSG_HDR_SZ 12
+#define VHOST_VRING_MAX_SIZE 32768
+#define VHOST_NET_VRING_IDX_RX 0
+#define VHOST_NET_VRING_IDX_TX 1
+#define VHOST_NET_VRING_NUM 2
+
+#define VIRTQ_DESC_F_NEXT 1
+
+#define foreach_virtio_net_feature \
+ _ (VIRTIO_NET_F_MRG_RXBUF, 15) \
+ _ (VIRTIO_F_ANY_LAYOUT, 27)
+
+typedef enum {
+#define _(f,n) FEAT_##f = (n),
+ foreach_virtio_net_feature
+#undef _
+} virtio_net_feature_t;
+
+int vhost_user_create_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename, u8 is_server,
+ u32 * sw_if_index, u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int vhost_user_modify_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename, u8 is_server,
+ u32 sw_if_index, u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int vhost_user_delete_if(vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index);
+
+typedef struct vhost_user_memory_region {
+ u64 guest_phys_addr;
+ u64 memory_size;
+ u64 userspace_addr;
+ u64 mmap_offset;
+} vhost_user_memory_region_t;
+
+typedef struct vhost_user_memory {
+ u32 nregions;
+ u32 padding;
+ vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS];
+} vhost_user_memory_t;
+
+typedef struct vhost_vring_state {
+ unsigned int index, num;
+} vhost_vring_state_t;
+
+typedef struct vhost_vring_addr {
+ unsigned int index, flags;
+ u64 desc_user_addr, used_user_addr, avail_user_addr, log_guest_addr;
+} vhost_vring_addr_t;
+
+typedef enum vhost_user_req {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_MAX
+} vhost_user_req_t;
+
+// vring_desc I/O buffer descriptor
+typedef struct {
+ uint64_t addr; // packet data buffer address
+ uint32_t len; // packet data buffer size
+ uint16_t flags; // (see below)
+ uint16_t next; // optional index next descriptor in chain
+} __attribute ((packed)) vring_desc_t;
+
+typedef struct {
+ uint16_t flags;
+ uint16_t idx;
+ uint16_t ring[VHOST_VRING_MAX_SIZE];
+} __attribute ((packed)) vring_avail_t;
+
+typedef struct {
+ uint16_t flags;
+ uint16_t idx;
+ struct /* vring_used_elem */ {
+ uint32_t id;
+ uint32_t len;
+ } ring[VHOST_VRING_MAX_SIZE];
+} __attribute ((packed)) vring_used_t;
+
+typedef struct {
+ u8 flags;
+ u8 gso_type;
+ u16 hdr_len;
+ u16 gso_size;
+ u16 csum_start;
+ u16 csum_offset;
+} __attribute ((packed)) virtio_net_hdr_t;
+
+typedef struct {
+ virtio_net_hdr_t hdr;
+ u16 num_buffers;
+} __attribute ((packed)) virtio_net_hdr_mrg_rxbuf_t;
+
+typedef struct vhost_user_msg {
+ vhost_user_req_t request;
+ u32 flags;
+ u32 size;
+ union {
+ u64 u64;
+ vhost_vring_state_t state;
+ vhost_vring_addr_t addr;
+ vhost_user_memory_t memory;
+ };
+} __attribute ((packed)) vhost_user_msg_t;
+
+typedef struct {
+ u32 qsz;
+ u16 last_avail_idx;
+ u16 last_used_idx;
+ vring_desc_t *desc;
+ vring_avail_t *avail;
+ vring_used_t *used;
+ int callfd;
+ int kickfd;
+ int errfd;
+ u32 callfd_idx;
+ u32 n_since_last_int;
+ f64 int_deadline;
+} vhost_user_vring_t;
+
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+ volatile u32 * lockp;
+ u32 is_up;
+ u32 admin_up;
+ u32 unix_fd;
+ u32 unix_file_index;
+ u32 client_fd;
+ char sock_filename[256];
+ int sock_errno;
+ u8 sock_is_server;
+ u32 hw_if_index, sw_if_index;
+ u8 active;
+
+ u32 nregions;
+ u64 features;
+ u64 feature_mask;
+ u32 num_vrings;
+ vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS];
+ void * region_mmap_addr[VHOST_MEMORY_MAX_NREGIONS];
+ u32 region_mmap_fd[VHOST_MEMORY_MAX_NREGIONS];
+ vhost_user_vring_t vrings[2];
+ int virtio_net_hdr_sz;
+ int is_any_layout;
+ u32 * d_trace_buffers;
+} vhost_user_intf_t;
+
+typedef struct {
+ u32 ** rx_buffers;
+ u32 mtu_bytes;
+ vhost_user_intf_t * vhost_user_interfaces;
+ u32 * vhost_user_inactive_interfaces_index;
+ uword * vhost_user_interface_index_by_listener_fd;
+ uword * vhost_user_interface_index_by_sock_fd;
+ uword * vhost_user_interface_index_by_sw_if_index;
+ u32 * show_dev_instance_by_real_dev_instance;
+ u32 coalesce_frames;
+ f64 coalesce_time;
+ int dont_dump_vhost_user_memory;
+} vhost_user_main_t;
+
+typedef struct {
+ u8 if_name[64];
+ u32 sw_if_index;
+ u32 virtio_net_hdr_sz;
+ u64 features;
+ u8 is_server;
+ u8 sock_filename[256];
+ u32 num_regions;
+ int sock_errno;
+} vhost_user_intf_details_t;
+
+int vhost_user_dump_ifs(vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_details_t **out_vuids);
+
+// CLI commands to be used from dpdk
+clib_error_t *
+vhost_user_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+clib_error_t *
+vhost_user_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+clib_error_t *
+show_vhost_user_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+
+#endif