summaryrefslogtreecommitdiffstats
path: root/vnet
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2016-04-20 05:04:20 +0200
committerDave Barach <openvpp@barachs.net>2016-04-25 14:24:03 +0000
commit108c7313854953ee3b66069b902f9fabb097ed25 (patch)
tree6588cffc536b8a6a0e3c33dd0325502d0db785ed /vnet
parent68adab9267e6194e96d3ca4c325a9f8dae2e3120 (diff)
Add native NETMAP driver
This is first drop of native NETMAP driver. It is mainly tested with NETMAP pipes but also support for native interfaces should work. New CLI: create netmap [<intf name>|valeXXX:YYY] [hw-addr <mac>] [pipe] [master|slave] Following example creates NETMAP pipe where VPP acts as master: create netmap name vale00:vpp1 pipe master then NETMAP pkt-gen tool can be used to send traffic: pkt-gen -i vale00:vpp1}0 -f tx Change-Id: Ie0ddaa5facc75285b78467420e8a9f9c8dfc39e5 Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'vnet')
-rw-r--r--vnet/Makefile.am14
-rw-r--r--vnet/vnet/devices/dpdk/threads.c1
-rw-r--r--vnet/vnet/devices/netmap/cli.c121
-rw-r--r--vnet/vnet/devices/netmap/device.c221
-rw-r--r--vnet/vnet/devices/netmap/net_netmap.h650
-rw-r--r--vnet/vnet/devices/netmap/netmap.c240
-rw-r--r--vnet/vnet/devices/netmap/netmap.h131
-rw-r--r--vnet/vnet/devices/netmap/node.c284
-rw-r--r--vnet/vnet/vcgn/cnat_cli_handler.c4
-rw-r--r--vnet/vnet/vnet.h5
10 files changed, 1666 insertions, 5 deletions
diff --git a/vnet/Makefile.am b/vnet/Makefile.am
index f7c1f30b520..5fdffcaa13d 100644
--- a/vnet/Makefile.am
+++ b/vnet/Makefile.am
@@ -684,6 +684,20 @@ nobase_include_HEADERS += \
vnet/devices/af_packet/af_packet.h
########################################
+# NETMAP interface
+########################################
+
+libvnet_la_SOURCES += \
+ vnet/devices/netmap/netmap.c \
+ vnet/devices/netmap/device.c \
+ vnet/devices/netmap/node.c \
+ vnet/devices/netmap/cli.c
+
+nobase_include_HEADERS += \
+ vnet/devices/netmap/netmap.h
+
+
+########################################
# Unix kernel related
########################################
diff --git a/vnet/vnet/devices/dpdk/threads.c b/vnet/vnet/devices/dpdk/threads.c
index b1f13ef3791..eeb440e2851 100644
--- a/vnet/vnet/devices/dpdk/threads.c
+++ b/vnet/vnet/devices/dpdk/threads.c
@@ -20,6 +20,7 @@
#include <vnet/ethernet/ethernet.h>
#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/devices/dpdk/threads.h>
#include <vlibmemory/api.h>
#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */
diff --git a/vnet/vnet/devices/netmap/cli.c b/vnet/vnet/devices/netmap/cli.c
new file mode 100644
index 00000000000..9c8fb783326
--- /dev/null
+++ b/vnet/vnet/devices/netmap/cli.c
@@ -0,0 +1,121 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/netmap/netmap.h>
+
+static clib_error_t *
+netmap_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u8 * host_if_name = NULL;
+ u8 hwaddr [6];
+ u8 * hw_addr_ptr = 0;
+ int r;
+ u8 is_pipe = 0;
+ u8 is_master = 0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "name %s", &host_if_name))
+ ;
+ else if (unformat (line_input, "hw-addr %U", unformat_ethernet_address, hwaddr))
+ hw_addr_ptr = hwaddr;
+ else if (unformat (line_input, "pipe"))
+ is_pipe = 1;
+ else if (unformat (line_input, "master"))
+ is_master = 1;
+ else if (unformat (line_input, "slave"))
+ is_master = 0;
+ else
+ return clib_error_return (0, "unknown input `%U'", format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (host_if_name == NULL)
+ return clib_error_return (0, "missing host interface name");
+
+ r = netmap_create_if(vm, host_if_name, hw_addr_ptr, is_pipe, is_master);
+
+ if (r == VNET_API_ERROR_SYSCALL_ERROR_1)
+ return clib_error_return(0, "%s (errno %d)", strerror (errno), errno);
+
+ if (r == VNET_API_ERROR_INVALID_INTERFACE)
+ return clib_error_return(0, "Invalid interface name");
+
+ if (r == VNET_API_ERROR_SUBIF_ALREADY_EXISTS)
+ return clib_error_return(0, "Interface elready exists");
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (netmap_create_command, static) = {
+ .path = "create netmap",
+ .short_help = "create netmap name [<intf name>|valeXXX:YYY] "
+ "[hw-addr <mac>] [pipe] [master|slave]",
+ .function = netmap_create_command_fn,
+};
+
+static clib_error_t *
+netmap_delete_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u8 * host_if_name = NULL;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "name %s", &host_if_name))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'", format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (host_if_name == NULL)
+ return clib_error_return (0, "missing host interface name");
+
+ netmap_delete_if(vm, host_if_name);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (netmap_delete_command, static) = {
+ .path = "delete netmap",
+ .short_help = "delete netmap name <interface name>",
+ .function = netmap_delete_command_fn,
+};
+
+clib_error_t *
+netmap_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (netmap_cli_init);
diff --git a/vnet/vnet/devices/netmap/device.c b/vnet/vnet/devices/netmap/device.c
new file mode 100644
index 00000000000..36e17da2a22
--- /dev/null
+++ b/vnet/vnet/devices/netmap/device.c
@@ -0,0 +1,221 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/netmap/netmap.h>
+
+#define foreach_netmap_tx_func_error \
+_(NO_FREE_SLOTS, "no free tx slots")
+
+typedef enum {
+#define _(f,s) NETMAP_TX_ERROR_##f,
+ foreach_netmap_tx_func_error
+#undef _
+ NETMAP_TX_N_ERROR,
+} netmap_tx_func_error_t;
+
+static char * netmap_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_netmap_tx_func_error
+#undef _
+};
+
+
+static u8 * format_netmap_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ netmap_main_t * apm = &netmap_main;
+ netmap_if_t * nif = pool_elt_at_index (apm->interfaces, i);
+
+ s = format (s, "netmap-%s", nif->host_if_name);
+ return s;
+}
+
+static u8 * format_netmap_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ int verbose = va_arg (*args, int);
+ netmap_main_t * nm = &netmap_main;
+ netmap_if_t * nif = vec_elt_at_index (nm->interfaces, dev_instance);
+ uword indent = format_get_indent (s);
+
+ s = format (s, "NETMAP interface");
+ if (verbose)
+ {
+ s = format (s, "\n%U version %d flags 0x%x"
+ "\n%U region %u memsize 0x%x offset 0x%x"
+ "\n%U tx_slots %u rx_slots %u tx_rings %u rx_rings %u",
+ format_white_space, indent + 2,
+ nif->req->nr_version,
+ nif->req->nr_flags,
+ format_white_space, indent + 2,
+ nif->mem_region,
+ nif->req->nr_memsize,
+ nif->req->nr_offset,
+ format_white_space, indent + 2,
+ nif->req->nr_tx_slots,
+ nif->req->nr_rx_slots,
+ nif->req->nr_tx_rings,
+ nif->req->nr_rx_rings);
+ }
+ return s;
+}
+
+static u8 * format_netmap_tx_trace (u8 * s, va_list * args)
+{
+ s = format (s, "Unimplemented...");
+ return s;
+}
+
+static uword
+netmap_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ netmap_main_t * nm = &netmap_main;
+ u32 * buffers = vlib_frame_args (frame);
+ u32 n_left = frame->n_vectors;
+ vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
+ netmap_if_t * nif = pool_elt_at_index (nm->interfaces, rd->dev_instance);
+ int cur_ring;
+
+ cur_ring = nif->first_tx_ring;
+
+ while(n_left && cur_ring <= nif->last_tx_ring)
+ {
+ struct netmap_ring * ring = NETMAP_TXRING(nif->nifp, cur_ring);
+ int n_free_slots = nm_ring_space(ring);
+ uint cur = ring->cur;
+
+ if (!n_free_slots)
+ {
+ cur_ring++;
+ continue;
+ }
+
+ while (n_left && n_free_slots)
+ {
+ vlib_buffer_t * b0;
+ u32 bi = buffers[0];
+ u32 len;
+ u32 offset = 0;
+ buffers++;
+
+ struct netmap_slot * slot = &ring->slot[cur];
+
+ do
+ {
+ b0 = vlib_get_buffer (vm, bi);
+ len = b0->current_length;
+ /* memcpy */
+ clib_memcpy ((u8 *) NETMAP_BUF(ring, slot->buf_idx) + offset,
+ vlib_buffer_get_current(b0), len);
+ offset += len;
+ }
+ while ((bi = b0->next_buffer));
+
+ slot->len = offset;
+ cur = (cur + 1) % ring->num_slots;
+ n_free_slots--;
+ n_left--;
+ }
+ CLIB_MEMORY_BARRIER();
+ ring->head = ring->cur = cur;
+ }
+
+ if (n_left < frame->n_vectors)
+ ioctl(nif->fd, NIOCTXSYNC, NULL);
+
+ if (n_left)
+ vlib_error_count (vm, node->node_index, NETMAP_TX_ERROR_NO_FREE_SLOTS,
+ n_left);
+
+ vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+ return frame->n_vectors;
+}
+
+static void
+netmap_set_interface_next_node (vnet_main_t *vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ netmap_main_t * apm = &netmap_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ netmap_if_t * nif = pool_elt_at_index (apm->interfaces, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ nif->per_interface_next_index = node_index;
+ return;
+ }
+
+ nif->per_interface_next_index =
+ vlib_node_add_next (vlib_get_main(), netmap_input_node.index, node_index);
+}
+
+static void netmap_clear_hw_interface_counters (u32 instance)
+{
+ /* Nothing for now */
+}
+
+static clib_error_t *
+netmap_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ netmap_main_t * apm = &netmap_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ netmap_if_t * nif = pool_elt_at_index (apm->interfaces, hw->dev_instance);
+ u32 hw_flags;
+
+ nif->is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+
+ if (nif->is_admin_up)
+ hw_flags = VNET_HW_INTERFACE_FLAG_LINK_UP;
+ else
+ hw_flags = 0;
+
+ vnet_hw_interface_set_flags(vnm, hw_if_index, hw_flags);
+
+ return 0;
+}
+
+static clib_error_t *
+netmap_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t * st,
+ int is_add)
+{
+ /* Nothing for now */
+ return 0;
+}
+
+VNET_DEVICE_CLASS (netmap_device_class) = {
+ .name = "netmap",
+ .tx_function = netmap_interface_tx,
+ .format_device_name = format_netmap_device_name,
+ .format_device = format_netmap_device,
+ .format_tx_trace = format_netmap_tx_trace,
+ .tx_function_n_errors = NETMAP_TX_N_ERROR,
+ .tx_function_error_strings = netmap_tx_func_error_strings,
+ .rx_redirect_to_node = netmap_set_interface_next_node,
+ .clear_counters = netmap_clear_hw_interface_counters,
+ .admin_up_down_function = netmap_interface_admin_up_down,
+ .subif_add_del_function = netmap_subif_add_del_function,
+ .no_flatten_output_chains = 1,
+};
diff --git a/vnet/vnet/devices/netmap/net_netmap.h b/vnet/vnet/devices/netmap/net_netmap.h
new file mode 100644
index 00000000000..fd4253b7c0c
--- /dev/null
+++ b/vnet/vnet/devices/netmap/net_netmap.h
@@ -0,0 +1,650 @@
+/*
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD: head/sys/net/netmap.h 251139 2013-05-30 14:07:14Z luigi $
+ *
+ * Definitions of constants and the structures used by the netmap
+ * framework, for the part visible to both kernel and userspace.
+ * Detailed info on netmap is available with "man netmap" or at
+ *
+ * http://info.iet.unipi.it/~luigi/netmap/
+ *
+ * This API is also used to communicate with the VALE software switch
+ */
+
+#ifndef _NET_NETMAP_H_
+#define _NET_NETMAP_H_
+
+#define NETMAP_API 11 /* current API version */
+
+#define NETMAP_MIN_API 11 /* min and max versions accepted */
+#define NETMAP_MAX_API 15
+/*
+ * Some fields should be cache-aligned to reduce contention.
+ * The alignment is architecture and OS dependent, but rather than
+ * digging into OS headers to find the exact value we use an estimate
+ * that should cover most architectures.
+ */
+#define NM_CACHE_ALIGN 128
+
+/*
+ * --- Netmap data structures ---
+ *
+ * The userspace data structures used by netmap are shown below.
+ * They are allocated by the kernel and mmap()ed by userspace threads.
+ * Pointers are implemented as memory offsets or indexes,
+ * so that they can be easily dereferenced in kernel and userspace.
+
+ KERNEL (opaque, obviously)
+
+ ====================================================================
+ |
+ USERSPACE | struct netmap_ring
+ +---->+---------------+
+ / | head,cur,tail |
+ struct netmap_if (nifp, 1 per fd) / | buf_ofs |
+ +---------------+ / | other fields |
+ | ni_tx_rings | / +===============+
+ | ni_rx_rings | / | buf_idx, len | slot[0]
+ | | / | flags, ptr |
+ | | / +---------------+
+ +===============+ / | buf_idx, len | slot[1]
+ | txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
+ | txring_ofs[1] | +---------------+
+ (tx+1 entries) (num_slots entries)
+ | txring_ofs[t] | | buf_idx, len | slot[n-1]
+ +---------------+ | flags, ptr |
+ | rxring_ofs[0] | +---------------+
+ | rxring_ofs[1] |
+ (rx+1 entries)
+ | rxring_ofs[r] |
+ +---------------+
+
+ * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to
+ * a file descriptor, the mmap()ed region contains a (logically readonly)
+ * struct netmap_if pointing to struct netmap_ring's.
+ *
+ * There is one netmap_ring per physical NIC ring, plus one tx/rx ring
+ * pair attached to the host stack (this pair is unused for non-NIC ports).
+ *
+ * All physical/host stack ports share the same memory region,
+ * so that zero-copy can be implemented between them.
+ * VALE switch ports instead have separate memory regions.
+ *
+ * The netmap_ring is the userspace-visible replica of the NIC ring.
+ * Each slot has the index of a buffer (MTU-sized and residing in the
+ * mmapped region), its length and some flags. An extra 64-bit pointer
+ * is provided for user-supplied buffers in the tx path.
+ *
+ * In user space, the buffer address is computed as
+ * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE
+ *
+ * Added in NETMAP_API 11:
+ *
+ * + NIOCREGIF can request the allocation of extra spare buffers from
+ * the same memory pool. The desired number of buffers must be in
+ * nr_arg3. The ioctl may return fewer buffers, depending on memory
+ * availability. nr_arg3 will return the actual value, and, once
+ * mapped, nifp->ni_bufs_head will be the index of the first buffer.
+ *
+ * The buffers are linked to each other using the first uint32_t
+ * as the index. On close, ni_bufs_head must point to the list of
+ * buffers to be released.
+ *
+ * + NIOCREGIF can request space for extra rings (and buffers)
+ * allocated in the same memory space. The number of extra rings
+ * is in nr_arg1, and is advisory. This is a no-op on NICs where
+ * the size of the memory space is fixed.
+ *
+ * + NIOCREGIF can attach to PIPE rings sharing the same memory
+ * space with a parent device. The ifname indicates the parent device,
+ * which must already exist. Flags in nr_flags indicate if we want to
+ * bind the master or slave side, the index (from nr_ringid)
+ * is just a cookie and does not need to be sequential.
+ *
+ * + NIOCREGIF can also attach to 'monitor' rings that replicate
+ * the content of specific rings, also from the same memory space.
+ *
+ * Extra flags in nr_flags support the above functions.
+ * Application libraries may use the following naming scheme:
+ * netmap:foo all NIC ring pairs
+ * netmap:foo^ only host ring pair
+ * netmap:foo+ all NIC ring + host ring pairs
+ * netmap:foo-k the k-th NIC ring pair
+ * netmap:foo{k PIPE ring pair k, master side
+ * netmap:foo}k PIPE ring pair k, slave side
+ */
+
+/*
+ * struct netmap_slot is a buffer descriptor
+ */
+struct netmap_slot {
+ uint32_t buf_idx; /* buffer index */
+ uint16_t len; /* length for this slot */
+ uint16_t flags; /* buf changed, etc. */
+ uint64_t ptr; /* pointer for indirect buffers */
+};
+
+/*
+ * The following flags control how the slot is used
+ */
+
+#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */
+ /*
+ * must be set whenever buf_idx is changed (as it might be
+ * necessary to recompute the physical address and mapping)
+ *
+ * It is also set by the kernel whenever the buf_idx is
+ * changed internally (e.g., by pipes). Applications may
+ * use this information to know when they can reuse the
+ * contents of previously prepared buffers.
+ */
+
+#define NS_REPORT 0x0002 /* ask the hardware to report results */
+ /*
+ * Request notification when slot is used by the hardware.
+ * Normally transmit completions are handled lazily and
+ * may be unreported. This flag lets us know when a slot
+ * has been sent (e.g. to terminate the sender).
+ */
+
+#define NS_FORWARD 0x0004 /* pass packet 'forward' */
+ /*
+ * (Only for physical ports, rx rings with NR_FORWARD set).
+ * Slot released to the kernel (i.e. before ring->head) with
+ * this flag set are passed to the peer ring (host/NIC),
+ * thus restoring the host-NIC connection for these slots.
+ * This supports efficient traffic monitoring or firewalling.
+ */
+
+#define NS_NO_LEARN 0x0008 /* disable bridge learning */
+ /*
+ * On a VALE switch, do not 'learn' the source port for
+ * this buffer.
+ */
+
+#define NS_INDIRECT 0x0010 /* userspace buffer */
+ /*
+ * (VALE tx rings only) data is in a userspace buffer,
+ * whose address is in the 'ptr' field in the slot.
+ */
+
+#define NS_MOREFRAG 0x0020 /* packet has more fragments */
+ /*
+ * (VALE ports only)
+ * Set on all but the last slot of a multi-segment packet.
+ * The 'len' field refers to the individual fragment.
+ */
+
+#define NS_PORT_SHIFT 8
+#define NS_PORT_MASK (0xff << NS_PORT_SHIFT)
+ /*
+ * The high 8 bits of the flag, if not zero, indicate the
+ * destination port for the VALE switch, overriding
+ * the lookup table.
+ */
+
+#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff)
+ /*
+ * (VALE rx rings only) the high 8 bits
+ * are the number of fragments.
+ */
+
+
+/*
+ * struct netmap_ring
+ *
+ * Netmap representation of a TX or RX ring (also known as "queue").
+ * This is a queue implemented as a fixed-size circular array.
+ * At the software level the important fields are: head, cur, tail.
+ *
+ * In TX rings:
+ *
+ * head first slot available for transmission.
+ * cur wakeup point. select() and poll() will unblock
+ * when 'tail' moves past 'cur'
+ * tail (readonly) first slot reserved to the kernel
+ *
+ * [head .. tail-1] can be used for new packets to send;
+ * 'head' and 'cur' must be incremented as slots are filled
+ * with new packets to be sent;
+ * 'cur' can be moved further ahead if we need more space
+ * for new transmissions. XXX todo (2014-03-12)
+ *
+ * In RX rings:
+ *
+ * head first valid received packet
+ * cur wakeup point. select() and poll() will unblock
+ * when 'tail' moves past 'cur'
+ * tail (readonly) first slot reserved to the kernel
+ *
+ * [head .. tail-1] contain received packets;
+ * 'head' and 'cur' must be incremented as slots are consumed
+ * and can be returned to the kernel;
+ * 'cur' can be moved further ahead if we want to wait for
+ * new packets without returning the previous ones.
+ *
+ * DATA OWNERSHIP/LOCKING:
+ * The netmap_ring, and all slots and buffers in the range
+ * [head .. tail-1] are owned by the user program;
+ * the kernel only accesses them during a netmap system call
+ * and in the user thread context.
+ *
+ * Other slots and buffers are reserved for use by the kernel
+ */
+struct netmap_ring {
+ /*
+ * buf_ofs is meant to be used through macros.
+ * It contains the offset of the buffer region from this
+ * descriptor.
+ */
+ const int64_t buf_ofs;
+ const uint32_t num_slots; /* number of slots in the ring. */
+ const uint32_t nr_buf_size;
+ const uint16_t ringid;
+ const uint16_t dir; /* 0: tx, 1: rx */
+
+ uint32_t head; /* (u) first user slot */
+ uint32_t cur; /* (u) wakeup point */
+ uint32_t tail; /* (k) first kernel slot */
+
+ uint32_t flags;
+
+ struct timeval ts; /* (k) time of last *sync() */
+
+ /* opaque room for a mutex or similar object */
+#if !defined(_WIN32) || defined(__CYGWIN__)
+ uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128];
+#else
+ uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128];
+#endif
+
+ /* the slots follow. This struct has variable size */
+ struct netmap_slot slot[0]; /* array of slots. */
+};
+
+
+/*
+ * RING FLAGS
+ */
+#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
+ /*
+ * updates the 'ts' field on each netmap syscall. This saves
+ * saves a separate gettimeofday(), and is not much worse than
+ * software timestamps generated in the interrupt handler.
+ */
+
+#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
+ /*
+ * Enables the NS_FORWARD slot flag for the ring.
+ */
+
+
+/*
+ * Netmap representation of an interface and its queue(s).
+ * This is initialized by the kernel when binding a file
+ * descriptor to a port, and should be considered as readonly
+ * by user programs. The kernel never uses it.
+ *
+ * There is one netmap_if for each file descriptor on which we want
+ * to select/poll.
+ * select/poll operates on one or all pairs depending on the value of
+ * nmr_queueid passed on the ioctl.
+ */
+struct netmap_if {
+ char ni_name[IFNAMSIZ]; /* name of the interface. */
+ const uint32_t ni_version; /* API version, currently unused */
+ const uint32_t ni_flags; /* properties */
+#define NI_PRIV_MEM 0x1 /* private memory region */
+
+ /*
+ * The number of packet rings available in netmap mode.
+ * Physical NICs can have different numbers of tx and rx rings.
+ * Physical NICs also have a 'host' ring pair.
+ * Additionally, clients can request additional ring pairs to
+ * be used for internal communication.
+ */
+ const uint32_t ni_tx_rings; /* number of HW tx rings */
+ const uint32_t ni_rx_rings; /* number of HW rx rings */
+
+ uint32_t ni_bufs_head; /* head index for extra bufs */
+ uint32_t ni_spare1[5];
+ /*
+ * The following array contains the offset of each netmap ring
+ * from this structure, in the following order:
+ * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings;
+ * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings.
+ *
+ * The area is filled up by the kernel on NIOCREGIF,
+ * and then only read by userspace code.
+ */
+ const ssize_t ring_ofs[0];
+};
+
+
+#ifndef NIOCREGIF
+/*
+ * ioctl names and related fields
+ *
+ * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
+ * whose identity is set in NIOCREGIF through nr_ringid.
+ * These are non blocking and take no argument.
+ *
+ * NIOCGINFO takes a struct ifreq, the interface name is the input,
+ * the outputs are number of queues and number of descriptor
+ * for each queue (useful to set number of threads etc.).
+ * The info returned is only advisory and may change before
+ * the interface is bound to a file descriptor.
+ *
+ * NIOCREGIF takes an interface name within a struct nmre,
+ * and activates netmap mode on the interface (if possible).
+ *
+ * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we
+ * can pass it down to other NIC-related ioctls.
+ *
+ * The actual argument (struct nmreq) has a number of options to request
+ * different functions.
+ * The following are used in NIOCREGIF when nr_cmd == 0:
+ *
+ * nr_name (in)
+ * The name of the port (em0, valeXXX:YYY, etc.)
+ * limited to IFNAMSIZ for backward compatibility.
+ *
+ * nr_version (in/out)
+ * Must match NETMAP_API as used in the kernel, error otherwise.
+ * Always returns the desired value on output.
+ *
+ * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out)
+ * On input, non-zero values may be used to reconfigure the port
+ * according to the requested values, but this is not guaranteed.
+ * On output the actual values in use are reported.
+ *
+ * nr_ringid (in)
+ * Indicates how rings should be bound to the file descriptors.
+ * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK)
+ * are used to indicate the ring number, and nr_flags specifies
+ * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected.
+ *
+ * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED:
+ * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control
+ * the binding as follows:
+ * 0 (default) binds all physical rings
+ * NETMAP_HW_RING | ring number binds a single ring pair
+ * NETMAP_SW_RING binds only the host tx/rx rings
+ *
+ * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push
+ * packets on tx rings only if POLLOUT is set.
+ * The default is to push any pending packet.
+ *
+ * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release
+ * packets on rx rings also when POLLIN is NOT set.
+ * The default is to touch the rx ring only with POLLIN.
+ * Note that this is the opposite of TX because it
+ * reflects the common usage.
+ *
+ * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead.
+ * NETMAP_PRIV_MEM is set on return for ports that do not use
+ * the global memory allocator.
+ * This information is not significant and applications
+ * should look at the region id in nr_arg2
+ *
+ * nr_flags is the recommended mode to indicate which rings should
+ * be bound to a file descriptor. Values are NR_REG_*
+ *
+ * nr_arg1 (in) The number of extra rings to be reserved.
+ * Especially when allocating a VALE port the system only
+ * allocates the amount of memory needed for the port.
+ * If more shared memory rings are desired (e.g. for pipes),
+ * the first invocation for the same basename/allocator
+ * should specify a suitable number. Memory cannot be
+ * extended after the first allocation without closing
+ * all ports on the same region.
+ *
+ * nr_arg2 (in/out) The identity of the memory region used.
+ * On input, 0 means the system decides autonomously,
+ * other values may try to select a specific region.
+ * On return the actual value is reported.
+ * Region '1' is the global allocator, normally shared
+ * by all interfaces. Other values are private regions.
+ * If two ports the same region zero-copy is possible.
+ *
+ * nr_arg3 (in/out) number of extra buffers to be allocated.
+ *
+ *
+ *
+ * nr_cmd (in) if non-zero indicates a special command:
+ * NETMAP_BDG_ATTACH and nr_name = vale*:ifname
+ * attaches the NIC to the switch; nr_ringid specifies
+ * which rings to use. Used by vale-ctl -a ...
+ * nr_arg1 = NETMAP_BDG_HOST also attaches the host port
+ * as in vale-ctl -h ...
+ *
+ * NETMAP_BDG_DETACH and nr_name = vale*:ifname
+ * disconnects a previously attached NIC.
+ * Used by vale-ctl -d ...
+ *
+ * NETMAP_BDG_LIST
+ * list the configuration of VALE switches.
+ *
+ * NETMAP_BDG_VNET_HDR
+ * Set the virtio-net header length used by the client
+ * of a VALE switch port.
+ *
+ * NETMAP_BDG_NEWIF
+ * create a persistent VALE port with name nr_name.
+ * Used by vale-ctl -n ...
+ *
+ * NETMAP_BDG_DELIF
+ * delete a persistent VALE port. Used by vale-ctl -d ...
+ *
+ * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific
+ *
+ *
+ *
+ */
+
+
+/*
+ * struct nmreq overlays a struct ifreq (just the name)
+ */
+struct nmreq {
+ char nr_name[IFNAMSIZ];
+ uint32_t nr_version; /* API version */
+ uint32_t nr_offset; /* nifp offset in the shared region */
+ uint32_t nr_memsize; /* size of the shared region */
+ uint32_t nr_tx_slots; /* slots in tx rings */
+ uint32_t nr_rx_slots; /* slots in rx rings */
+ uint16_t nr_tx_rings; /* number of tx rings */
+ uint16_t nr_rx_rings; /* number of rx rings */
+
+ uint16_t nr_ringid; /* ring(s) we care about */
+#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */
+#define NETMAP_SW_RING 0x2000 /* only host ring pair */
+
+#define NETMAP_RING_MASK 0x0fff /* the ring number */
+
+#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */
+
+#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */
+
+ uint16_t nr_cmd;
+#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
+#define NETMAP_BDG_DETACH 2 /* detach the NIC */
+#define NETMAP_BDG_REGOPS 3 /* register bridge callbacks */
+#define NETMAP_BDG_LIST 4 /* get bridge's info */
+#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */
+#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */
+#define NETMAP_BDG_NEWIF 6 /* create a virtual port */
+#define NETMAP_BDG_DELIF 7 /* destroy a virtual port */
+#define NETMAP_PT_HOST_CREATE 8 /* create ptnetmap kthreads */
+#define NETMAP_PT_HOST_DELETE 9 /* delete ptnetmap kthreads */
+#define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */
+#define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */
+#define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */
+ uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */
+#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */
+
+ uint16_t nr_arg2;
+ uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */
+ uint32_t nr_flags;
+ /* various modes, extends nr_ringid */
+ uint32_t spare2[1];
+};
+
+#define NR_REG_MASK 0xf /* values for nr_flags */
+enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */
+ NR_REG_ALL_NIC = 1,
+ NR_REG_SW = 2,
+ NR_REG_NIC_SW = 3,
+ NR_REG_ONE_NIC = 4,
+ NR_REG_PIPE_MASTER = 5,
+ NR_REG_PIPE_SLAVE = 6,
+};
+/* monitor uses the NR_REG to select the rings to monitor */
+#define NR_MONITOR_TX 0x100
+#define NR_MONITOR_RX 0x200
+#define NR_ZCOPY_MON 0x400
+/* request exclusive access to the selected rings */
+#define NR_EXCLUSIVE 0x800
+/* request ptnetmap host support */
+#define NR_PASSTHROUGH_HOST NR_PTNETMAP_HOST /* deprecated */
+#define NR_PTNETMAP_HOST 0x1000
+#define NR_RX_RINGS_ONLY 0x2000
+#define NR_TX_RINGS_ONLY 0x4000
+/* Applications set this flag if they are able to deal with virtio-net headers,
+ * that is send/receive frames that start with a virtio-net header.
+ * If not set, NIOCREGIF will fail with netmap ports that require applications
+ * to use those headers. If the flag is set, the application can use the
+ * NETMAP_VNET_HDR_GET command to figure out the header length. */
+#define NR_ACCEPT_VNET_HDR 0x8000
+
+
+/*
+ * Windows does not have _IOWR(). _IO(), _IOW() and _IOR() are defined
+ * in ws2def.h but not sure if they are in the form we need.
+ * XXX so we redefine them
+ * in a convenient way to use for DeviceIoControl signatures
+ */
+#ifdef _WIN32
+#undef _IO // ws2def.h
+#define _WIN_NM_IOCTL_TYPE 40000
+#define _IO(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \
+ METHOD_BUFFERED, FILE_ANY_ACCESS )
+#define _IO_direct(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \
+ METHOD_OUT_DIRECT, FILE_ANY_ACCESS )
+
+#define _IOWR(_c, _n, _s) _IO(_c, _n)
+
+/* We havesome internal sysctl in addition to the externally visible ones */
+#define NETMAP_MMAP _IO_direct('i', 160) // note METHOD_OUT_DIRECT
+#define NETMAP_POLL _IO('i', 162)
+
+/* and also two setsockopt for sysctl emulation */
+#define NETMAP_SETSOCKOPT _IO('i', 140)
+#define NETMAP_GETSOCKOPT _IO('i', 141)
+
+
+//These linknames are for the Netmap Core Driver
+#define NETMAP_NT_DEVICE_NAME L"\\Device\\NETMAP"
+#define NETMAP_DOS_DEVICE_NAME L"\\DosDevices\\netmap"
+
+//Definition of a structure used to pass a virtual address within an IOCTL
+typedef struct _MEMORY_ENTRY {
+ PVOID pUsermodeVirtualAddress;
+} MEMORY_ENTRY, *PMEMORY_ENTRY;
+
+typedef struct _POLL_REQUEST_DATA {
+ int events;
+ int timeout;
+ int revents;
+} POLL_REQUEST_DATA;
+
+#endif /* _WIN32 */
+
+/*
+ * FreeBSD uses the size value embedded in the _IOWR to determine
+ * how much to copy in/out. So we need it to match the actual
+ * data structure we pass. We put some spares in the structure
+ * to ease compatibility with other versions
+ */
+#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */
+#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */
+#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */
+#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */
+#define NIOCCONFIG _IOWR('i',150, struct nm_ifreq) /* for ext. modules */
+#endif /* !NIOCREGIF */
+
+
+/*
+ * Helper functions for kernel and userspace
+ */
+
+/*
+ * check if space is available in the ring.
+ */
+static inline int
+nm_ring_empty(struct netmap_ring *ring)
+{
+ return (ring->cur == ring->tail);
+}
+
+/*
+ * Opaque structure that is passed to an external kernel
+ * module via ioctl(fd, NIOCCONFIG, req) for a user-owned
+ * bridge port (at this point ephemeral VALE interface).
+ */
+#define NM_IFRDATA_LEN 256
+struct nm_ifreq {
+ char nifr_name[IFNAMSIZ];
+ char data[NM_IFRDATA_LEN];
+};
+
+/*
+ * netmap kernel thread configuration
+ */
+/* bhyve/vmm.ko MSIX parameters for IOCTL */
+struct ptn_vmm_ioctl_msix {
+ uint64_t msg;
+ uint64_t addr;
+};
+
+/* IOCTL parameters */
+struct nm_kth_ioctl {
+ u_long com;
+ /* TODO: use union */
+ union {
+ struct ptn_vmm_ioctl_msix msix;
+ } data;
+};
+
+/* Configuration of a ptnetmap ring */
+struct ptnet_ring_cfg {
+ uint64_t ioeventfd; /* eventfd in linux, tsleep() parameter in FreeBSD */
+ uint64_t irqfd; /* eventfd in linux, ioctl fd in FreeBSD */
+ struct nm_kth_ioctl ioctl; /* ioctl parameter to send irq (only used in bhyve/FreeBSD) */
+};
+#endif /* _NET_NETMAP_H_ */
diff --git a/vnet/vnet/devices/netmap/netmap.c b/vnet/vnet/devices/netmap/netmap.c
new file mode 100644
index 00000000000..2f3233a28e5
--- /dev/null
+++ b/vnet/vnet/devices/netmap/netmap.c
@@ -0,0 +1,240 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <vnet/devices/netmap/net_netmap.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/netmap/netmap.h>
+
+static u32
+netmap_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
+{
+ /* nothing for now */
+ return 0;
+}
+
+static clib_error_t * netmap_fd_read_ready (unix_file_t * uf)
+{
+ vlib_main_t * vm = vlib_get_main();
+ netmap_main_t * nm = &netmap_main;
+ u32 idx = uf->private_data;
+
+ nm->pending_input_bitmap = clib_bitmap_set (nm->pending_input_bitmap, idx, 1);
+
+ /* Schedule the rx node */
+ vlib_node_set_interrupt_pending (vm, netmap_input_node.index);
+
+ return 0;
+}
+
+static void
+close_netmap_if(netmap_main_t * nm, netmap_if_t * nif)
+{
+ if (nif->unix_file_index != ~0) {
+ unix_file_del(&unix_main, unix_main.file_pool + nif->unix_file_index);
+ nif->unix_file_index = ~0;
+ }
+
+ if (nif->fd > -1)
+ close(nif->fd);
+
+ if (nif->mem_region)
+ {
+ netmap_mem_region_t * reg = &nm->mem_regions[nif->mem_region];
+ if (--reg->refcnt == 0)
+ {
+ munmap(reg->mem, reg->region_size);
+ reg->region_size = 0;
+ }
+ }
+
+
+ mhash_unset(&nm->if_index_by_host_if_name, nif->host_if_name, &nif->if_index);
+ vec_free(nif->host_if_name);
+ vec_free(nif->req);
+
+ memset(nif, 0, sizeof(*nif));
+ pool_put(nm->interfaces, nif);
+}
+
+int
+netmap_create_if(vlib_main_t * vm, u8 * if_name, u8 * hw_addr_set,
+ u8 is_pipe, u8 is_master)
+{
+ netmap_main_t * nm = &netmap_main;
+ int ret = 0;
+ netmap_if_t * nif = 0;
+ u8 hw_addr[6];
+ clib_error_t * error = 0;
+ vnet_sw_interface_t * sw;
+ vnet_main_t *vnm = vnet_get_main();
+ uword * p;
+ struct nmreq * req = 0;
+ netmap_mem_region_t * reg;
+ int fd;
+
+ p = mhash_get (&nm->if_index_by_host_if_name, if_name);
+ if (p)
+ return VNET_API_ERROR_SUBIF_ALREADY_EXISTS;
+
+ fd = open("/dev/netmap", O_RDWR);
+ if (fd < 0)
+ return VNET_API_ERROR_SUBIF_ALREADY_EXISTS;
+
+ pool_get (nm->interfaces, nif);
+ nif->if_index = nif - nm->interfaces;
+ nif->fd = fd;
+ nif->unix_file_index = ~0;
+
+ vec_validate(req, 0);
+ nif->req = req;
+ req->nr_version = NETMAP_API;
+ req->nr_flags = NR_REG_ALL_NIC;
+
+ if (is_pipe)
+ req->nr_flags = is_master ? NR_REG_PIPE_MASTER : NR_REG_PIPE_SLAVE;
+ else
+ req->nr_flags = NR_REG_ALL_NIC;
+
+ req->nr_flags |= NR_ACCEPT_VNET_HDR;
+ snprintf(req->nr_name, IFNAMSIZ, "%s", if_name);
+ req->nr_name[IFNAMSIZ-1] = 0;
+
+ if (ioctl(nif->fd, NIOCREGIF, req))
+ {
+ ret = VNET_API_ERROR_NOT_CONNECTED;
+ goto error;
+ }
+
+ nif->mem_region = req->nr_arg2;
+ vec_validate (nm->mem_regions, nif->mem_region);
+ reg = &nm->mem_regions[nif->mem_region];
+ if (reg->region_size == 0)
+ {
+ reg->mem = mmap(NULL, req->nr_memsize, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ clib_warning("mem %p", reg->mem);
+ if (reg->mem == MAP_FAILED)
+ {
+ ret = VNET_API_ERROR_NOT_CONNECTED;
+ goto error;
+ }
+ reg->region_size = req->nr_memsize;
+ }
+ reg->refcnt++;
+
+ nif->nifp = NETMAP_IF(reg->mem, req->nr_offset);
+ nif->first_rx_ring = 0;
+ nif->last_rx_ring = 0;
+ nif->first_tx_ring = 0;
+ nif->last_tx_ring = 0;
+ nif->host_if_name = if_name;
+ nif->per_interface_next_index = ~0;
+
+ {
+ unix_file_t template = {0};
+ template.read_function = netmap_fd_read_ready;
+ template.file_descriptor = nif->fd;
+ template.private_data = nif->if_index;
+ nif->unix_file_index = unix_file_add (&unix_main, &template);
+ }
+
+ /*use configured or generate random MAC address */
+ if (hw_addr_set)
+ memcpy(hw_addr, hw_addr_set, 6);
+ else
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (hw_addr+2, &rnd, sizeof(rnd));
+ hw_addr[0] = 2;
+ hw_addr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface(vnm, netmap_device_class.index,
+ nif->if_index, hw_addr, &nif->hw_if_index,
+ netmap_eth_flag_change);
+
+ if (error)
+ {
+ clib_error_report (error);
+ ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ sw = vnet_get_hw_sw_interface (vnm, nif->hw_if_index);
+ nif->sw_if_index = sw->sw_if_index;
+
+ vnet_hw_interface_set_flags (vnm, nif->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+
+ mhash_set_mem (&nm->if_index_by_host_if_name, if_name, &nif->if_index, 0);
+
+ return 0;
+
+error:
+ close_netmap_if(nm, nif);
+ return ret;
+}
+
+int
+netmap_delete_if(vlib_main_t *vm, u8 *host_if_name)
+{
+ vnet_main_t *vnm = vnet_get_main();
+ netmap_main_t *nm = &netmap_main;
+ netmap_if_t *nif;
+ uword *p;
+
+ p = mhash_get(&nm->if_index_by_host_if_name, host_if_name);
+ if (p == NULL) {
+ clib_warning("Host interface %s does not exist", host_if_name);
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ }
+ nif = pool_elt_at_index(nm->interfaces, p[0]);
+
+ /* bring down the interface */
+ vnet_hw_interface_set_flags(vnm, nif->hw_if_index, 0);
+
+ ethernet_delete_interface(vnm, nif->hw_if_index);
+
+ close_netmap_if(nm, nif);
+ return 0;
+}
+
+static clib_error_t *
+netmap_init (vlib_main_t * vm)
+{
+ netmap_main_t * nm = &netmap_main;
+
+ memset (nm, 0, sizeof (netmap_main_t));
+
+ mhash_init_vec_string (&nm->if_index_by_host_if_name, sizeof (uword));
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (netmap_init);
diff --git a/vnet/vnet/devices/netmap/netmap.h b/vnet/vnet/devices/netmap/netmap.h
new file mode 100644
index 00000000000..d9a476b6959
--- /dev/null
+++ b/vnet/vnet/devices/netmap/netmap.h
@@ -0,0 +1,131 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+/*
+ * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <vnet/devices/netmap/net_netmap.h>
+
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+ u8 * host_if_name;
+ uword if_index;
+ u32 hw_if_index;
+ u32 sw_if_index;
+ u32 unix_file_index;
+
+ u32 per_interface_next_index;
+ u8 is_admin_up;
+
+ /* netmap */
+ struct nmreq * req;
+ u16 mem_region;
+ int fd;
+ struct netmap_if * nifp;
+ u16 first_tx_ring;
+ u16 last_tx_ring;
+ u16 first_rx_ring;
+ u16 last_rx_ring;
+
+} netmap_if_t;
+
+typedef struct {
+ char * mem;
+ u32 region_size;
+ int refcnt;
+} netmap_mem_region_t;
+
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+ netmap_if_t * interfaces;
+
+ /* bitmap of pending rx interfaces */
+ uword * pending_input_bitmap;
+
+ /* rx buffer cache */
+ u32 * rx_buffers;
+
+ /* hash of host interface names */
+ mhash_t if_index_by_host_if_name;
+
+ /* vector of memory regions */
+ netmap_mem_region_t * mem_regions;
+} netmap_main_t;
+
+netmap_main_t netmap_main;
+extern vnet_device_class_t netmap_device_class;
+extern vlib_node_registration_t netmap_input_node;
+
+int netmap_create_if(vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, u8 is_pipe, u8 is_master);
+int netmap_delete_if(vlib_main_t * vm, u8 * host_if_name);
+
+
+/* Macros and helper functions from sys/net/netmap_user.h */
+
+#define _NETMAP_OFFSET(type, ptr, offset) \
+ ((type)(void *)((char *)(ptr) + (offset)))
+
+#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs)
+
+#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
+ nifp, (nifp)->ring_ofs[index] )
+
+#define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
+ nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] )
+
+#define NETMAP_BUF(ring, index) \
+ ((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size))
+
+#define NETMAP_BUF_IDX(ring, buf) \
+ ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \
+ (ring)->nr_buf_size )
+
+static inline uint32_t
+nm_ring_space(struct netmap_ring *ring)
+{
+ int ret = ring->tail - ring->cur;
+ if (ret < 0)
+ ret += ring->num_slots;
+ return ret;
+}
+
+
diff --git a/vnet/vnet/devices/netmap/node.c b/vnet/vnet/devices/netmap/node.c
new file mode 100644
index 00000000000..f866a282cfb
--- /dev/null
+++ b/vnet/vnet/devices/netmap/node.c
@@ -0,0 +1,284 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/netmap/netmap.h>
+
+#define foreach_netmap_input_error
+
+typedef enum {
+#define _(f,s) NETMAP_INPUT_ERROR_##f,
+ foreach_netmap_input_error
+#undef _
+ NETMAP_INPUT_N_ERROR,
+} netmap_input_error_t;
+
+static char * netmap_input_error_strings[] = {
+#define _(n,s) s,
+ foreach_netmap_input_error
+#undef _
+};
+
+enum {
+ NETMAP_INPUT_NEXT_DROP,
+ NETMAP_INPUT_NEXT_ETHERNET_INPUT,
+ NETMAP_INPUT_N_NEXT,
+};
+
+typedef struct {
+ u32 next_index;
+ u32 hw_if_index;
+ struct netmap_slot slot;
+} netmap_input_trace_t;
+
+static u8 * format_netmap_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ netmap_input_trace_t * t = va_arg (*args, netmap_input_trace_t *);
+ uword indent = format_get_indent (s);
+
+ s = format (s, "netmap: hw_if_index %d next-index %d",
+ t->hw_if_index, t->next_index);
+ s = format (s, "\n%Uslot: flags 0x%x len %u buf_idx %u",
+ format_white_space, indent + 2,
+ t->slot.flags, t->slot.len, t->slot.buf_idx);
+ return s;
+}
+
+always_inline void
+buffer_add_to_chain(vlib_main_t *vm, u32 bi, u32 first_bi, u32 prev_bi)
+{
+ vlib_buffer_t * b = vlib_get_buffer (vm, bi);
+ vlib_buffer_t * first_b = vlib_get_buffer (vm, first_bi);
+ vlib_buffer_t * prev_b = vlib_get_buffer (vm, prev_bi);
+
+ /* update first buffer */
+ first_b->total_length_not_including_first_buffer += b->current_length;
+
+ /* update previous buffer */
+ prev_b->next_buffer = bi;
+ prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+ /* update current buffer */
+ b->next_buffer = 0;
+
+#if DPDK > 0
+ struct rte_mbuf * mbuf = rte_mbuf_from_vlib_buffer(b);
+ struct rte_mbuf * first_mbuf = rte_mbuf_from_vlib_buffer(first_b);
+ struct rte_mbuf * prev_mbuf = rte_mbuf_from_vlib_buffer(prev_b);
+ first_mbuf->nb_segs++;
+ prev_mbuf->next = mbuf;
+ mbuf->data_len = b->current_length;
+ mbuf->data_off = RTE_PKTMBUF_HEADROOM + b->current_data;
+ mbuf->next = 0;
+#endif
+}
+
+always_inline uword
+netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame, u32 device_idx)
+{
+ u32 next_index = NETMAP_INPUT_NEXT_ETHERNET_INPUT;
+ uword n_trace = vlib_get_trace_count (vm, node);
+ netmap_main_t * nm = &netmap_main;
+ netmap_if_t * nif = pool_elt_at_index(nm->interfaces, device_idx);
+ u32 n_rx_packets = 0;
+ u32 n_rx_bytes = 0;
+ u32 * to_next = 0;
+ u32 n_free_bufs;
+ struct netmap_ring * ring;
+ int cur_ring;
+ u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ if (nif->per_interface_next_index != ~0)
+ next_index = nif->per_interface_next_index;
+
+ n_free_bufs = vec_len (nm->rx_buffers);
+ if (PREDICT_FALSE(n_free_bufs < VLIB_FRAME_SIZE))
+ {
+ vec_validate(nm->rx_buffers, VLIB_FRAME_SIZE + n_free_bufs - 1);
+ n_free_bufs += vlib_buffer_alloc(vm, &nm->rx_buffers[n_free_bufs], VLIB_FRAME_SIZE);
+ _vec_len (nm->rx_buffers) = n_free_bufs;
+ }
+
+ cur_ring = nif->first_rx_ring;
+ while (cur_ring <= nif->last_rx_ring && n_free_bufs)
+ {
+ int r = 0;
+ u32 cur_slot_index;
+ ring = NETMAP_RXRING(nif->nifp, cur_ring);
+ r = nm_ring_space(ring);
+
+ if (!r)
+ {
+ cur_ring++;
+ continue;
+ }
+
+ if (r > n_free_bufs)
+ r = n_free_bufs;
+
+ cur_slot_index = ring->cur;
+ while (r)
+ {
+ u32 n_left_to_next;
+ u32 next0 = next_index;
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (r && n_left_to_next)
+ {
+ vlib_buffer_t * b0, * first_b0 = 0;
+ u32 offset = 0;
+ u32 bi0 = 0, first_bi0 = 0, prev_bi0;
+ u32 next_slot_index = (cur_slot_index + 1) % ring->num_slots;
+ u32 next2_slot_index = (cur_slot_index + 2) % ring->num_slots;
+ struct netmap_slot * slot = &ring->slot[cur_slot_index];
+ u32 data_len = slot->len;
+
+ /* prefetch 2 slots in advance */
+ CLIB_PREFETCH (&ring->slot[next2_slot_index], CLIB_CACHE_LINE_BYTES, LOAD);
+ /* prefetch start of next packet */
+ CLIB_PREFETCH (NETMAP_BUF(ring, ring->slot[next_slot_index].buf_idx),
+ CLIB_CACHE_LINE_BYTES, LOAD);
+
+ while (data_len && n_free_bufs)
+ {
+ /* grab free buffer */
+ u32 last_empty_buffer = vec_len (nm->rx_buffers) - 1;
+ prev_bi0 = bi0;
+ bi0 = nm->rx_buffers[last_empty_buffer];
+ b0 = vlib_get_buffer (vm, bi0);
+ _vec_len (nm->rx_buffers) = last_empty_buffer;
+ n_free_bufs--;
+
+ /* copy data */
+ u32 bytes_to_copy = data_len > n_buffer_bytes ? n_buffer_bytes : data_len;
+ b0->current_data = 0;
+ clib_memcpy (vlib_buffer_get_current (b0),
+ (u8 *) NETMAP_BUF(ring, slot->buf_idx) + offset,
+ bytes_to_copy);
+
+ /* fill buffer header */
+ b0->clone_count = 0;
+ b0->current_length = bytes_to_copy;
+
+ if (offset == 0)
+ {
+#if DPDK > 0
+ struct rte_mbuf * mb = rte_mbuf_from_vlib_buffer(b0);
+ rte_pktmbuf_data_len (mb) = b0->current_length;
+ rte_pktmbuf_pkt_len (mb) = b0->current_length;
+#endif
+ b0->total_length_not_including_first_buffer = 0;
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = nif->sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ first_bi0 = bi0;
+ first_b0 = vlib_get_buffer(vm, first_bi0);
+ }
+ else
+ buffer_add_to_chain(vm, bi0, first_bi0, prev_bi0);
+
+ offset += bytes_to_copy;
+ data_len -= bytes_to_copy;
+ }
+
+ /* trace */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(first_b0);
+ if (PREDICT_FALSE(n_trace > 0))
+ {
+ netmap_input_trace_t *tr;
+ vlib_trace_buffer (vm, node, next0, first_b0, /* follow_chain */ 0);
+ vlib_set_trace_count (vm, node, --n_trace);
+ tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->hw_if_index = nif->hw_if_index;
+ memcpy (&tr->slot, slot, sizeof (struct netmap_slot));
+ }
+ /* enque and take next packet */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, first_bi0, next0);
+
+ /* next packet */
+ n_rx_packets++;
+ n_rx_bytes += slot->len;
+ to_next[0] = first_bi0;
+ to_next += 1;
+ n_left_to_next--;
+ cur_slot_index = next_slot_index;
+
+ r--;
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ ring->head = ring->cur = cur_slot_index;
+ cur_ring++;
+ }
+
+ if (n_rx_packets)
+ ioctl(nif->fd, NIOCTXSYNC, NULL);
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number(),
+ nif->hw_if_index,
+ n_rx_packets, n_rx_bytes);
+
+ return n_rx_packets;
+}
+
+static uword
+netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ int i;
+ u32 n_rx_packets = 0;
+
+ netmap_main_t * nm = &netmap_main;
+
+ clib_bitmap_foreach (i, nm->pending_input_bitmap,
+ ({
+ clib_bitmap_set (nm->pending_input_bitmap, i, 0);
+ n_rx_packets += netmap_device_input_fn(vm, node, frame, i);
+ }));
+
+ return n_rx_packets;
+}
+
+
+VLIB_REGISTER_NODE (netmap_input_node) = {
+ .function = netmap_input_fn,
+ .name = "netmap-input",
+ .format_trace = format_netmap_input_trace,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_INTERRUPT,
+ .n_errors = NETMAP_INPUT_N_ERROR,
+ .error_strings = netmap_input_error_strings,
+
+ .n_next_nodes = NETMAP_INPUT_N_NEXT,
+ .next_nodes = {
+ [NETMAP_INPUT_NEXT_DROP] = "error-drop",
+ [NETMAP_INPUT_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ },
+};
+
diff --git a/vnet/vnet/vcgn/cnat_cli_handler.c b/vnet/vnet/vcgn/cnat_cli_handler.c
index 46c8493501b..e76d7cc4248 100644
--- a/vnet/vnet/vcgn/cnat_cli_handler.c
+++ b/vnet/vnet/vcgn/cnat_cli_handler.c
@@ -37,6 +37,10 @@
#include <arpa/inet.h>
+#if DPDK
+#include <vnet/devices/dpdk/dpdk.h>
+#endif
+
u32 show_debug_level = 0;
u32
diff --git a/vnet/vnet/vnet.h b/vnet/vnet/vnet.h
index 2378c2420b8..9254da62372 100644
--- a/vnet/vnet/vnet.h
+++ b/vnet/vnet/vnet.h
@@ -84,9 +84,4 @@ vnet_main_t **vnet_mains;
#include <vnet/interface_funcs.h>
#include <vnet/global_funcs.h>
-#if DPDK > 0
-#include <vnet/devices/dpdk/threads.h>
-#include <vnet/dpdk_replication.h>
-#endif
-
#endif /* included_vnet_vnet_h */