diff options
Diffstat (limited to 'extras')
-rw-r--r-- | extras/deprecated/ixge/CMakeLists.txt | 20 | ||||
-rw-r--r-- | extras/deprecated/ixge/ixge.c | 2938 | ||||
-rw-r--r-- | extras/deprecated/ixge/ixge.h | 1292 | ||||
-rw-r--r-- | extras/deprecated/netmap/FEATURE.yaml | 12 | ||||
-rw-r--r-- | extras/deprecated/netmap/cli.c | 236 | ||||
-rw-r--r-- | extras/deprecated/netmap/device.c | 252 | ||||
-rw-r--r-- | extras/deprecated/netmap/dir.dox | 27 | ||||
-rw-r--r-- | extras/deprecated/netmap/net_netmap.h | 650 | ||||
-rw-r--r-- | extras/deprecated/netmap/netmap.api | 56 | ||||
-rw-r--r-- | extras/deprecated/netmap/netmap.c | 314 | ||||
-rw-r--r-- | extras/deprecated/netmap/netmap.h | 166 | ||||
-rw-r--r-- | extras/deprecated/netmap/netmap_api.c | 137 | ||||
-rw-r--r-- | extras/deprecated/netmap/node.c | 298 |
13 files changed, 6398 insertions, 0 deletions
diff --git a/extras/deprecated/ixge/CMakeLists.txt b/extras/deprecated/ixge/CMakeLists.txt new file mode 100644 index 00000000000..226652c72e2 --- /dev/null +++ b/extras/deprecated/ixge/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) 2018 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_vpp_plugin(ixge + SOURCES + ixge.c + + INSTALL_HEADERS + ixge.h +) diff --git a/extras/deprecated/ixge/ixge.c b/extras/deprecated/ixge/ixge.c new file mode 100644 index 00000000000..6ab79c9872c --- /dev/null +++ b/extras/deprecated/ixge/ixge.c @@ -0,0 +1,2938 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * WARNING! + * This driver is not intended for production use and it is unsupported. + * It is provided for educational use only. + * Please use supported DPDK driver instead. + */ + +#if __x86_64__ || __i386__ || __aarch64__ +#include <vppinfra/vector.h> + +#ifndef CLIB_HAVE_VEC128 +#warning HACK: ixge driver wont really work, missing u32x4 +typedef unsigned long long u32x4; +#endif + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/pci/pci.h> +#include <vnet/vnet.h> +#include <ixge/ixge.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/plugin/plugin.h> +#include <vpp/app/version.h> + +#define IXGE_ALWAYS_POLL 0 + +#define EVENT_SET_FLAGS 0 +#define IXGE_HWBP_RACE_ELOG 0 + +#define PCI_VENDOR_ID_INTEL 0x8086 + +/* 10 GIG E (XGE) PHY IEEE 802.3 clause 45 definitions. */ +#define XGE_PHY_DEV_TYPE_PMA_PMD 1 +#define XGE_PHY_DEV_TYPE_PHY_XS 4 +#define XGE_PHY_ID1 0x2 +#define XGE_PHY_ID2 0x3 +#define XGE_PHY_CONTROL 0x0 +#define XGE_PHY_CONTROL_RESET (1 << 15) + +ixge_main_t ixge_main; +static vlib_node_registration_t ixge_input_node; +static vlib_node_registration_t ixge_process_node; + +static void +ixge_semaphore_get (ixge_device_t * xd) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + u32 i; + + i = 0; + while (!(r->software_semaphore & (1 << 0))) + { + if (i > 0) + vlib_process_suspend (vm, 100e-6); + i++; + } + do + { + r->software_semaphore |= 1 << 1; + } + while (!(r->software_semaphore & (1 << 1))); +} + +static void +ixge_semaphore_release (ixge_device_t * xd) +{ + ixge_regs_t *r = xd->regs; + r->software_semaphore &= ~3; +} + +static void +ixge_software_firmware_sync (ixge_device_t * xd, u32 sw_mask) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + u32 fw_mask = sw_mask << 5; + u32 m, done = 0; + + while (!done) + { + ixge_semaphore_get (xd); + m = r->software_firmware_sync; + done = (m & fw_mask) == 0; + if (done) + r->software_firmware_sync = m | sw_mask; + ixge_semaphore_release (xd); + if (!done) + vlib_process_suspend (vm, 10e-3); + } +} + +static void +ixge_software_firmware_sync_release (ixge_device_t * xd, u32 sw_mask) +{ + ixge_regs_t *r = xd->regs; + ixge_semaphore_get (xd); + r->software_firmware_sync &= ~sw_mask; + ixge_semaphore_release (xd); +} + +u32 +ixge_read_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, + u32 v, u32 is_read) +{ + ixge_regs_t *r = xd->regs; + const u32 busy_bit = 1 << 30; + u32 x; + + ASSERT (xd->phy_index < 2); + ixge_software_firmware_sync (xd, 1 << (1 + xd->phy_index)); + + ASSERT (reg_index < (1 << 16)); + ASSERT (dev_type < (1 << 5)); + if (!is_read) + r->xge_mac.phy_data = v; + + /* Address cycle. */ + x = + reg_index | (dev_type << 16) | (xd-> + phys[xd->phy_index].mdio_address << 21); + r->xge_mac.phy_command = x | busy_bit; + /* Busy wait timed to take 28e-6 secs. No suspend. */ + while (r->xge_mac.phy_command & busy_bit) + ; + + r->xge_mac.phy_command = x | ((is_read ? 2 : 1) << 26) | busy_bit; + while (r->xge_mac.phy_command & busy_bit) + ; + + if (is_read) + v = r->xge_mac.phy_data >> 16; + + ixge_software_firmware_sync_release (xd, 1 << (1 + xd->phy_index)); + + return v; +} + +static u32 +ixge_read_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index) +{ + return ixge_read_write_phy_reg (xd, dev_type, reg_index, 0, /* is_read */ + 1); +} + +static void +ixge_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, u32 v) +{ + (void) ixge_read_write_phy_reg (xd, dev_type, reg_index, v, /* is_read */ + 0); +} + +static void +ixge_i2c_put_bits (i2c_bus_t * b, int scl, int sda) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); + u32 v; + + v = 0; + v |= (sda != 0) << 3; + v |= (scl != 0) << 1; + xd->regs->i2c_control = v; +} + +static void +ixge_i2c_get_bits (i2c_bus_t * b, int *scl, int *sda) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); + u32 v; + + v = xd->regs->i2c_control; + *sda = (v & (1 << 2)) != 0; + *scl = (v & (1 << 0)) != 0; +} + +static u16 +ixge_read_eeprom (ixge_device_t * xd, u32 address) +{ + ixge_regs_t *r = xd->regs; + u32 v; + r->eeprom_read = (( /* start bit */ (1 << 0)) | (address << 2)); + /* Wait for done bit. */ + while (!((v = r->eeprom_read) & (1 << 1))) + ; + return v >> 16; +} + +static void +ixge_sfp_enable_disable_laser (ixge_device_t * xd, uword enable) +{ + u32 tx_disable_bit = 1 << 3; + if (enable) + xd->regs->sdp_control &= ~tx_disable_bit; + else + xd->regs->sdp_control |= tx_disable_bit; +} + +static void +ixge_sfp_enable_disable_10g (ixge_device_t * xd, uword enable) +{ + u32 is_10g_bit = 1 << 5; + if (enable) + xd->regs->sdp_control |= is_10g_bit; + else + xd->regs->sdp_control &= ~is_10g_bit; +} + +static clib_error_t * +ixge_sfp_phy_init_from_eeprom (ixge_device_t * xd, u16 sfp_type) +{ + u16 a, id, reg_values_addr = 0; + + a = ixge_read_eeprom (xd, 0x2b); + if (a == 0 || a == 0xffff) + return clib_error_create ("no init sequence in eeprom"); + + while (1) + { + id = ixge_read_eeprom (xd, ++a); + if (id == 0xffff) + break; + reg_values_addr = ixge_read_eeprom (xd, ++a); + if (id == sfp_type) + break; + } + if (id != sfp_type) + return clib_error_create ("failed to find id 0x%x", sfp_type); + + ixge_software_firmware_sync (xd, 1 << 3); + while (1) + { + u16 v = ixge_read_eeprom (xd, ++reg_values_addr); + if (v == 0xffff) + break; + xd->regs->core_analog_config = v; + } + ixge_software_firmware_sync_release (xd, 1 << 3); + + /* Make sure laser is off. We'll turn on the laser when + the interface is brought up. */ + ixge_sfp_enable_disable_laser (xd, /* enable */ 0); + ixge_sfp_enable_disable_10g (xd, /* is_10g */ 1); + + return 0; +} + +static void +ixge_sfp_device_up_down (ixge_device_t * xd, uword is_up) +{ + u32 v; + + if (is_up) + { + /* pma/pmd 10g serial SFI. */ + xd->regs->xge_mac.auto_negotiation_control2 &= ~(3 << 16); + xd->regs->xge_mac.auto_negotiation_control2 |= 2 << 16; + + v = xd->regs->xge_mac.auto_negotiation_control; + v &= ~(7 << 13); + v |= (0 << 13); + /* Restart autoneg. */ + v |= (1 << 12); + xd->regs->xge_mac.auto_negotiation_control = v; + + while (!(xd->regs->xge_mac.link_partner_ability[0] & 0xf0000)) + ; + + v = xd->regs->xge_mac.auto_negotiation_control; + + /* link mode 10g sfi serdes */ + v &= ~(7 << 13); + v |= (3 << 13); + + /* Restart autoneg. */ + v |= (1 << 12); + xd->regs->xge_mac.auto_negotiation_control = v; + + xd->regs->xge_mac.link_status; + } + + ixge_sfp_enable_disable_laser (xd, /* enable */ is_up); + + /* Give time for link partner to notice that we're up. */ + if (is_up && vlib_in_process_context (vlib_get_main ())) + { + vlib_process_suspend (vlib_get_main (), 300e-3); + } +} + +always_inline ixge_dma_regs_t * +get_dma_regs (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 qi) +{ + ixge_regs_t *r = xd->regs; + ASSERT (qi < 128); + if (rt == VLIB_RX) + return qi < 64 ? &r->rx_dma0[qi] : &r->rx_dma1[qi - 64]; + else + return &r->tx_dma[qi]; +} + +static clib_error_t * +ixge_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, hif->dev_instance); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); + + if (is_up) + { + xd->regs->rx_enable |= 1; + xd->regs->tx_dma_control |= 1; + dr->control |= 1 << 25; + while (!(dr->control & (1 << 25))) + ; + } + else + { + xd->regs->rx_enable &= ~1; + xd->regs->tx_dma_control &= ~1; + } + + ixge_sfp_device_up_down (xd, is_up); + + return /* no error */ 0; +} + +static void +ixge_sfp_phy_init (ixge_device_t * xd) +{ + ixge_phy_t *phy = xd->phys + xd->phy_index; + i2c_bus_t *ib = &xd->i2c_bus; + + ib->private_data = xd->device_index; + ib->put_bits = ixge_i2c_put_bits; + ib->get_bits = ixge_i2c_get_bits; + vlib_i2c_init (ib); + + vlib_i2c_read_eeprom (ib, 0x50, 0, 128, (u8 *) & xd->sfp_eeprom); + + if (vlib_i2c_bus_timed_out (ib) || !sfp_eeprom_is_valid (&xd->sfp_eeprom)) + xd->sfp_eeprom.id = SFP_ID_UNKNOWN; + else + { + /* FIXME 5 => SR/LR eeprom ID. */ + clib_error_t *e = + ixge_sfp_phy_init_from_eeprom (xd, 5 + xd->pci_function); + if (e) + clib_error_report (e); + } + + phy->mdio_address = ~0; +} + +static void +ixge_phy_init (ixge_device_t * xd) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_phy_t *phy = xd->phys + xd->phy_index; + + switch (xd->device_id) + { + case IXGE_82599_sfp: + case IXGE_82599_sfp_em: + case IXGE_82599_sfp_fcoe: + /* others? */ + return ixge_sfp_phy_init (xd); + + default: + break; + } + + /* Probe address of phy. */ + { + u32 i, v; + + phy->mdio_address = ~0; + for (i = 0; i < 32; i++) + { + phy->mdio_address = i; + v = ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1); + if (v != 0xffff && v != 0) + break; + } + + /* No PHY found? */ + if (i >= 32) + return; + } + + phy->id = + ((ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1) << 16) | + ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID2)); + + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, phy id 0x%d mdio address %d",.format_args = "i4i4i4",}; + struct + { + u32 instance, id, address; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->id = phy->id; + ed->address = phy->mdio_address; + } + + /* Reset phy. */ + ixge_write_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL, + XGE_PHY_CONTROL_RESET); + + /* Wait for self-clearning reset bit to clear. */ + do + { + vlib_process_suspend (vm, 1e-3); + } + while (ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL) & + XGE_PHY_CONTROL_RESET); +} + +static u8 * +format_ixge_rx_from_hw_descriptor (u8 * s, va_list * va) +{ + ixge_rx_from_hw_descriptor_t *d = + va_arg (*va, ixge_rx_from_hw_descriptor_t *); + u32 s0 = d->status[0], s2 = d->status[2]; + u32 is_ip4, is_ip6, is_ip, is_tcp, is_udp; + u32 indent = format_get_indent (s); + + s = format (s, "%s-owned", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE) ? "sw" : + "hw"); + s = + format (s, ", length this descriptor %d, l3 offset %d", + d->n_packet_bytes_this_descriptor, + IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s0)); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) + s = format (s, ", end-of-packet"); + + s = format (s, "\n%U", format_white_space, indent); + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR) + s = format (s, "layer2 error"); + + if (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2) + { + s = format (s, "layer 2 type %d", (s0 & 0x1f)); + return s; + } + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN) + s = format (s, "vlan header 0x%x\n%U", d->vlan_tag, + format_white_space, indent); + + if ((is_ip4 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4))) + { + s = format (s, "ip4%s", + (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT) ? " options" : + ""); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED) + s = format (s, " checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) ? + "bad" : "ok"); + } + if ((is_ip6 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6))) + s = format (s, "ip6%s", + (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT) ? " extended" : + ""); + is_tcp = is_udp = 0; + if ((is_ip = (is_ip4 | is_ip6))) + { + is_tcp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP) != 0; + is_udp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP) != 0; + if (is_tcp) + s = format (s, ", tcp"); + if (is_udp) + s = format (s, ", udp"); + } + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED) + s = format (s, ", tcp checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR) ? "bad" : + "ok"); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED) + s = format (s, ", udp checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR) ? "bad" : + "ok"); + + return s; +} + +static u8 * +format_ixge_tx_descriptor (u8 * s, va_list * va) +{ + ixge_tx_descriptor_t *d = va_arg (*va, ixge_tx_descriptor_t *); + u32 s0 = d->status0, s1 = d->status1; + u32 indent = format_get_indent (s); + u32 v; + + s = format (s, "buffer 0x%Lx, %d packet bytes, %d bytes this buffer", + d->buffer_address, s1 >> 14, d->n_bytes_this_buffer); + + s = format (s, "\n%U", format_white_space, indent); + + if ((v = (s0 >> 0) & 3)) + s = format (s, "reserved 0x%x, ", v); + + if ((v = (s0 >> 2) & 3)) + s = format (s, "mac 0x%x, ", v); + + if ((v = (s0 >> 4) & 0xf) != 3) + s = format (s, "type 0x%x, ", v); + + s = format (s, "%s%s%s%s%s%s%s%s", + (s0 & (1 << 8)) ? "eop, " : "", + (s0 & (1 << 9)) ? "insert-fcs, " : "", + (s0 & (1 << 10)) ? "reserved26, " : "", + (s0 & (1 << 11)) ? "report-status, " : "", + (s0 & (1 << 12)) ? "reserved28, " : "", + (s0 & (1 << 13)) ? "is-advanced, " : "", + (s0 & (1 << 14)) ? "vlan-enable, " : "", + (s0 & (1 << 15)) ? "tx-segmentation, " : ""); + + if ((v = s1 & 0xf) != 0) + s = format (s, "status 0x%x, ", v); + + if ((v = (s1 >> 4) & 0xf)) + s = format (s, "context 0x%x, ", v); + + if ((v = (s1 >> 8) & 0x3f)) + s = format (s, "options 0x%x, ", v); + + return s; +} + +typedef struct +{ + ixge_descriptor_t before, after; + + u32 buffer_index; + + u16 device_index; + + u8 queue_index; + + u8 is_start_of_packet; + + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} ixge_rx_dma_trace_t; + +static u8 * +format_ixge_rx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + vlib_node_t *node = va_arg (*va, vlib_node_t *); + vnet_main_t *vnm = vnet_get_main (); + ixge_rx_dma_trace_t *t = va_arg (*va, ixge_rx_dma_trace_t *); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); + format_function_t *f; + u32 indent = format_get_indent (s); + + { + vnet_sw_interface_t *sw = + vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + s = + format (s, "%U rx queue %d", format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + } + + s = format (s, "\n%Ubefore: %U", + format_white_space, indent, + format_ixge_rx_from_hw_descriptor, &t->before); + s = format (s, "\n%Uafter : head/tail address 0x%Lx/0x%Lx", + format_white_space, indent, + t->after.rx_to_hw.head_address, t->after.rx_to_hw.tail_address); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vnet_buffer, &t->buffer); + + s = format (s, "\n%U", format_white_space, indent); + + f = node->format_buffer; + if (!f || !t->is_start_of_packet) + f = format_hex_bytes; + s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +#define foreach_ixge_error \ + _ (none, "no error") \ + _ (tx_full_drops, "tx ring full drops") \ + _ (ip4_checksum_error, "ip4 checksum errors") \ + _ (rx_alloc_fail, "rx buf alloc from free list failed") \ + _ (rx_alloc_no_physmem, "rx buf alloc failed no physmem") + +typedef enum +{ +#define _(f,s) IXGE_ERROR_##f, + foreach_ixge_error +#undef _ + IXGE_N_ERROR, +} ixge_error_t; + +always_inline void +ixge_rx_next_and_error_from_status_x1 (ixge_device_t * xd, + u32 s00, u32 s02, + u8 * next0, u8 * error0, u32 * flags0) +{ + u8 is0_ip4, is0_ip6, n0, e0; + u32 f0; + + e0 = IXGE_ERROR_none; + n0 = IXGE_RX_NEXT_ETHERNET_INPUT; + + is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; + + e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e0); + + is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; + + n0 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n0; + + /* Check for error. */ + n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; + + f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); + + f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); + + *error0 = e0; + *next0 = n0; + *flags0 = f0; +} + +always_inline void +ixge_rx_next_and_error_from_status_x2 (ixge_device_t * xd, + u32 s00, u32 s02, + u32 s10, u32 s12, + u8 * next0, u8 * error0, u32 * flags0, + u8 * next1, u8 * error1, u32 * flags1) +{ + u8 is0_ip4, is0_ip6, n0, e0; + u8 is1_ip4, is1_ip6, n1, e1; + u32 f0, f1; + + e0 = e1 = IXGE_ERROR_none; + n0 = n1 = IXGE_RX_NEXT_IP4_INPUT; + + is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + is1_ip4 = s12 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + + n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; + n1 = is1_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n1; + + e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e0); + e1 = (is1_ip4 && (s12 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e1); + + is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + is1_ip6 = s10 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + + n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; + n1 = is1_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n1; + + n0 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n0; + n1 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n1; + + /* Check for error. */ + n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; + n1 = e1 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n1; + + *error0 = e0; + *error1 = e1; + + *next0 = n0; + *next1 = n1; + + f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); + f1 = ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); + + f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); + f1 |= ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); + + *flags0 = f0; + *flags1 = f1; +} + +static void +ixge_rx_trace (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + ixge_descriptor_t * before_descriptors, + u32 * before_buffers, + ixge_descriptor_t * after_descriptors, uword n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = dq->rx.node; + ixge_rx_from_hw_descriptor_t *bd; + ixge_rx_to_hw_descriptor_t *ad; + u32 *b, n_left, is_sop, next_index_sop; + + n_left = n_descriptors; + b = before_buffers; + bd = &before_descriptors->rx_from_hw; + ad = &after_descriptors->rx_to_hw; + is_sop = dq->rx.is_start_of_packet; + next_index_sop = dq->rx.saved_start_of_packet_next_index; + + while (n_left >= 2) + { + u32 bi0, bi1, flags0, flags1; + vlib_buffer_t *b0, *b1; + ixge_rx_dma_trace_t *t0, *t1; + u8 next0, error0, next1, error1; + + bi0 = b[0]; + bi1 = b[1]; + n_left -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + ixge_rx_next_and_error_from_status_x2 (xd, + bd[0].status[0], bd[0].status[2], + bd[1].status[0], bd[1].status[2], + &next0, &error0, &flags0, + &next1, &error1, &flags1); + + next_index_sop = is_sop ? next0 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + next_index_sop = is_sop ? next1 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b1, /* follow_chain */ 0); + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->is_start_of_packet = is_sop; + is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t1->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t1->device_index = xd->device_index; + t0->before.rx_from_hw = bd[0]; + t1->before.rx_from_hw = bd[1]; + t0->after.rx_to_hw = ad[0]; + t1->after.rx_to_hw = ad[1]; + t0->buffer_index = bi0; + t1->buffer_index = bi1; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + memcpy (t1->buffer.pre_data, b1->data + b1->current_data, + sizeof (t1->buffer.pre_data)); + + b += 2; + bd += 2; + ad += 2; + } + + while (n_left >= 1) + { + u32 bi0, flags0; + vlib_buffer_t *b0; + ixge_rx_dma_trace_t *t0; + u8 next0, error0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ixge_rx_next_and_error_from_status_x1 (xd, + bd[0].status[0], bd[0].status[2], + &next0, &error0, &flags0); + + next_index_sop = is_sop ? next0 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t0->before.rx_from_hw = bd[0]; + t0->after.rx_to_hw = ad[0]; + t0->buffer_index = bi0; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + + b += 1; + bd += 1; + ad += 1; + } +} + +typedef struct +{ + ixge_tx_descriptor_t descriptor; + + u32 buffer_index; + + u16 device_index; + + u8 queue_index; + + u8 is_start_of_packet; + + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} ixge_tx_dma_trace_t; + +static u8 * +format_ixge_tx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + ixge_tx_dma_trace_t *t = va_arg (*va, ixge_tx_dma_trace_t *); + vnet_main_t *vnm = vnet_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); + format_function_t *f; + u32 indent = format_get_indent (s); + + { + vnet_sw_interface_t *sw = + vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + s = + format (s, "%U tx queue %d", format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + } + + s = format (s, "\n%Udescriptor: %U", + format_white_space, indent, + format_ixge_tx_descriptor, &t->descriptor); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vnet_buffer, &t->buffer); + + s = format (s, "\n%U", format_white_space, indent); + + f = format_ethernet_header_with_length; + if (!f || !t->is_start_of_packet) + f = format_hex_bytes; + s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +typedef struct +{ + vlib_node_runtime_t *node; + + u32 is_start_of_packet; + + u32 n_bytes_in_packet; + + ixge_tx_descriptor_t *start_of_packet_descriptor; +} ixge_tx_state_t; + +static void +ixge_tx_trace (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + ixge_tx_state_t * tx_state, + ixge_tx_descriptor_t * descriptors, + u32 * buffers, uword n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = tx_state->node; + ixge_tx_descriptor_t *d; + u32 *b, n_left, is_sop; + + n_left = n_descriptors; + b = buffers; + d = descriptors; + is_sop = tx_state->is_start_of_packet; + + while (n_left >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + ixge_tx_dma_trace_t *t0, *t1; + + bi0 = b[0]; + bi1 = b[1]; + n_left -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->is_start_of_packet = is_sop; + is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t1->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t1->device_index = xd->device_index; + t0->descriptor = d[0]; + t1->descriptor = d[1]; + t0->buffer_index = bi0; + t1->buffer_index = bi1; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + memcpy (t1->buffer.pre_data, b1->data + b1->current_data, + sizeof (t1->buffer.pre_data)); + + b += 2; + d += 2; + } + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + ixge_tx_dma_trace_t *t0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t0->descriptor = d[0]; + t0->buffer_index = bi0; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + + b += 1; + d += 1; + } +} + +always_inline uword +ixge_ring_sub (ixge_dma_queue_t * q, u32 i0, u32 i1) +{ + i32 d = i1 - i0; + ASSERT (i0 < q->n_descriptors); + ASSERT (i1 < q->n_descriptors); + return d < 0 ? q->n_descriptors + d : d; +} + +always_inline uword +ixge_ring_add (ixge_dma_queue_t * q, u32 i0, u32 i1) +{ + u32 d = i0 + i1; + ASSERT (i0 < q->n_descriptors); + ASSERT (i1 < q->n_descriptors); + d -= d >= q->n_descriptors ? q->n_descriptors : 0; + return d; +} + +always_inline uword +ixge_tx_descriptor_matches_template (ixge_main_t * xm, + ixge_tx_descriptor_t * d) +{ + u32 cmp; + + cmp = ((d->status0 & xm->tx_descriptor_template_mask.status0) + ^ xm->tx_descriptor_template.status0); + if (cmp) + return 0; + cmp = ((d->status1 & xm->tx_descriptor_template_mask.status1) + ^ xm->tx_descriptor_template.status1); + if (cmp) + return 0; + + return 1; +} + +static uword +ixge_tx_no_wrap (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + u32 * buffers, + u32 start_descriptor_index, + u32 n_descriptors, ixge_tx_state_t * tx_state) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_tx_descriptor_t *d, *d_sop; + u32 n_left = n_descriptors; + u32 *to_free = vec_end (xm->tx_buffers_pending_free); + u32 *to_tx = + vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); + u32 is_sop = tx_state->is_start_of_packet; + u32 len_sop = tx_state->n_bytes_in_packet; + u16 template_status = xm->tx_descriptor_template.status0; + u32 descriptor_prefetch_rotor = 0; + + ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); + d = &dq->descriptors[start_descriptor_index].tx; + d_sop = is_sop ? d : tx_state->start_of_packet_descriptor; + + while (n_left >= 4) + { + vlib_buffer_t *b0, *b1; + u32 bi0, fi0, len0; + u32 bi1, fi1, len1; + u8 is_eop0, is_eop1; + + /* Prefetch next iteration. */ + vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD); + vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD); + + if ((descriptor_prefetch_rotor & 0x3) == 0) + CLIB_PREFETCH (d + 4, CLIB_CACHE_LINE_BYTES, STORE); + + descriptor_prefetch_rotor += 2; + + bi0 = buffers[0]; + bi1 = buffers[1]; + + to_free[0] = fi0 = to_tx[0]; + to_tx[0] = bi0; + to_free += fi0 != 0; + + to_free[0] = fi1 = to_tx[1]; + to_tx[1] = bi1; + to_free += fi1 != 0; + + buffers += 2; + n_left -= 2; + to_tx += 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + is_eop1 = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + len0 = b0->current_length; + len1 = b1->current_length; + + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 1)); + + d[0].buffer_address = vlib_buffer_get_pa (vm, b0); + d[1].buffer_address = vlib_buffer_get_pa (vm, b1); + + d[0].n_bytes_this_buffer = len0; + d[1].n_bytes_this_buffer = len1; + + d[0].status0 = + template_status | (is_eop0 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + d[1].status0 = + template_status | (is_eop1 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + + len_sop = (is_sop ? 0 : len_sop) + len0; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop0 ? d : d_sop; + + is_sop = is_eop0; + + len_sop = (is_sop ? 0 : len_sop) + len1; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop1 ? d : d_sop; + + is_sop = is_eop1; + } + + while (n_left > 0) + { + vlib_buffer_t *b0; + u32 bi0, fi0, len0; + u8 is_eop0; + + bi0 = buffers[0]; + + to_free[0] = fi0 = to_tx[0]; + to_tx[0] = bi0; + to_free += fi0 != 0; + + buffers += 1; + n_left -= 1; + to_tx += 1; + + b0 = vlib_get_buffer (vm, bi0); + + is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + len0 = b0->current_length; + + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); + + d[0].buffer_address = vlib_buffer_get_pa (vm, b0); + d[0].n_bytes_this_buffer = len0; + + d[0].status0 = + template_status | (is_eop0 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + + len_sop = (is_sop ? 0 : len_sop) + len0; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop0 ? d : d_sop; + + is_sop = is_eop0; + } + + if (tx_state->node->flags & VLIB_NODE_FLAG_TRACE) + { + to_tx = + vec_elt_at_index (dq->descriptor_buffer_indices, + start_descriptor_index); + ixge_tx_trace (xm, xd, dq, tx_state, + &dq->descriptors[start_descriptor_index].tx, to_tx, + n_descriptors); + } + + _vec_len (xm->tx_buffers_pending_free) = + to_free - xm->tx_buffers_pending_free; + + /* When we are done d_sop can point to end of ring. Wrap it if so. */ + { + ixge_tx_descriptor_t *d_start = &dq->descriptors[0].tx; + + ASSERT (d_sop - d_start <= dq->n_descriptors); + d_sop = d_sop - d_start == dq->n_descriptors ? d_start : d_sop; + } + + tx_state->is_start_of_packet = is_sop; + tx_state->start_of_packet_descriptor = d_sop; + tx_state->n_bytes_in_packet = len_sop; + + return n_descriptors; +} + +static uword +ixge_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + ixge_main_t *xm = &ixge_main; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + ixge_device_t *xd = vec_elt_at_index (xm->devices, rd->dev_instance); + ixge_dma_queue_t *dq; + u32 *from, n_left_tx, n_descriptors_to_tx, n_tail_drop; + u32 queue_index = 0; /* fixme parameter */ + ixge_tx_state_t tx_state; + + tx_state.node = node; + tx_state.is_start_of_packet = 1; + tx_state.start_of_packet_descriptor = 0; + tx_state.n_bytes_in_packet = 0; + + from = vlib_frame_vector_args (f); + + dq = vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); + + dq->head_index = dq->tx.head_index_write_back[0]; + + /* Since head == tail means ring is empty we can send up to dq->n_descriptors - 1. */ + n_left_tx = dq->n_descriptors - 1; + n_left_tx -= ixge_ring_sub (dq, dq->head_index, dq->tail_index); + + _vec_len (xm->tx_buffers_pending_free) = 0; + + n_descriptors_to_tx = f->n_vectors; + n_tail_drop = 0; + if (PREDICT_FALSE (n_descriptors_to_tx > n_left_tx)) + { + i32 i, n_ok, i_eop, i_sop; + + i_sop = i_eop = ~0; + for (i = n_left_tx - 1; i >= 0; i--) + { + vlib_buffer_t *b = vlib_get_buffer (vm, from[i]); + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + if (i_sop != ~0 && i_eop != ~0) + break; + i_eop = i; + i_sop = i + 1; + } + } + if (i == 0) + n_ok = 0; + else + n_ok = i_eop + 1; + + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, ring full to tx %d head %d tail %d",.format_args = + "i2i2i2i2",}; + struct + { + u16 instance, to_tx, head, tail; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->to_tx = n_descriptors_to_tx; + ed->head = dq->head_index; + ed->tail = dq->tail_index; + } + + if (n_ok < n_descriptors_to_tx) + { + n_tail_drop = n_descriptors_to_tx - n_ok; + vec_add (xm->tx_buffers_pending_free, from + n_ok, n_tail_drop); + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_tx_full_drops, n_tail_drop); + } + + n_descriptors_to_tx = n_ok; + } + + dq->tx.n_buffers_on_ring += n_descriptors_to_tx; + + /* Process from tail to end of descriptor ring. */ + if (n_descriptors_to_tx > 0 && dq->tail_index < dq->n_descriptors) + { + u32 n = + clib_min (dq->n_descriptors - dq->tail_index, n_descriptors_to_tx); + n = ixge_tx_no_wrap (xm, xd, dq, from, dq->tail_index, n, &tx_state); + from += n; + n_descriptors_to_tx -= n; + dq->tail_index += n; + ASSERT (dq->tail_index <= dq->n_descriptors); + if (dq->tail_index == dq->n_descriptors) + dq->tail_index = 0; + } + + if (n_descriptors_to_tx > 0) + { + u32 n = + ixge_tx_no_wrap (xm, xd, dq, from, 0, n_descriptors_to_tx, &tx_state); + from += n; + ASSERT (n == n_descriptors_to_tx); + dq->tail_index += n; + ASSERT (dq->tail_index <= dq->n_descriptors); + if (dq->tail_index == dq->n_descriptors) + dq->tail_index = 0; + } + + /* We should only get full packets. */ + ASSERT (tx_state.is_start_of_packet); + + /* Report status when last descriptor is done. */ + { + u32 i = dq->tail_index == 0 ? dq->n_descriptors - 1 : dq->tail_index - 1; + ixge_tx_descriptor_t *d = &dq->descriptors[i].tx; + d->status0 |= IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS; + } + + /* Give new descriptors to hardware. */ + { + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_TX, queue_index); + + CLIB_MEMORY_BARRIER (); + + dr->tail_index = dq->tail_index; + } + + /* Free any buffers that are done. */ + { + u32 n = _vec_len (xm->tx_buffers_pending_free); + if (n > 0) + { + vlib_buffer_free_no_next (vm, xm->tx_buffers_pending_free, n); + _vec_len (xm->tx_buffers_pending_free) = 0; + ASSERT (dq->tx.n_buffers_on_ring >= n); + dq->tx.n_buffers_on_ring -= (n - n_tail_drop); + } + } + + return f->n_vectors; +} + +static uword +ixge_rx_queue_no_wrap (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + u32 start_descriptor_index, u32 n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = dq->rx.node; + ixge_descriptor_t *d; + static ixge_descriptor_t *d_trace_save; + static u32 *d_trace_buffers; + u32 n_descriptors_left = n_descriptors; + u32 *to_rx = + vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); + u32 *to_add; + u32 bi_sop = dq->rx.saved_start_of_packet_buffer_index; + u32 bi_last = dq->rx.saved_last_buffer_index; + u32 next_index_sop = dq->rx.saved_start_of_packet_next_index; + u32 is_sop = dq->rx.is_start_of_packet; + u32 next_index, n_left_to_next, *to_next; + u32 n_packets = 0; + u32 n_bytes = 0; + u32 n_trace = vlib_get_trace_count (vm, node); + vlib_buffer_t *b_last, b_dummy; + + ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); + d = &dq->descriptors[start_descriptor_index]; + + b_last = bi_last != ~0 ? vlib_get_buffer (vm, bi_last) : &b_dummy; + next_index = dq->rx.next_index; + + if (n_trace > 0) + { + u32 n = clib_min (n_trace, n_descriptors); + if (d_trace_save) + { + _vec_len (d_trace_save) = 0; + _vec_len (d_trace_buffers) = 0; + } + vec_add (d_trace_save, (ixge_descriptor_t *) d, n); + vec_add (d_trace_buffers, to_rx, n); + } + + { + uword l = vec_len (xm->rx_buffers_to_add); + + if (l < n_descriptors_left) + { + u32 n_to_alloc = 2 * dq->n_descriptors - l; + u32 n_allocated; + + vec_resize (xm->rx_buffers_to_add, n_to_alloc); + + _vec_len (xm->rx_buffers_to_add) = l; + n_allocated = + vlib_buffer_alloc (vm, xm->rx_buffers_to_add + l, n_to_alloc); + _vec_len (xm->rx_buffers_to_add) += n_allocated; + + /* Handle transient allocation failure */ + if (PREDICT_FALSE (l + n_allocated <= n_descriptors_left)) + { + if (n_allocated == 0) + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_rx_alloc_no_physmem, 1); + else + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_rx_alloc_fail, 1); + + n_descriptors_left = l + n_allocated; + } + n_descriptors = n_descriptors_left; + } + + /* Add buffers from end of vector going backwards. */ + to_add = vec_end (xm->rx_buffers_to_add) - 1; + } + + while (n_descriptors_left > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_descriptors_left >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *b0, *b1; + vlib_buffer_t *f0, *f1; + u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; + u32 bi1, fi1, len1, l3_offset1, s21, s01, flags1; + u8 is_eop0, error0, next0; + u8 is_eop1, error1, next1; + ixge_descriptor_t d0, d1; + + vlib_prefetch_buffer_with_index (vm, to_rx[2], STORE); + vlib_prefetch_buffer_with_index (vm, to_rx[3], STORE); + + CLIB_PREFETCH (d + 2, 32, STORE); + + d0.as_u32x4 = d[0].as_u32x4; + d1.as_u32x4 = d[1].as_u32x4; + + s20 = d0.rx_from_hw.status[2]; + s21 = d1.rx_from_hw.status[2]; + + s00 = d0.rx_from_hw.status[0]; + s01 = d1.rx_from_hw.status[0]; + + if (! + ((s20 & s21) & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) + goto found_hw_owned_descriptor_x2; + + bi0 = to_rx[0]; + bi1 = to_rx[1]; + + ASSERT (to_add - 1 >= xm->rx_buffers_to_add); + fi0 = to_add[0]; + fi1 = to_add[-1]; + + to_rx[0] = fi0; + to_rx[1] = fi1; + to_rx += 2; + to_add -= 2; + +#if 0 + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == vlib_buffer_is_known (bi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == vlib_buffer_is_known (bi1)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == vlib_buffer_is_known (fi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == vlib_buffer_is_known (fi1)); +#endif + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + + CLIB_PREFETCH (b0->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b1->data, CLIB_CACHE_LINE_BYTES, LOAD); + + is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + is_eop1 = (s21 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + + ixge_rx_next_and_error_from_status_x2 (xd, s00, s20, s01, s21, + &next0, &error0, &flags0, + &next1, &error1, &flags1); + + next0 = is_sop ? next0 : next_index_sop; + next1 = is_eop0 ? next1 : next0; + next_index_sop = next1; + + b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + b1->flags |= flags1 | (!is_eop1 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->error = node->errors[error0]; + b1->error = node->errors[error1]; + + len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; + len1 = d1.rx_from_hw.n_packet_bytes_this_descriptor; + n_bytes += len0 + len1; + n_packets += is_eop0 + is_eop1; + + /* Give new buffers to hardware. */ + f0 = vlib_get_buffer (vm, fi0); + f1 = vlib_get_buffer (vm, fi1); + d0.rx_to_hw.tail_address = vlib_buffer_get_pa (vm, f0); + d1.rx_to_hw.tail_address = vlib_buffer_get_pa (vm, f1); + d0.rx_to_hw.head_address = d[0].rx_to_hw.tail_address; + d1.rx_to_hw.head_address = d[1].rx_to_hw.tail_address; + d[0].as_u32x4 = d0.as_u32x4; + d[1].as_u32x4 = d1.as_u32x4; + + d += 2; + n_descriptors_left -= 2; + + /* Point to either l2 or l3 header depending on next. */ + l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; + l3_offset1 = (is_eop0 && (next1 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s01) : 0; + + b0->current_length = len0 - l3_offset0; + b1->current_length = len1 - l3_offset1; + b0->current_data = l3_offset0; + b1->current_data = l3_offset1; + + b_last->next_buffer = is_sop ? ~0 : bi0; + b0->next_buffer = is_eop0 ? ~0 : bi1; + bi_last = bi1; + b_last = b1; + + if (CLIB_DEBUG > 0) + { + u32 bi_sop0 = is_sop ? bi0 : bi_sop; + u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; + + if (is_eop0) + { + u8 *msg = vlib_validate_buffer (vm, bi_sop0, + /* follow_buffer_next */ 1); + ASSERT (!msg); + } + if (is_eop1) + { + u8 *msg = vlib_validate_buffer (vm, bi_sop1, + /* follow_buffer_next */ 1); + ASSERT (!msg); + } + } + if (0) /* "Dave" version */ + { + u32 bi_sop0 = is_sop ? bi0 : bi_sop; + u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; + + if (is_eop0) + { + to_next[0] = bi_sop0; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop0, next0); + } + if (is_eop1) + { + to_next[0] = bi_sop1; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop1, next1); + } + is_sop = is_eop1; + bi_sop = bi_sop1; + } + if (1) /* "Eliot" version */ + { + /* Speculatively enqueue to cached next. */ + u8 saved_is_sop = is_sop; + u32 bi_sop_save = bi_sop; + + bi_sop = saved_is_sop ? bi0 : bi_sop; + to_next[0] = bi_sop; + to_next += is_eop0; + n_left_to_next -= is_eop0; + + bi_sop = is_eop0 ? bi1 : bi_sop; + to_next[0] = bi_sop; + to_next += is_eop1; + n_left_to_next -= is_eop1; + + is_sop = is_eop1; + + if (PREDICT_FALSE + (!(next0 == next_index && next1 == next_index))) + { + /* Undo speculation. */ + to_next -= is_eop0 + is_eop1; + n_left_to_next += is_eop0 + is_eop1; + + /* Re-do both descriptors being careful about where we enqueue. */ + bi_sop = saved_is_sop ? bi0 : bi_sop_save; + if (is_eop0) + { + if (next0 != next_index) + vlib_set_next_frame_buffer (vm, node, next0, bi_sop); + else + { + to_next[0] = bi_sop; + to_next += 1; + n_left_to_next -= 1; + } + } + + bi_sop = is_eop0 ? bi1 : bi_sop; + if (is_eop1) + { + if (next1 != next_index) + vlib_set_next_frame_buffer (vm, node, next1, bi_sop); + else + { + to_next[0] = bi_sop; + to_next += 1; + n_left_to_next -= 1; + } + } + + /* Switch cached next index when next for both packets is the same. */ + if (is_eop0 && is_eop1 && next0 == next1) + { + vlib_put_next_frame (vm, node, next_index, + n_left_to_next); + next_index = next0; + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + } + } + } + } + + /* Bail out of dual loop and proceed with single loop. */ + found_hw_owned_descriptor_x2: + + while (n_descriptors_left > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + vlib_buffer_t *f0; + u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; + u8 is_eop0, error0, next0; + ixge_descriptor_t d0; + + d0.as_u32x4 = d[0].as_u32x4; + + s20 = d0.rx_from_hw.status[2]; + s00 = d0.rx_from_hw.status[0]; + + if (!(s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) + goto found_hw_owned_descriptor_x1; + + bi0 = to_rx[0]; + ASSERT (to_add >= xm->rx_buffers_to_add); + fi0 = to_add[0]; + + to_rx[0] = fi0; + to_rx += 1; + to_add -= 1; + +#if 0 + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == vlib_buffer_is_known (bi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == vlib_buffer_is_known (fi0)); +#endif + + b0 = vlib_get_buffer (vm, bi0); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + ixge_rx_next_and_error_from_status_x1 + (xd, s00, s20, &next0, &error0, &flags0); + + next0 = is_sop ? next0 : next_index_sop; + next_index_sop = next0; + + b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->error = node->errors[error0]; + + len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; + n_bytes += len0; + n_packets += is_eop0; + + /* Give new buffer to hardware. */ + f0 = vlib_get_buffer (vm, fi0); + d0.rx_to_hw.tail_address = vlib_buffer_get_pa (vm, f0); + d0.rx_to_hw.head_address = d0.rx_to_hw.tail_address; + d[0].as_u32x4 = d0.as_u32x4; + + d += 1; + n_descriptors_left -= 1; + + /* Point to either l2 or l3 header depending on next. */ + l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; + b0->current_length = len0 - l3_offset0; + b0->current_data = l3_offset0; + + b_last->next_buffer = is_sop ? ~0 : bi0; + bi_last = bi0; + b_last = b0; + + bi_sop = is_sop ? bi0 : bi_sop; + + if (CLIB_DEBUG > 0 && is_eop0) + { + u8 *msg = + vlib_validate_buffer (vm, bi_sop, /* follow_buffer_next */ 1); + ASSERT (!msg); + } + + if (0) /* "Dave" version */ + { + if (is_eop0) + { + to_next[0] = bi_sop; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop, next0); + } + } + if (1) /* "Eliot" version */ + { + if (PREDICT_TRUE (next0 == next_index)) + { + to_next[0] = bi_sop; + to_next += is_eop0; + n_left_to_next -= is_eop0; + } + else + { + if (next0 != next_index && is_eop0) + vlib_set_next_frame_buffer (vm, node, next0, bi_sop); + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + next_index = next0; + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + } + } + is_sop = is_eop0; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + +found_hw_owned_descriptor_x1: + if (n_descriptors_left > 0) + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + + _vec_len (xm->rx_buffers_to_add) = (to_add + 1) - xm->rx_buffers_to_add; + + { + u32 n_done = n_descriptors - n_descriptors_left; + + if (n_trace > 0 && n_done > 0) + { + u32 n = clib_min (n_trace, n_done); + ixge_rx_trace (xm, xd, dq, + d_trace_save, + d_trace_buffers, + &dq->descriptors[start_descriptor_index], n); + vlib_set_trace_count (vm, node, n_trace - n); + } + if (d_trace_save) + { + _vec_len (d_trace_save) = 0; + _vec_len (d_trace_buffers) = 0; + } + + /* Don't keep a reference to b_last if we don't have to. + Otherwise we can over-write a next_buffer pointer after already haven + enqueued a packet. */ + if (is_sop) + { + b_last->next_buffer = ~0; + bi_last = ~0; + } + + dq->rx.n_descriptors_done_this_call = n_done; + dq->rx.n_descriptors_done_total += n_done; + dq->rx.is_start_of_packet = is_sop; + dq->rx.saved_start_of_packet_buffer_index = bi_sop; + dq->rx.saved_last_buffer_index = bi_last; + dq->rx.saved_start_of_packet_next_index = next_index_sop; + dq->rx.next_index = next_index; + dq->rx.n_bytes += n_bytes; + + return n_packets; + } +} + +static uword +ixge_rx_queue (ixge_main_t * xm, + ixge_device_t * xd, + vlib_node_runtime_t * node, u32 queue_index) +{ + ixge_dma_queue_t *dq = + vec_elt_at_index (xd->dma_queues[VLIB_RX], queue_index); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, dq->queue_index); + uword n_packets = 0; + u32 hw_head_index, sw_head_index; + + /* One time initialization. */ + if (!dq->rx.node) + { + dq->rx.node = node; + dq->rx.is_start_of_packet = 1; + dq->rx.saved_start_of_packet_buffer_index = ~0; + dq->rx.saved_last_buffer_index = ~0; + } + + dq->rx.next_index = node->cached_next_index; + + dq->rx.n_descriptors_done_total = 0; + dq->rx.n_descriptors_done_this_call = 0; + dq->rx.n_bytes = 0; + + /* Fetch head from hardware and compare to where we think we are. */ + hw_head_index = dr->head_index; + sw_head_index = dq->head_index; + + if (hw_head_index == sw_head_index) + goto done; + + if (hw_head_index < sw_head_index) + { + u32 n_tried = dq->n_descriptors - sw_head_index; + n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); + sw_head_index = + ixge_ring_add (dq, sw_head_index, + dq->rx.n_descriptors_done_this_call); + + if (dq->rx.n_descriptors_done_this_call != n_tried) + goto done; + } + if (hw_head_index >= sw_head_index) + { + u32 n_tried = hw_head_index - sw_head_index; + n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); + sw_head_index = + ixge_ring_add (dq, sw_head_index, + dq->rx.n_descriptors_done_this_call); + } + +done: + dq->head_index = sw_head_index; + dq->tail_index = + ixge_ring_add (dq, dq->tail_index, dq->rx.n_descriptors_done_total); + + /* Give tail back to hardware. */ + CLIB_MEMORY_BARRIER (); + + dr->tail_index = dq->tail_index; + + vlib_increment_combined_counter (vnet_main. + interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + 0 /* thread_index */ , + xd->vlib_sw_if_index, n_packets, + dq->rx.n_bytes); + + return n_packets; +} + +static void +ixge_interrupt (ixge_main_t * xm, ixge_device_t * xd, u32 i) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + + if (i != 20) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, %s",.format_args = "i1t1",.n_enum_strings = + 16,.enum_strings = + { + "flow director", + "rx miss", + "pci exception", + "mailbox", + "link status change", + "linksec key exchange", + "manageability event", + "reserved23", + "sdp0", + "sdp1", + "sdp2", + "sdp3", + "ecc", "descriptor handler error", "tcp timer", "other",},}; + struct + { + u8 instance; + u8 index; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->index = i - 16; + } + else + { + u32 v = r->xge_mac.link_status; + uword is_up = (v & (1 << 30)) != 0; + + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, link status change 0x%x",.format_args = "i4i4",}; + struct + { + u32 instance, link_status; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->link_status = v; + xd->link_status_at_last_link_change = v; + + vlib_process_signal_event (vm, ixge_process_node.index, + EVENT_SET_FLAGS, + ((is_up << 31) | xd->vlib_hw_if_index)); + } +} + +always_inline u32 +clean_block (u32 * b, u32 * t, u32 n_left) +{ + u32 *t0 = t; + + while (n_left >= 4) + { + u32 bi0, bi1, bi2, bi3; + + t[0] = bi0 = b[0]; + b[0] = 0; + t += bi0 != 0; + + t[0] = bi1 = b[1]; + b[1] = 0; + t += bi1 != 0; + + t[0] = bi2 = b[2]; + b[2] = 0; + t += bi2 != 0; + + t[0] = bi3 = b[3]; + b[3] = 0; + t += bi3 != 0; + + b += 4; + n_left -= 4; + } + + while (n_left > 0) + { + u32 bi0; + + t[0] = bi0 = b[0]; + b[0] = 0; + t += bi0 != 0; + b += 1; + n_left -= 1; + } + + return t - t0; +} + +static void +ixge_tx_queue (ixge_main_t * xm, ixge_device_t * xd, u32 queue_index) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_dma_queue_t *dq = + vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); + u32 n_clean, *b, *t, *t0; + i32 n_hw_owned_descriptors; + i32 first_to_clean, last_to_clean; + u64 hwbp_race = 0; + + /* Handle case where head write back pointer update + * arrives after the interrupt during high PCI bus loads. + */ + while ((dq->head_index == dq->tx.head_index_write_back[0]) && + dq->tx.n_buffers_on_ring && (dq->head_index != dq->tail_index)) + { + hwbp_race++; + if (IXGE_HWBP_RACE_ELOG && (hwbp_race == 1)) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d tx head index race: head %4d, tail %4d, buffs %4d",.format_args + = "i4i4i4i4",}; + struct + { + u32 instance, head_index, tail_index, n_buffers_on_ring; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->head_index = dq->head_index; + ed->tail_index = dq->tail_index; + ed->n_buffers_on_ring = dq->tx.n_buffers_on_ring; + } + } + + dq->head_index = dq->tx.head_index_write_back[0]; + n_hw_owned_descriptors = ixge_ring_sub (dq, dq->head_index, dq->tail_index); + ASSERT (dq->tx.n_buffers_on_ring >= n_hw_owned_descriptors); + n_clean = dq->tx.n_buffers_on_ring - n_hw_owned_descriptors; + + if (IXGE_HWBP_RACE_ELOG && hwbp_race) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d tx head index race: head %4d, hw_owned %4d, n_clean %4d, retries %d",.format_args + = "i4i4i4i4i4",}; + struct + { + u32 instance, head_index, n_hw_owned_descriptors, n_clean, retries; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->head_index = dq->head_index; + ed->n_hw_owned_descriptors = n_hw_owned_descriptors; + ed->n_clean = n_clean; + ed->retries = hwbp_race; + } + + /* + * This function used to wait until hardware owned zero descriptors. + * At high PPS rates, that doesn't happen until the TX ring is + * completely full of descriptors which need to be cleaned up. + * That, in turn, causes TX ring-full drops and/or long RX service + * interruptions. + */ + if (n_clean == 0) + return; + + /* Clean the n_clean descriptors prior to the reported hardware head */ + last_to_clean = dq->head_index - 1; + last_to_clean = (last_to_clean < 0) ? last_to_clean + dq->n_descriptors : + last_to_clean; + + first_to_clean = (last_to_clean) - (n_clean - 1); + first_to_clean = (first_to_clean < 0) ? first_to_clean + dq->n_descriptors : + first_to_clean; + + vec_resize (xm->tx_buffers_pending_free, dq->n_descriptors - 1); + t0 = t = xm->tx_buffers_pending_free; + b = dq->descriptor_buffer_indices + first_to_clean; + + /* Wrap case: clean from first to end, then start to last */ + if (first_to_clean > last_to_clean) + { + t += clean_block (b, t, (dq->n_descriptors - 1) - first_to_clean); + first_to_clean = 0; + b = dq->descriptor_buffer_indices; + } + + /* Typical case: clean from first to last */ + if (first_to_clean <= last_to_clean) + t += clean_block (b, t, (last_to_clean - first_to_clean) + 1); + + if (t > t0) + { + u32 n = t - t0; + vlib_buffer_free_no_next (vm, t0, n); + ASSERT (dq->tx.n_buffers_on_ring >= n); + dq->tx.n_buffers_on_ring -= n; + _vec_len (xm->tx_buffers_pending_free) = 0; + } +} + +/* RX queue interrupts 0 thru 7; TX 8 thru 15. */ +always_inline uword +ixge_interrupt_is_rx_queue (uword i) +{ + return i < 8; +} + +always_inline uword +ixge_interrupt_is_tx_queue (uword i) +{ + return i >= 8 && i < 16; +} + +always_inline uword +ixge_tx_queue_to_interrupt (uword i) +{ + return 8 + i; +} + +always_inline uword +ixge_rx_queue_to_interrupt (uword i) +{ + return 0 + i; +} + +always_inline uword +ixge_interrupt_rx_queue (uword i) +{ + ASSERT (ixge_interrupt_is_rx_queue (i)); + return i - 0; +} + +always_inline uword +ixge_interrupt_tx_queue (uword i) +{ + ASSERT (ixge_interrupt_is_tx_queue (i)); + return i - 8; +} + +static uword +ixge_device_input (ixge_main_t * xm, + ixge_device_t * xd, vlib_node_runtime_t * node) +{ + ixge_regs_t *r = xd->regs; + u32 i, s; + uword n_rx_packets = 0; + + s = r->interrupt.status_write_1_to_set; + if (s) + r->interrupt.status_write_1_to_clear = s; + + /* *INDENT-OFF* */ + foreach_set_bit (i, s, ({ + if (ixge_interrupt_is_rx_queue (i)) + n_rx_packets += ixge_rx_queue (xm, xd, node, ixge_interrupt_rx_queue (i)); + + else if (ixge_interrupt_is_tx_queue (i)) + ixge_tx_queue (xm, xd, ixge_interrupt_tx_queue (i)); + + else + ixge_interrupt (xm, xd, i); + })); + /* *INDENT-ON* */ + + return n_rx_packets; +} + +static uword +ixge_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd; + uword n_rx_packets = 0; + + if (node->state == VLIB_NODE_STATE_INTERRUPT) + { + uword i; + + /* Loop over devices with interrupts. */ + /* *INDENT-OFF* */ + foreach_set_bit (i, node->runtime_data[0], ({ + xd = vec_elt_at_index (xm->devices, i); + n_rx_packets += ixge_device_input (xm, xd, node); + + /* Re-enable interrupts since we're going to stay in interrupt mode. */ + if (! (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) + xd->regs->interrupt.enable_write_1_to_set = ~0; + })); + /* *INDENT-ON* */ + + /* Clear mask of devices with pending interrupts. */ + node->runtime_data[0] = 0; + } + else + { + /* Poll all devices for input/interrupts. */ + vec_foreach (xd, xm->devices) + { + n_rx_packets += ixge_device_input (xm, xd, node); + + /* Re-enable interrupts when switching out of polling mode. */ + if (node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) + xd->regs->interrupt.enable_write_1_to_set = ~0; + } + } + + return n_rx_packets; +} + +static char *ixge_error_strings[] = { +#define _(n,s) s, + foreach_ixge_error +#undef _ +}; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ixge_input_node, static) = { + .function = ixge_input, + .type = VLIB_NODE_TYPE_INPUT, + .name = "ixge-input", + .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED, + + /* Will be enabled if/when hardware is detected. */ + .state = VLIB_NODE_STATE_DISABLED, + + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_ixge_rx_dma_trace, + + .n_errors = IXGE_N_ERROR, + .error_strings = ixge_error_strings, + + .n_next_nodes = IXGE_RX_N_NEXT, + .next_nodes = { + [IXGE_RX_NEXT_DROP] = "error-drop", + [IXGE_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + [IXGE_RX_NEXT_IP4_INPUT] = "ip4-input", + [IXGE_RX_NEXT_IP6_INPUT] = "ip6-input", + }, +}; + +/* *INDENT-ON* */ + +static u8 * +format_ixge_device_name (u8 * s, va_list * args) +{ + vlib_main_t *vm = vlib_get_main (); + u32 i = va_arg (*args, u32); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, i); + vlib_pci_addr_t *addr = vlib_pci_get_addr (vm, xd->pci_dev_handle); + return format (s, "TenGigabitEthernet%x/%x/%x/%x", + addr->domain, addr->bus, addr->slot, addr->function); +} + +#define IXGE_COUNTER_IS_64_BIT (1 << 0) +#define IXGE_COUNTER_NOT_CLEAR_ON_READ (1 << 1) + +static u8 ixge_counter_flags[] = { +#define _(a,f) 0, +#define _64(a,f) IXGE_COUNTER_IS_64_BIT, + foreach_ixge_counter +#undef _ +#undef _64 +}; + +static void +ixge_update_counters (ixge_device_t * xd) +{ + /* Byte offset for counter registers. */ + static u32 reg_offsets[] = { +#define _(a,f) (a) / sizeof (u32), +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + }; + volatile u32 *r = (volatile u32 *) xd->regs; + int i; + + for (i = 0; i < ARRAY_LEN (xd->counters); i++) + { + u32 o = reg_offsets[i]; + xd->counters[i] += r[o]; + if (ixge_counter_flags[i] & IXGE_COUNTER_NOT_CLEAR_ON_READ) + r[o] = 0; + if (ixge_counter_flags[i] & IXGE_COUNTER_IS_64_BIT) + xd->counters[i] += (u64) r[o + 1] << (u64) 32; + } +} + +static u8 * +format_ixge_device_id (u8 * s, va_list * args) +{ + u32 device_id = va_arg (*args, u32); + char *t = 0; + switch (device_id) + { +#define _(f,n) case n: t = #f; break; + foreach_ixge_pci_device_id; +#undef _ + default: + t = 0; + break; + } + if (t == 0) + s = format (s, "unknown 0x%x", device_id); + else + s = format (s, "%s", t); + return s; +} + +static u8 * +format_ixge_link_status (u8 * s, va_list * args) +{ + ixge_device_t *xd = va_arg (*args, ixge_device_t *); + u32 v = xd->link_status_at_last_link_change; + + s = format (s, "%s", (v & (1 << 30)) ? "up" : "down"); + + { + char *modes[] = { + "1g", "10g parallel", "10g serial", "autoneg", + }; + char *speeds[] = { + "unknown", "100m", "1g", "10g", + }; + s = format (s, ", mode %s, speed %s", + modes[(v >> 26) & 3], speeds[(v >> 28) & 3]); + } + + return s; +} + +static u8 * +format_ixge_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + CLIB_UNUSED (int verbose) = va_arg (*args, int); + vlib_main_t *vm = vlib_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, dev_instance); + ixge_phy_t *phy = xd->phys + xd->phy_index; + u32 indent = format_get_indent (s); + + ixge_update_counters (xd); + xd->link_status_at_last_link_change = xd->regs->xge_mac.link_status; + + s = format (s, "Intel 8259X: id %U\n%Ulink %U", + format_ixge_device_id, xd->device_id, + format_white_space, indent + 2, format_ixge_link_status, xd); + + { + + vlib_pci_addr_t *addr = vlib_pci_get_addr (vm, xd->pci_dev_handle); + vlib_pci_device_info_t *d = vlib_pci_get_device_info (vm, addr, 0); + + if (d) + s = format (s, "\n%UPCIe %U", format_white_space, indent + 2, + format_vlib_pci_link_speed, d); + } + + s = format (s, "\n%U", format_white_space, indent + 2); + if (phy->mdio_address != ~0) + s = format (s, "PHY address %d, id 0x%x", phy->mdio_address, phy->id); + else if (xd->sfp_eeprom.id == SFP_ID_SFP) + s = format (s, "SFP %U", format_sfp_eeprom, &xd->sfp_eeprom); + else + s = format (s, "PHY not found"); + + /* FIXME */ + { + ixge_dma_queue_t *dq = vec_elt_at_index (xd->dma_queues[VLIB_RX], 0); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); + u32 hw_head_index = dr->head_index; + u32 sw_head_index = dq->head_index; + u32 nitems; + + nitems = ixge_ring_sub (dq, hw_head_index, sw_head_index); + s = format (s, "\n%U%d unprocessed, %d total buffers on rx queue 0 ring", + format_white_space, indent + 2, nitems, dq->n_descriptors); + + s = format (s, "\n%U%d buffers in driver rx cache", + format_white_space, indent + 2, + vec_len (xm->rx_buffers_to_add)); + + s = format (s, "\n%U%d buffers on tx queue 0 ring", + format_white_space, indent + 2, + xd->dma_queues[VLIB_TX][0].tx.n_buffers_on_ring); + } + { + u32 i; + u64 v; + static char *names[] = { +#define _(a,f) #f, +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + }; + + for (i = 0; i < ARRAY_LEN (names); i++) + { + v = xd->counters[i] - xd->counters_last_clear[i]; + if (v != 0) + s = format (s, "\n%U%-40U%16Ld", + format_white_space, indent + 2, + format_c_identifier, names[i], v); + } + } + + return s; +} + +static void +ixge_clear_hw_interface_counters (u32 instance) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, instance); + ixge_update_counters (xd); + memcpy (xd->counters_last_clear, xd->counters, sizeof (xd->counters)); +} + +/* + * Dynamically redirect all pkts from a specific interface + * to the specified node + */ +static void +ixge_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + ixge_main_t *xm = &ixge_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + ixge_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + xd->per_interface_next_index = node_index; + return; + } + + xd->per_interface_next_index = + vlib_node_add_next (xm->vlib_main, ixge_input_node.index, node_index); +} + + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (ixge_device_class) = { + .name = "ixge", + .tx_function = ixge_interface_tx, + .format_device_name = format_ixge_device_name, + .format_device = format_ixge_device, + .format_tx_trace = format_ixge_tx_dma_trace, + .clear_counters = ixge_clear_hw_interface_counters, + .admin_up_down_function = ixge_interface_admin_up_down, + .rx_redirect_to_node = ixge_set_interface_next_node, +}; +/* *INDENT-ON* */ + +#define IXGE_N_BYTES_IN_RX_BUFFER (2048) // DAW-HACK: Set Rx buffer size so all packets < ETH_MTU_SIZE fit in the buffer (i.e. sop & eop for all descriptors). + +static clib_error_t * +ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_dma_queue_t *dq; + clib_error_t *error = 0; + + vec_validate (xd->dma_queues[rt], queue_index); + dq = vec_elt_at_index (xd->dma_queues[rt], queue_index); + + if (!xm->n_descriptors_per_cache_line) + xm->n_descriptors_per_cache_line = + CLIB_CACHE_LINE_BYTES / sizeof (dq->descriptors[0]); + + if (!xm->n_bytes_in_rx_buffer) + xm->n_bytes_in_rx_buffer = IXGE_N_BYTES_IN_RX_BUFFER; + xm->n_bytes_in_rx_buffer = round_pow2 (xm->n_bytes_in_rx_buffer, 1024); + + if (!xm->n_descriptors[rt]) + xm->n_descriptors[rt] = 4 * VLIB_FRAME_SIZE; + + dq->queue_index = queue_index; + dq->n_descriptors = + round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); + dq->head_index = dq->tail_index = 0; + + dq->descriptors = vlib_physmem_alloc_aligned (vm, dq->n_descriptors * + sizeof (dq->descriptors[0]), + 128 /* per chip spec */ ); + if (!dq->descriptors) + return vlib_physmem_last_error (vm); + + clib_memset (dq->descriptors, 0, + dq->n_descriptors * sizeof (dq->descriptors[0])); + vec_resize (dq->descriptor_buffer_indices, dq->n_descriptors); + + if (rt == VLIB_RX) + { + u32 n_alloc, i; + + n_alloc = vlib_buffer_alloc (vm, dq->descriptor_buffer_indices, + vec_len (dq->descriptor_buffer_indices)); + ASSERT (n_alloc == vec_len (dq->descriptor_buffer_indices)); + for (i = 0; i < n_alloc; i++) + { + dq->descriptors[i].rx_to_hw.tail_address = + vlib_buffer_get_pa + (vm, vlib_get_buffer (vm, dq->descriptor_buffer_indices[i])); + } + } + else + { + u32 i; + + dq->tx.head_index_write_back = + vlib_physmem_alloc (vm, CLIB_CACHE_LINE_BYTES); + if (!dq->tx.head_index_write_back) + return vlib_physmem_last_error (vm); + + for (i = 0; i < dq->n_descriptors; i++) + dq->descriptors[i].tx = xm->tx_descriptor_template; + + vec_validate (xm->tx_buffers_pending_free, dq->n_descriptors - 1); + } + + { + ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); + u64 a; + + a = vlib_physmem_get_pa (vm, dq->descriptors); + dr->descriptor_address[0] = a & 0xFFFFFFFF; + dr->descriptor_address[1] = a >> (u64) 32; + dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); + dq->head_index = dq->tail_index = 0; + + if (rt == VLIB_RX) + { + ASSERT ((xm->n_bytes_in_rx_buffer / 1024) < 32); + dr->rx_split_control = + ( /* buffer size */ ((xm->n_bytes_in_rx_buffer / 1024) << 0) + | ( /* lo free descriptor threshold (units of 64 descriptors) */ + (1 << 22)) | ( /* descriptor type: advanced one buffer */ + (1 << 25)) | ( /* drop if no descriptors available */ + (1 << 28))); + + /* Give hardware all but last 16 cache lines' worth of descriptors. */ + dq->tail_index = dq->n_descriptors - + 16 * xm->n_descriptors_per_cache_line; + } + else + { + /* Make sure its initialized before hardware can get to it. */ + dq->tx.head_index_write_back[0] = dq->head_index; + + a = vlib_physmem_get_pa (vm, dq->tx.head_index_write_back); + dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; + dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; + } + + /* DMA on 82599 does not work with [13] rx data write relaxed ordering + and [12] undocumented set. */ + if (rt == VLIB_RX) + dr->dca_control &= ~((1 << 13) | (1 << 12)); + + CLIB_MEMORY_BARRIER (); + + if (rt == VLIB_TX) + { + xd->regs->tx_dma_control |= (1 << 0); + dr->control |= ((32 << 0) /* prefetch threshold */ + | (64 << 8) /* host threshold */ + | (0 << 16) /* writeback threshold */ ); + } + + /* Enable this queue and wait for hardware to initialize + before adding to tail. */ + if (rt == VLIB_TX) + { + dr->control |= 1 << 25; + while (!(dr->control & (1 << 25))) + ; + } + + /* Set head/tail indices and enable DMA. */ + dr->head_index = dq->head_index; + dr->tail_index = dq->tail_index; + } + + return error; +} + +static u32 +ixge_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) +{ + ixge_device_t *xd; + ixge_regs_t *r; + u32 old; + ixge_main_t *xm = &ixge_main; + + xd = vec_elt_at_index (xm->devices, hw->dev_instance); + r = xd->regs; + + old = r->filter_control; + + if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) + r->filter_control = old | (1 << 9) /* unicast promiscuous */ ; + else + r->filter_control = old & ~(1 << 9); + + return old; +} + +static void +ixge_device_init (ixge_main_t * xm) +{ + vnet_main_t *vnm = vnet_get_main (); + ixge_device_t *xd; + + /* Reset chip(s). */ + vec_foreach (xd, xm->devices) + { + ixge_regs_t *r = xd->regs; + const u32 reset_bit = (1 << 26) | (1 << 3); + + r->control |= reset_bit; + + /* No need to suspend. Timed to take ~1e-6 secs */ + while (r->control & reset_bit) + ; + + /* Software loaded. */ + r->extended_control |= (1 << 28); + + ixge_phy_init (xd); + + /* Register ethernet interface. */ + { + u8 addr8[6]; + u32 i, addr32[2]; + clib_error_t *error; + + addr32[0] = r->rx_ethernet_address0[0][0]; + addr32[1] = r->rx_ethernet_address0[0][1]; + for (i = 0; i < 6; i++) + addr8[i] = addr32[i / 4] >> ((i % 4) * 8); + + error = ethernet_register_interface + (vnm, ixge_device_class.index, xd->device_index, + /* ethernet address */ addr8, + &xd->vlib_hw_if_index, ixge_flag_change); + if (error) + clib_error_report (error); + } + + { + vnet_sw_interface_t *sw = + vnet_get_hw_sw_interface (vnm, xd->vlib_hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + } + + ixge_dma_init (xd, VLIB_RX, /* queue_index */ 0); + + xm->n_descriptors[VLIB_TX] = 20 * VLIB_FRAME_SIZE; + + ixge_dma_init (xd, VLIB_TX, /* queue_index */ 0); + + /* RX/TX queue 0 gets mapped to interrupt bits 0 & 8. */ + r->interrupt.queue_mapping[0] = (( /* valid bit */ (1 << 7) | + ixge_rx_queue_to_interrupt (0)) << 0); + + r->interrupt.queue_mapping[0] |= (( /* valid bit */ (1 << 7) | + ixge_tx_queue_to_interrupt (0)) << 8); + + /* No use in getting too many interrupts. + Limit them to one every 3/4 ring size at line rate + min sized packets. + No need for this since kernel/vlib main loop provides adequate interrupt + limiting scheme. */ + if (0) + { + f64 line_rate_max_pps = + 10e9 / (8 * (64 + /* interframe padding */ 20)); + ixge_throttle_queue_interrupt (r, 0, + .75 * xm->n_descriptors[VLIB_RX] / + line_rate_max_pps); + } + + /* Accept all multicast and broadcast packets. Should really add them + to the dst_ethernet_address register array. */ + r->filter_control |= (1 << 10) | (1 << 8); + + /* Enable frames up to size in mac frame size register. */ + r->xge_mac.control |= 1 << 2; + r->xge_mac.rx_max_frame_size = (9216 + 14) << 16; + + /* Enable all interrupts. */ + if (!IXGE_ALWAYS_POLL) + r->interrupt.enable_write_1_to_set = ~0; + } +} + +static uword +ixge_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd; + uword event_type, *event_data = 0; + f64 timeout, link_debounce_deadline; + + ixge_device_init (xm); + + /* Clear all counters. */ + vec_foreach (xd, xm->devices) + { + ixge_update_counters (xd); + clib_memset (xd->counters, 0, sizeof (xd->counters)); + } + + timeout = 30.0; + link_debounce_deadline = 1e70; + + while (1) + { + /* 36 bit stat counters could overflow in ~50 secs. + We poll every 30 secs to be conservative. */ + vlib_process_wait_for_event_or_clock (vm, timeout); + + event_type = vlib_process_get_events (vm, &event_data); + + switch (event_type) + { + case EVENT_SET_FLAGS: + /* 1 ms */ + link_debounce_deadline = vlib_time_now (vm) + 1e-3; + timeout = 1e-3; + break; + + case ~0: + /* No events found: timer expired. */ + if (vlib_time_now (vm) > link_debounce_deadline) + { + vec_foreach (xd, xm->devices) + { + ixge_regs_t *r = xd->regs; + u32 v = r->xge_mac.link_status; + uword is_up = (v & (1 << 30)) != 0; + + vnet_hw_interface_set_flags + (vnm, xd->vlib_hw_if_index, + is_up ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); + } + link_debounce_deadline = 1e70; + timeout = 30.0; + } + break; + + default: + ASSERT (0); + } + + if (event_data) + _vec_len (event_data) = 0; + + /* Query stats every 30 secs. */ + { + f64 now = vlib_time_now (vm); + if (now - xm->time_last_stats_update > 30) + { + xm->time_last_stats_update = now; + vec_foreach (xd, xm->devices) ixge_update_counters (xd); + } + } + } + + return 0; +} + +static vlib_node_registration_t ixge_process_node = { + .function = ixge_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "ixge-process", +}; + +clib_error_t * +ixge_init (vlib_main_t * vm) +{ + ixge_main_t *xm = &ixge_main; + + xm->vlib_main = vm; + clib_memset (&xm->tx_descriptor_template, 0, + sizeof (xm->tx_descriptor_template)); + clib_memset (&xm->tx_descriptor_template_mask, 0, + sizeof (xm->tx_descriptor_template_mask)); + xm->tx_descriptor_template.status0 = + (IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED | + IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED | + IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS); + xm->tx_descriptor_template_mask.status0 = 0xffff; + xm->tx_descriptor_template_mask.status1 = 0x00003fff; + + xm->tx_descriptor_template_mask.status0 &= + ~(IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET + | IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS); + xm->tx_descriptor_template_mask.status1 &= + ~(IXGE_TX_DESCRIPTOR_STATUS1_DONE); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_INIT_FUNCTION (ixge_init) = +{ + .runs_before = VLIB_INITS("pci_bus_init"), +}; +/* *INDENT-ON* */ + + +static void +ixge_pci_intr_handler (vlib_main_t * vm, vlib_pci_dev_handle_t h) +{ + uword private_data = vlib_pci_get_private_data (vm, h); + + vlib_node_set_interrupt_pending (vm, ixge_input_node.index); + + /* Let node know which device is interrupting. */ + { + vlib_node_runtime_t *rt = + vlib_node_get_runtime (vm, ixge_input_node.index); + rt->runtime_data[0] |= 1 << private_data; + } +} + +static clib_error_t * +ixge_pci_init (vlib_main_t * vm, vlib_pci_dev_handle_t h) +{ + ixge_main_t *xm = &ixge_main; + clib_error_t *error = 0; + void *r; + ixge_device_t *xd; + vlib_pci_addr_t *addr = vlib_pci_get_addr (vm, h); + vlib_pci_device_info_t *d = vlib_pci_get_device_info (vm, addr, 0); + + error = vlib_pci_map_region (vm, h, 0, &r); + if (error) + return error; + + vec_add2 (xm->devices, xd, 1); + + if (vec_len (xm->devices) == 1) + { + ixge_input_node.function = ixge_input; + } + + xd->pci_dev_handle = h; + xd->device_id = d->device_id; + xd->regs = r; + xd->device_index = xd - xm->devices; + xd->pci_function = addr->function; + xd->per_interface_next_index = ~0; + + vlib_pci_set_private_data (vm, h, xd->device_index); + + /* Chip found so enable node. */ + { + vlib_node_set_state (vm, ixge_input_node.index, + (IXGE_ALWAYS_POLL + ? VLIB_NODE_STATE_POLLING + : VLIB_NODE_STATE_INTERRUPT)); + + //dev->private_data = xd->device_index; + } + + if (vec_len (xm->devices) == 1) + { + vlib_register_node (vm, &ixge_process_node); + xm->process_node_index = ixge_process_node.index; + } + + error = vlib_pci_bus_master_enable (vm, h); + + if (error) + return error; + + return vlib_pci_intr_enable (vm, h); +} + +/* *INDENT-OFF* */ +PCI_REGISTER_DEVICE (ixge_pci_device_registration,static) = { + .init_function = ixge_pci_init, + .interrupt_handler = ixge_pci_intr_handler, + .supported_devices = { +#define _(t,i) { .vendor_id = PCI_VENDOR_ID_INTEL, .device_id = i, }, + foreach_ixge_pci_device_id +#undef _ + { 0 }, + }, +}; +/* *INDENT-ON* */ + +void +ixge_set_next_node (ixge_rx_next_t next, char *name) +{ + vlib_node_registration_t *r = &ixge_input_node; + + switch (next) + { + case IXGE_RX_NEXT_IP4_INPUT: + case IXGE_RX_NEXT_IP6_INPUT: + case IXGE_RX_NEXT_ETHERNET_INPUT: + r->next_nodes[next] = name; + break; + + default: + clib_warning ("%s: illegal next %d\n", __FUNCTION__, next); + break; + } +} + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .default_disabled = 1, + .description = "Intel 82599 Family Native Driver (experimental)", +}; +#endif + +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/extras/deprecated/ixge/ixge.h b/extras/deprecated/ixge/ixge.h new file mode 100644 index 00000000000..f80d9c0e7cf --- /dev/null +++ b/extras/deprecated/ixge/ixge.h @@ -0,0 +1,1292 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_ixge_h +#define included_ixge_h + +#include <vnet/vnet.h> +#include <vlib/pci/pci.h> +#include <vlib/i2c.h> +#include <vnet/ethernet/sfp.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> + +typedef volatile struct +{ + /* [31:7] 128 byte aligned. */ + u32 descriptor_address[2]; + u32 n_descriptor_bytes; + + /* [5] rx/tx descriptor dca enable + [6] rx packet head dca enable + [7] rx packet tail dca enable + [9] rx/tx descriptor relaxed order + [11] rx/tx descriptor write back relaxed order + [13] rx/tx data write/read relaxed order + [15] rx head data write relaxed order + [31:24] apic id for cpu's cache. */ + u32 dca_control; + + u32 head_index; + + /* [4:0] tail buffer size (in 1k byte units) + [13:8] head buffer size (in 64 byte units) + [24:22] lo free descriptors threshold (units of 64 descriptors) + [27:25] descriptor type 0 = legacy, 1 = advanced one buffer (e.g. tail), + 2 = advanced header splitting (head + tail), 5 = advanced header + splitting (head only). + [28] drop if no descriptors available. */ + u32 rx_split_control; + + u32 tail_index; + CLIB_PAD_FROM_TO (0x1c, 0x28); + + /* [7:0] rx/tx prefetch threshold + [15:8] rx/tx host threshold + [24:16] rx/tx write back threshold + [25] rx/tx enable + [26] tx descriptor writeback flush + [30] rx strip vlan enable */ + u32 control; + + u32 rx_coallesce_control; + + union + { + struct + { + /* packets bytes lo hi */ + u32 stats[3]; + + u32 unused; + } rx; + + struct + { + u32 unused[2]; + + /* [0] enables head write back. */ + u32 head_index_write_back_address[2]; + } tx; + }; +} ixge_dma_regs_t; + +/* Only advanced descriptors are supported. */ +typedef struct +{ + u64 tail_address; + u64 head_address; +} ixge_rx_to_hw_descriptor_t; + +typedef struct +{ + u32 status[3]; + u16 n_packet_bytes_this_descriptor; + u16 vlan_tag; +} ixge_rx_from_hw_descriptor_t; + +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2 (1 << (4 + 11)) +/* Valid if not layer2. */ +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4 (1 << (4 + 0)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT (1 << (4 + 1)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6 (1 << (4 + 2)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT (1 << (4 + 3)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP (1 << (4 + 4)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP (1 << (4 + 5)) +#define IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET(s) (((s) >> 21) & 0x3ff) + +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE (1 << (0 + 0)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET (1 << (0 + 1)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN (1 << (0 + 3)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED (1 << (0 + 4)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED (1 << (0 + 5)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED (1 << (0 + 6)) +#define IXGE_RX_DESCRIPTOR_STATUS2_NOT_UNICAST (1 << (0 + 7)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_DOUBLE_VLAN (1 << (0 + 9)) +#define IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR (1 << (0 + 10)) +#define IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR (1 << (20 + 9)) +#define IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR (1 << (20 + 10)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR (1 << (20 + 11)) + +/* For layer2 packets stats0 bottom 3 bits give ether type index from filter. */ +#define IXGE_RX_DESCRIPTOR_STATUS0_LAYER2_ETHERNET_TYPE(s) ((s) & 7) + +typedef struct +{ + u64 buffer_address; + u16 n_bytes_this_buffer; + u16 status0; + u32 status1; +#define IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED (3 << 4) +#define IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED (1 << (8 + 5)) +#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS (8 + 3) +#define IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS) +#define IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS (1 << (8 + 1)) +#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET (8 + 0) +#define IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET) +#define IXGE_TX_DESCRIPTOR_STATUS1_DONE (1 << 0) +#define IXGE_TX_DESCRIPTOR_STATUS1_CONTEXT(i) (/* valid */ (1 << 7) | ((i) << 4)) +#define IXGE_TX_DESCRIPTOR_STATUS1_IPSEC_OFFLOAD (1 << (8 + 2)) +#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_TCP_UDP_CHECKSUM (1 << (8 + 1)) +#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_IP4_CHECKSUM (1 << (8 + 0)) +#define IXGE_TX_DESCRIPTOR_STATUS0_N_BYTES_THIS_BUFFER(l) ((l) << 0) +#define IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET(l) ((l) << 14) +} ixge_tx_descriptor_t; + +typedef struct +{ + struct + { + u8 checksum_start_offset; + u8 checksum_insert_offset; + u16 checksum_end_offset; + } ip, tcp; + u32 status0; + + u8 status1; + + /* Byte offset after UDP/TCP header. */ + u8 payload_offset; + + u16 max_tcp_segment_size; +} __attribute__ ((packed)) ixge_tx_context_descriptor_t; + +typedef union +{ + ixge_rx_to_hw_descriptor_t rx_to_hw; + ixge_rx_from_hw_descriptor_t rx_from_hw; + ixge_tx_descriptor_t tx; + u32x4 as_u32x4; +} ixge_descriptor_t; + +typedef volatile struct +{ + /* [2] pcie master disable + [3] mac reset + [26] global device reset */ + u32 control; + u32 control_alias; + /* [3:2] device id (0 or 1 for dual port chips) + [7] link is up + [17:10] num vfs + [18] io active + [19] pcie master enable status */ + u32 status_read_only; + CLIB_PAD_FROM_TO (0xc, 0x18); + /* [14] pf reset done + [17] relaxed ordering disable + [26] extended vlan enable + [28] driver loaded */ + u32 extended_control; + CLIB_PAD_FROM_TO (0x1c, 0x20); + + /* software definable pins. + sdp_data [7:0] + sdp_is_output [15:8] + sdp_is_native [23:16] + sdp_function [31:24]. + */ + u32 sdp_control; + CLIB_PAD_FROM_TO (0x24, 0x28); + + /* [0] i2c clock in + [1] i2c clock out + [2] i2c data in + [3] i2c data out */ + u32 i2c_control; + CLIB_PAD_FROM_TO (0x2c, 0x4c); + u32 tcp_timer; + + CLIB_PAD_FROM_TO (0x50, 0x200); + + u32 led_control; + + CLIB_PAD_FROM_TO (0x204, 0x600); + u32 core_spare; + CLIB_PAD_FROM_TO (0x604, 0x700); + + struct + { + u32 vflr_events_clear[4]; + u32 mailbox_interrupt_status[4]; + u32 mailbox_interrupt_enable[4]; + CLIB_PAD_FROM_TO (0x730, 0x800); + } pf_foo; + + struct + { + u32 status_write_1_to_clear; + CLIB_PAD_FROM_TO (0x804, 0x808); + u32 status_write_1_to_set; + CLIB_PAD_FROM_TO (0x80c, 0x810); + u32 status_auto_clear_enable; + CLIB_PAD_FROM_TO (0x814, 0x820); + + /* [11:3] minimum inter-interrupt interval + (2e-6 units; 20e-6 units for fast ethernet). + [15] low-latency interrupt moderation enable + [20:16] low-latency interrupt credit + [27:21] interval counter + [31] write disable for credit and counter (write only). */ + u32 throttle0[24]; + + u32 enable_write_1_to_set; + CLIB_PAD_FROM_TO (0x884, 0x888); + u32 enable_write_1_to_clear; + CLIB_PAD_FROM_TO (0x88c, 0x890); + u32 enable_auto_clear; + u32 msi_to_eitr_select; + /* [3:0] spd 0-3 interrupt detection enable + [4] msi-x enable + [5] other clear disable (makes other bits in status not clear on read) + etc. */ + u32 control; + CLIB_PAD_FROM_TO (0x89c, 0x900); + + /* Defines interrupt mapping for 128 rx + 128 tx queues. + 64 x 4 8 bit entries. + For register [i]: + [5:0] bit in interrupt status for rx queue 2*i + 0 + [7] valid bit + [13:8] bit for tx queue 2*i + 0 + [15] valid bit + similar for rx 2*i + 1 and tx 2*i + 1. */ + u32 queue_mapping[64]; + + /* tcp timer [7:0] and other interrupts [15:8] */ + u32 misc_mapping; + CLIB_PAD_FROM_TO (0xa04, 0xa90); + + /* 64 interrupts determined by mappings. */ + u32 status1_write_1_to_clear[4]; + u32 enable1_write_1_to_set[4]; + u32 enable1_write_1_to_clear[4]; + CLIB_PAD_FROM_TO (0xac0, 0xad0); + u32 status1_enable_auto_clear[4]; + CLIB_PAD_FROM_TO (0xae0, 0x1000); + } interrupt; + + ixge_dma_regs_t rx_dma0[64]; + + CLIB_PAD_FROM_TO (0x2000, 0x2140); + u32 dcb_rx_packet_plane_t4_config[8]; + u32 dcb_rx_packet_plane_t4_status[8]; + CLIB_PAD_FROM_TO (0x2180, 0x2300); + + /* reg i defines mapping for 4 rx queues starting at 4*i + 0. */ + u32 rx_queue_stats_mapping[32]; + u32 rx_queue_stats_control; + + CLIB_PAD_FROM_TO (0x2384, 0x2410); + u32 fc_user_descriptor_ptr[2]; + u32 fc_buffer_control; + CLIB_PAD_FROM_TO (0x241c, 0x2420); + u32 fc_rx_dma; + CLIB_PAD_FROM_TO (0x2424, 0x2430); + u32 dcb_packet_plane_control; + CLIB_PAD_FROM_TO (0x2434, 0x2f00); + + u32 rx_dma_control; + u32 pf_queue_drop_enable; + CLIB_PAD_FROM_TO (0x2f08, 0x2f20); + u32 rx_dma_descriptor_cache_config; + CLIB_PAD_FROM_TO (0x2f24, 0x3000); + + /* 1 bit. */ + u32 rx_enable; + CLIB_PAD_FROM_TO (0x3004, 0x3008); + /* [15:0] ether type (little endian) + [31:16] opcode (big endian) */ + u32 flow_control_control; + CLIB_PAD_FROM_TO (0x300c, 0x3020); + /* 3 bit traffic class for each of 8 priorities. */ + u32 rx_priority_to_traffic_class; + CLIB_PAD_FROM_TO (0x3024, 0x3028); + u32 rx_coallesce_data_buffer_control; + CLIB_PAD_FROM_TO (0x302c, 0x3190); + u32 rx_packet_buffer_flush_detect; + CLIB_PAD_FROM_TO (0x3194, 0x3200); + u32 flow_control_tx_timers[4]; /* 2 timer values */ + CLIB_PAD_FROM_TO (0x3210, 0x3220); + u32 flow_control_rx_threshold_lo[8]; + CLIB_PAD_FROM_TO (0x3240, 0x3260); + u32 flow_control_rx_threshold_hi[8]; + CLIB_PAD_FROM_TO (0x3280, 0x32a0); + u32 flow_control_refresh_threshold; + CLIB_PAD_FROM_TO (0x32a4, 0x3c00); + /* For each of 8 traffic classes (units of bytes). */ + u32 rx_packet_buffer_size[8]; + CLIB_PAD_FROM_TO (0x3c20, 0x3d00); + u32 flow_control_config; + CLIB_PAD_FROM_TO (0x3d04, 0x4200); + + struct + { + u32 pcs_config; + CLIB_PAD_FROM_TO (0x4204, 0x4208); + u32 link_control; + u32 link_status; + u32 pcs_debug[2]; + u32 auto_negotiation; + u32 link_partner_ability; + u32 auto_negotiation_tx_next_page; + u32 auto_negotiation_link_partner_next_page; + CLIB_PAD_FROM_TO (0x4228, 0x4240); + } gige_mac; + + struct + { + /* [0] tx crc enable + [2] enable frames up to max frame size register [31:16] + [10] pad frames < 64 bytes if specified by user + [15] loopback enable + [16] mdc hi speed + [17] turn off mdc between mdio packets */ + u32 control; + + /* [5] rx symbol error (all bits clear on read) + [6] rx illegal symbol + [7] rx idle error + [8] rx local fault + [9] rx remote fault */ + u32 status; + + u32 pause_and_pace_control; + CLIB_PAD_FROM_TO (0x424c, 0x425c); + u32 phy_command; + u32 phy_data; + CLIB_PAD_FROM_TO (0x4264, 0x4268); + + /* [31:16] max frame size in bytes. */ + u32 rx_max_frame_size; + CLIB_PAD_FROM_TO (0x426c, 0x4288); + + /* [0] + [2] pcs receive link up? (latch lo) + [7] local fault + [1] + [0] pcs 10g base r capable + [1] pcs 10g base x capable + [2] pcs 10g base w capable + [10] rx local fault + [11] tx local fault + [15:14] 2 => device present at this address (else not present) */ + u32 xgxs_status[2]; + + u32 base_x_pcs_status; + + /* [0] pass unrecognized flow control frames + [1] discard pause frames + [2] rx priority flow control enable (only in dcb mode) + [3] rx flow control enable. */ + u32 flow_control; + + /* [3:0] tx lanes change polarity + [7:4] rx lanes change polarity + [11:8] swizzle tx lanes + [15:12] swizzle rx lanes + 4 x 2 bit tx lane swap + 4 x 2 bit rx lane swap. */ + u32 serdes_control; + + u32 fifo_control; + + /* [0] force link up + [1] autoneg ack2 bit to transmit + [6:2] autoneg selector field to transmit + [8:7] 10g pma/pmd type 0 => xaui, 1 kx4, 2 cx4 + [9] 1g pma/pmd type 0 => sfi, 1 => kx/bx + [10] disable 10g on without main power + [11] restart autoneg on transition to dx power state + [12] restart autoneg + [15:13] link mode: + 0 => 1g no autoneg + 1 => 10g kx4 parallel link no autoneg + 2 => 1g bx autoneg + 3 => 10g sfi serdes + 4 => kx4/kx/kr + 5 => xgmii 1g/100m + 6 => kx4/kx/kr 1g an + 7 kx4/kx/kr sgmii. + [16] kr support + [17] fec requested + [18] fec ability + etc. */ + u32 auto_negotiation_control; + + /* [0] signal detect 1g/100m + [1] fec signal detect + [2] 10g serial pcs fec block lock + [3] 10g serial high error rate + [4] 10g serial pcs block lock + [5] kx/kx4/kr autoneg next page received + [6] kx/kx4/kr backplane autoneg next page received + [7] link status clear to read + [11:8] 10g signal detect (4 lanes) (for serial just lane 0) + [12] 10g serial signal detect + [16:13] 10g parallel lane sync status + [17] 10g parallel align status + [18] 1g sync status + [19] kx/kx4/kr backplane autoneg is idle + [20] 1g autoneg enabled + [21] 1g pcs enabled for sgmii + [22] 10g xgxs enabled + [23] 10g serial fec enabled (forward error detection) + [24] 10g kr pcs enabled + [25] sgmii enabled + [27:26] mac link mode + 0 => 1g + 1 => 10g parallel + 2 => 10g serial + 3 => autoneg + [29:28] link speed + 1 => 100m + 2 => 1g + 3 => 10g + [30] link is up + [31] kx/kx4/kr backplane autoneg completed successfully. */ + u32 link_status; + + /* [17:16] pma/pmd for 10g serial + 0 => kr, 2 => sfi + [18] disable dme pages */ + u32 auto_negotiation_control2; + + CLIB_PAD_FROM_TO (0x42ac, 0x42b0); + u32 link_partner_ability[2]; + CLIB_PAD_FROM_TO (0x42b8, 0x42d0); + u32 manageability_control; + u32 link_partner_next_page[2]; + CLIB_PAD_FROM_TO (0x42dc, 0x42e0); + u32 kr_pcs_control; + u32 kr_pcs_status; + u32 fec_status[2]; + CLIB_PAD_FROM_TO (0x42f0, 0x4314); + u32 sgmii_control; + CLIB_PAD_FROM_TO (0x4318, 0x4324); + u32 link_status2; + CLIB_PAD_FROM_TO (0x4328, 0x4900); + } xge_mac; + + u32 tx_dcb_control; + u32 tx_dcb_descriptor_plane_queue_select; + u32 tx_dcb_descriptor_plane_t1_config; + u32 tx_dcb_descriptor_plane_t1_status; + CLIB_PAD_FROM_TO (0x4910, 0x4950); + + /* For each TC in units of 1k bytes. */ + u32 tx_packet_buffer_thresholds[8]; + CLIB_PAD_FROM_TO (0x4970, 0x4980); + struct + { + u32 mmw; + u32 config; + u32 status; + u32 rate_drift; + } dcb_tx_rate_scheduler; + CLIB_PAD_FROM_TO (0x4990, 0x4a80); + u32 tx_dma_control; + CLIB_PAD_FROM_TO (0x4a84, 0x4a88); + u32 tx_dma_tcp_flags_control[2]; + CLIB_PAD_FROM_TO (0x4a90, 0x4b00); + u32 pf_mailbox[64]; + CLIB_PAD_FROM_TO (0x4c00, 0x5000); + + /* RX */ + u32 checksum_control; + CLIB_PAD_FROM_TO (0x5004, 0x5008); + u32 rx_filter_control; + CLIB_PAD_FROM_TO (0x500c, 0x5010); + u32 management_vlan_tag[8]; + u32 management_udp_tcp_ports[8]; + CLIB_PAD_FROM_TO (0x5050, 0x5078); + /* little endian. */ + u32 extended_vlan_ether_type; + CLIB_PAD_FROM_TO (0x507c, 0x5080); + /* [1] store/dma bad packets + [8] accept all multicast + [9] accept all unicast + [10] accept all broadcast. */ + u32 filter_control; + CLIB_PAD_FROM_TO (0x5084, 0x5088); + /* [15:0] vlan ethernet type (0x8100) little endian + [28] cfi bit expected + [29] drop packets with unexpected cfi bit + [30] vlan filter enable. */ + u32 vlan_control; + CLIB_PAD_FROM_TO (0x508c, 0x5090); + /* [1:0] hi bit of ethernet address for 12 bit index into multicast table + 0 => 47, 1 => 46, 2 => 45, 3 => 43. + [2] enable multicast filter + */ + u32 multicast_control; + CLIB_PAD_FROM_TO (0x5094, 0x5100); + u32 fcoe_rx_control; + CLIB_PAD_FROM_TO (0x5104, 0x5108); + u32 fc_flt_context; + CLIB_PAD_FROM_TO (0x510c, 0x5110); + u32 fc_filter_control; + CLIB_PAD_FROM_TO (0x5114, 0x5120); + u32 rx_message_type_lo; + CLIB_PAD_FROM_TO (0x5124, 0x5128); + /* [15:0] ethernet type (little endian) + [18:16] matche pri in vlan tag + [19] priority match enable + [25:20] virtualization pool + [26] pool enable + [27] is fcoe + [30] ieee 1588 timestamp enable + [31] filter enable. + (See ethernet_type_queue_select.) */ + u32 ethernet_type_queue_filter[8]; + CLIB_PAD_FROM_TO (0x5148, 0x5160); + /* [7:0] l2 ethernet type and + [15:8] l2 ethernet type or */ + u32 management_decision_filters1[8]; + u32 vf_vm_tx_switch_loopback_enable[2]; + u32 rx_time_sync_control; + CLIB_PAD_FROM_TO (0x518c, 0x5190); + u32 management_ethernet_type_filters[4]; + u32 rx_timestamp_attributes_lo; + u32 rx_timestamp_hi; + u32 rx_timestamp_attributes_hi; + CLIB_PAD_FROM_TO (0x51ac, 0x51b0); + u32 pf_virtual_control; + CLIB_PAD_FROM_TO (0x51b4, 0x51d8); + u32 fc_offset_parameter; + CLIB_PAD_FROM_TO (0x51dc, 0x51e0); + u32 vf_rx_enable[2]; + u32 rx_timestamp_lo; + CLIB_PAD_FROM_TO (0x51ec, 0x5200); + /* 12 bits determined by multicast_control + lookup bits in this vector. */ + u32 multicast_enable[128]; + + /* [0] ethernet address [31:0] + [1] [15:0] ethernet address [47:32] + [31] valid bit. + Index 0 is read from eeprom after reset. */ + u32 rx_ethernet_address0[16][2]; + + CLIB_PAD_FROM_TO (0x5480, 0x5800); + u32 wake_up_control; + CLIB_PAD_FROM_TO (0x5804, 0x5808); + u32 wake_up_filter_control; + CLIB_PAD_FROM_TO (0x580c, 0x5818); + u32 multiple_rx_queue_command_82598; + CLIB_PAD_FROM_TO (0x581c, 0x5820); + u32 management_control; + u32 management_filter_control; + CLIB_PAD_FROM_TO (0x5828, 0x5838); + u32 wake_up_ip4_address_valid; + CLIB_PAD_FROM_TO (0x583c, 0x5840); + u32 wake_up_ip4_address_table[4]; + u32 management_control_to_host; + CLIB_PAD_FROM_TO (0x5854, 0x5880); + u32 wake_up_ip6_address_table[4]; + + /* unicast_and broadcast_and vlan_and ip_address_and + etc. */ + u32 management_decision_filters[8]; + + u32 management_ip4_or_ip6_address_filters[4][4]; + CLIB_PAD_FROM_TO (0x58f0, 0x5900); + u32 wake_up_packet_length; + CLIB_PAD_FROM_TO (0x5904, 0x5910); + u32 management_ethernet_address_filters[4][2]; + CLIB_PAD_FROM_TO (0x5930, 0x5a00); + u32 wake_up_packet_memory[32]; + CLIB_PAD_FROM_TO (0x5a80, 0x5c00); + u32 redirection_table_82598[32]; + u32 rss_random_keys_82598[10]; + CLIB_PAD_FROM_TO (0x5ca8, 0x6000); + + ixge_dma_regs_t tx_dma[128]; + + u32 pf_vm_vlan_insert[64]; + u32 tx_dma_tcp_max_alloc_size_requests; + CLIB_PAD_FROM_TO (0x8104, 0x8110); + u32 vf_tx_enable[2]; + CLIB_PAD_FROM_TO (0x8118, 0x8120); + /* [0] dcb mode enable + [1] virtualization mode enable + [3:2] number of tcs/qs per pool. */ + u32 multiple_tx_queues_command; + CLIB_PAD_FROM_TO (0x8124, 0x8200); + u32 pf_vf_anti_spoof[8]; + u32 pf_dma_tx_switch_control; + CLIB_PAD_FROM_TO (0x8224, 0x82e0); + u32 tx_strict_low_latency_queues[4]; + CLIB_PAD_FROM_TO (0x82f0, 0x8600); + u32 tx_queue_stats_mapping_82599[32]; + u32 tx_queue_packet_counts[32]; + u32 tx_queue_byte_counts[32][2]; + + struct + { + u32 control; + u32 status; + u32 buffer_almost_full; + CLIB_PAD_FROM_TO (0x880c, 0x8810); + u32 buffer_min_ifg; + CLIB_PAD_FROM_TO (0x8814, 0x8900); + } tx_security; + + struct + { + u32 index; + u32 salt; + u32 key[4]; + CLIB_PAD_FROM_TO (0x8918, 0x8a00); + } tx_ipsec; + + struct + { + u32 capabilities; + u32 control; + u32 tx_sci[2]; + u32 sa; + u32 sa_pn[2]; + u32 key[2][4]; + /* untagged packets, encrypted packets, protected packets, + encrypted bytes, protected bytes */ + u32 stats[5]; + CLIB_PAD_FROM_TO (0x8a50, 0x8c00); + } tx_link_security; + + struct + { + u32 control; + u32 timestamp_value[2]; + u32 system_time[2]; + u32 increment_attributes; + u32 time_adjustment_offset[2]; + u32 aux_control; + u32 target_time[2][2]; + CLIB_PAD_FROM_TO (0x8c34, 0x8c3c); + u32 aux_time_stamp[2][2]; + CLIB_PAD_FROM_TO (0x8c4c, 0x8d00); + } tx_timesync; + + struct + { + u32 control; + u32 status; + CLIB_PAD_FROM_TO (0x8d08, 0x8e00); + } rx_security; + + struct + { + u32 index; + u32 ip_address[4]; + u32 spi; + u32 ip_index; + u32 key[4]; + u32 salt; + u32 mode; + CLIB_PAD_FROM_TO (0x8e34, 0x8f00); + } rx_ipsec; + + struct + { + u32 capabilities; + u32 control; + u32 sci[2]; + u32 sa[2]; + u32 sa_pn[2]; + u32 key[2][4]; + /* see datasheet */ + u32 stats[17]; + CLIB_PAD_FROM_TO (0x8f84, 0x9000); + } rx_link_security; + + /* 4 wake up, 2 management, 2 wake up. */ + u32 flexible_filters[8][16][4]; + CLIB_PAD_FROM_TO (0x9800, 0xa000); + + /* 4096 bits. */ + u32 vlan_filter[128]; + + /* [0] ethernet address [31:0] + [1] [15:0] ethernet address [47:32] + [31] valid bit. + Index 0 is read from eeprom after reset. */ + u32 rx_ethernet_address1[128][2]; + + /* select one of 64 pools for each rx address. */ + u32 rx_ethernet_address_pool_select[128][2]; + CLIB_PAD_FROM_TO (0xaa00, 0xc800); + u32 tx_priority_to_traffic_class; + CLIB_PAD_FROM_TO (0xc804, 0xcc00); + + /* In bytes units of 1k. Total packet buffer is 160k. */ + u32 tx_packet_buffer_size[8]; + + CLIB_PAD_FROM_TO (0xcc20, 0xcd10); + u32 tx_manageability_tc_mapping; + CLIB_PAD_FROM_TO (0xcd14, 0xcd20); + u32 dcb_tx_packet_plane_t2_config[8]; + u32 dcb_tx_packet_plane_t2_status[8]; + CLIB_PAD_FROM_TO (0xcd60, 0xce00); + + u32 tx_flow_control_status; + CLIB_PAD_FROM_TO (0xce04, 0xd000); + + ixge_dma_regs_t rx_dma1[64]; + + struct + { + /* Bigendian ip4 src/dst address. */ + u32 src_address[128]; + u32 dst_address[128]; + + /* TCP/UDP ports [15:0] src [31:16] dst; bigendian. */ + u32 tcp_udp_port[128]; + + /* [1:0] protocol tcp, udp, sctp, other + [4:2] match priority (highest wins) + [13:8] pool + [25] src address match disable + [26] dst address match disable + [27] src port match disable + [28] dst port match disable + [29] protocol match disable + [30] pool match disable + [31] enable. */ + u32 control[128]; + + /* [12] size bypass + [19:13] must be 0x80 + [20] low-latency interrupt + [27:21] rx queue. */ + u32 interrupt[128]; + } ip4_filters; + + CLIB_PAD_FROM_TO (0xea00, 0xeb00); + /* 4 bit rss output index indexed by 7 bit hash. + 128 8 bit fields = 32 registers. */ + u32 redirection_table_82599[32]; + + u32 rss_random_key_82599[10]; + CLIB_PAD_FROM_TO (0xeba8, 0xec00); + /* [15:0] reserved + [22:16] rx queue index + [29] low-latency interrupt on match + [31] enable */ + u32 ethernet_type_queue_select[8]; + CLIB_PAD_FROM_TO (0xec20, 0xec30); + u32 syn_packet_queue_filter; + CLIB_PAD_FROM_TO (0xec34, 0xec60); + u32 immediate_interrupt_rx_vlan_priority; + CLIB_PAD_FROM_TO (0xec64, 0xec70); + u32 rss_queues_per_traffic_class; + CLIB_PAD_FROM_TO (0xec74, 0xec90); + u32 lli_size_threshold; + CLIB_PAD_FROM_TO (0xec94, 0xed00); + + struct + { + u32 control; + CLIB_PAD_FROM_TO (0xed04, 0xed10); + u32 table[8]; + CLIB_PAD_FROM_TO (0xed30, 0xee00); + } fcoe_redirection; + + struct + { + /* [1:0] packet buffer allocation 0 => disabled, else 64k*2^(f-1) + [3] packet buffer initialization done + [4] perfetch match mode + [5] report status in rss field of rx descriptors + [7] report status always + [14:8] drop queue + [20:16] flex 2 byte packet offset (units of 2 bytes) + [27:24] max linked list length + [31:28] full threshold. */ + u32 control; + CLIB_PAD_FROM_TO (0xee04, 0xee0c); + + u32 data[8]; + + /* [1:0] 0 => no action, 1 => add, 2 => remove, 3 => query. + [2] valid filter found by query command + [3] filter update override + [4] ip6 adress table + [6:5] l4 protocol reserved, udp, tcp, sctp + [7] is ip6 + [8] clear head/tail + [9] packet drop action + [10] matched packet generates low-latency interrupt + [11] last in linked list + [12] collision + [15] rx queue enable + [22:16] rx queue + [29:24] pool. */ + u32 command; + + CLIB_PAD_FROM_TO (0xee30, 0xee3c); + /* ip4 dst/src address, tcp ports, udp ports. + set bits mean bit is ignored. */ + u32 ip4_masks[4]; + u32 filter_length; + u32 usage_stats; + u32 failed_usage_stats; + u32 filters_match_stats; + u32 filters_miss_stats; + CLIB_PAD_FROM_TO (0xee60, 0xee68); + /* Lookup, signature. */ + u32 hash_keys[2]; + /* [15:0] ip6 src address 1 bit per byte + [31:16] ip6 dst address. */ + u32 ip6_mask; + /* [0] vlan id + [1] vlan priority + [2] pool + [3] ip protocol + [4] flex + [5] dst ip6. */ + u32 other_mask; + CLIB_PAD_FROM_TO (0xee78, 0xf000); + } flow_director; + + struct + { + u32 l2_control[64]; + u32 vlan_pool_filter[64]; + u32 vlan_pool_filter_bitmap[128]; + u32 dst_ethernet_address[128]; + u32 mirror_rule[4]; + u32 mirror_rule_vlan[8]; + u32 mirror_rule_pool[8]; + CLIB_PAD_FROM_TO (0xf650, 0x10010); + } pf_bar; + + u32 eeprom_flash_control; + /* [0] start + [1] done + [15:2] address + [31:16] read data. */ + u32 eeprom_read; + CLIB_PAD_FROM_TO (0x10018, 0x1001c); + u32 flash_access; + CLIB_PAD_FROM_TO (0x10020, 0x10114); + u32 flash_data; + u32 flash_control; + u32 flash_read_data; + CLIB_PAD_FROM_TO (0x10120, 0x1013c); + u32 flash_opcode; + u32 software_semaphore; + CLIB_PAD_FROM_TO (0x10144, 0x10148); + u32 firmware_semaphore; + CLIB_PAD_FROM_TO (0x1014c, 0x10160); + u32 software_firmware_sync; + CLIB_PAD_FROM_TO (0x10164, 0x10200); + u32 general_rx_control; + CLIB_PAD_FROM_TO (0x10204, 0x11000); + + struct + { + u32 control; + CLIB_PAD_FROM_TO (0x11004, 0x11010); + /* [3:0] enable counters + [7:4] leaky bucket counter mode + [29] reset + [30] stop + [31] start. */ + u32 counter_control; + /* [7:0],[15:8],[23:16],[31:24] event for counters 0-3. + event codes: + 0x0 bad tlp + 0x10 reqs that reached timeout + etc. */ + u32 counter_event; + CLIB_PAD_FROM_TO (0x11018, 0x11020); + u32 counters_clear_on_read[4]; + u32 counter_config[4]; + struct + { + u32 address; + u32 data; + } indirect_access; + CLIB_PAD_FROM_TO (0x11048, 0x11050); + u32 extended_control; + CLIB_PAD_FROM_TO (0x11054, 0x11064); + u32 mirrored_revision_id; + CLIB_PAD_FROM_TO (0x11068, 0x11070); + u32 dca_requester_id_information; + + /* [0] global disable + [4:1] mode: 0 => legacy, 1 => dca 1.0. */ + u32 dca_control; + CLIB_PAD_FROM_TO (0x11078, 0x110b0); + /* [0] pci completion abort + [1] unsupported i/o address + [2] wrong byte enable + [3] pci timeout */ + u32 pcie_interrupt_status; + CLIB_PAD_FROM_TO (0x110b4, 0x110b8); + u32 pcie_interrupt_enable; + CLIB_PAD_FROM_TO (0x110bc, 0x110c0); + u32 msi_x_pba_clear[8]; + CLIB_PAD_FROM_TO (0x110e0, 0x12300); + } pcie; + + u32 interrupt_throttle1[128 - 24]; + CLIB_PAD_FROM_TO (0x124a0, 0x14f00); + + u32 core_analog_config; + CLIB_PAD_FROM_TO (0x14f04, 0x14f10); + u32 core_common_config; + CLIB_PAD_FROM_TO (0x14f14, 0x15f14); + + u32 link_sec_software_firmware_interface; +} ixge_regs_t; + +typedef union +{ + struct + { + /* Addresses bigendian. */ + union + { + struct + { + ip6_address_t src_address; + u32 unused[1]; + } ip6; + struct + { + u32 unused[3]; + ip4_address_t src_address, dst_address; + } ip4; + }; + + /* [15:0] src port (little endian). + [31:16] dst port. */ + u32 tcp_udp_ports; + + /* [15:0] vlan (cfi bit set to 0). + [31:16] flex bytes. bigendian. */ + u32 vlan_and_flex_word; + + /* [14:0] hash + [15] bucket valid + [31:16] signature (signature filers)/sw-index (perfect match). */ + u32 hash; + }; + + u32 as_u32[8]; +} ixge_flow_director_key_t; + +always_inline void +ixge_throttle_queue_interrupt (ixge_regs_t * r, + u32 queue_interrupt_index, + f64 inter_interrupt_interval_in_secs) +{ + volatile u32 *tr = + (queue_interrupt_index < ARRAY_LEN (r->interrupt.throttle0) + ? &r->interrupt.throttle0[queue_interrupt_index] + : &r->interrupt_throttle1[queue_interrupt_index]); + ASSERT (queue_interrupt_index < 128); + u32 v; + i32 i, mask = (1 << 9) - 1; + + i = flt_round_nearest (inter_interrupt_interval_in_secs / 2e-6); + i = i < 1 ? 1 : i; + i = i >= mask ? mask : i; + + v = tr[0]; + v &= ~(mask << 3); + v |= i << 3; + tr[0] = v; +} + +#define foreach_ixge_counter \ + _ (0x40d0, rx_total_packets) \ + _64 (0x40c0, rx_total_bytes) \ + _ (0x41b0, rx_good_packets_before_filtering) \ + _64 (0x41b4, rx_good_bytes_before_filtering) \ + _ (0x2f50, rx_dma_good_packets) \ + _64 (0x2f54, rx_dma_good_bytes) \ + _ (0x2f5c, rx_dma_duplicated_good_packets) \ + _64 (0x2f60, rx_dma_duplicated_good_bytes) \ + _ (0x2f68, rx_dma_good_loopback_packets) \ + _64 (0x2f6c, rx_dma_good_loopback_bytes) \ + _ (0x2f74, rx_dma_good_duplicated_loopback_packets) \ + _64 (0x2f78, rx_dma_good_duplicated_loopback_bytes) \ + _ (0x4074, rx_good_packets) \ + _64 (0x4088, rx_good_bytes) \ + _ (0x407c, rx_multicast_packets) \ + _ (0x4078, rx_broadcast_packets) \ + _ (0x405c, rx_64_byte_packets) \ + _ (0x4060, rx_65_127_byte_packets) \ + _ (0x4064, rx_128_255_byte_packets) \ + _ (0x4068, rx_256_511_byte_packets) \ + _ (0x406c, rx_512_1023_byte_packets) \ + _ (0x4070, rx_gt_1023_byte_packets) \ + _ (0x4000, rx_crc_errors) \ + _ (0x4120, rx_ip_checksum_errors) \ + _ (0x4004, rx_illegal_symbol_errors) \ + _ (0x4008, rx_error_symbol_errors) \ + _ (0x4034, rx_mac_local_faults) \ + _ (0x4038, rx_mac_remote_faults) \ + _ (0x4040, rx_length_errors) \ + _ (0x41a4, rx_xons) \ + _ (0x41a8, rx_xoffs) \ + _ (0x40a4, rx_undersize_packets) \ + _ (0x40a8, rx_fragments) \ + _ (0x40ac, rx_oversize_packets) \ + _ (0x40b0, rx_jabbers) \ + _ (0x40b4, rx_management_packets) \ + _ (0x40b8, rx_management_drops) \ + _ (0x3fa0, rx_missed_packets_pool_0) \ + _ (0x40d4, tx_total_packets) \ + _ (0x4080, tx_good_packets) \ + _64 (0x4090, tx_good_bytes) \ + _ (0x40f0, tx_multicast_packets) \ + _ (0x40f4, tx_broadcast_packets) \ + _ (0x87a0, tx_dma_good_packets) \ + _64 (0x87a4, tx_dma_good_bytes) \ + _ (0x40d8, tx_64_byte_packets) \ + _ (0x40dc, tx_65_127_byte_packets) \ + _ (0x40e0, tx_128_255_byte_packets) \ + _ (0x40e4, tx_256_511_byte_packets) \ + _ (0x40e8, tx_512_1023_byte_packets) \ + _ (0x40ec, tx_gt_1023_byte_packets) \ + _ (0x4010, tx_undersize_drops) \ + _ (0x8780, switch_security_violation_packets) \ + _ (0x5118, fc_crc_errors) \ + _ (0x241c, fc_rx_drops) \ + _ (0x2424, fc_last_error_count) \ + _ (0x2428, fcoe_rx_packets) \ + _ (0x242c, fcoe_rx_dwords) \ + _ (0x8784, fcoe_tx_packets) \ + _ (0x8788, fcoe_tx_dwords) \ + _ (0x1030, queue_0_rx_count) \ + _ (0x1430, queue_0_drop_count) \ + _ (0x1070, queue_1_rx_count) \ + _ (0x1470, queue_1_drop_count) \ + _ (0x10b0, queue_2_rx_count) \ + _ (0x14b0, queue_2_drop_count) \ + _ (0x10f0, queue_3_rx_count) \ + _ (0x14f0, queue_3_drop_count) \ + _ (0x1130, queue_4_rx_count) \ + _ (0x1530, queue_4_drop_count) \ + _ (0x1170, queue_5_rx_count) \ + _ (0x1570, queue_5_drop_count) \ + _ (0x11b0, queue_6_rx_count) \ + _ (0x15b0, queue_6_drop_count) \ + _ (0x11f0, queue_7_rx_count) \ + _ (0x15f0, queue_7_drop_count) \ + _ (0x1230, queue_8_rx_count) \ + _ (0x1630, queue_8_drop_count) \ + _ (0x1270, queue_9_rx_count) \ + _ (0x1270, queue_9_drop_count) + + + + +typedef enum +{ +#define _(a,f) IXGE_COUNTER_##f, +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + IXGE_N_COUNTER, +} ixge_counter_type_t; + +typedef struct +{ + u32 mdio_address; + + /* 32 bit ID read from ID registers. */ + u32 id; +} ixge_phy_t; + +typedef struct +{ + /* Cache aligned descriptors. */ + ixge_descriptor_t *descriptors; + + /* Number of descriptors in table. */ + u32 n_descriptors; + + /* Software head and tail pointers into descriptor ring. */ + u32 head_index, tail_index; + + /* Index into dma_queues vector. */ + u32 queue_index; + + /* Buffer indices corresponding to each active descriptor. */ + u32 *descriptor_buffer_indices; + + union + { + struct + { + u32 *volatile head_index_write_back; + + u32 n_buffers_on_ring; + } tx; + + struct + { + /* Buffer indices to use to replenish each descriptor. */ + u32 *replenish_buffer_indices; + + vlib_node_runtime_t *node; + u32 next_index; + + u32 saved_start_of_packet_buffer_index; + + u32 saved_start_of_packet_next_index; + u32 saved_last_buffer_index; + + u32 is_start_of_packet; + + u32 n_descriptors_done_total; + + u32 n_descriptors_done_this_call; + + u32 n_bytes; + } rx; + }; +} ixge_dma_queue_t; + +#define foreach_ixge_pci_device_id \ + _ (82598, 0x10b6) \ + _ (82598_bx, 0x1508) \ + _ (82598af_dual_port, 0x10c6) \ + _ (82598af_single_port, 0x10c7) \ + _ (82598at, 0x10c8) \ + _ (82598at2, 0x150b) \ + _ (82598eb_sfp_lom, 0x10db) \ + _ (82598eb_cx4, 0x10dd) \ + _ (82598_cx4_dual_port, 0x10ec) \ + _ (82598_da_dual_port, 0x10f1) \ + _ (82598_sr_dual_port_em, 0x10e1) \ + _ (82598eb_xf_lr, 0x10f4) \ + _ (82599_kx4, 0x10f7) \ + _ (82599_kx4_mezz, 0x1514) \ + _ (82599_kr, 0x1517) \ + _ (82599_combo_backplane, 0x10f8) \ + _ (82599_cx4, 0x10f9) \ + _ (82599_sfp, 0x10fb) \ + _ (82599_backplane_fcoe, 0x152a) \ + _ (82599_sfp_fcoe, 0x1529) \ + _ (82599_sfp_em, 0x1507) \ + _ (82599_xaui_lom, 0x10fc) \ + _ (82599_t3_lom, 0x151c) \ + _ (x540t, 0x1528) + +typedef enum +{ +#define _(f,n) IXGE_##f = n, + foreach_ixge_pci_device_id +#undef _ +} ixge_pci_device_id_t; + +typedef struct +{ + /* registers */ + ixge_regs_t *regs; + + /* Specific next index when using dynamic redirection */ + u32 per_interface_next_index; + + /* PCI bus info. */ + vlib_pci_dev_handle_t pci_dev_handle; + + /* From PCI config space header. */ + ixge_pci_device_id_t device_id; + + u16 device_index; + + /* 0 or 1. */ + u16 pci_function; + + /* VLIB interface for this instance. */ + u32 vlib_hw_if_index, vlib_sw_if_index; + + ixge_dma_queue_t *dma_queues[VLIB_N_RX_TX]; + + /* Phy index (0 or 1) and address on MDI bus. */ + u32 phy_index; + ixge_phy_t phys[2]; + + /* Value of link_status register at last link change. */ + u32 link_status_at_last_link_change; + + i2c_bus_t i2c_bus; + sfp_eeprom_t sfp_eeprom; + + /* Counters. */ + u64 counters[IXGE_N_COUNTER], counters_last_clear[IXGE_N_COUNTER]; +} ixge_device_t; + +typedef struct +{ + vlib_main_t *vlib_main; + + /* Vector of devices. */ + ixge_device_t *devices; + + /* Descriptor ring sizes. */ + u32 n_descriptors[VLIB_N_RX_TX]; + + /* RX buffer size. Must be at least 1k; will be rounded to + next largest 1k size. */ + u32 n_bytes_in_rx_buffer; + + u32 n_descriptors_per_cache_line; + + u32 process_node_index; + + /* Template and mask for initializing/validating TX descriptors. */ + ixge_tx_descriptor_t tx_descriptor_template, tx_descriptor_template_mask; + + /* Vector of buffers for which TX is done and can be freed. */ + u32 *tx_buffers_pending_free; + + u32 *rx_buffers_to_add; + + f64 time_last_stats_update; + +} ixge_main_t; + +extern ixge_main_t ixge_main; +extern vnet_device_class_t ixge_device_class; + +typedef enum +{ + IXGE_RX_NEXT_IP4_INPUT, + IXGE_RX_NEXT_IP6_INPUT, + IXGE_RX_NEXT_ETHERNET_INPUT, + IXGE_RX_NEXT_DROP, + IXGE_RX_N_NEXT, +} ixge_rx_next_t; + +void ixge_set_next_node (ixge_rx_next_t, char *); + +#endif /* included_ixge_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/extras/deprecated/netmap/FEATURE.yaml b/extras/deprecated/netmap/FEATURE.yaml new file mode 100644 index 00000000000..e23e5c243e7 --- /dev/null +++ b/extras/deprecated/netmap/FEATURE.yaml @@ -0,0 +1,12 @@ +--- +name: Netmap Device +maintainer: Damjan Marion <damarion@cisco.com> +features: + - L4 checksum offload +description: "Create a netmap interface, which is a high speed user-space + interface that allows VPP to patch into a linux namespace, + a linux container, or a physical NIC without the use of DPDK." +missing: + - API dump +state: production +properties: [API, CLI, STATS, MULTITHREAD] diff --git a/extras/deprecated/netmap/cli.c b/extras/deprecated/netmap/cli.c new file mode 100644 index 00000000000..713632947a1 --- /dev/null +++ b/extras/deprecated/netmap/cli.c @@ -0,0 +1,236 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> + +#include <vnet/devices/netmap/net_netmap.h> +#include <vnet/devices/netmap/netmap.h> + +static clib_error_t * +netmap_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u8 *host_if_name = NULL; + u8 hwaddr[6]; + u8 *hw_addr_ptr = 0; + int r; + u8 is_pipe = 0; + u8 is_master = 0; + u32 sw_if_index = ~0; + clib_error_t *error = NULL; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "name %s", &host_if_name)) + ; + else + if (unformat + (line_input, "hw-addr %U", unformat_ethernet_address, hwaddr)) + hw_addr_ptr = hwaddr; + else if (unformat (line_input, "pipe")) + is_pipe = 1; + else if (unformat (line_input, "master")) + is_master = 1; + else if (unformat (line_input, "slave")) + is_master = 0; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (host_if_name == NULL) + { + error = clib_error_return (0, "missing host interface name"); + goto done; + } + + r = + netmap_create_if (vm, host_if_name, hw_addr_ptr, is_pipe, is_master, + &sw_if_index); + + if (r == VNET_API_ERROR_SYSCALL_ERROR_1) + { + error = clib_error_return (0, "%s (errno %d)", strerror (errno), errno); + goto done; + } + + if (r == VNET_API_ERROR_INVALID_INTERFACE) + { + error = clib_error_return (0, "Invalid interface name"); + goto done; + } + + if (r == VNET_API_ERROR_SUBIF_ALREADY_EXISTS) + { + error = clib_error_return (0, "Interface already exists"); + goto done; + } + + vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (), + sw_if_index); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * '<em>netmap</em>' is a framework for very fast packet I/O from userspace. + * '<em>VALE</em>' is an equally fast in-kernel software switch using the + * netmap API. '<em>netmap</em>' includes '<em>netmap pipes</em>', a shared + * memory packet transport channel. Together, they provide a high speed + * user-space interface that allows VPP to patch into a linux namespace, a + * linux container, or a physical NIC without the use of DPDK. Netmap/VALE + * generates the '<em>netmap.ko</em>' kernel module that needs to be loaded + * before netmap interfaces can be created. + * - https://github.com/luigirizzo/netmap - Netmap/VALE repo. + * - https://github.com/vpp-dev/netmap - VPP development package for Netmap/VALE, + * which is a snapshot of the Netmap/VALE repo with minor changes to work + * with containers and modified kernel drivers to work with NICs. + * + * Create a netmap interface that will attach to a linux interface. + * The interface must already exist. Once created, a new netmap interface + * will exist in VPP with the name '<em>netmap-<ifname></em>', where + * '<em><ifname></em>' takes one of two forms: + * - <b>ifname</b> - Linux interface to bind too. + * - <b>valeXXX:YYY</b> - + * - Where '<em>valeXXX</em>' is an arbitrary name for a VALE + * interface that must start with '<em>vale</em>' and is less + * than 16 characters. + * - Where '<em>YYY</em>' is an existing linux namespace. + * + * This command has the following optional parameters: + * + * - <b>hw-addr <mac-addr></b> - Optional ethernet address, can be in either + * X:X:X:X:X:X unix or X.X.X cisco format. + * + * - <b>pipe</b> - Optional flag to indicate that a '<em>netmap pipe</em>' + * instance should be created. + * + * - <b>master | slave</b> - Optional flag to indicate whether VPP should + * be the master or slave of the '<em>netmap pipe</em>'. Only considered + * if '<em>pipe</em>' is entered. Defaults to '<em>slave</em>' if not entered. + * + * @cliexpar + * Example of how to create a netmap interface tied to the linux + * namespace '<em>vpp1</em>': + * @cliexstart{create netmap name vale00:vpp1 hw-addr 02:FE:3F:34:15:9B pipe master} + * netmap-vale00:vpp1 + * @cliexend + * Once the netmap interface is created, enable the interface using: + * @cliexcmd{set interface state netmap-vale00:vpp1 up} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (netmap_create_command, static) = { + .path = "create netmap", + .short_help = "create netmap name <ifname>|valeXXX:YYY " + "[hw-addr <mac-addr>] [pipe] [master|slave]", + .function = netmap_create_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +netmap_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u8 *host_if_name = NULL; + clib_error_t *error = NULL; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "name %s", &host_if_name)) + ; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (host_if_name == NULL) + { + error = clib_error_return (0, "missing host interface name"); + goto done; + } + + netmap_delete_if (vm, host_if_name); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * Delete a netmap interface. Use the '<em><ifname></em>' to identify + * the netmap interface to be deleted. In VPP, netmap interfaces are + * named as '<em>netmap-<ifname></em>', where '<em><ifname></em>' + * takes one of two forms: + * - <b>ifname</b> - Linux interface to bind too. + * - <b>valeXXX:YYY</b> - + * - Where '<em>valeXXX</em>' is an arbitrary name for a VALE + * interface that must start with '<em>vale</em>' and is less + * than 16 characters. + * - Where '<em>YYY</em>' is an existing linux namespace. + * + * @cliexpar + * Example of how to delete a netmap interface named '<em>netmap-vale00:vpp1</em>': + * @cliexcmd{delete netmap name vale00:vpp1} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (netmap_delete_command, static) = { + .path = "delete netmap", + .short_help = "delete netmap name <ifname>|valeXXX:YYY", + .function = netmap_delete_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +netmap_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (netmap_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/extras/deprecated/netmap/device.c b/extras/deprecated/netmap/device.c new file mode 100644 index 00000000000..47407aaaa26 --- /dev/null +++ b/extras/deprecated/netmap/device.c @@ -0,0 +1,252 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> + +#include <vnet/devices/netmap/net_netmap.h> +#include <vnet/devices/netmap/netmap.h> + +#define foreach_netmap_tx_func_error \ +_(NO_FREE_SLOTS, "no free tx slots") \ +_(PENDING_MSGS, "pending msgs in tx ring") + +typedef enum +{ +#define _(f,s) NETMAP_TX_ERROR_##f, + foreach_netmap_tx_func_error +#undef _ + NETMAP_TX_N_ERROR, +} netmap_tx_func_error_t; + +static char *netmap_tx_func_error_strings[] = { +#define _(n,s) s, + foreach_netmap_tx_func_error +#undef _ +}; + + +static u8 * +format_netmap_device_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + netmap_main_t *apm = &netmap_main; + netmap_if_t *nif = pool_elt_at_index (apm->interfaces, i); + + s = format (s, "netmap-%s", nif->host_if_name); + return s; +} + +static u8 * +format_netmap_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + int verbose = va_arg (*args, int); + netmap_main_t *nm = &netmap_main; + netmap_if_t *nif = vec_elt_at_index (nm->interfaces, dev_instance); + u32 indent = format_get_indent (s); + + s = format (s, "NETMAP interface"); + if (verbose) + { + s = format (s, "\n%U version %d flags 0x%x" + "\n%U region %u memsize 0x%x offset 0x%x" + "\n%U tx_slots %u rx_slots %u tx_rings %u rx_rings %u", + format_white_space, indent + 2, + nif->req->nr_version, + nif->req->nr_flags, + format_white_space, indent + 2, + nif->mem_region, + nif->req->nr_memsize, + nif->req->nr_offset, + format_white_space, indent + 2, + nif->req->nr_tx_slots, + nif->req->nr_rx_slots, + nif->req->nr_tx_rings, nif->req->nr_rx_rings); + } + return s; +} + +static u8 * +format_netmap_tx_trace (u8 * s, va_list * args) +{ + s = format (s, "Unimplemented..."); + return s; +} + +VNET_DEVICE_CLASS_TX_FN (netmap_device_class) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + netmap_main_t *nm = &netmap_main; + u32 *buffers = vlib_frame_vector_args (frame); + u32 n_left = frame->n_vectors; + f64 const time_constant = 1e3; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + netmap_if_t *nif = pool_elt_at_index (nm->interfaces, rd->dev_instance); + int cur_ring; + + clib_spinlock_lock_if_init (&nif->lockp); + + cur_ring = nif->first_tx_ring; + + while (n_left && cur_ring <= nif->last_tx_ring) + { + struct netmap_ring *ring = NETMAP_TXRING (nif->nifp, cur_ring); + int n_free_slots = nm_ring_space (ring); + uint cur = ring->cur; + + if (nm_tx_pending (ring)) + { + if (ioctl (nif->fd, NIOCTXSYNC, NULL) < 0) + clib_unix_warning ("NIOCTXSYNC"); + clib_cpu_time_wait (time_constant); + + if (nm_tx_pending (ring) && !n_free_slots) + { + cur_ring++; + continue; + } + } + + while (n_left && n_free_slots) + { + vlib_buffer_t *b0 = 0; + u32 bi = buffers[0]; + u32 len; + u32 offset = 0; + buffers++; + + struct netmap_slot *slot = &ring->slot[cur]; + + do + { + b0 = vlib_get_buffer (vm, bi); + len = b0->current_length; + /* memcpy */ + clib_memcpy_fast ((u8 *) NETMAP_BUF (ring, slot->buf_idx) + + offset, vlib_buffer_get_current (b0), len); + offset += len; + } + while ((bi = b0->next_buffer)); + + slot->len = offset; + cur = (cur + 1) % ring->num_slots; + n_free_slots--; + n_left--; + } + CLIB_MEMORY_BARRIER (); + ring->head = ring->cur = cur; + } + + if (n_left < frame->n_vectors) + ioctl (nif->fd, NIOCTXSYNC, NULL); + + clib_spinlock_unlock_if_init (&nif->lockp); + + if (n_left) + vlib_error_count (vm, node->node_index, + (n_left == + frame->n_vectors ? NETMAP_TX_ERROR_PENDING_MSGS : + NETMAP_TX_ERROR_NO_FREE_SLOTS), n_left); + + vlib_buffer_free (vm, vlib_frame_vector_args (frame), frame->n_vectors); + return frame->n_vectors; +} + +static void +netmap_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + netmap_main_t *apm = &netmap_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + netmap_if_t *nif = pool_elt_at_index (apm->interfaces, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + nif->per_interface_next_index = node_index; + return; + } + + nif->per_interface_next_index = + vlib_node_add_next (vlib_get_main (), netmap_input_node.index, + node_index); +} + +static void +netmap_clear_hw_interface_counters (u32 instance) +{ + /* Nothing for now */ +} + +static clib_error_t * +netmap_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + netmap_main_t *apm = &netmap_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + netmap_if_t *nif = pool_elt_at_index (apm->interfaces, hw->dev_instance); + u32 hw_flags; + + nif->is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + + if (nif->is_admin_up) + hw_flags = VNET_HW_INTERFACE_FLAG_LINK_UP; + else + hw_flags = 0; + + vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags); + + return 0; +} + +static clib_error_t * +netmap_subif_add_del_function (vnet_main_t * vnm, + u32 hw_if_index, + struct vnet_sw_interface_t *st, int is_add) +{ + /* Nothing for now */ + return 0; +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (netmap_device_class) = { + .name = "netmap", + .format_device_name = format_netmap_device_name, + .format_device = format_netmap_device, + .format_tx_trace = format_netmap_tx_trace, + .tx_function_n_errors = NETMAP_TX_N_ERROR, + .tx_function_error_strings = netmap_tx_func_error_strings, + .rx_redirect_to_node = netmap_set_interface_next_node, + .clear_counters = netmap_clear_hw_interface_counters, + .admin_up_down_function = netmap_interface_admin_up_down, + .subif_add_del_function = netmap_subif_add_del_function, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/extras/deprecated/netmap/dir.dox b/extras/deprecated/netmap/dir.dox new file mode 100644 index 00000000000..7ddbf947c29 --- /dev/null +++ b/extras/deprecated/netmap/dir.dox @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Doxygen directory documentation */ + +/** +@dir +@brief netmap Interface Implementation. + +This directory contains the source code for the netmap driver. + +*/ +/*? %%clicmd:group_label netmap %% ?*/ +/*? %%syscfg:group_label netmap %% ?*/ diff --git a/extras/deprecated/netmap/net_netmap.h b/extras/deprecated/netmap/net_netmap.h new file mode 100644 index 00000000000..fd4253b7c0c --- /dev/null +++ b/extras/deprecated/netmap/net_netmap.h @@ -0,0 +1,650 @@ +/* + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/net/netmap.h 251139 2013-05-30 14:07:14Z luigi $ + * + * Definitions of constants and the structures used by the netmap + * framework, for the part visible to both kernel and userspace. + * Detailed info on netmap is available with "man netmap" or at + * + * http://info.iet.unipi.it/~luigi/netmap/ + * + * This API is also used to communicate with the VALE software switch + */ + +#ifndef _NET_NETMAP_H_ +#define _NET_NETMAP_H_ + +#define NETMAP_API 11 /* current API version */ + +#define NETMAP_MIN_API 11 /* min and max versions accepted */ +#define NETMAP_MAX_API 15 +/* + * Some fields should be cache-aligned to reduce contention. + * The alignment is architecture and OS dependent, but rather than + * digging into OS headers to find the exact value we use an estimate + * that should cover most architectures. + */ +#define NM_CACHE_ALIGN 128 + +/* + * --- Netmap data structures --- + * + * The userspace data structures used by netmap are shown below. + * They are allocated by the kernel and mmap()ed by userspace threads. + * Pointers are implemented as memory offsets or indexes, + * so that they can be easily dereferenced in kernel and userspace. + + KERNEL (opaque, obviously) + + ==================================================================== + | + USERSPACE | struct netmap_ring + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +---------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+1 entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ + | rxring_ofs[1] | + (rx+1 entries) + | rxring_ofs[r] | + +---------------+ + + * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to + * a file descriptor, the mmap()ed region contains a (logically readonly) + * struct netmap_if pointing to struct netmap_ring's. + * + * There is one netmap_ring per physical NIC ring, plus one tx/rx ring + * pair attached to the host stack (this pair is unused for non-NIC ports). + * + * All physical/host stack ports share the same memory region, + * so that zero-copy can be implemented between them. + * VALE switch ports instead have separate memory regions. + * + * The netmap_ring is the userspace-visible replica of the NIC ring. + * Each slot has the index of a buffer (MTU-sized and residing in the + * mmapped region), its length and some flags. An extra 64-bit pointer + * is provided for user-supplied buffers in the tx path. + * + * In user space, the buffer address is computed as + * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE + * + * Added in NETMAP_API 11: + * + * + NIOCREGIF can request the allocation of extra spare buffers from + * the same memory pool. The desired number of buffers must be in + * nr_arg3. The ioctl may return fewer buffers, depending on memory + * availability. nr_arg3 will return the actual value, and, once + * mapped, nifp->ni_bufs_head will be the index of the first buffer. + * + * The buffers are linked to each other using the first uint32_t + * as the index. On close, ni_bufs_head must point to the list of + * buffers to be released. + * + * + NIOCREGIF can request space for extra rings (and buffers) + * allocated in the same memory space. The number of extra rings + * is in nr_arg1, and is advisory. This is a no-op on NICs where + * the size of the memory space is fixed. + * + * + NIOCREGIF can attach to PIPE rings sharing the same memory + * space with a parent device. The ifname indicates the parent device, + * which must already exist. Flags in nr_flags indicate if we want to + * bind the master or slave side, the index (from nr_ringid) + * is just a cookie and does not need to be sequential. + * + * + NIOCREGIF can also attach to 'monitor' rings that replicate + * the content of specific rings, also from the same memory space. + * + * Extra flags in nr_flags support the above functions. + * Application libraries may use the following naming scheme: + * netmap:foo all NIC ring pairs + * netmap:foo^ only host ring pair + * netmap:foo+ all NIC ring + host ring pairs + * netmap:foo-k the k-th NIC ring pair + * netmap:foo{k PIPE ring pair k, master side + * netmap:foo}k PIPE ring pair k, slave side + */ + +/* + * struct netmap_slot is a buffer descriptor + */ +struct netmap_slot { + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* length for this slot */ + uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* pointer for indirect buffers */ +}; + +/* + * The following flags control how the slot is used + */ + +#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ + /* + * must be set whenever buf_idx is changed (as it might be + * necessary to recompute the physical address and mapping) + * + * It is also set by the kernel whenever the buf_idx is + * changed internally (e.g., by pipes). Applications may + * use this information to know when they can reuse the + * contents of previously prepared buffers. + */ + +#define NS_REPORT 0x0002 /* ask the hardware to report results */ + /* + * Request notification when slot is used by the hardware. + * Normally transmit completions are handled lazily and + * may be unreported. This flag lets us know when a slot + * has been sent (e.g. to terminate the sender). + */ + +#define NS_FORWARD 0x0004 /* pass packet 'forward' */ + /* + * (Only for physical ports, rx rings with NR_FORWARD set). + * Slot released to the kernel (i.e. before ring->head) with + * this flag set are passed to the peer ring (host/NIC), + * thus restoring the host-NIC connection for these slots. + * This supports efficient traffic monitoring or firewalling. + */ + +#define NS_NO_LEARN 0x0008 /* disable bridge learning */ + /* + * On a VALE switch, do not 'learn' the source port for + * this buffer. + */ + +#define NS_INDIRECT 0x0010 /* userspace buffer */ + /* + * (VALE tx rings only) data is in a userspace buffer, + * whose address is in the 'ptr' field in the slot. + */ + +#define NS_MOREFRAG 0x0020 /* packet has more fragments */ + /* + * (VALE ports only) + * Set on all but the last slot of a multi-segment packet. + * The 'len' field refers to the individual fragment. + */ + +#define NS_PORT_SHIFT 8 +#define NS_PORT_MASK (0xff << NS_PORT_SHIFT) + /* + * The high 8 bits of the flag, if not zero, indicate the + * destination port for the VALE switch, overriding + * the lookup table. + */ + +#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) + /* + * (VALE rx rings only) the high 8 bits + * are the number of fragments. + */ + + +/* + * struct netmap_ring + * + * Netmap representation of a TX or RX ring (also known as "queue"). + * This is a queue implemented as a fixed-size circular array. + * At the software level the important fields are: head, cur, tail. + * + * In TX rings: + * + * head first slot available for transmission. + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel + * + * [head .. tail-1] can be used for new packets to send; + * 'head' and 'cur' must be incremented as slots are filled + * with new packets to be sent; + * 'cur' can be moved further ahead if we need more space + * for new transmissions. XXX todo (2014-03-12) + * + * In RX rings: + * + * head first valid received packet + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel + * + * [head .. tail-1] contain received packets; + * 'head' and 'cur' must be incremented as slots are consumed + * and can be returned to the kernel; + * 'cur' can be moved further ahead if we want to wait for + * new packets without returning the previous ones. + * + * DATA OWNERSHIP/LOCKING: + * The netmap_ring, and all slots and buffers in the range + * [head .. tail-1] are owned by the user program; + * the kernel only accesses them during a netmap system call + * and in the user thread context. + * + * Other slots and buffers are reserved for use by the kernel + */ +struct netmap_ring { + /* + * buf_ofs is meant to be used through macros. + * It contains the offset of the buffer region from this + * descriptor. + */ + const int64_t buf_ofs; + const uint32_t num_slots; /* number of slots in the ring. */ + const uint32_t nr_buf_size; + const uint16_t ringid; + const uint16_t dir; /* 0: tx, 1: rx */ + + uint32_t head; /* (u) first user slot */ + uint32_t cur; /* (u) wakeup point */ + uint32_t tail; /* (k) first kernel slot */ + + uint32_t flags; + + struct timeval ts; /* (k) time of last *sync() */ + + /* opaque room for a mutex or similar object */ +#if !defined(_WIN32) || defined(__CYGWIN__) + uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128]; +#else + uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128]; +#endif + + /* the slots follow. This struct has variable size */ + struct netmap_slot slot[0]; /* array of slots. */ +}; + + +/* + * RING FLAGS + */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + /* + * updates the 'ts' field on each netmap syscall. This saves + * saves a separate gettimeofday(), and is not much worse than + * software timestamps generated in the interrupt handler. + */ + +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ + /* + * Enables the NS_FORWARD slot flag for the ring. + */ + + +/* + * Netmap representation of an interface and its queue(s). + * This is initialized by the kernel when binding a file + * descriptor to a port, and should be considered as readonly + * by user programs. The kernel never uses it. + * + * There is one netmap_if for each file descriptor on which we want + * to select/poll. + * select/poll operates on one or all pairs depending on the value of + * nmr_queueid passed on the ioctl. + */ +struct netmap_if { + char ni_name[IFNAMSIZ]; /* name of the interface. */ + const uint32_t ni_version; /* API version, currently unused */ + const uint32_t ni_flags; /* properties */ +#define NI_PRIV_MEM 0x1 /* private memory region */ + + /* + * The number of packet rings available in netmap mode. + * Physical NICs can have different numbers of tx and rx rings. + * Physical NICs also have a 'host' ring pair. + * Additionally, clients can request additional ring pairs to + * be used for internal communication. + */ + const uint32_t ni_tx_rings; /* number of HW tx rings */ + const uint32_t ni_rx_rings; /* number of HW rx rings */ + + uint32_t ni_bufs_head; /* head index for extra bufs */ + uint32_t ni_spare1[5]; + /* + * The following array contains the offset of each netmap ring + * from this structure, in the following order: + * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; + * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * + * The area is filled up by the kernel on NIOCREGIF, + * and then only read by userspace code. + */ + const ssize_t ring_ofs[0]; +}; + + +#ifndef NIOCREGIF +/* + * ioctl names and related fields + * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid. + * These are non blocking and take no argument. + * + * NIOCGINFO takes a struct ifreq, the interface name is the input, + * the outputs are number of queues and number of descriptor + * for each queue (useful to set number of threads etc.). + * The info returned is only advisory and may change before + * the interface is bound to a file descriptor. + * + * NIOCREGIF takes an interface name within a struct nmre, + * and activates netmap mode on the interface (if possible). + * + * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we + * can pass it down to other NIC-related ioctls. + * + * The actual argument (struct nmreq) has a number of options to request + * different functions. + * The following are used in NIOCREGIF when nr_cmd == 0: + * + * nr_name (in) + * The name of the port (em0, valeXXX:YYY, etc.) + * limited to IFNAMSIZ for backward compatibility. + * + * nr_version (in/out) + * Must match NETMAP_API as used in the kernel, error otherwise. + * Always returns the desired value on output. + * + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * On input, non-zero values may be used to reconfigure the port + * according to the requested values, but this is not guaranteed. + * On output the actual values in use are reported. + * + * nr_ringid (in) + * Indicates how rings should be bound to the file descriptors. + * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) + * are used to indicate the ring number, and nr_flags specifies + * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. + * + * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: + * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control + * the binding as follows: + * 0 (default) binds all physical rings + * NETMAP_HW_RING | ring number binds a single ring pair + * NETMAP_SW_RING binds only the host tx/rx rings + * + * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push + * packets on tx rings only if POLLOUT is set. + * The default is to push any pending packet. + * + * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release + * packets on rx rings also when POLLIN is NOT set. + * The default is to touch the rx ring only with POLLIN. + * Note that this is the opposite of TX because it + * reflects the common usage. + * + * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. + * NETMAP_PRIV_MEM is set on return for ports that do not use + * the global memory allocator. + * This information is not significant and applications + * should look at the region id in nr_arg2 + * + * nr_flags is the recommended mode to indicate which rings should + * be bound to a file descriptor. Values are NR_REG_* + * + * nr_arg1 (in) The number of extra rings to be reserved. + * Especially when allocating a VALE port the system only + * allocates the amount of memory needed for the port. + * If more shared memory rings are desired (e.g. for pipes), + * the first invocation for the same basename/allocator + * should specify a suitable number. Memory cannot be + * extended after the first allocation without closing + * all ports on the same region. + * + * nr_arg2 (in/out) The identity of the memory region used. + * On input, 0 means the system decides autonomously, + * other values may try to select a specific region. + * On return the actual value is reported. + * Region '1' is the global allocator, normally shared + * by all interfaces. Other values are private regions. + * If two ports the same region zero-copy is possible. + * + * nr_arg3 (in/out) number of extra buffers to be allocated. + * + * + * + * nr_cmd (in) if non-zero indicates a special command: + * NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch; nr_ringid specifies + * which rings to use. Used by vale-ctl -a ... + * nr_arg1 = NETMAP_BDG_HOST also attaches the host port + * as in vale-ctl -h ... + * + * NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC. + * Used by vale-ctl -d ... + * + * NETMAP_BDG_LIST + * list the configuration of VALE switches. + * + * NETMAP_BDG_VNET_HDR + * Set the virtio-net header length used by the client + * of a VALE switch port. + * + * NETMAP_BDG_NEWIF + * create a persistent VALE port with name nr_name. + * Used by vale-ctl -n ... + * + * NETMAP_BDG_DELIF + * delete a persistent VALE port. Used by vale-ctl -d ... + * + * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific + * + * + * + */ + + +/* + * struct nmreq overlays a struct ifreq (just the name) + */ +struct nmreq { + char nr_name[IFNAMSIZ]; + uint32_t nr_version; /* API version */ + uint32_t nr_offset; /* nifp offset in the shared region */ + uint32_t nr_memsize; /* size of the shared region */ + uint32_t nr_tx_slots; /* slots in tx rings */ + uint32_t nr_rx_slots; /* slots in rx rings */ + uint16_t nr_tx_rings; /* number of tx rings */ + uint16_t nr_rx_rings; /* number of rx rings */ + + uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ +#define NETMAP_SW_RING 0x2000 /* only host ring pair */ + +#define NETMAP_RING_MASK 0x0fff /* the ring number */ + +#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ + +#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ + + uint16_t nr_cmd; +#define NETMAP_BDG_ATTACH 1 /* attach the NIC */ +#define NETMAP_BDG_DETACH 2 /* detach the NIC */ +#define NETMAP_BDG_REGOPS 3 /* register bridge callbacks */ +#define NETMAP_BDG_LIST 4 /* get bridge's info */ +#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ +#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ +#define NETMAP_BDG_NEWIF 6 /* create a virtual port */ +#define NETMAP_BDG_DELIF 7 /* destroy a virtual port */ +#define NETMAP_PT_HOST_CREATE 8 /* create ptnetmap kthreads */ +#define NETMAP_PT_HOST_DELETE 9 /* delete ptnetmap kthreads */ +#define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */ +#define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */ +#define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */ + uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ +#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ + + uint16_t nr_arg2; + uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ + uint32_t nr_flags; + /* various modes, extends nr_ringid */ + uint32_t spare2[1]; +}; + +#define NR_REG_MASK 0xf /* values for nr_flags */ +enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ + NR_REG_ALL_NIC = 1, + NR_REG_SW = 2, + NR_REG_NIC_SW = 3, + NR_REG_ONE_NIC = 4, + NR_REG_PIPE_MASTER = 5, + NR_REG_PIPE_SLAVE = 6, +}; +/* monitor uses the NR_REG to select the rings to monitor */ +#define NR_MONITOR_TX 0x100 +#define NR_MONITOR_RX 0x200 +#define NR_ZCOPY_MON 0x400 +/* request exclusive access to the selected rings */ +#define NR_EXCLUSIVE 0x800 +/* request ptnetmap host support */ +#define NR_PASSTHROUGH_HOST NR_PTNETMAP_HOST /* deprecated */ +#define NR_PTNETMAP_HOST 0x1000 +#define NR_RX_RINGS_ONLY 0x2000 +#define NR_TX_RINGS_ONLY 0x4000 +/* Applications set this flag if they are able to deal with virtio-net headers, + * that is send/receive frames that start with a virtio-net header. + * If not set, NIOCREGIF will fail with netmap ports that require applications + * to use those headers. If the flag is set, the application can use the + * NETMAP_VNET_HDR_GET command to figure out the header length. */ +#define NR_ACCEPT_VNET_HDR 0x8000 + + +/* + * Windows does not have _IOWR(). _IO(), _IOW() and _IOR() are defined + * in ws2def.h but not sure if they are in the form we need. + * XXX so we redefine them + * in a convenient way to use for DeviceIoControl signatures + */ +#ifdef _WIN32 +#undef _IO // ws2def.h +#define _WIN_NM_IOCTL_TYPE 40000 +#define _IO(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \ + METHOD_BUFFERED, FILE_ANY_ACCESS ) +#define _IO_direct(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \ + METHOD_OUT_DIRECT, FILE_ANY_ACCESS ) + +#define _IOWR(_c, _n, _s) _IO(_c, _n) + +/* We havesome internal sysctl in addition to the externally visible ones */ +#define NETMAP_MMAP _IO_direct('i', 160) // note METHOD_OUT_DIRECT +#define NETMAP_POLL _IO('i', 162) + +/* and also two setsockopt for sysctl emulation */ +#define NETMAP_SETSOCKOPT _IO('i', 140) +#define NETMAP_GETSOCKOPT _IO('i', 141) + + +//These linknames are for the Netmap Core Driver +#define NETMAP_NT_DEVICE_NAME L"\\Device\\NETMAP" +#define NETMAP_DOS_DEVICE_NAME L"\\DosDevices\\netmap" + +//Definition of a structure used to pass a virtual address within an IOCTL +typedef struct _MEMORY_ENTRY { + PVOID pUsermodeVirtualAddress; +} MEMORY_ENTRY, *PMEMORY_ENTRY; + +typedef struct _POLL_REQUEST_DATA { + int events; + int timeout; + int revents; +} POLL_REQUEST_DATA; + +#endif /* _WIN32 */ + +/* + * FreeBSD uses the size value embedded in the _IOWR to determine + * how much to copy in/out. So we need it to match the actual + * data structure we pass. We put some spares in the structure + * to ease compatibility with other versions + */ +#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ +#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ +#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ +#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ +#define NIOCCONFIG _IOWR('i',150, struct nm_ifreq) /* for ext. modules */ +#endif /* !NIOCREGIF */ + + +/* + * Helper functions for kernel and userspace + */ + +/* + * check if space is available in the ring. + */ +static inline int +nm_ring_empty(struct netmap_ring *ring) +{ + return (ring->cur == ring->tail); +} + +/* + * Opaque structure that is passed to an external kernel + * module via ioctl(fd, NIOCCONFIG, req) for a user-owned + * bridge port (at this point ephemeral VALE interface). + */ +#define NM_IFRDATA_LEN 256 +struct nm_ifreq { + char nifr_name[IFNAMSIZ]; + char data[NM_IFRDATA_LEN]; +}; + +/* + * netmap kernel thread configuration + */ +/* bhyve/vmm.ko MSIX parameters for IOCTL */ +struct ptn_vmm_ioctl_msix { + uint64_t msg; + uint64_t addr; +}; + +/* IOCTL parameters */ +struct nm_kth_ioctl { + u_long com; + /* TODO: use union */ + union { + struct ptn_vmm_ioctl_msix msix; + } data; +}; + +/* Configuration of a ptnetmap ring */ +struct ptnet_ring_cfg { + uint64_t ioeventfd; /* eventfd in linux, tsleep() parameter in FreeBSD */ + uint64_t irqfd; /* eventfd in linux, ioctl fd in FreeBSD */ + struct nm_kth_ioctl ioctl; /* ioctl parameter to send irq (only used in bhyve/FreeBSD) */ +}; +#endif /* _NET_NETMAP_H_ */ diff --git a/extras/deprecated/netmap/netmap.api b/extras/deprecated/netmap/netmap.api new file mode 100644 index 00000000000..a14753cad9c --- /dev/null +++ b/extras/deprecated/netmap/netmap.api @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +option version = "1.0.0"; + +/** \brief Create netmap + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param netmap_if_name - interface name + @param hw_addr - interface MAC + @param use_random_hw_addr - use random generated MAC + @param is_pipe - is pipe + @param is_master - 0=slave, 1=master +*/ +autoreply define netmap_create +{ + u32 client_index; + u32 context; + + u8 netmap_if_name[64]; + u8 hw_addr[6]; + u8 use_random_hw_addr; + u8 is_pipe; + u8 is_master; +}; + +/** \brief Delete netmap + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param netmap_if_name - interface name +*/ +autoreply define netmap_delete +{ + u32 client_index; + u32 context; + + u8 netmap_if_name[64]; +}; + +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/extras/deprecated/netmap/netmap.c b/extras/deprecated/netmap/netmap.c new file mode 100644 index 00000000000..03d96216bb0 --- /dev/null +++ b/extras/deprecated/netmap/netmap.c @@ -0,0 +1,314 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <fcntl.h> +#include <vnet/devices/netmap/net_netmap.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/netmap/netmap.h> + +netmap_main_t netmap_main; + +static u32 +netmap_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, + u32 flags) +{ + /* nothing for now */ + return 0; +} + +static clib_error_t * +netmap_fd_read_ready (clib_file_t * uf) +{ + vlib_main_t *vm = vlib_get_main (); + netmap_main_t *nm = &netmap_main; + u32 idx = uf->private_data; + + nm->pending_input_bitmap = + clib_bitmap_set (nm->pending_input_bitmap, idx, 1); + + /* Schedule the rx node */ + vlib_node_set_interrupt_pending (vm, netmap_input_node.index); + + return 0; +} + +static void +close_netmap_if (netmap_main_t * nm, netmap_if_t * nif) +{ + if (nif->clib_file_index != ~0) + { + clib_file_del (&file_main, file_main.file_pool + nif->clib_file_index); + nif->clib_file_index = ~0; + } + else if (nif->fd > -1) + close (nif->fd); + + if (nif->mem_region) + { + netmap_mem_region_t *reg = &nm->mem_regions[nif->mem_region]; + if (--reg->refcnt == 0) + { + munmap (reg->mem, reg->region_size); + reg->region_size = 0; + } + } + + + mhash_unset (&nm->if_index_by_host_if_name, nif->host_if_name, + &nif->if_index); + vec_free (nif->host_if_name); + vec_free (nif->req); + + clib_memset (nif, 0, sizeof (*nif)); + pool_put (nm->interfaces, nif); +} + +int +netmap_worker_thread_enable () +{ + /* if worker threads are enabled, switch to polling mode */ + foreach_vlib_main (( + { + vlib_node_set_state (this_vlib_main, + netmap_input_node.index, + VLIB_NODE_STATE_POLLING); + })); + + return 0; +} + +int +netmap_worker_thread_disable () +{ + foreach_vlib_main (( + { + vlib_node_set_state (this_vlib_main, + netmap_input_node.index, + VLIB_NODE_STATE_INTERRUPT); + })); + + return 0; +} + +int +netmap_create_if (vlib_main_t * vm, u8 * if_name, u8 * hw_addr_set, + u8 is_pipe, u8 is_master, u32 * sw_if_index) +{ + netmap_main_t *nm = &netmap_main; + int ret = 0; + netmap_if_t *nif = 0; + u8 hw_addr[6]; + clib_error_t *error = 0; + vnet_sw_interface_t *sw; + vnet_main_t *vnm = vnet_get_main (); + uword *p; + struct nmreq *req = 0; + netmap_mem_region_t *reg; + vlib_thread_main_t *tm = vlib_get_thread_main (); + int fd; + + p = mhash_get (&nm->if_index_by_host_if_name, if_name); + if (p) + return VNET_API_ERROR_SUBIF_ALREADY_EXISTS; + + fd = open ("/dev/netmap", O_RDWR); + if (fd < 0) + return VNET_API_ERROR_SUBIF_ALREADY_EXISTS; + + pool_get (nm->interfaces, nif); + nif->if_index = nif - nm->interfaces; + nif->fd = fd; + nif->clib_file_index = ~0; + + vec_validate (req, 0); + nif->req = req; + req->nr_version = NETMAP_API; + req->nr_flags = NR_REG_ALL_NIC; + + if (is_pipe) + req->nr_flags = is_master ? NR_REG_PIPE_MASTER : NR_REG_PIPE_SLAVE; + else + req->nr_flags = NR_REG_ALL_NIC; + + req->nr_flags |= NR_ACCEPT_VNET_HDR; + snprintf (req->nr_name, IFNAMSIZ, "%s", if_name); + req->nr_name[IFNAMSIZ - 1] = 0; + + if (ioctl (nif->fd, NIOCREGIF, req)) + { + ret = VNET_API_ERROR_NOT_CONNECTED; + goto error; + } + + nif->mem_region = req->nr_arg2; + vec_validate (nm->mem_regions, nif->mem_region); + reg = &nm->mem_regions[nif->mem_region]; + if (reg->region_size == 0) + { + reg->mem = mmap (NULL, req->nr_memsize, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + clib_warning ("mem %p", reg->mem); + if (reg->mem == MAP_FAILED) + { + ret = VNET_API_ERROR_NOT_CONNECTED; + goto error; + } + reg->region_size = req->nr_memsize; + } + reg->refcnt++; + + nif->nifp = NETMAP_IF (reg->mem, req->nr_offset); + nif->first_rx_ring = 0; + nif->last_rx_ring = 0; + nif->first_tx_ring = 0; + nif->last_tx_ring = 0; + nif->host_if_name = if_name; + nif->per_interface_next_index = ~0; + + if (tm->n_vlib_mains > 1) + clib_spinlock_init (&nif->lockp); + + { + clib_file_t template = { 0 }; + template.read_function = netmap_fd_read_ready; + template.file_descriptor = nif->fd; + template.private_data = nif->if_index; + nif->clib_file_index = clib_file_add (&file_main, &template); + } + + /*use configured or generate random MAC address */ + if (hw_addr_set) + memcpy (hw_addr, hw_addr_set, 6); + else + { + f64 now = vlib_time_now (vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + + memcpy (hw_addr + 2, &rnd, sizeof (rnd)); + hw_addr[0] = 2; + hw_addr[1] = 0xfe; + } + + error = ethernet_register_interface (vnm, netmap_device_class.index, + nif->if_index, hw_addr, + &nif->hw_if_index, + netmap_eth_flag_change); + + if (error) + { + clib_error_report (error); + ret = VNET_API_ERROR_SYSCALL_ERROR_1; + goto error; + } + + sw = vnet_get_hw_sw_interface (vnm, nif->hw_if_index); + nif->sw_if_index = sw->sw_if_index; + + mhash_set_mem (&nm->if_index_by_host_if_name, if_name, &nif->if_index, 0); + + if (sw_if_index) + *sw_if_index = nif->sw_if_index; + + if (tm->n_vlib_mains > 1 && pool_elts (nm->interfaces) == 1) + netmap_worker_thread_enable (); + + return 0; + +error: + close_netmap_if (nm, nif); + return ret; +} + +int +netmap_delete_if (vlib_main_t * vm, u8 * host_if_name) +{ + vnet_main_t *vnm = vnet_get_main (); + netmap_main_t *nm = &netmap_main; + netmap_if_t *nif; + uword *p; + vlib_thread_main_t *tm = vlib_get_thread_main (); + + p = mhash_get (&nm->if_index_by_host_if_name, host_if_name); + if (p == NULL) + { + clib_warning ("Host interface %s does not exist", host_if_name); + return VNET_API_ERROR_SYSCALL_ERROR_1; + } + nif = pool_elt_at_index (nm->interfaces, p[0]); + + /* bring down the interface */ + vnet_hw_interface_set_flags (vnm, nif->hw_if_index, 0); + + ethernet_delete_interface (vnm, nif->hw_if_index); + + close_netmap_if (nm, nif); + + if (tm->n_vlib_mains > 1 && pool_elts (nm->interfaces) == 0) + netmap_worker_thread_disable (); + + return 0; +} + +static clib_error_t * +netmap_init (vlib_main_t * vm) +{ + netmap_main_t *nm = &netmap_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_thread_registration_t *tr; + uword *p; + + clib_memset (nm, 0, sizeof (netmap_main_t)); + + nm->input_cpu_first_index = 0; + nm->input_cpu_count = 1; + + /* find out which cpus will be used for input */ + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + tr = p ? (vlib_thread_registration_t *) p[0] : 0; + + if (tr && tr->count > 0) + { + nm->input_cpu_first_index = tr->first_index; + nm->input_cpu_count = tr->count; + } + + mhash_init_vec_string (&nm->if_index_by_host_if_name, sizeof (uword)); + + vec_validate_aligned (nm->rx_buffers, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + + return 0; +} + +VLIB_INIT_FUNCTION (netmap_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/extras/deprecated/netmap/netmap.h b/extras/deprecated/netmap/netmap.h new file mode 100644 index 00000000000..29f855fda8e --- /dev/null +++ b/extras/deprecated/netmap/netmap.h @@ -0,0 +1,166 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +/* + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <vppinfra/lock.h> + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + clib_spinlock_t lockp; + u8 *host_if_name; + uword if_index; + u32 hw_if_index; + u32 sw_if_index; + u32 clib_file_index; + + u32 per_interface_next_index; + u8 is_admin_up; + + /* netmap */ + struct nmreq *req; + u16 mem_region; + int fd; + struct netmap_if *nifp; + u16 first_tx_ring; + u16 last_tx_ring; + u16 first_rx_ring; + u16 last_rx_ring; + +} netmap_if_t; + +typedef struct +{ + char *mem; + u32 region_size; + int refcnt; +} netmap_mem_region_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + netmap_if_t *interfaces; + + /* bitmap of pending rx interfaces */ + uword *pending_input_bitmap; + + /* rx buffer cache */ + u32 **rx_buffers; + + /* hash of host interface names */ + mhash_t if_index_by_host_if_name; + + /* vector of memory regions */ + netmap_mem_region_t *mem_regions; + + /* first cpu index */ + u32 input_cpu_first_index; + + /* total cpu count */ + u32 input_cpu_count; +} netmap_main_t; + +extern netmap_main_t netmap_main; +extern vnet_device_class_t netmap_device_class; +extern vlib_node_registration_t netmap_input_node; + +int netmap_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, + u8 is_pipe, u8 is_master, u32 * sw_if_index); +int netmap_delete_if (vlib_main_t * vm, u8 * host_if_name); + + +/* Macros and helper functions from sys/net/netmap_user.h */ + +#ifdef _NET_NETMAP_H_ + +#define _NETMAP_OFFSET(type, ptr, offset) \ + ((type)(void *)((char *)(ptr) + (offset))) + +#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs) + +#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ + nifp, (nifp)->ring_ofs[index] ) + +#define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ + nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] ) + +#define NETMAP_BUF(ring, index) \ + ((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size)) + +#define NETMAP_BUF_IDX(ring, buf) \ + ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \ + (ring)->nr_buf_size ) + +static inline uint32_t +nm_ring_next (struct netmap_ring *ring, uint32_t i) +{ + return (PREDICT_FALSE (i + 1 == ring->num_slots) ? 0 : i + 1); +} + + +/* + * Return 1 if we have pending transmissions in the tx ring. + * When everything is complete ring->head = ring->tail + 1 (modulo ring size) + */ +static inline int +nm_tx_pending (struct netmap_ring *ring) +{ + return nm_ring_next (ring, ring->tail) != ring->head; +} + +static inline uint32_t +nm_ring_space (struct netmap_ring *ring) +{ + int ret = ring->tail - ring->cur; + if (ret < 0) + ret += ring->num_slots; + return ret; +} +#endif + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/extras/deprecated/netmap/netmap_api.c b/extras/deprecated/netmap/netmap_api.c new file mode 100644 index 00000000000..ee05ec22d25 --- /dev/null +++ b/extras/deprecated/netmap/netmap_api.c @@ -0,0 +1,137 @@ +/* + *------------------------------------------------------------------ + * netmap_api.c - netmap api + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vlibmemory/api.h> + +#include <vnet/interface.h> +#include <vnet/api_errno.h> +#include <vnet/devices/netmap/netmap.h> + +#include <vnet/vnet_msg_enum.h> + +#define vl_typedefs /* define message structures */ +#include <vnet/vnet_all_api_h.h> +#undef vl_typedefs + +#define vl_endianfun /* define message structures */ +#include <vnet/vnet_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <vnet/vnet_all_api_h.h> +#undef vl_printfun + +#include <vlibapi/api_helper_macros.h> + +#define foreach_vpe_api_msg \ +_(NETMAP_CREATE, netmap_create) \ +_(NETMAP_DELETE, netmap_delete) \ + +static void +vl_api_netmap_create_t_handler (vl_api_netmap_create_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + vl_api_netmap_create_reply_t *rmp; + int rv = 0; + u8 *if_name = NULL; + + if_name = format (0, "%s", mp->netmap_if_name); + vec_add1 (if_name, 0); + + rv = + netmap_create_if (vm, if_name, mp->use_random_hw_addr ? 0 : mp->hw_addr, + mp->is_pipe, mp->is_master, 0); + + vec_free (if_name); + + REPLY_MACRO (VL_API_NETMAP_CREATE_REPLY); +} + +static void +vl_api_netmap_delete_t_handler (vl_api_netmap_delete_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + vl_api_netmap_delete_reply_t *rmp; + int rv = 0; + u8 *if_name = NULL; + + if_name = format (0, "%s", mp->netmap_if_name); + vec_add1 (if_name, 0); + + rv = netmap_delete_if (vm, if_name); + + vec_free (if_name); + + REPLY_MACRO (VL_API_NETMAP_DELETE_REPLY); +} + +/* + * netmap_api_hookup + * Add vpe's API message handlers to the table. + * vlib has already mapped shared memory and + * added the client registration handlers. + * See .../vlib-api/vlibmemory/memclnt_vlib.c:memclnt_process() + */ +#define vl_msg_name_crc_list +#include <vnet/vnet_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (api_main_t * am) +{ +#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id); + foreach_vl_msg_name_crc_netmap; +#undef _ +} + +static clib_error_t * +netmap_api_hookup (vlib_main_t * vm) +{ + api_main_t *am = vlibapi_get_main (); + +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_msg; +#undef _ + + /* + * Set up the (msg_name, crc, message-id) table + */ + setup_message_id_table (am); + + return 0; +} + +VLIB_API_INIT_FUNCTION (netmap_api_hookup); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/extras/deprecated/netmap/node.c b/extras/deprecated/netmap/node.c new file mode 100644 index 00000000000..bc55ecc8eb1 --- /dev/null +++ b/extras/deprecated/netmap/node.c @@ -0,0 +1,298 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/devices.h> +#include <vnet/feature/feature.h> + +#include <vnet/devices/netmap/net_netmap.h> +#include <vnet/devices/netmap/netmap.h> + +#define foreach_netmap_input_error + +typedef enum +{ +#define _(f,s) NETMAP_INPUT_ERROR_##f, + foreach_netmap_input_error +#undef _ + NETMAP_INPUT_N_ERROR, +} netmap_input_error_t; + +static char *netmap_input_error_strings[] = { +#define _(n,s) s, + foreach_netmap_input_error +#undef _ +}; + +typedef struct +{ + u32 next_index; + u32 hw_if_index; + struct netmap_slot slot; +} netmap_input_trace_t; + +static u8 * +format_netmap_input_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + netmap_input_trace_t *t = va_arg (*args, netmap_input_trace_t *); + u32 indent = format_get_indent (s); + + s = format (s, "netmap: hw_if_index %d next-index %d", + t->hw_if_index, t->next_index); + s = format (s, "\n%Uslot: flags 0x%x len %u buf_idx %u", + format_white_space, indent + 2, + t->slot.flags, t->slot.len, t->slot.buf_idx); + return s; +} + +always_inline void +buffer_add_to_chain (vlib_main_t * vm, u32 bi, u32 first_bi, u32 prev_bi) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_buffer_t *first_b = vlib_get_buffer (vm, first_bi); + vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_bi); + + /* update first buffer */ + first_b->total_length_not_including_first_buffer += b->current_length; + + /* update previous buffer */ + prev_b->next_buffer = bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + b->next_buffer = 0; +} + +always_inline uword +netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, netmap_if_t * nif) +{ + u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + uword n_trace = vlib_get_trace_count (vm, node); + netmap_main_t *nm = &netmap_main; + u32 n_rx_packets = 0; + u32 n_rx_bytes = 0; + u32 *to_next = 0; + u32 n_free_bufs; + struct netmap_ring *ring; + int cur_ring; + u32 thread_index = vm->thread_index; + u32 n_buffer_bytes = vlib_buffer_get_default_data_size (vm); + + if (nif->per_interface_next_index != ~0) + next_index = nif->per_interface_next_index; + + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); + if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) + { + vec_validate (nm->rx_buffers[thread_index], + VLIB_FRAME_SIZE + n_free_bufs - 1); + n_free_bufs += + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], + VLIB_FRAME_SIZE); + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; + } + + cur_ring = nif->first_rx_ring; + while (cur_ring <= nif->last_rx_ring && n_free_bufs) + { + int r = 0; + u32 cur_slot_index; + ring = NETMAP_RXRING (nif->nifp, cur_ring); + r = nm_ring_space (ring); + + if (!r) + { + cur_ring++; + continue; + } + + if (r > n_free_bufs) + r = n_free_bufs; + + cur_slot_index = ring->cur; + while (r) + { + u32 n_left_to_next; + u32 next0 = next_index; + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (r && n_left_to_next) + { + vlib_buffer_t *first_b0 = 0; + u32 offset = 0; + u32 bi0 = 0, first_bi0 = 0, prev_bi0; + u32 next_slot_index = (cur_slot_index + 1) % ring->num_slots; + u32 next2_slot_index = (cur_slot_index + 2) % ring->num_slots; + struct netmap_slot *slot = &ring->slot[cur_slot_index]; + u32 data_len = slot->len; + + /* prefetch 2 slots in advance */ + CLIB_PREFETCH (&ring->slot[next2_slot_index], + CLIB_CACHE_LINE_BYTES, LOAD); + /* prefetch start of next packet */ + CLIB_PREFETCH (NETMAP_BUF + (ring, ring->slot[next_slot_index].buf_idx), + CLIB_CACHE_LINE_BYTES, LOAD); + + while (data_len && n_free_bufs) + { + vlib_buffer_t *b0; + /* grab free buffer */ + u32 last_empty_buffer = + vec_len (nm->rx_buffers[thread_index]) - 1; + prev_bi0 = bi0; + bi0 = nm->rx_buffers[thread_index][last_empty_buffer]; + b0 = vlib_get_buffer (vm, bi0); + _vec_len (nm->rx_buffers[thread_index]) = last_empty_buffer; + n_free_bufs--; + + /* copy data */ + u32 bytes_to_copy = + data_len > n_buffer_bytes ? n_buffer_bytes : data_len; + b0->current_data = 0; + clib_memcpy_fast (vlib_buffer_get_current (b0), + (u8 *) NETMAP_BUF (ring, slot->buf_idx) + + offset, bytes_to_copy); + + /* fill buffer header */ + b0->current_length = bytes_to_copy; + + if (offset == 0) + { + b0->total_length_not_including_first_buffer = 0; + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + vnet_buffer (b0)->sw_if_index[VLIB_RX] = + nif->sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + first_bi0 = bi0; + first_b0 = vlib_get_buffer (vm, first_bi0); + } + else + buffer_add_to_chain (vm, bi0, first_bi0, prev_bi0); + + offset += bytes_to_copy; + data_len -= bytes_to_copy; + } + + /* trace */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b0); + if (PREDICT_FALSE (n_trace > 0)) + { + if (PREDICT_TRUE (first_b0 != 0)) + { + netmap_input_trace_t *tr; + vlib_trace_buffer (vm, node, next0, first_b0, + /* follow_chain */ 0); + vlib_set_trace_count (vm, node, --n_trace); + tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr)); + tr->next_index = next0; + tr->hw_if_index = nif->hw_if_index; + memcpy (&tr->slot, slot, sizeof (struct netmap_slot)); + } + } + + /* redirect if feature path enabled */ + vnet_feature_start_device_input_x1 (nif->sw_if_index, &next0, + first_b0); + + /* enque and take next packet */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, first_bi0, + next0); + + /* next packet */ + n_rx_packets++; + n_rx_bytes += slot->len; + to_next[0] = first_bi0; + to_next += 1; + n_left_to_next--; + cur_slot_index = next_slot_index; + + r--; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + ring->head = ring->cur = cur_slot_index; + cur_ring++; + } + + if (n_rx_packets) + ioctl (nif->fd, NIOCRXSYNC, NULL); + + vlib_increment_combined_counter + (vnet_get_main ()->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + vlib_get_thread_index (), nif->hw_if_index, n_rx_packets, n_rx_bytes); + + vnet_device_increment_rx_packets (thread_index, n_rx_packets); + + return n_rx_packets; +} + +VLIB_NODE_FN (netmap_input_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + int i; + u32 n_rx_packets = 0; + u32 thread_index = vm->thread_index; + netmap_main_t *nm = &netmap_main; + netmap_if_t *nmi; + + for (i = 0; i < vec_len (nm->interfaces); i++) + { + nmi = vec_elt_at_index (nm->interfaces, i); + if (nmi->is_admin_up && + (i % nm->input_cpu_count) == + (thread_index - nm->input_cpu_first_index)) + n_rx_packets += netmap_device_input_fn (vm, node, frame, nmi); + } + + return n_rx_packets; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (netmap_input_node) = { + .name = "netmap-input", + .sibling_of = "device-input", + .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED, + .format_trace = format_netmap_input_trace, + .type = VLIB_NODE_TYPE_INPUT, + /* default state is INTERRUPT mode, switch to POLLING if worker threads are enabled */ + .state = VLIB_NODE_STATE_INTERRUPT, + .n_errors = NETMAP_INPUT_N_ERROR, + .error_strings = netmap_input_error_strings, +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ |