diff options
Diffstat (limited to 'examples/tep_termination')
-rw-r--r-- | examples/tep_termination/Makefile | 56 | ||||
-rw-r--r-- | examples/tep_termination/main.c | 1275 | ||||
-rw-r--r-- | examples/tep_termination/main.h | 129 | ||||
-rw-r--r-- | examples/tep_termination/vxlan.c | 259 | ||||
-rw-r--r-- | examples/tep_termination/vxlan.h | 86 | ||||
-rw-r--r-- | examples/tep_termination/vxlan_setup.c | 457 | ||||
-rw-r--r-- | examples/tep_termination/vxlan_setup.h | 87 |
7 files changed, 2349 insertions, 0 deletions
diff --git a/examples/tep_termination/Makefile b/examples/tep_termination/Makefile new file mode 100644 index 00000000..448e6183 --- /dev/null +++ b/examples/tep_termination/Makefile @@ -0,0 +1,56 @@ +# BSD LICENSE +# +# Copyright(c) 2010-2015 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, can be overridden by command line or environment +RTE_TARGET ?= x86_64-native-linuxapp-gcc + +include $(RTE_SDK)/mk/rte.vars.mk + +ifneq ($(CONFIG_RTE_EXEC_ENV),"linuxapp") +$(error This application can only operate in a linuxapp environment, \ +please change the definition of the RTE_TARGET environment variable) +endif + +# binary name +APP = tep_termination + +# all source are stored in SRCS-y +SRCS-y := main.c vxlan_setup.c vxlan.c + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -D_GNU_SOURCE + +include $(RTE_SDK)/mk/rte.extapp.mk diff --git a/examples/tep_termination/main.c b/examples/tep_termination/main.c new file mode 100644 index 00000000..f97d552a --- /dev/null +++ b/examples/tep_termination/main.c @@ -0,0 +1,1275 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <arpa/inet.h> +#include <getopt.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/virtio_net.h> +#include <linux/virtio_ring.h> +#include <signal.h> +#include <stdint.h> +#include <sys/eventfd.h> +#include <sys/param.h> +#include <unistd.h> + +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_ethdev.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_malloc.h> +#include <rte_virtio_net.h> + +#include "main.h" +#include "vxlan.h" +#include "vxlan_setup.h" + +/* the maximum number of external ports supported */ +#define MAX_SUP_PORTS 1 + +/** + * Calculate the number of buffers needed per port + */ +#define NUM_MBUFS_PER_PORT ((MAX_QUEUES * RTE_TEST_RX_DESC_DEFAULT) +\ + (nb_switching_cores * MAX_PKT_BURST) +\ + (nb_switching_cores * \ + RTE_TEST_TX_DESC_DEFAULT) +\ + (nb_switching_cores * MBUF_CACHE_SIZE)) + +#define MBUF_CACHE_SIZE 128 +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) + +#define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ +#define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ + +/* Defines how long we wait between retries on RX */ +#define BURST_RX_WAIT_US 15 + +#define BURST_RX_RETRIES 4 /* Number of retries on RX. */ + +#define JUMBO_FRAME_MAX_SIZE 0x2600 + +/* State of virtio device. */ +#define DEVICE_MAC_LEARNING 0 +#define DEVICE_RX 1 +#define DEVICE_SAFE_REMOVE 2 + +/* Config_core_flag status definitions. */ +#define REQUEST_DEV_REMOVAL 1 +#define ACK_DEV_REMOVAL 0 + +/* Configurable number of RX/TX ring descriptors */ +#define RTE_TEST_RX_DESC_DEFAULT 1024 +#define RTE_TEST_TX_DESC_DEFAULT 512 + +/* Get first 4 bytes in mbuf headroom. */ +#define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ + + sizeof(struct rte_mbuf))) + +#define INVALID_PORT_ID 0xFF + +/* Size of buffers used for snprintfs. */ +#define MAX_PRINT_BUFF 6072 + +/* Maximum character device basename size. */ +#define MAX_BASENAME_SZ 20 + +/* Maximum long option length for option parsing. */ +#define MAX_LONG_OPT_SZ 64 + +/* Used to compare MAC addresses. */ +#define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL + +#define CMD_LINE_OPT_NB_DEVICES "nb-devices" +#define CMD_LINE_OPT_UDP_PORT "udp-port" +#define CMD_LINE_OPT_TX_CHECKSUM "tx-checksum" +#define CMD_LINE_OPT_TSO_SEGSZ "tso-segsz" +#define CMD_LINE_OPT_FILTER_TYPE "filter-type" +#define CMD_LINE_OPT_ENCAP "encap" +#define CMD_LINE_OPT_DECAP "decap" +#define CMD_LINE_OPT_RX_RETRY "rx-retry" +#define CMD_LINE_OPT_RX_RETRY_DELAY "rx-retry-delay" +#define CMD_LINE_OPT_RX_RETRY_NUM "rx-retry-num" +#define CMD_LINE_OPT_STATS "stats" +#define CMD_LINE_OPT_DEV_BASENAME "dev-basename" + +/* mask of enabled ports */ +static uint32_t enabled_port_mask; + +/*Number of switching cores enabled*/ +static uint32_t nb_switching_cores; + +/* number of devices/queues to support*/ +uint16_t nb_devices = 2; + +/* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ +#define MAX_RING_DESC 4096 + +struct vpool { + struct rte_mempool *pool; + struct rte_ring *ring; + uint32_t buf_size; +} vpool_array[MAX_QUEUES+MAX_QUEUES]; + +/* UDP tunneling port */ +uint16_t udp_port = 4789; + +/* enable/disable inner TX checksum */ +uint8_t tx_checksum = 0; + +/* TCP segment size */ +uint16_t tso_segsz = 0; + +/* enable/disable decapsulation */ +uint8_t rx_decap = 1; + +/* enable/disable encapsulation */ +uint8_t tx_encap = 1; + +/* RX filter type for tunneling packet */ +uint8_t filter_idx = 1; + +/* overlay packet operation */ +struct ol_switch_ops overlay_options = { + .port_configure = vxlan_port_init, + .tunnel_setup = vxlan_link, + .tunnel_destroy = vxlan_unlink, + .tx_handle = vxlan_tx_pkts, + .rx_handle = vxlan_rx_pkts, + .param_handle = NULL, +}; + +/* Enable stats. */ +uint32_t enable_stats = 0; +/* Enable retries on RX. */ +static uint32_t enable_retry = 1; +/* Specify timeout (in useconds) between retries on RX. */ +static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; +/* Specify the number of retries on RX. */ +static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; + +/* Character device basename. Can be set by user. */ +static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; + +static unsigned lcore_ids[RTE_MAX_LCORE]; +uint8_t ports[RTE_MAX_ETHPORTS]; + +static unsigned nb_ports; /**< The number of ports specified in command line */ + +/* ethernet addresses of ports */ +struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; + +/* heads for the main used and free linked lists for the data path. */ +static struct virtio_net_data_ll *ll_root_used; +static struct virtio_net_data_ll *ll_root_free; + +/** + * Array of data core structures containing information on + * individual core linked lists. + */ +static struct lcore_info lcore_info[RTE_MAX_LCORE]; + +/* Used for queueing bursts of TX packets. */ +struct mbuf_table { + unsigned len; + unsigned txq_id; + struct rte_mbuf *m_table[MAX_PKT_BURST]; +}; + +/* TX queue for each data core. */ +struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; + +struct device_statistics dev_statistics[MAX_DEVICES]; + +/** + * Set character device basename. + */ +static int +us_vhost_parse_basename(const char *q_arg) +{ + /* parse number string */ + if (strlen(q_arg) >= MAX_BASENAME_SZ) + return -1; + else + snprintf((char *)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); + + return 0; +} + +/** + * Parse the portmask provided at run time. + */ +static int +parse_portmask(const char *portmask) +{ + char *end = NULL; + unsigned long pm; + + /* parse hexadecimal string */ + pm = strtoul(portmask, &end, 16); + if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + + if (pm == 0) + return -1; + + return pm; +} + +/** + * Parse num options at run time. + */ +static int +parse_num_opt(const char *q_arg, uint32_t max_valid_value) +{ + char *end = NULL; + unsigned long num; + + /* parse unsigned int string */ + num = strtoul(q_arg, &end, 10); + if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + + if (num > max_valid_value) + return -1; + + return num; +} + +/** + * Display usage + */ +static void +tep_termination_usage(const char *prgname) +{ + RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" + " --udp-port: UDP destination port for VXLAN packet\n" + " --nb-devices[1-64]: The number of virtIO device\n" + " --tx-checksum [0|1]: inner Tx checksum offload\n" + " --tso-segsz [0-N]: TCP segment size\n" + " --decap [0|1]: tunneling packet decapsulation\n" + " --encap [0|1]: tunneling packet encapsulation\n" + " --filter-type[1-3]: filter type for tunneling packet\n" + " 1: Inner MAC and tenent ID\n" + " 2: Inner MAC and VLAN, and tenent ID\n" + " 3: Outer MAC, Inner MAC and tenent ID\n" + " -p PORTMASK: Set mask for ports to be used by application\n" + " --rx-retry [0|1]: disable/enable(default) retries on rx." + " Enable retry if destintation queue is full\n" + " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX." + " This makes effect only if retries on rx enabled\n" + " --rx-retry-num [0-N]: the number of retries on rx." + " This makes effect only if retries on rx enabled\n" + " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" + " --dev-basename: The basename to be used for the character device.\n", + prgname); +} + +/** + * Parse the arguments given in the command line of the application. + */ +static int +tep_termination_parse_args(int argc, char **argv) +{ + int opt, ret; + int option_index; + unsigned i; + const char *prgname = argv[0]; + static struct option long_option[] = { + {CMD_LINE_OPT_NB_DEVICES, required_argument, NULL, 0}, + {CMD_LINE_OPT_UDP_PORT, required_argument, NULL, 0}, + {CMD_LINE_OPT_TX_CHECKSUM, required_argument, NULL, 0}, + {CMD_LINE_OPT_TSO_SEGSZ, required_argument, NULL, 0}, + {CMD_LINE_OPT_DECAP, required_argument, NULL, 0}, + {CMD_LINE_OPT_ENCAP, required_argument, NULL, 0}, + {CMD_LINE_OPT_FILTER_TYPE, required_argument, NULL, 0}, + {CMD_LINE_OPT_RX_RETRY, required_argument, NULL, 0}, + {CMD_LINE_OPT_RX_RETRY_DELAY, required_argument, NULL, 0}, + {CMD_LINE_OPT_RX_RETRY_NUM, required_argument, NULL, 0}, + {CMD_LINE_OPT_STATS, required_argument, NULL, 0}, + {CMD_LINE_OPT_DEV_BASENAME, required_argument, NULL, 0}, + {NULL, 0, 0, 0}, + }; + + /* Parse command line */ + while ((opt = getopt_long(argc, argv, "p:", + long_option, &option_index)) != EOF) { + switch (opt) { + /* Portmask */ + case 'p': + enabled_port_mask = parse_portmask(optarg); + if (enabled_port_mask == 0) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid portmask\n"); + tep_termination_usage(prgname); + return -1; + } + break; + case 0: + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_NB_DEVICES, + sizeof(CMD_LINE_OPT_NB_DEVICES))) { + ret = parse_num_opt(optarg, MAX_DEVICES); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for nb-devices [0-%d]\n", + MAX_DEVICES); + tep_termination_usage(prgname); + return -1; + } else + nb_devices = ret; + } + + /* Enable/disable retries on RX. */ + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_RX_RETRY, + sizeof(CMD_LINE_OPT_RX_RETRY))) { + ret = parse_num_opt(optarg, 1); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for rx-retry [0|1]\n"); + tep_termination_usage(prgname); + return -1; + } else + enable_retry = ret; + } + + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_TSO_SEGSZ, + sizeof(CMD_LINE_OPT_TSO_SEGSZ))) { + ret = parse_num_opt(optarg, INT16_MAX); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for TCP segment size [0-N]\n"); + tep_termination_usage(prgname); + return -1; + } else + tso_segsz = ret; + } + + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_UDP_PORT, + sizeof(CMD_LINE_OPT_UDP_PORT))) { + ret = parse_num_opt(optarg, INT16_MAX); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for UDP port [0-N]\n"); + tep_termination_usage(prgname); + return -1; + } else + udp_port = ret; + } + + /* Specify the retries delay time (in useconds) on RX.*/ + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_RX_RETRY_DELAY, + sizeof(CMD_LINE_OPT_RX_RETRY_DELAY))) { + ret = parse_num_opt(optarg, INT32_MAX); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for rx-retry-delay [0-N]\n"); + tep_termination_usage(prgname); + return -1; + } else + burst_rx_delay_time = ret; + } + + /* Specify the retries number on RX. */ + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_RX_RETRY_NUM, + sizeof(CMD_LINE_OPT_RX_RETRY_NUM))) { + ret = parse_num_opt(optarg, INT32_MAX); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for rx-retry-num [0-N]\n"); + tep_termination_usage(prgname); + return -1; + } else + burst_rx_retry_num = ret; + } + + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_TX_CHECKSUM, + sizeof(CMD_LINE_OPT_TX_CHECKSUM))) { + ret = parse_num_opt(optarg, 1); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for tx-checksum [0|1]\n"); + tep_termination_usage(prgname); + return -1; + } else + tx_checksum = ret; + } + + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_FILTER_TYPE, + sizeof(CMD_LINE_OPT_FILTER_TYPE))) { + ret = parse_num_opt(optarg, 3); + if ((ret == -1) || (ret == 0)) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for filter type [1-3]\n"); + tep_termination_usage(prgname); + return -1; + } else + filter_idx = ret - 1; + } + + /* Enable/disable encapsulation on RX. */ + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_DECAP, + sizeof(CMD_LINE_OPT_DECAP))) { + ret = parse_num_opt(optarg, 1); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for decap [0|1]\n"); + tep_termination_usage(prgname); + return -1; + } else + rx_decap = ret; + } + + /* Enable/disable encapsulation on TX. */ + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_ENCAP, + sizeof(CMD_LINE_OPT_ENCAP))) { + ret = parse_num_opt(optarg, 1); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for encap [0|1]\n"); + tep_termination_usage(prgname); + return -1; + } else + tx_encap = ret; + } + + /* Enable/disable stats. */ + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_STATS, + sizeof(CMD_LINE_OPT_STATS))) { + ret = parse_num_opt(optarg, INT32_MAX); + if (ret == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for stats [0..N]\n"); + tep_termination_usage(prgname); + return -1; + } else + enable_stats = ret; + } + + /* Set character device basename. */ + if (!strncmp(long_option[option_index].name, + CMD_LINE_OPT_DEV_BASENAME, + sizeof(CMD_LINE_OPT_DEV_BASENAME))) { + if (us_vhost_parse_basename(optarg) == -1) { + RTE_LOG(INFO, VHOST_CONFIG, + "Invalid argument for character " + "device basename (Max %d characters)\n", + MAX_BASENAME_SZ); + tep_termination_usage(prgname); + return -1; + } + } + + break; + + /* Invalid option - print options. */ + default: + tep_termination_usage(prgname); + return -1; + } + } + + for (i = 0; i < RTE_MAX_ETHPORTS; i++) { + if (enabled_port_mask & (1 << i)) + ports[nb_ports++] = (uint8_t)i; + } + + if ((nb_ports == 0) || (nb_ports > MAX_SUP_PORTS)) { + RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," + "but only %u port can be enabled\n", nb_ports, + MAX_SUP_PORTS); + return -1; + } + + return 0; +} + +/** + * Update the global var NB_PORTS and array PORTS + * according to system ports number and return valid ports number + */ +static unsigned +check_ports_num(unsigned max_nb_ports) +{ + unsigned valid_nb_ports = nb_ports; + unsigned portid; + + if (nb_ports > max_nb_ports) { + RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) " + " exceeds total system port number(%u)\n", + nb_ports, max_nb_ports); + nb_ports = max_nb_ports; + } + + for (portid = 0; portid < nb_ports; portid++) { + if (ports[portid] >= max_nb_ports) { + RTE_LOG(INFO, VHOST_PORT, + "\nSpecified port ID(%u) exceeds max " + " system port ID(%u)\n", + ports[portid], (max_nb_ports - 1)); + ports[portid] = INVALID_PORT_ID; + valid_nb_ports--; + } + } + return valid_nb_ports; +} + +/** + * This function routes the TX packet to the correct interface. This may be a local device + * or the physical port. + */ +static inline void __attribute__((always_inline)) +virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m) +{ + struct mbuf_table *tx_q; + struct rte_mbuf **m_table; + unsigned len, ret = 0; + const uint16_t lcore_id = rte_lcore_id(); + struct virtio_net *dev = vdev->dev; + + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", + dev->device_fh); + + /* Add packet to the port tx queue */ + tx_q = &lcore_tx_queue[lcore_id]; + len = tx_q->len; + + tx_q->m_table[len] = m; + len++; + if (enable_stats) { + dev_statistics[dev->device_fh].tx_total++; + dev_statistics[dev->device_fh].tx++; + } + + if (unlikely(len == MAX_PKT_BURST)) { + m_table = (struct rte_mbuf **)tx_q->m_table; + ret = overlay_options.tx_handle(ports[0], + (uint16_t)tx_q->txq_id, m_table, + (uint16_t)tx_q->len); + + /* Free any buffers not handled by TX and update + * the port stats. + */ + if (unlikely(ret < len)) { + do { + rte_pktmbuf_free(m_table[ret]); + } while (++ret < len); + } + + len = 0; + } + + tx_q->len = len; + return; +} + +/** + * This function is called by each data core. It handles all + * RX/TX registered with the core. For TX the specific lcore + * linked list is used. For RX, MAC addresses are compared + * with all devices in the main linked list. + */ +static int +switch_worker(__rte_unused void *arg) +{ + struct rte_mempool *mbuf_pool = arg; + struct virtio_net *dev = NULL; + struct vhost_dev *vdev = NULL; + struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; + struct virtio_net_data_ll *dev_ll; + struct mbuf_table *tx_q; + volatile struct lcore_ll_info *lcore_ll; + const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) + / US_PER_S * BURST_TX_DRAIN_US; + uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; + unsigned i, ret = 0; + const uint16_t lcore_id = rte_lcore_id(); + const uint16_t num_cores = (uint16_t)rte_lcore_count(); + uint16_t rx_count = 0; + uint16_t tx_count; + uint32_t retry = 0; + + RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); + lcore_ll = lcore_info[lcore_id].lcore_ll; + prev_tsc = 0; + + tx_q = &lcore_tx_queue[lcore_id]; + for (i = 0; i < num_cores; i++) { + if (lcore_ids[i] == lcore_id) { + tx_q->txq_id = i; + break; + } + } + + while (1) { + cur_tsc = rte_rdtsc(); + /* + * TX burst queue drain + */ + diff_tsc = cur_tsc - prev_tsc; + if (unlikely(diff_tsc > drain_tsc)) { + + if (tx_q->len) { + LOG_DEBUG(VHOST_DATA, "TX queue drained after " + "timeout with burst size %u\n", + tx_q->len); + ret = overlay_options.tx_handle(ports[0], + (uint16_t)tx_q->txq_id, + (struct rte_mbuf **)tx_q->m_table, + (uint16_t)tx_q->len); + if (unlikely(ret < tx_q->len)) { + do { + rte_pktmbuf_free(tx_q->m_table[ret]); + } while (++ret < tx_q->len); + } + + tx_q->len = 0; + } + + prev_tsc = cur_tsc; + + } + + rte_prefetch0(lcore_ll->ll_root_used); + + /** + * Inform the configuration core that we have exited + * the linked list and that no devices are + * in use if requested. + */ + if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) + lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; + + /* + * Process devices + */ + dev_ll = lcore_ll->ll_root_used; + + while (dev_ll != NULL) { + vdev = dev_ll->vdev; + dev = vdev->dev; + + if (unlikely(vdev->remove)) { + dev_ll = dev_ll->next; + overlay_options.tunnel_destroy(vdev); + vdev->ready = DEVICE_SAFE_REMOVE; + continue; + } + if (likely(vdev->ready == DEVICE_RX)) { + /* Handle guest RX */ + rx_count = rte_eth_rx_burst(ports[0], + vdev->rx_q, pkts_burst, MAX_PKT_BURST); + + if (rx_count) { + /* + * Retry is enabled and the queue is + * full then we wait and retry to + * avoid packet loss. Here MAX_PKT_BURST + * must be less than virtio queue size + */ + if (enable_retry && unlikely(rx_count > + rte_vring_available_entries(dev, VIRTIO_RXQ))) { + for (retry = 0; retry < burst_rx_retry_num; + retry++) { + rte_delay_us(burst_rx_delay_time); + if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) + break; + } + } + + ret_count = overlay_options.rx_handle(dev, pkts_burst, rx_count); + if (enable_stats) { + rte_atomic64_add( + &dev_statistics[dev->device_fh].rx_total_atomic, + rx_count); + rte_atomic64_add( + &dev_statistics[dev->device_fh].rx_atomic, ret_count); + } + while (likely(rx_count)) { + rx_count--; + rte_pktmbuf_free(pkts_burst[rx_count]); + } + + } + } + + if (likely(!vdev->remove)) { + /* Handle guest TX*/ + tx_count = rte_vhost_dequeue_burst(dev, + VIRTIO_TXQ, mbuf_pool, + pkts_burst, MAX_PKT_BURST); + /* If this is the first received packet we need to learn the MAC */ + if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { + if (vdev->remove || + (overlay_options.tunnel_setup(vdev, pkts_burst[0]) == -1)) { + while (tx_count) + rte_pktmbuf_free(pkts_burst[--tx_count]); + } + } + while (tx_count) + virtio_tx_route(vdev, pkts_burst[--tx_count]); + } + + /* move to the next device in the list */ + dev_ll = dev_ll->next; + } + } + + return 0; +} + +/** + * Add an entry to a used linked list. A free entry must first be found + * in the free linked list using get_data_ll_free_entry(); + */ +static void +add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, + struct virtio_net_data_ll *ll_dev) +{ + struct virtio_net_data_ll *ll = *ll_root_addr; + + /* Set next as NULL and use a compiler barrier to avoid reordering. */ + ll_dev->next = NULL; + rte_compiler_barrier(); + + /* If ll == NULL then this is the first device. */ + if (ll) { + /* Increment to the tail of the linked list. */ + while (ll->next != NULL) + ll = ll->next; + + ll->next = ll_dev; + } else { + *ll_root_addr = ll_dev; + } +} + +/** + * Remove an entry from a used linked list. The entry must then be added to + * the free linked list using put_data_ll_free_entry(). + */ +static void +rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, + struct virtio_net_data_ll *ll_dev, + struct virtio_net_data_ll *ll_dev_last) +{ + struct virtio_net_data_ll *ll = *ll_root_addr; + + if (unlikely((ll == NULL) || (ll_dev == NULL))) + return; + + if (ll_dev == ll) + *ll_root_addr = ll_dev->next; + else + if (likely(ll_dev_last != NULL)) + ll_dev_last->next = ll_dev->next; + else + RTE_LOG(ERR, VHOST_CONFIG, + "Remove entry form ll failed.\n"); +} + +/** + * Find and return an entry from the free linked list. + */ +static struct virtio_net_data_ll * +get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) +{ + struct virtio_net_data_ll *ll_free = *ll_root_addr; + struct virtio_net_data_ll *ll_dev; + + if (ll_free == NULL) + return NULL; + + ll_dev = ll_free; + *ll_root_addr = ll_free->next; + + return ll_dev; +} + +/** + * Place an entry back on to the free linked list. + */ +static void +put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, + struct virtio_net_data_ll *ll_dev) +{ + struct virtio_net_data_ll *ll_free = *ll_root_addr; + + if (ll_dev == NULL) + return; + + ll_dev->next = ll_free; + *ll_root_addr = ll_dev; +} + +/** + * Creates a linked list of a given size. + */ +static struct virtio_net_data_ll * +alloc_data_ll(uint32_t size) +{ + struct virtio_net_data_ll *ll_new; + uint32_t i; + + /* Malloc and then chain the linked list. */ + ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); + if (ll_new == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for ll_new.\n"); + return NULL; + } + + for (i = 0; i < size - 1; i++) { + ll_new[i].vdev = NULL; + ll_new[i].next = &ll_new[i+1]; + } + ll_new[i].next = NULL; + + return ll_new; +} + +/** + * Create the main linked list along with each individual cores + * linked list. A used and a free list are created to manage entries. + */ +static int +init_data_ll(void) +{ + int lcore; + + RTE_LCORE_FOREACH_SLAVE(lcore) { + lcore_info[lcore].lcore_ll = + malloc(sizeof(struct lcore_ll_info)); + if (lcore_info[lcore].lcore_ll == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for lcore_ll.\n"); + return -1; + } + + lcore_info[lcore].lcore_ll->device_num = 0; + lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; + lcore_info[lcore].lcore_ll->ll_root_used = NULL; + if (nb_devices % nb_switching_cores) + lcore_info[lcore].lcore_ll->ll_root_free = + alloc_data_ll((nb_devices / nb_switching_cores) + + 1); + else + lcore_info[lcore].lcore_ll->ll_root_free = + alloc_data_ll(nb_devices / nb_switching_cores); + } + + /* Allocate devices up to a maximum of MAX_DEVICES. */ + ll_root_free = alloc_data_ll(MIN((nb_devices), MAX_DEVICES)); + + return 0; +} + +/** + * Remove a device from the specific data core linked list and + * from the main linked list. Synchonization occurs through the use + * of the lcore dev_removal_flag. Device is made volatile here + * to avoid re-ordering of dev->remove=1 which can cause an infinite + * loop in the rte_pause loop. + */ +static void +destroy_device(volatile struct virtio_net *dev) +{ + struct virtio_net_data_ll *ll_lcore_dev_cur; + struct virtio_net_data_ll *ll_main_dev_cur; + struct virtio_net_data_ll *ll_lcore_dev_last = NULL; + struct virtio_net_data_ll *ll_main_dev_last = NULL; + struct vhost_dev *vdev; + int lcore; + + dev->flags &= ~VIRTIO_DEV_RUNNING; + + vdev = (struct vhost_dev *)dev->priv; + + /* set the remove flag. */ + vdev->remove = 1; + while (vdev->ready != DEVICE_SAFE_REMOVE) + rte_pause(); + + /* Search for entry to be removed from lcore ll */ + ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; + while (ll_lcore_dev_cur != NULL) { + if (ll_lcore_dev_cur->vdev == vdev) { + break; + } else { + ll_lcore_dev_last = ll_lcore_dev_cur; + ll_lcore_dev_cur = ll_lcore_dev_cur->next; + } + } + + if (ll_lcore_dev_cur == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%"PRIu64") Failed to find the dev to be destroy.\n", + dev->device_fh); + return; + } + + /* Search for entry to be removed from main ll */ + ll_main_dev_cur = ll_root_used; + ll_main_dev_last = NULL; + while (ll_main_dev_cur != NULL) { + if (ll_main_dev_cur->vdev == vdev) { + break; + } else { + ll_main_dev_last = ll_main_dev_cur; + ll_main_dev_cur = ll_main_dev_cur->next; + } + } + + /* Remove entries from the lcore and main ll. */ + rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, + ll_lcore_dev_cur, ll_lcore_dev_last); + rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); + + /* Set the dev_removal_flag on each lcore. */ + RTE_LCORE_FOREACH_SLAVE(lcore) { + lcore_info[lcore].lcore_ll->dev_removal_flag = + REQUEST_DEV_REMOVAL; + } + + /* + * Once each core has set the dev_removal_flag to + * ACK_DEV_REMOVAL we can be sure that they can no longer access + * the device removed from the linked lists and that the devices + * are no longer in use. + */ + RTE_LCORE_FOREACH_SLAVE(lcore) { + while (lcore_info[lcore].lcore_ll->dev_removal_flag + != ACK_DEV_REMOVAL) + rte_pause(); + } + + /* Add the entries back to the lcore and main free ll.*/ + put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, + ll_lcore_dev_cur); + put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); + + /* Decrement number of device on the lcore. */ + lcore_info[vdev->coreid].lcore_ll->device_num--; + + RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed " + "from data core\n", dev->device_fh); + + rte_free(vdev); + +} + +/** + * A new device is added to a data core. First the device is added + * to the main linked list and the allocated to a specific data core. + */ +static int +new_device(struct virtio_net *dev) +{ + struct virtio_net_data_ll *ll_dev; + int lcore, core_add = 0; + uint32_t device_num_min = nb_devices; + struct vhost_dev *vdev; + + vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); + if (vdev == NULL) { + RTE_LOG(INFO, VHOST_DATA, + "(%"PRIu64") Couldn't allocate memory for vhost dev\n", + dev->device_fh); + return -1; + } + vdev->dev = dev; + dev->priv = vdev; + /* Add device to main ll */ + ll_dev = get_data_ll_free_entry(&ll_root_free); + if (ll_dev == NULL) { + RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in" + " linked list Device limit of %d devices per core" + " has been reached\n", dev->device_fh, nb_devices); + if (vdev->regions_hpa) + rte_free(vdev->regions_hpa); + rte_free(vdev); + return -1; + } + ll_dev->vdev = vdev; + add_data_ll_entry(&ll_root_used, ll_dev); + vdev->rx_q = dev->device_fh; + + /* reset ready flag */ + vdev->ready = DEVICE_MAC_LEARNING; + vdev->remove = 0; + + /* Find a suitable lcore to add the device. */ + RTE_LCORE_FOREACH_SLAVE(lcore) { + if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { + device_num_min = lcore_info[lcore].lcore_ll->device_num; + core_add = lcore; + } + } + /* Add device to lcore ll */ + ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); + if (ll_dev == NULL) { + RTE_LOG(INFO, VHOST_DATA, + "(%"PRIu64") Failed to add device to data core\n", + dev->device_fh); + vdev->ready = DEVICE_SAFE_REMOVE; + destroy_device(dev); + rte_free(vdev->regions_hpa); + rte_free(vdev); + return -1; + } + ll_dev->vdev = vdev; + vdev->coreid = core_add; + + add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, + ll_dev); + + /* Initialize device stats */ + memset(&dev_statistics[dev->device_fh], 0, + sizeof(struct device_statistics)); + + /* Disable notifications. */ + rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); + rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); + lcore_info[vdev->coreid].lcore_ll->device_num++; + dev->flags |= VIRTIO_DEV_RUNNING; + + RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", + dev->device_fh, vdev->coreid); + + return 0; +} + +/** + * These callback allow devices to be added to the data core when configuration + * has been fully complete. + */ +static const struct virtio_net_device_ops virtio_net_device_ops = { + .new_device = new_device, + .destroy_device = destroy_device, +}; + +/** + * This is a thread will wake up after a period to print stats if the user has + * enabled them. + */ +static void +print_stats(void) +{ + struct virtio_net_data_ll *dev_ll; + uint64_t tx_dropped, rx_dropped; + uint64_t tx, tx_total, rx, rx_total, rx_ip_csum, rx_l4_csum; + uint32_t device_fh; + const char clr[] = { 27, '[', '2', 'J', '\0' }; + const char top_left[] = { 27, '[', '1', ';', '1', 'H', '\0' }; + + while (1) { + sleep(enable_stats); + + /* Clear screen and move to top left */ + printf("%s%s", clr, top_left); + + printf("\nDevice statistics ================================"); + + dev_ll = ll_root_used; + while (dev_ll != NULL) { + device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; + tx_total = dev_statistics[device_fh].tx_total; + tx = dev_statistics[device_fh].tx; + tx_dropped = tx_total - tx; + + rx_total = rte_atomic64_read( + &dev_statistics[device_fh].rx_total_atomic); + rx = rte_atomic64_read( + &dev_statistics[device_fh].rx_atomic); + rx_dropped = rx_total - rx; + rx_ip_csum = rte_atomic64_read( + &dev_statistics[device_fh].rx_bad_ip_csum); + rx_l4_csum = rte_atomic64_read( + &dev_statistics[device_fh].rx_bad_l4_csum); + + printf("\nStatistics for device %"PRIu32" ----------" + "\nTX total: %"PRIu64"" + "\nTX dropped: %"PRIu64"" + "\nTX successful: %"PRIu64"" + "\nRX total: %"PRIu64"" + "\nRX bad IP csum: %"PRIu64"" + "\nRX bad L4 csum: %"PRIu64"" + "\nRX dropped: %"PRIu64"" + "\nRX successful: %"PRIu64"", + device_fh, + tx_total, + tx_dropped, + tx, + rx_total, + rx_ip_csum, + rx_l4_csum, + rx_dropped, + rx); + + dev_ll = dev_ll->next; + } + printf("\n================================================\n"); + } +} + +/** + * Main function, does initialisation and calls the per-lcore functions. The CUSE + * device is also registered here to handle the IOCTLs. + */ +int +main(int argc, char *argv[]) +{ + struct rte_mempool *mbuf_pool = NULL; + unsigned lcore_id, core_id = 0; + unsigned nb_ports, valid_nb_ports; + int ret; + uint8_t portid; + uint16_t queue_id; + static pthread_t tid; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* init EAL */ + ret = rte_eal_init(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); + argc -= ret; + argv += ret; + + /* parse app arguments */ + ret = tep_termination_parse_args(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Invalid argument\n"); + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) + if (rte_lcore_is_enabled(lcore_id)) + lcore_ids[core_id++] = lcore_id; + + /* set the number of swithcing cores available */ + nb_switching_cores = rte_lcore_count()-1; + + /* Get the number of physical ports. */ + nb_ports = rte_eth_dev_count(); + if (nb_ports > RTE_MAX_ETHPORTS) + nb_ports = RTE_MAX_ETHPORTS; + + /* + * Update the global var NB_PORTS and global array PORTS + * and get value of var VALID_NB_PORTS according to system ports number + */ + valid_nb_ports = check_ports_num(nb_ports); + + if ((valid_nb_ports == 0) || (valid_nb_ports > MAX_SUP_PORTS)) { + rte_exit(EXIT_FAILURE, "Current enabled port number is %u," + "but only %u port can be enabled\n", nb_ports, + MAX_SUP_PORTS); + } + /* Create the mbuf pool. */ + mbuf_pool = rte_mempool_create( + "MBUF_POOL", + NUM_MBUFS_PER_PORT + * valid_nb_ports, + MBUF_SIZE, MBUF_CACHE_SIZE, + sizeof(struct rte_pktmbuf_pool_private), + rte_pktmbuf_pool_init, NULL, + rte_pktmbuf_init, NULL, + rte_socket_id(), 0); + if (mbuf_pool == NULL) + rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); + + for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) + vpool_array[queue_id].pool = mbuf_pool; + + /* Set log level. */ + rte_set_log_level(LOG_LEVEL); + + /* initialize all ports */ + for (portid = 0; portid < nb_ports; portid++) { + /* skip ports that are not enabled */ + if ((enabled_port_mask & (1 << portid)) == 0) { + RTE_LOG(INFO, VHOST_PORT, + "Skipping disabled port %d\n", portid); + continue; + } + if (overlay_options.port_configure(portid, mbuf_pool) != 0) + rte_exit(EXIT_FAILURE, + "Cannot initialize network ports\n"); + } + + /* Initialise all linked lists. */ + if (init_data_ll() == -1) + rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); + + /* Initialize device stats */ + memset(&dev_statistics, 0, sizeof(dev_statistics)); + + /* Enable stats if the user option is set. */ + if (enable_stats) { + ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); + if (ret != 0) + rte_exit(EXIT_FAILURE, "Cannot create print-stats thread\n"); + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); + ret = rte_thread_setname(tid, thread_name); + if (ret != 0) + RTE_LOG(ERR, VHOST_CONFIG, "Cannot set print-stats name\n"); + } + + /* Launch all data cores. */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rte_eal_remote_launch(switch_worker, + mbuf_pool, lcore_id); + } + rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); + + /* Register CUSE device to handle IOCTLs. */ + ret = rte_vhost_driver_register((char *)&dev_basename); + if (ret != 0) + rte_exit(EXIT_FAILURE, "CUSE device setup failure.\n"); + + rte_vhost_driver_callback_register(&virtio_net_device_ops); + + /* Start CUSE session. */ + rte_vhost_driver_session_start(); + + return 0; +} diff --git a/examples/tep_termination/main.h b/examples/tep_termination/main.h new file mode 100644 index 00000000..a34301ad --- /dev/null +++ b/examples/tep_termination/main.h @@ -0,0 +1,129 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _MAIN_H_ +#define _MAIN_H_ + +#include <rte_ether.h> + +#ifdef DEBUG +#define LOG_LEVEL RTE_LOG_DEBUG +#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#else +#define LOG_LEVEL RTE_LOG_INFO +#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#endif + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER2 +#define RTE_LOGTYPE_VHOST_PORT RTE_LOGTYPE_USER3 + +/* State of virtio device. */ +#define DEVICE_MAC_LEARNING 0 +#define DEVICE_RX 1 +#define DEVICE_SAFE_REMOVE 2 + +#define MAX_QUEUES 512 + +/* Max burst size for RX/TX */ +#define MAX_PKT_BURST 32 + +/* Max number of devices. Limited by the application. */ +#define MAX_DEVICES 64 + +/* Per-device statistics struct */ +struct device_statistics { + uint64_t tx_total; + rte_atomic64_t rx_total_atomic; + uint64_t rx_total; + uint64_t tx; + rte_atomic64_t rx_atomic; + /**< Bad inner IP csum for tunneling pkt */ + rte_atomic64_t rx_bad_ip_csum; + /**< Bad inner L4 csum for tunneling pkt */ + rte_atomic64_t rx_bad_l4_csum; +} __rte_cache_aligned; + +/** + * Device linked list structure for data path. + */ +struct vhost_dev { + /**< Pointer to device created by vhost lib. */ + struct virtio_net *dev; + /**< Number of memory regions for gpa to hpa translation. */ + uint32_t nregions_hpa; + /**< Memory region information for gpa to hpa translation. */ + struct virtio_memory_regions_hpa *regions_hpa; + /**< Device MAC address (Obtained on first TX packet). */ + struct ether_addr mac_address; + /**< RX queue number. */ + uint16_t rx_q; + /**< Data core that the device is added to. */ + uint16_t coreid; + /**< A device is set as ready if the MAC address has been set. */ + volatile uint8_t ready; + /**< Device is marked for removal from the data core. */ + volatile uint8_t remove; +} __rte_cache_aligned; + +/** + * Structure containing data core specific information. + */ +struct lcore_ll_info { + /**< Pointer to head in free linked list. */ + struct virtio_net_data_ll *ll_root_free; + /**< Pointer to head of used linked list. */ + struct virtio_net_data_ll *ll_root_used; + /**< Number of devices on lcore. */ + uint32_t device_num; + /**< Flag to synchronize device removal. */ + volatile uint8_t dev_removal_flag; +}; + +struct lcore_info { + /**< Pointer to data core specific lcore_ll_info struct */ + struct lcore_ll_info *lcore_ll; +}; + +struct virtio_net_data_ll { + /**< Pointer to device created by configuration core. */ + struct vhost_dev *vdev; + /**< Pointer to next device in linked list. */ + struct virtio_net_data_ll *next; +}; + +uint32_t +virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count); + +#endif /* _MAIN_H_ */ diff --git a/examples/tep_termination/vxlan.c b/examples/tep_termination/vxlan.c new file mode 100644 index 00000000..5ee1f956 --- /dev/null +++ b/examples/tep_termination/vxlan.c @@ -0,0 +1,259 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdint.h> +#include <rte_mbuf.h> +#include <rte_hash_crc.h> +#include <rte_byteorder.h> +#include <rte_udp.h> +#include <rte_tcp.h> +#include <rte_sctp.h> + +#include "main.h" +#include "vxlan.h" + +static uint16_t +get_psd_sum(void *l3_hdr, uint16_t ethertype, uint64_t ol_flags) +{ + if (ethertype == ETHER_TYPE_IPv4) + return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); + else /* assume ethertype == ETHER_TYPE_IPv6 */ + return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); +} + +/** + * Parse an ethernet header to fill the ethertype, outer_l2_len, outer_l3_len and + * ipproto. This function is able to recognize IPv4/IPv6 with one optional vlan + * header. + */ +static void +parse_ethernet(struct ether_hdr *eth_hdr, union tunnel_offload_info *info, + uint8_t *l4_proto) +{ + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + uint16_t ethertype; + + info->outer_l2_len = sizeof(struct ether_hdr); + ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); + + if (ethertype == ETHER_TYPE_VLAN) { + struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + info->outer_l2_len += sizeof(struct vlan_hdr); + ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); + } + + switch (ethertype) { + case ETHER_TYPE_IPv4: + ipv4_hdr = (struct ipv4_hdr *) + ((char *)eth_hdr + info->outer_l2_len); + info->outer_l3_len = sizeof(struct ipv4_hdr); + *l4_proto = ipv4_hdr->next_proto_id; + break; + case ETHER_TYPE_IPv6: + ipv6_hdr = (struct ipv6_hdr *) + ((char *)eth_hdr + info->outer_l2_len); + info->outer_l3_len = sizeof(struct ipv6_hdr); + *l4_proto = ipv6_hdr->proto; + break; + default: + info->outer_l3_len = 0; + *l4_proto = 0; + break; + } +} + +/** + * Calculate the checksum of a packet in hardware + */ +static uint64_t +process_inner_cksums(struct ether_hdr *eth_hdr, union tunnel_offload_info *info) +{ + void *l3_hdr = NULL; + uint8_t l4_proto; + uint16_t ethertype; + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + struct udp_hdr *udp_hdr; + struct tcp_hdr *tcp_hdr; + struct sctp_hdr *sctp_hdr; + uint64_t ol_flags = 0; + + info->l2_len = sizeof(struct ether_hdr); + ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); + + if (ethertype == ETHER_TYPE_VLAN) { + struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + info->l2_len += sizeof(struct vlan_hdr); + ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); + } + + l3_hdr = (char *)eth_hdr + info->l2_len; + + if (ethertype == ETHER_TYPE_IPv4) { + ipv4_hdr = (struct ipv4_hdr *)l3_hdr; + ipv4_hdr->hdr_checksum = 0; + ol_flags |= PKT_TX_IPV4; + ol_flags |= PKT_TX_IP_CKSUM; + info->l3_len = sizeof(struct ipv4_hdr); + l4_proto = ipv4_hdr->next_proto_id; + } else if (ethertype == ETHER_TYPE_IPv6) { + ipv6_hdr = (struct ipv6_hdr *)l3_hdr; + info->l3_len = sizeof(struct ipv6_hdr); + l4_proto = ipv6_hdr->proto; + ol_flags |= PKT_TX_IPV6; + } else + return 0; /* packet type not supported, nothing to do */ + + if (l4_proto == IPPROTO_UDP) { + udp_hdr = (struct udp_hdr *)((char *)l3_hdr + info->l3_len); + ol_flags |= PKT_TX_UDP_CKSUM; + udp_hdr->dgram_cksum = get_psd_sum(l3_hdr, + ethertype, ol_flags); + } else if (l4_proto == IPPROTO_TCP) { + tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + info->l3_len); + ol_flags |= PKT_TX_TCP_CKSUM; + tcp_hdr->cksum = get_psd_sum(l3_hdr, ethertype, + ol_flags); + if (tso_segsz != 0) { + ol_flags |= PKT_TX_TCP_SEG; + info->tso_segsz = tso_segsz; + info->l4_len = sizeof(struct tcp_hdr); + } + + } else if (l4_proto == IPPROTO_SCTP) { + sctp_hdr = (struct sctp_hdr *)((char *)l3_hdr + info->l3_len); + sctp_hdr->cksum = 0; + ol_flags |= PKT_TX_SCTP_CKSUM; + } + + return ol_flags; +} + +int +decapsulation(struct rte_mbuf *pkt) +{ + uint8_t l4_proto = 0; + uint16_t outer_header_len; + struct udp_hdr *udp_hdr; + union tunnel_offload_info info = { .data = 0 }; + struct ether_hdr *phdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + + parse_ethernet(phdr, &info, &l4_proto); + + if (l4_proto != IPPROTO_UDP) + return -1; + + udp_hdr = (struct udp_hdr *)((char *)phdr + + info.outer_l2_len + info.outer_l3_len); + + /** check udp destination port, 4789 is the default vxlan port + * (rfc7348) or that the rx offload flag is set (i40e only + * currently)*/ + if (udp_hdr->dst_port != rte_cpu_to_be_16(DEFAULT_VXLAN_PORT) && + (pkt->packet_type & RTE_PTYPE_TUNNEL_MASK) == 0) + return -1; + outer_header_len = info.outer_l2_len + info.outer_l3_len + + sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr); + + rte_pktmbuf_adj(pkt, outer_header_len); + + return 0; +} + +void +encapsulation(struct rte_mbuf *m, uint8_t queue_id) +{ + uint vport_id; + uint64_t ol_flags = 0; + uint32_t old_len = m->pkt_len, hash; + union tunnel_offload_info tx_offload = { .data = 0 }; + struct ether_hdr *phdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + + /*Allocate space for new ethernet, IPv4, UDP and VXLAN headers*/ + struct ether_hdr *pneth = (struct ether_hdr *) rte_pktmbuf_prepend(m, + sizeof(struct ether_hdr) + sizeof(struct ipv4_hdr) + + sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr)); + + struct ipv4_hdr *ip = (struct ipv4_hdr *) &pneth[1]; + struct udp_hdr *udp = (struct udp_hdr *) &ip[1]; + struct vxlan_hdr *vxlan = (struct vxlan_hdr *) &udp[1]; + + /* convert TX queue ID to vport ID */ + vport_id = queue_id - 1; + + /* replace original Ethernet header with ours */ + pneth = rte_memcpy(pneth, &app_l2_hdr[vport_id], + sizeof(struct ether_hdr)); + + /* copy in IP header */ + ip = rte_memcpy(ip, &app_ip_hdr[vport_id], + sizeof(struct ipv4_hdr)); + ip->total_length = rte_cpu_to_be_16(m->data_len + - sizeof(struct ether_hdr)); + + /* outer IP checksum */ + ol_flags |= PKT_TX_OUTER_IP_CKSUM; + ip->hdr_checksum = 0; + + /* inner IP checksum offload */ + if (tx_checksum) { + ol_flags |= process_inner_cksums(phdr, &tx_offload); + m->l2_len = tx_offload.l2_len; + m->l3_len = tx_offload.l3_len; + m->l4_len = tx_offload.l4_len; + m->l2_len += ETHER_VXLAN_HLEN; + } + + m->outer_l2_len = sizeof(struct ether_hdr); + m->outer_l3_len = sizeof(struct ipv4_hdr); + + m->ol_flags |= ol_flags; + m->tso_segsz = tx_offload.tso_segsz; + + /*VXLAN HEADER*/ + vxlan->vx_flags = rte_cpu_to_be_32(VXLAN_HF_VNI); + vxlan->vx_vni = rte_cpu_to_be_32(vxdev.out_key << 8); + + /*UDP HEADER*/ + udp->dgram_cksum = 0; + udp->dgram_len = rte_cpu_to_be_16(old_len + + sizeof(struct udp_hdr) + + sizeof(struct vxlan_hdr)); + + udp->dst_port = rte_cpu_to_be_16(vxdev.dst_port); + hash = rte_hash_crc(phdr, 2 * ETHER_ADDR_LEN, phdr->ether_type); + udp->src_port = rte_cpu_to_be_16((((uint64_t) hash * PORT_RANGE) >> 32) + + PORT_MIN); + + return; +} diff --git a/examples/tep_termination/vxlan.h b/examples/tep_termination/vxlan.h new file mode 100644 index 00000000..4242e111 --- /dev/null +++ b/examples/tep_termination/vxlan.h @@ -0,0 +1,86 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VXLAN_H_ +#define _VXLAN_H_ + +#include <rte_ether.h> +#include <rte_ip.h> + +#define PORT_MIN 49152 +#define PORT_MAX 65535 +#define PORT_RANGE ((PORT_MAX - PORT_MIN) + 1) + +#define VXLAN_N_PORTS 2 +#define VXLAN_HF_VNI 0x08000000 +#define DEFAULT_VXLAN_PORT 4789 + +extern struct ipv4_hdr app_ip_hdr[VXLAN_N_PORTS]; +extern struct ether_hdr app_l2_hdr[VXLAN_N_PORTS]; +extern uint8_t tx_checksum; +extern uint16_t tso_segsz; + +struct vxlan_port { + uint32_t vport_id; /**< VirtIO port id */ + uint32_t peer_ip; /**< remote VTEP IP address */ + struct ether_addr peer_mac; /**< remote VTEP MAC address */ + struct ether_addr vport_mac; /**< VirtIO port MAC address */ +} __rte_cache_aligned; + +struct vxlan_conf { + uint16_t dst_port; /**< VXLAN UDP destination port */ + uint32_t port_ip; /**< DPDK port IP address*/ + uint32_t in_key; /**< VLAN ID */ + uint32_t out_key; /**< VXLAN VNI */ + struct vxlan_port port[VXLAN_N_PORTS]; /**< VXLAN configuration */ +} __rte_cache_aligned; + +extern struct vxlan_conf vxdev; + +/* structure that caches offload info for the current packet */ +union tunnel_offload_info { + uint64_t data; + struct { + uint64_t l2_len:7; /**< L2 (MAC) Header Length. */ + uint64_t l3_len:9; /**< L3 (IP) Header Length. */ + uint64_t l4_len:8; /**< L4 Header Length. */ + uint64_t tso_segsz:16; /**< TCP TSO segment size */ + uint64_t outer_l2_len:7; /**< outer L2 Header Length */ + uint64_t outer_l3_len:16; /**< outer L3 Header Length */ + }; +} __rte_cache_aligned; + +int decapsulation(struct rte_mbuf *pkt); +void encapsulation(struct rte_mbuf *m, uint8_t queue_id); + +#endif /* _VXLAN_H_ */ diff --git a/examples/tep_termination/vxlan_setup.c b/examples/tep_termination/vxlan_setup.c new file mode 100644 index 00000000..2a48e142 --- /dev/null +++ b/examples/tep_termination/vxlan_setup.c @@ -0,0 +1,457 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <getopt.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/virtio_net.h> +#include <linux/virtio_ring.h> +#include <sys/param.h> +#include <unistd.h> + +#include <rte_ethdev.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_mbuf.h> +#include <rte_malloc.h> +#include <rte_ip.h> +#include <rte_udp.h> +#include <rte_tcp.h> + +#include "main.h" +#include "rte_virtio_net.h" +#include "vxlan.h" +#include "vxlan_setup.h" + +#define IPV4_HEADER_LEN 20 +#define UDP_HEADER_LEN 8 +#define VXLAN_HEADER_LEN 8 + +#define IP_VERSION 0x40 +#define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */ +#define IP_DEFTTL 64 /* from RFC 1340. */ +#define IP_VHL_DEF (IP_VERSION | IP_HDRLEN) + +#define IP_DN_FRAGMENT_FLAG 0x0040 + +/* Used to compare MAC addresses. */ +#define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL + +/* Configurable number of RX/TX ring descriptors */ +#define RTE_TEST_RX_DESC_DEFAULT 1024 +#define RTE_TEST_TX_DESC_DEFAULT 512 + +/* Default inner VLAN ID */ +#define INNER_VLAN_ID 100 + +/* VXLAN device */ +struct vxlan_conf vxdev; + +struct ipv4_hdr app_ip_hdr[VXLAN_N_PORTS]; +struct ether_hdr app_l2_hdr[VXLAN_N_PORTS]; + +/* local VTEP IP address */ +uint8_t vxlan_multicast_ips[2][4] = { {239, 1, 1, 1 }, {239, 1, 2, 1 } }; + +/* Remote VTEP IP address */ +uint8_t vxlan_overlay_ips[2][4] = { {192, 168, 10, 1}, {192, 168, 30, 1} }; + +/* Remote VTEP MAC address */ +uint8_t peer_mac[6] = {0x00, 0x11, 0x01, 0x00, 0x00, 0x01}; + +/* VXLAN RX filter type */ +uint8_t tep_filter_type[] = {RTE_TUNNEL_FILTER_IMAC_TENID, + RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID, + RTE_TUNNEL_FILTER_OMAC_TENID_IMAC,}; + +/* Options for configuring ethernet port */ +static const struct rte_eth_conf port_conf = { + .rxmode = { + .split_hdr_size = 0, + .header_split = 0, /**< Header Split disabled */ + .hw_ip_checksum = 0, /**< IP checksum offload disabled */ + .hw_vlan_filter = 0, /**< VLAN filtering disabled */ + .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ + .hw_strip_crc = 0, /**< CRC stripped by hardware */ + }, + .txmode = { + .mq_mode = ETH_MQ_TX_NONE, + }, +}; + +/** + * The one or two device(s) that belongs to the same tenant ID can + * be assigned in a VM. + */ +const uint16_t tenant_id_conf[] = { + 1000, 1000, 1001, 1001, 1002, 1002, 1003, 1003, + 1004, 1004, 1005, 1005, 1006, 1006, 1007, 1007, + 1008, 1008, 1009, 1009, 1010, 1010, 1011, 1011, + 1012, 1012, 1013, 1013, 1014, 1014, 1015, 1015, + 1016, 1016, 1017, 1017, 1018, 1018, 1019, 1019, + 1020, 1020, 1021, 1021, 1022, 1022, 1023, 1023, + 1024, 1024, 1025, 1025, 1026, 1026, 1027, 1027, + 1028, 1028, 1029, 1029, 1030, 1030, 1031, 1031, +}; + +/** + * Initialises a given port using global settings and with the rx buffers + * coming from the mbuf_pool passed as parameter + */ +int +vxlan_port_init(uint8_t port, struct rte_mempool *mbuf_pool) +{ + int retval; + uint16_t q; + struct rte_eth_dev_info dev_info; + uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count(); + const uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; + const uint16_t tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; + struct rte_eth_udp_tunnel tunnel_udp; + struct rte_eth_rxconf *rxconf; + struct rte_eth_txconf *txconf; + struct vxlan_conf *pconf = &vxdev; + + pconf->dst_port = udp_port; + + rte_eth_dev_info_get(port, &dev_info); + + if (dev_info.max_rx_queues > MAX_QUEUES) { + rte_exit(EXIT_FAILURE, + "please define MAX_QUEUES no less than %u in %s\n", + dev_info.max_rx_queues, __FILE__); + } + + rxconf = &dev_info.default_rxconf; + txconf = &dev_info.default_txconf; + txconf->txq_flags = 0; + + if (port >= rte_eth_dev_count()) + return -1; + + rx_rings = nb_devices; + + /* Configure ethernet device. */ + retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); + if (retval != 0) + return retval; + + /* Setup the queues. */ + for (q = 0; q < rx_rings; q++) { + retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, + rte_eth_dev_socket_id(port), + rxconf, + mbuf_pool); + if (retval < 0) + return retval; + } + for (q = 0; q < tx_rings; q++) { + retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, + rte_eth_dev_socket_id(port), + txconf); + if (retval < 0) + return retval; + } + + /* Start the device. */ + retval = rte_eth_dev_start(port); + if (retval < 0) + return retval; + + /* Configure UDP port for UDP tunneling */ + tunnel_udp.udp_port = udp_port; + tunnel_udp.prot_type = RTE_TUNNEL_TYPE_VXLAN; + retval = rte_eth_dev_udp_tunnel_port_add(port, &tunnel_udp); + if (retval < 0) + return retval; + rte_eth_macaddr_get(port, &ports_eth_addr[port]); + RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 + " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", + (unsigned)port, + ports_eth_addr[port].addr_bytes[0], + ports_eth_addr[port].addr_bytes[1], + ports_eth_addr[port].addr_bytes[2], + ports_eth_addr[port].addr_bytes[3], + ports_eth_addr[port].addr_bytes[4], + ports_eth_addr[port].addr_bytes[5]); + + if (tso_segsz != 0) { + struct rte_eth_dev_info dev_info; + rte_eth_dev_info_get(port, &dev_info); + if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) == 0) + RTE_LOG(WARNING, PORT, + "hardware TSO offload is not supported\n"); + } + return 0; +} + +static int +vxlan_rx_process(struct rte_mbuf *pkt) +{ + int ret = 0; + + if (rx_decap) + ret = decapsulation(pkt); + + return ret; +} + +static void +vxlan_tx_process(uint8_t queue_id, struct rte_mbuf *pkt) +{ + if (tx_encap) + encapsulation(pkt, queue_id); + + return; +} + +/* + * This function learns the MAC address of the device and set init + * L2 header and L3 header info. + */ +int +vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m) +{ + int i, ret; + struct ether_hdr *pkt_hdr; + struct virtio_net *dev = vdev->dev; + uint64_t portid = dev->device_fh; + struct ipv4_hdr *ip; + + struct rte_eth_tunnel_filter_conf tunnel_filter_conf; + + if (unlikely(portid > VXLAN_N_PORTS)) { + RTE_LOG(INFO, VHOST_DATA, + "(%"PRIu64") WARNING: Not configuring device," + "as already have %d ports for VXLAN.", + dev->device_fh, VXLAN_N_PORTS); + return -1; + } + + /* Learn MAC address of guest device from packet */ + pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + if (is_same_ether_addr(&(pkt_hdr->s_addr), &vdev->mac_address)) { + RTE_LOG(INFO, VHOST_DATA, + "(%"PRIu64") WARNING: This device is using an existing" + " MAC address and has not been registered.\n", + dev->device_fh); + return -1; + } + + for (i = 0; i < ETHER_ADDR_LEN; i++) { + vdev->mac_address.addr_bytes[i] = + vxdev.port[portid].vport_mac.addr_bytes[i] = + pkt_hdr->s_addr.addr_bytes[i]; + vxdev.port[portid].peer_mac.addr_bytes[i] = peer_mac[i]; + } + + memset(&tunnel_filter_conf, 0, + sizeof(struct rte_eth_tunnel_filter_conf)); + + ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac); + tunnel_filter_conf.filter_type = tep_filter_type[filter_idx]; + + /* inner MAC */ + ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac); + + tunnel_filter_conf.queue_id = vdev->rx_q; + tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q]; + + if (tep_filter_type[filter_idx] == RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID) + tunnel_filter_conf.inner_vlan = INNER_VLAN_ID; + + tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN; + + ret = rte_eth_dev_filter_ctrl(ports[0], + RTE_ETH_FILTER_TUNNEL, + RTE_ETH_FILTER_ADD, + &tunnel_filter_conf); + if (ret) { + RTE_LOG(ERR, VHOST_DATA, + "%d Failed to add device MAC address to cloud filter\n", + vdev->rx_q); + return -1; + } + + /* Print out inner MAC and VNI info. */ + RTE_LOG(INFO, VHOST_DATA, + "(%d) MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VNI %d registered\n", + vdev->rx_q, + vdev->mac_address.addr_bytes[0], + vdev->mac_address.addr_bytes[1], + vdev->mac_address.addr_bytes[2], + vdev->mac_address.addr_bytes[3], + vdev->mac_address.addr_bytes[4], + vdev->mac_address.addr_bytes[5], + tenant_id_conf[vdev->rx_q]); + + vxdev.port[portid].vport_id = portid; + + for (i = 0; i < 4; i++) { + /* Local VTEP IP */ + vxdev.port_ip |= vxlan_multicast_ips[portid][i] << (8 * i); + /* Remote VTEP IP */ + vxdev.port[portid].peer_ip |= + vxlan_overlay_ips[portid][i] << (8 * i); + } + + vxdev.out_key = tenant_id_conf[vdev->rx_q]; + ether_addr_copy(&vxdev.port[portid].peer_mac, + &app_l2_hdr[portid].d_addr); + ether_addr_copy(&ports_eth_addr[0], + &app_l2_hdr[portid].s_addr); + app_l2_hdr[portid].ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + + ip = &app_ip_hdr[portid]; + ip->version_ihl = IP_VHL_DEF; + ip->type_of_service = 0; + ip->total_length = 0; + ip->packet_id = 0; + ip->fragment_offset = IP_DN_FRAGMENT_FLAG; + ip->time_to_live = IP_DEFTTL; + ip->next_proto_id = IPPROTO_UDP; + ip->hdr_checksum = 0; + ip->src_addr = vxdev.port_ip; + ip->dst_addr = vxdev.port[portid].peer_ip; + + /* Set device as ready for RX. */ + vdev->ready = DEVICE_RX; + + return 0; +} + +/** + * Removes cloud filter. Ensures that nothing is adding buffers to the RX + * queue before disabling RX on the device. + */ +void +vxlan_unlink(struct vhost_dev *vdev) +{ + unsigned i = 0, rx_count; + int ret; + struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; + struct rte_eth_tunnel_filter_conf tunnel_filter_conf; + + if (vdev->ready == DEVICE_RX) { + memset(&tunnel_filter_conf, 0, + sizeof(struct rte_eth_tunnel_filter_conf)); + + ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac); + ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac); + tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q]; + tunnel_filter_conf.filter_type = tep_filter_type[filter_idx]; + + if (tep_filter_type[filter_idx] == + RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID) + tunnel_filter_conf.inner_vlan = INNER_VLAN_ID; + + tunnel_filter_conf.queue_id = vdev->rx_q; + tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN; + + ret = rte_eth_dev_filter_ctrl(ports[0], + RTE_ETH_FILTER_TUNNEL, + RTE_ETH_FILTER_DELETE, + &tunnel_filter_conf); + if (ret) { + RTE_LOG(ERR, VHOST_DATA, + "%d Failed to add device MAC address to cloud filter\n", + vdev->rx_q); + return; + } + for (i = 0; i < ETHER_ADDR_LEN; i++) + vdev->mac_address.addr_bytes[i] = 0; + + /* Clear out the receive buffers */ + rx_count = rte_eth_rx_burst(ports[0], + (uint16_t)vdev->rx_q, + pkts_burst, MAX_PKT_BURST); + + while (rx_count) { + for (i = 0; i < rx_count; i++) + rte_pktmbuf_free(pkts_burst[i]); + + rx_count = rte_eth_rx_burst(ports[0], + (uint16_t)vdev->rx_q, + pkts_burst, MAX_PKT_BURST); + } + vdev->ready = DEVICE_MAC_LEARNING; + } +} + +/* Transmit packets after encapsulating */ +int +vxlan_tx_pkts(uint8_t port_id, uint16_t queue_id, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) { + int ret = 0; + uint16_t i; + + for (i = 0; i < nb_pkts; i++) + vxlan_tx_process(queue_id, tx_pkts[i]); + + ret = rte_eth_tx_burst(port_id, queue_id, tx_pkts, nb_pkts); + + return ret; +} + +/* Check for decapsulation and pass packets directly to VIRTIO device */ +int +vxlan_rx_pkts(struct virtio_net *dev, struct rte_mbuf **pkts_burst, + uint32_t rx_count) +{ + uint32_t i = 0; + uint32_t count = 0; + int ret; + struct rte_mbuf *pkts_valid[rx_count]; + + for (i = 0; i < rx_count; i++) { + if (enable_stats) { + rte_atomic64_add( + &dev_statistics[dev->device_fh].rx_bad_ip_csum, + (pkts_burst[i]->ol_flags & PKT_RX_IP_CKSUM_BAD) + != 0); + rte_atomic64_add( + &dev_statistics[dev->device_fh].rx_bad_ip_csum, + (pkts_burst[i]->ol_flags & PKT_RX_L4_CKSUM_BAD) + != 0); + } + ret = vxlan_rx_process(pkts_burst[i]); + if (unlikely(ret < 0)) + continue; + + pkts_valid[count] = pkts_burst[i]; + count++; + } + + ret = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_valid, count); + return ret; +} diff --git a/examples/tep_termination/vxlan_setup.h b/examples/tep_termination/vxlan_setup.h new file mode 100644 index 00000000..1846540f --- /dev/null +++ b/examples/tep_termination/vxlan_setup.h @@ -0,0 +1,87 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VXLAN_SETUP_H_ +#define VXLAN_SETUP_H_ + +extern uint16_t nb_devices; +extern uint16_t udp_port; +extern uint8_t filter_idx; +extern uint8_t ports[RTE_MAX_ETHPORTS]; +extern struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; +extern uint32_t enable_stats; +extern struct device_statistics dev_statistics[MAX_DEVICES]; +extern uint8_t rx_decap; +extern uint8_t tx_encap; + +typedef int (*ol_port_configure_t)(uint8_t port, + struct rte_mempool *mbuf_pool); + +typedef int (*ol_tunnel_setup_t)(struct vhost_dev *vdev, + struct rte_mbuf *m); + +typedef void (*ol_tunnel_destroy_t)(struct vhost_dev *vdev); + +typedef int (*ol_tx_handle_t)(uint8_t port_id, uint16_t queue_id, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); + +typedef int (*ol_rx_handle_t)(struct virtio_net *dev, struct rte_mbuf **pkts, + uint32_t count); + +typedef int (*ol_param_handle)(struct virtio_net *dev); + +struct ol_switch_ops { + ol_port_configure_t port_configure; + ol_tunnel_setup_t tunnel_setup; + ol_tunnel_destroy_t tunnel_destroy; + ol_tx_handle_t tx_handle; + ol_rx_handle_t rx_handle; + ol_param_handle param_handle; +}; + +int +vxlan_port_init(uint8_t port, struct rte_mempool *mbuf_pool); + +int +vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m); + +void +vxlan_unlink(struct vhost_dev *vdev); + +int +vxlan_tx_pkts(uint8_t port_id, uint16_t queue_id, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +int +vxlan_rx_pkts(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count); + +#endif /* VXLAN_SETUP_H_ */ |