/* *------------------------------------------------------------------ * tuntap.c - kernel stack (reverse) punt/inject path * * Copyright (c) 2009 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *------------------------------------------------------------------ */ /** * @file * @brief TunTap Kernel stack (reverse) punt/inject path. * * This driver runs in one of two distinct modes: * - "punt/inject" mode, where we send pkts not otherwise processed * by the forwarding to the Linux kernel stack, and * * - "normal interface" mode, where we treat the Linux kernel stack * as a peer. * * By default, we select punt/inject mode. */ #include /* for open */ #include #include #include #include #include /* for iovec */ #include #include #include #include #include #include #include #include #include #include static vnet_device_class_t tuntap_dev_class; static vnet_hw_interface_class_t tuntap_interface_class; static void tuntap_punt_frame (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame); static void tuntap_nopunt_frame (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame); typedef struct { u32 sw_if_index; u8 is_v6; u8 addr[16]; } subif_address_t; /** * @brief TUNTAP per thread struct */ typedef struct { /** Vector of VLIB rx buffers to use. We allocate them in blocks of VLIB_FRAME_SIZE (256). */ u32 *rx_buffers; /** Vector of iovecs for readv/writev calls. */ struct iovec *iovecs; } tuntap_per_thread_t; /** * @brief TUNTAP node main state */ typedef struct { /** per thread variables */ tuntap_per_thread_t *threads; /** File descriptors for /dev/net/tun and provisioning socket. */ int dev_net_tun_fd, dev_tap_fd; /** Create a "tap" [ethernet] encaps device */ int is_ether; /** 1 if a "normal" routed intfc, 0 if a punt/inject interface */ int have_normal_interface; /** tap device destination MAC address. Required, or Linux drops pkts */ u8 ether_dst_mac[6]; /** Interface MTU in bytes and # of default sized buffers. */ u32 mtu_bytes, mtu_buffers; /** Linux interface name for tun device. */ char *tun_name; /** Pool of subinterface addresses */ subif_address_t *subifs; /** Hash for subif addresses */ mhash_t subif_mhash; /** Unix file index */ u32 clib_file_index; /** For the "normal" interface, if configured */ u32 hw_if_index, sw_if_index; } tuntap_main_t; static tuntap_main_t tuntap_main = { .tun_name = "vnet", /** Suitable defaults for an Ethernet-like tun/tap device */ .mtu_bytes = 4096 + 256, }; /** * @brief tuntap_tx * @node tuntap-tx * * Output node, writes the buffers comprising the incoming frame * to the tun/tap device, aka hands them to the Linux kernel stack. * * @param *vm - vlib_main_t * @param *node - vlib_node_runtime_t * @param *frame - vlib_frame_t * * @return rc - uword * */ static uword tuntap_tx (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { u32 *buffers = vlib_frame_vector_args (frame); uword n_packets = frame->n_vectors; tuntap_main_t *tm = &tuntap_main; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; u32 n_bytes = 0; int i; u16 thread_index = vm->thread_index; for (i = 0; i < n_packets; i++) { struct iovec *iov; vlib_buffer_t *b; uword l; b = vlib_get_buffer (vm, buffers[i]); if (tm->is_ether && (!tm->have_normal_interface)) { vlib_buffer_reset (b); clib_memcpy_fast (vlib_buffer_get_current (b), tm->ether_dst_mac, 6); } /* Re-set iovecs if present. */ if (tm->threads[thread_index].iovecs) _vec_len (tm->threads[thread_index].iovecs) = 0; /** VLIB buffer chain -> Unix iovec(s). */ vec_add2 (tm->threads[thread_index].iovecs, iov, 1); iov->iov_base = b->data + b->current_data; iov->iov_len = l = b->current_length; if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT)) { do { b = vlib_get_buffer (vm, b->next_buffer); vec_add2 (tm->threads[thread_index].iovecs, iov, 1); iov->iov_base = b->data + b->current_data; iov->iov_len = b->current_length; l += b->current_length; } while (b->flags & VLIB_BUFFER_NEXT_PRESENT); } if (writev (tm->dev_net_tun_fd, tm->threads[thread_index].iovecs, vec_len (tm->threads[thread_index].iovecs)) < l) clib_unix_warning ("writev"); n_bytes += l; } /* Update tuntap interface output stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, vm->thread_index, tm->sw_if_index, n_packets, n_bytes); /** The normal interface path flattens the buffer chain */ if (tm->have_normal_interface) vlib_buffer_free_no_next (vm, buffers, n_packets); else vlib_buffer_free (vm, buffers, n_packets); return n_packets; } /* *INDENT-OFF* */ VLIB_REGISTER_NODE (tuntap_tx_node,static) = { .function = tuntap_tx, .name = "tuntap-tx", .type = VLIB_NODE_TYPE_INTERNAL, .vector_size = 4, }; /* *INDENT-ON* */ /** * @brief TUNTAP receive node * @node tuntap-rx * * @param *vm - vlib_main_t * @param *node - vlib_node_runtime_t * @param *frame - vlib_frame_t * * @return rc - uword * */ static uword tuntap_rx (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { tuntap_main_t *tm = &tuntap_main; vlib_buffer_t *b; u32 bi; const uword buffer_size = vlib_buffer_get_default_data_size (vm); u16 thread_index = vm->thread_index; /** Make sure we have some RX buffers. */ { uword n_left = vec_len (tm->threads[thread_index].rx_buffers); uword n_alloc; if (n_left < VLIB_FRAME_SIZE / 2) { if (!tm->threads[thread_index].rx_buffers) vec_alloc (tm->threads[thread_index].rx_buffers, VLIB_FRAME_SIZE); n_alloc = vlib_buffer_alloc (vm, tm->threads[thread_index].rx_buffers + n_left, VLIB_FRAME_SIZE - n_left); _vec_len (tm->threads[thread_index].rx_buffers) = n_left + n_alloc; } } /** Allocate RX buffers from end of rx_buffers. Turn them into iovecs to pass to readv. */ { uword i_rx = vec_len (tm->threads[thread_index].rx_buffers) - 1; vlib_buffer_t *b; word i, n_bytes_left, n_bytes_in_packet; /** We should have enough buffers left for an MTU sized packet. */ ASSERT (vec_len (tm->threads[thread_index].rx_buffers) >= tm->mtu_buffers); vec_validate (tm->threads[thread_index].iovecs, tm->mtu_buffers - 1); for (i = 0; i < tm->mtu_buffers; i++) { b = vlib_get_buffer (vm, tm->threads[thread_index].rx_buffers[i_rx - i]); tm->threads[thread_index].iovecs[i].iov_base = b->data; tm->threads[thread_index].iovecs[i].iov_len = buffer_size; } n_bytes_left = readv (tm->dev_net_tun_fd, tm->threads[thread_index].iovecs, tm->mtu_buffers); n_bytes_in_packet = n_bytes_left; if (n_bytes_left <= 0) { if (errno != EAGAIN) clib_unix_warning ("readv %d", n_bytes_left); return 0; } bi = tm->threads[thread_index].rx_buffers[i_rx]; while (1) { b = vlib_get_buffer (vm, tm->threads[thread_index].rx_buffers[i_rx]); b->flags = 0; b->current_data = 0; b->current_length = n_bytes_left < buffer_size ? n_bytes_left : buffer_size; n_bytes_left -= buffer_size; if (n_bytes_left <= 0) { break; } i_rx--; b->flags |= VLIB_BUFFER_NEXT_PRESENT; b->next_buffer = tm->threads[thread_index].rx_buffers[i_rx]; } /** Interface counters for tuntap interface. */ vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, thread_index, tm->sw_if_index, 1, n_bytes_in_packet); _vec_len (tm->threads[thread_index].rx_buffers) = i_rx; } b = vlib_get_buffer (vm, bi); { u32 next_index; uword n_trace = vlib_get_trace_count (vm, node); vnet_buffer (b)->sw_if_index[VLIB_RX] = tm->sw_if_index; vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~ 0; /* * Turn this on if you run into * "bad monkey" contexts, and you want to know exactly * which nodes they've visited... */ if (VLIB_BUFFER_TRACE_TRAJECTORY) b->pre_data[0] = 0; b->error = node->errors[0]; if (tm->is_ether) { next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; } else switch (b->data[0] & 0xf0) { case 0x40: next_index = VNET_DEVICE_INPUT_NEXT_IP4_INPUT; break; case 0x60: next_index = VNET_DEVICE_INPUT_NEXT_IP6_INPUT; break; default: next_index = VNET_DEVICE_INPUT_NEXT_DROP; break; } /* The linux kernel couldn't care less if our interface is up */ if (tm->have_normal_interface) { vnet_main_t *vnm = vnet_get_main (); vnet_sw_interface_t *si; si = vnet_get_sw_interface (vnm, tm->sw_if_index); if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)) next_index = VNET_DEVICE_INPUT_NEXT_DROP; } vnet_feature_start_device_input_x1 (tm->sw_if_index, &next_index, b); vlib_set_next_frame_buffer (vm, node, next_index, bi); if (n_trace > 0) { vlib_trace_buffer (vm, node, next_index, b, /* follow_chain */ 1); vlib_set_trace_count (vm, node, n_trace - 1); } } return 1; } /** * @brief TUNTAP_RX error strings */ static char *tuntap_rx_error_strings[] = { "unknown packet type", }; /* *INDENT-OFF* */ VLIB_REGISTER_NODE (tuntap_rx_node,static) = { .function = tuntap_rx, .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED, .name = "tuntap-rx", .sibling_of = "device-input", .type = VLIB_NODE_TYPE_INPUT, .state = VLIB_NODE_STATE_INTERRUPT, .vector_size = 4, .n_errors = 1, .error_strings = tuntap_rx_error_strings, }; /* *INDENT-ON* */ /** * @brief Gets called when file descriptor is ready from epoll. * * @param *uf - clib_file_t * * @return error - clib_error_t */ static clib_error_t * tuntap_read_ready (clib_file_t * uf) { vlib_main_t *vm = vlib_get_main (); vlib_node_set_interrupt_pending (vm, tuntap_rx_node.index); return 0; } /** * @brief Clean up the tun/tap device * * @param *vm - vlib_main_t * * @return error - clib_error_t * */ static clib_error_t * tuntap_exit (vlib_main_t * vm) { tuntap_main_t *tm = &tuntap_main; struct ifreq ifr; int sfd; /* Not present. */ if (!tm->dev_net_tun_fd || tm->dev_net_tun_fd < 0) return 0; sfd = socket (AF_INET, SOCK_STREAM, 0); if (sfd < 0) clib_unix_warning ("provisioning socket"); clib_memset (&ifr, 0, sizeof (ifr)); strncpy (ifr.ifr_name, tm->tun_name, sizeof (ifr.ifr_name) - 1); /* get flags, modify to bring down interface... */ if (ioctl (sfd, SIOCGIFFLAGS, &ifr) < 0) clib_unix_warning ("SIOCGIFFLAGS"); ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING); if (ioctl (sfd, SIOCSIFFLAGS, &ifr) < 0) clib_unix_warning ("SIOCSIFFLAGS"); /* Turn off persistence */ if (ioctl (tm->dev_net_tun_fd, TUNSETPERSIST, 0) < 0) clib_unix_warning ("TUNSETPERSIST"); close (tm->dev_tap_fd); if (tm->dev_net_tun_fd >= 0) close (tm->dev_net_tun_fd); if (sfd >= 0) close (sfd); return 0; } VLIB_MAIN_LOOP_EXIT_FUNCTION (tuntap_exit); /** * @brief CLI function for tun/tap config * * @param *vm - vlib_main_t * @param *input - unformat_input_t * * @return error - clib_error_t * */ static clib_error_t * tuntap_config (vlib_main_t * vm, unformat_input_t * input) { tuntap_main_t *tm = &tuntap_main; clib_error_t *error = 0; struct ifreq ifr; u8 *name; int flags = IFF_TUN | IFF_NO_PI; int is_enabled = 0, is_ether = 0, have_normal_interface = 0; const uword buffer_size = vlib_buffer_get_default_data_size (vm); while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { if (unformat (input, "mtu %d", &tm->mtu_bytes)) ; else if (unformat (input, "enable")) is_enabled = 1; else if (unformat (input, "disable")) is_enabled = 0; else if (unformat (input, "ethernet") || unformat (input, "ether")) is_ether = 1; else if (unformat (input, "have-normal-interface") || unformat (input, "have-normal")) have_normal_interface = 1; else if (unformat (input, "name %s", &name)) tm->tun_name = (char *) name; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } tm->dev_net_tun_fd = -1; tm->dev_tap_fd = -1; if (is_enabled == 0) return 0; if (geteuid ()) { clib_warning ("tuntap disabled: must be superuser"); return 0; } tm->is_ether = is_ether; tm->have_normal_interface = have_normal_interface; if (is_ether) flags = IFF_TAP | IFF_NO_PI; if ((tm->dev_net_tun_fd = open ("/dev/net/tun", O_RDWR)) < 0) { error = clib_error_return_unix (0, "open /dev/net/tun"); goto done; } clib_memset (&ifr, 0, sizeof (ifr)); strncpy (ifr.ifr_name, tm->tun_name, sizeof (ifr.ifr_name) - 1); ifr.ifr_flags = flags; if (ioctl (tm->dev_net_tun_fd, TUNSETIFF, (void *) &ifr) < 0) { error = clib_error_return_unix (0, "ioctl TUNSETIFF"); goto done; } /* Make it persistent, at least until we split. */ if (ioctl (tm->dev_net_tun_fd, TUNSETPERSIST, 1) < 0) { error = clib_error_return_unix (0, "TUNSETPERSIST"); goto done; } /* Open a provisioning socket */ if ((tm->dev_tap_fd = socket (PF_PACKET, SOCK_RAW, htons (ETH_P_ALL))) < 0) { error = clib_error_return_unix (0, "socket"); goto done; } /* Find the interface index. */ { struct ifreq ifr; struct sockaddr_ll sll; clib_memset (&ifr, 0, sizeof (ifr)); strncpy (ifr.ifr_name, tm->tun_name, sizeof (ifr.ifr_name) - 1); if (ioctl (tm->dev_tap_fd, SIOCGIFINDEX, &ifr) < 0) { error = clib_error_return_unix (0, "ioctl SIOCGIFINDEX"); goto done; } /* Bind the provisioning socket to the interface. */ clib_memset (&sll, 0, sizeof (sll)); sll.sll_family = AF_PACKET; sll.sll_ifindex = ifr.ifr_ifindex; sll.sll_protocol = htons (ETH_P_ALL); if (bind (tm->dev_tap_fd, (struct sockaddr *) &sll, sizeof (sll)) < 0) { error = clib_error_return_unix (0, "bind"); goto done; } } /* non-blocking I/O on /dev/tapX */ { int one = 1; if (ioctl (tm->dev_net_tun_fd, FIONBIO, &one) < 0) { error = clib_error_return_unix (0, "ioctl FIONBIO"); goto done; } } tm->mtu_buffers = (tm->mtu_bytes + (buffer_size - 1)) / buffer_size; ifr.ifr_mtu = tm->mtu_bytes; if (ioctl (tm->dev_tap_fd, SIOCSIFMTU, &ifr) < 0) { error = clib_error_return_unix (0, "ioctl SIOCSIFMTU"); goto done; } /* get flags, modify to bring up interface... */ if (ioctl (tm->dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0) { error = clib_error_return_unix (0, "ioctl SIOCGIFFLAGS"); goto done; } ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); if (ioctl (tm->dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0) { error = clib_error_return_unix (0, "ioctl SIOCSIFFLAGS"); goto done; } if (is_ether) { if (ioctl (tm->dev_tap_fd, SIOCGIFHWADDR, &ifr) < 0) { error = clib_error_return_unix (0, "ioctl SIOCGIFHWADDR"); goto done; } else clib_memcpy_fast (tm->ether_dst_mac, ifr.ifr_hwaddr.sa_data, 6); } if (have_normal_interface) { vnet_main_t *vnm = vnet_get_main (); error = ethernet_register_interface (vnm, tuntap_dev_class.index, 0 /* device instance */ , tm->ether_dst_mac /* ethernet address */ , &tm->hw_if_index, 0 /* flag change */ ); if (error) clib_error_report (error); tm->sw_if_index = tm->hw_if_index; vm->os_punt_frame = tuntap_nopunt_frame; } else { vnet_main_t *vnm = vnet_get_main (); vnet_hw_interface_t *hi; vm->os_punt_frame = tuntap_punt_frame; tm->hw_if_index = vnet_register_interface (vnm, tuntap_dev_class.index, 0 /* device instance */ , tuntap_interface_class.index, 0); hi = vnet_get_hw_interface (vnm, tm->hw_if_index); tm->sw_if_index = hi->sw_if_index; /* Interface is always up. */ vnet_hw_interface_set_flags (vnm, tm->hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP); vnet_sw_interface_set_flags (vnm, tm->sw_if_index, VNET_SW_INTERFACE_FLAG_ADMIN_UP); } { clib_file_t template = { 0 }; template.
# Copyright (c) 2021 Cisco and/or its affiliates.
#
# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
#
# Licensed under the Apache License 2.0 or
# GNU General Public License v2.0 or later;  you may not use this file
# except in compliance with one of these Licenses. You
# may obtain a copy of the Licenses at:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#     https://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html
#
# Note: If this file is linked with Scapy, which is GPLv2+, your use of it
# must be under GPLv2+.  If at any point in the future it is no longer linked
# with Scapy (or other GPLv2+ licensed software), you are free to choose
# Apache 2.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distribute