diff options
Diffstat (limited to 'vnet/vnet/unix')
-rw-r--r-- | vnet/vnet/unix/gdb_funcs.c | 117 | ||||
-rw-r--r-- | vnet/vnet/unix/pcap.c | 213 | ||||
-rw-r--r-- | vnet/vnet/unix/pcap.h | 187 | ||||
-rw-r--r-- | vnet/vnet/unix/pcap2pg.c | 155 | ||||
-rw-r--r-- | vnet/vnet/unix/tapcli.c | 1200 | ||||
-rw-r--r-- | vnet/vnet/unix/tapcli.h | 29 | ||||
-rw-r--r-- | vnet/vnet/unix/tuntap.c | 907 | ||||
-rw-r--r-- | vnet/vnet/unix/tuntap.h | 37 |
8 files changed, 2845 insertions, 0 deletions
diff --git a/vnet/vnet/unix/gdb_funcs.c b/vnet/vnet/unix/gdb_funcs.c new file mode 100644 index 00000000000..9a2e4599a2b --- /dev/null +++ b/vnet/vnet/unix/gdb_funcs.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vppinfra/format.h> +#include <vlib/vlib.h> + +#include <vlib/threads.h> + +/* Functions to call from gdb */ + +u32 vl(void *p) +{ + return vec_len (p); +} + +uword pe (void *v) +{ + return (pool_elts(v)); +} + +int pifi (void *p, u32 index) +{ + return pool_is_free_index (p, index); +} + +void debug_hex_bytes (u8 *s, u32 n) +{ + fformat (stderr, "%U\n", format_hex_bytes, s, n); +} + +void vlib_dump_frame_ownership (void) +{ + vlib_main_t * vm = vlib_get_main(); + vlib_node_main_t * nm = &vm->node_main; + vlib_node_runtime_t * this_node_runtime; + vlib_next_frame_t * nf; + u32 first_nf_index; + u32 index; + + vec_foreach(this_node_runtime, nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) + { + first_nf_index = this_node_runtime->next_frame_index; + + for (index = first_nf_index; index < first_nf_index + + this_node_runtime->n_next_nodes; index++) + { + vlib_node_runtime_t * owned_runtime; + nf = vec_elt_at_index (vm->node_main.next_frames, index); + if (nf->flags & VLIB_FRAME_OWNER) + { + owned_runtime = vec_elt_at_index (nm->nodes_by_type[0], + nf->node_runtime_index); + fformat(stderr, + "%s next index %d owns enqueue rights to %s\n", + nm->nodes[this_node_runtime->node_index]->name, + index - first_nf_index, + nm->nodes[owned_runtime->node_index]->name); + fformat (stderr, " nf index %d nf->frame_index %d\n", + nf - vm->node_main.next_frames, + nf->frame_index); + } + } + } +} + +void vlib_runtime_index_to_node_name (u32 index) +{ + vlib_main_t * vm = vlib_get_main(); + vlib_node_main_t * nm = &vm->node_main; + + if (index > vec_len (nm->nodes)) + { + fformat(stderr, "%d out of range, max %d\n", vec_len(nm->nodes)); + return; + } + + fformat(stderr, "node runtime index %d name %s\n", index, nm->nodes[index]->name); +} + + +static clib_error_t * +show_gdb_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_cli_output (vm, "vl(p) returns vec_len(p)"); + vlib_cli_output (vm, "pe(p) returns pool_elts(p)"); + vlib_cli_output (vm, "pifi(p, i) returns pool_is_free_index(p, i)"); + vlib_cli_output (vm, "debug_hex_bytes (ptr, n_bytes) dumps n_bytes in hex"); + vlib_cli_output (vm, "vlib_dump_frame_ownership() does what it says"); + vlib_cli_output (vm, "vlib_runtime_index_to_node_name (index) prints NN"); + + return 0; +} + +VLIB_CLI_COMMAND (show_gdb_funcs_command, static) = { + .path = "show gdb", + .short_help = "Describe functions which can be called from gdb", + .function = show_gdb_command_fn, +}; + +/* Cafeteria plan, maybe you don't want these functions */ +clib_error_t * +gdb_func_init (vlib_main_t * vm) { return 0; } + +VLIB_INIT_FUNCTION (gdb_func_init); diff --git a/vnet/vnet/unix/pcap.c b/vnet/vnet/unix/pcap.c new file mode 100644 index 00000000000..16b8443085b --- /dev/null +++ b/vnet/vnet/unix/pcap.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pcap.c: libpcap packet capture format + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/unix/pcap.h> +#include <sys/fcntl.h> + +/* Usage + +#include <vnet/unix/pcap.h> + +static pcap_main_t pcap = { + .file_name = "/tmp/ip4", + .n_packets_to_capture = 2, + .packet_type = PCAP_PACKET_TYPE_ip, +}; + +To add a buffer: + + pcap_add_buffer (&pcap, vm, pi0, 128); + +file will be written after n_packets_to_capture or call to pcap_write (&pcap). + +*/ + +clib_error_t * +pcap_write (pcap_main_t * pm) +{ + clib_error_t * error = 0; + + if (! (pm->flags & PCAP_MAIN_INIT_DONE)) + { + pcap_file_header_t fh; + int n; + + if (! pm->file_name) + pm->file_name = "/tmp/vnet.pcap"; + + pm->file_descriptor = open (pm->file_name, O_CREAT | O_TRUNC | O_WRONLY, 0664); + if (pm->file_descriptor < 0) + { + error = clib_error_return_unix (0, "failed to open `%s'", pm->file_name); + goto done; + } + + pm->flags |= PCAP_MAIN_INIT_DONE; + pm->n_packets_captured = 0; + pm->n_pcap_data_written = 0; + + /* Write file header. */ + memset (&fh, 0, sizeof (fh)); + fh.magic = 0xa1b2c3d4; + fh.major_version = 2; + fh.minor_version = 4; + fh.time_zone = 0; + fh.max_packet_size_in_bytes = 1 << 16; + fh.packet_type = pm->packet_type; + n = write (pm->file_descriptor, &fh, sizeof (fh)); + if (n != sizeof (fh)) + { + if (n < 0) + error = clib_error_return_unix (0, "write file header `%s'", pm->file_name); + else + error = clib_error_return (0, "short write of file header `%s'", pm->file_name); + goto done; + } + } + + do { + int n = vec_len (pm->pcap_data) - pm->n_pcap_data_written; + + if (n > 0) + { + n = write (pm->file_descriptor, + vec_elt_at_index (pm->pcap_data, pm->n_pcap_data_written), + n); + if (n < 0 && unix_error_is_fatal (errno)) + { + error = clib_error_return_unix (0, "write `%s'", pm->file_name); + goto done; + } + } + pm->n_pcap_data_written += n; + if (pm->n_pcap_data_written >= vec_len (pm->pcap_data)) + { + vec_reset_length (pm->pcap_data); + break; + } + } while (pm->n_packets_captured >= pm->n_packets_to_capture); + + if (pm->n_packets_captured >= pm->n_packets_to_capture) + { + close (pm->file_descriptor); + pm->flags &= ~PCAP_MAIN_INIT_DONE; + pm->file_descriptor = -1; + } + + done: + if (error) + { + if (pm->file_descriptor >= 0) + close (pm->file_descriptor); + } + return error; +} + +clib_error_t * pcap_read (pcap_main_t * pm) +{ + clib_error_t * error = 0; + int fd, need_swap, n; + pcap_file_header_t fh; + pcap_packet_header_t ph; + + fd = open (pm->file_name, O_RDONLY); + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", pm->file_name); + goto done; + } + + if (read (fd, &fh, sizeof (fh)) != sizeof (fh)) + { + error = clib_error_return_unix (0, "read file header `%s'", pm->file_name); + goto done; + } + + need_swap = 0; + if (fh.magic == 0xd4c3b2a1) + { + need_swap = 1; +#define _(t,f) fh.f = clib_byte_swap_##t (fh.f); + foreach_pcap_file_header; +#undef _ + } + + if (fh.magic != 0xa1b2c3d4) + { + error = clib_error_return (0, "bad magic `%s'", pm->file_name); + goto done; + } + + pm->min_packet_bytes = 0; + pm->max_packet_bytes = 0; + while ((n = read (fd, &ph, sizeof (ph))) != 0) + { + u8 * data; + + if (need_swap) + { +#define _(t,f) ph.f = clib_byte_swap_##t (ph.f); + foreach_pcap_packet_header; +#undef _ + } + + data = vec_new (u8, ph.n_bytes_in_packet); + if (read (fd, data, ph.n_packet_bytes_stored_in_file) != ph.n_packet_bytes_stored_in_file) + { + error = clib_error_return (0, "short read `%s'", pm->file_name); + goto done; + } + + if (vec_len (pm->packets_read) == 0) + pm->min_packet_bytes = pm->max_packet_bytes = ph.n_bytes_in_packet; + else + { + pm->min_packet_bytes = clib_min (pm->min_packet_bytes, ph.n_bytes_in_packet); + pm->max_packet_bytes = clib_max (pm->max_packet_bytes, ph.n_bytes_in_packet); + } + + vec_add1 (pm->packets_read, data); + } + + done: + if (fd >= 0) + close (fd); + return error; + +} diff --git a/vnet/vnet/unix/pcap.h b/vnet/vnet/unix/pcap.h new file mode 100644 index 00000000000..6e8e69191f5 --- /dev/null +++ b/vnet/vnet/unix/pcap.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pcap.h: libpcap packet capture format + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vnet_pcap_h +#define included_vnet_pcap_h + +#include <vlib/vlib.h> + +#define foreach_vnet_pcap_packet_type \ + _ (null, 0) \ + _ (ethernet, 1) \ + _ (ppp, 9) \ + _ (ip, 12) \ + _ (hdlc, 104) + +typedef enum { +#define _(f,n) PCAP_PACKET_TYPE_##f = (n), + foreach_vnet_pcap_packet_type +#undef _ +} pcap_packet_type_t; + +#define foreach_pcap_file_header \ + /* 0xa1b2c3d4 host byte order. \ + 0xd4c3b2a1 => need to byte swap everything. */ \ + _ (u32, magic) \ + \ + /* Currently major 2 minor 4. */ \ + _ (u16, major_version) \ + _ (u16, minor_version) \ + \ + /* 0 for GMT. */ \ + _ (u32, time_zone) \ + \ + /* Accuracy of timestamps. Typically set to 0. */ \ + _ (u32, sigfigs) \ + \ + /* Size of largest packet in file. */ \ + _ (u32, max_packet_size_in_bytes) \ + \ + /* One of vnet_pcap_packet_type_t. */ \ + _ (u32, packet_type) + +/* File header. */ +typedef struct { +#define _(t, f) t f; + foreach_pcap_file_header +#undef _ +} pcap_file_header_t; + +#define foreach_pcap_packet_header \ + /* Time stamp in seconds and microseconds. */ \ + _ (u32, time_in_sec) \ + _ (u32, time_in_usec) \ + \ + /* Number of bytes stored in file and size of actual packet. */ \ + _ (u32, n_packet_bytes_stored_in_file) \ + _ (u32, n_bytes_in_packet) + +/* Packet header. */ +typedef struct { +#define _(t, f) t f; + foreach_pcap_packet_header +#undef _ + + /* Packet data follows. */ + u8 data[0]; +} pcap_packet_header_t; + +typedef struct { + /* File name of pcap output. */ + char * file_name; + + /* Number of packets to capture. */ + u32 n_packets_to_capture; + + pcap_packet_type_t packet_type; + + /* Number of packets currently captured. */ + u32 n_packets_captured; + + u32 flags; +#define PCAP_MAIN_INIT_DONE (1 << 0) + + /* File descriptor for reading/writing. */ + int file_descriptor; + + u32 n_pcap_data_written; + + /* Vector of pcap data. */ + u8 * pcap_data; + + /* Packets read from file. */ + u8 ** packets_read; + + u32 min_packet_bytes, max_packet_bytes; +} pcap_main_t; + +/* Write out data to output file. */ +clib_error_t * pcap_write (pcap_main_t * pm); + +clib_error_t * pcap_read (pcap_main_t * pm); + +static inline void * +pcap_add_packet (pcap_main_t * pm, + f64 time_now, + u32 n_bytes_in_trace, + u32 n_bytes_in_packet) +{ + pcap_packet_header_t * h; + u8 * d; + + vec_add2 (pm->pcap_data, d, sizeof (h[0]) + n_bytes_in_trace); + h = (void *) (d); + h->time_in_sec = time_now; + h->time_in_usec = 1e6*(time_now - h->time_in_sec); + h->n_packet_bytes_stored_in_file = n_bytes_in_trace; + h->n_bytes_in_packet = n_bytes_in_packet; + pm->n_packets_captured++; + return h->data; +} + +static inline void +pcap_add_buffer (pcap_main_t * pm, + vlib_main_t * vm, u32 buffer_index, + u32 n_bytes_in_trace) +{ + vlib_buffer_t * b = vlib_get_buffer (vm, buffer_index); + u32 n = vlib_buffer_length_in_chain (vm, b); + i32 n_left = clib_min (n_bytes_in_trace, n); + f64 time_now = vlib_time_now (vm); + void * d; + + d = pcap_add_packet (pm, time_now, n_bytes_in_trace, n_left); + while (1) + { + memcpy (d, b->data + b->current_data, b->current_length); + n_left -= b->current_length; + if (n_left <= 0) + break; + d += b->current_length; + ASSERT (b->flags & VLIB_BUFFER_NEXT_PRESENT); + b = vlib_get_buffer (vm, b->next_buffer); + } + + /* Flush output vector. */ + if (vec_len (pm->pcap_data) >= 64*1024 + || pm->n_packets_captured >= pm->n_packets_to_capture) + pcap_write (pm); +} + +#endif /* included_vnet_pcap_h */ diff --git a/vnet/vnet/unix/pcap2pg.c b/vnet/vnet/unix/pcap2pg.c new file mode 100644 index 00000000000..10b47c976dc --- /dev/null +++ b/vnet/vnet/unix/pcap2pg.c @@ -0,0 +1,155 @@ +/* + * pcap2pg.c: convert pcap input to pg input + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/unix/pcap.h> +#include <vnet/ethernet/packet.h> +#include <stdio.h> + +pcap_main_t pcap_main; + +static char * pg_fmt = + "packet-generator new {\n" + " name s%d\n" + " limit 1\n" + " size %d-%d\n" + " node ethernet-input\n"; + + +void stream_boilerplate (FILE *ofp, int i, u8 * pkt) +{ + fformat(ofp, pg_fmt, i, vec_len(pkt), vec_len(pkt)); +} + +int pcap2pg (pcap_main_t * pm, FILE *ofp) +{ + int i, j; + u8 *pkt; + + for (i = 0; i < vec_len (pm->packets_read); i++) + { + int offset; + ethernet_header_t * h; + u64 ethertype; + + pkt = pm->packets_read[i]; + h = (ethernet_header_t *)pkt; + + stream_boilerplate (ofp, i, pkt); + + fformat (ofp, " data {\n"); + + ethertype = clib_net_to_host_u16 (h->type); + + /* + * In vnet terms, packet generator interfaces are not ethernets. + * They don't have vlan tables. + * This dance transforms captured 802.1q VLAN packets into + * regular Ethernet packets. + */ + if (ethertype == 0x8100 /* 802.1q vlan */) + { + u16 * vlan_ethertype = (u16 *)(h+1); + ethertype = clib_net_to_host_u16(vlan_ethertype[0]); + offset = 18; + } + else + offset = 14; + + fformat (ofp, + " 0x%04x: %02x%02x.%02x%02x.%02x%02x" + " -> %02x%02x.%02x%02x.%02x%02x\n", + ethertype, + h->src_address[0], + h->src_address[1], + h->src_address[2], + h->src_address[3], + h->src_address[4], + h->src_address[5], + h->dst_address[0], + h->dst_address[1], + h->dst_address[2], + h->dst_address[3], + h->dst_address[4], + h->dst_address[5]); + + fformat (ofp, " hex 0x"); + + for (j = offset; j < vec_len (pkt); j++) + fformat (ofp, "%02x", pkt[j]); + + fformat (ofp, " }\n"); + fformat (ofp, "}\n\n"); + } + return 0; +} + +int main (int argc, char **argv) +{ + unformat_input_t input; + pcap_main_t * pm = &pcap_main; + u8 * input_file = 0, * output_file = 0; + FILE * ofp; + clib_error_t * error; + + unformat_init_command_line (&input, argv); + + while (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(&input, "-i %s", &input_file) + || unformat (&input, "input %s", &input_file)) + ; + else if (unformat (&input, "-o %s", &output_file) + || unformat (&input, "output %s", &output_file)) + ; + else + { + usage: + fformat(stderr, + "usage: pcap2pg -i <input-file> [-o <output-file>]\n"); + exit (1); + } + } + + if (input_file == 0) + goto usage; + + pm->file_name = (char *)input_file; + error = pcap_read (pm); + + if (error) + { + clib_error_report (error); + exit (1); + } + + if (output_file) + { + ofp = fopen ((char *)output_file, "rw"); + if (ofp == NULL) + clib_unix_warning ("Couldn't create '%s'", output_file); + exit (1); + } + else + { + ofp = stdout; + } + + pcap2pg (pm, ofp); + + fclose (ofp); + exit (0); +} diff --git a/vnet/vnet/unix/tapcli.c b/vnet/vnet/unix/tapcli.c new file mode 100644 index 00000000000..44af321f796 --- /dev/null +++ b/vnet/vnet/unix/tapcli.c @@ -0,0 +1,1200 @@ +/* + *------------------------------------------------------------------ + * tapcli.c - dynamic tap interface hookup + * + * Copyright (c) 2009 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <fcntl.h> /* for open */ +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/uio.h> /* for iovec */ +#include <netinet/in.h> + +#include <linux/if_arp.h> +#include <linux/if_tun.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> + +#include <vnet/ip/ip.h> + +#include <vnet/ethernet/ethernet.h> + +#if DPDK == 1 +#include <vnet/devices/dpdk/dpdk.h> +#endif + +#include <vnet/unix/tapcli.h> + +static vnet_device_class_t tapcli_dev_class; +static vnet_hw_interface_class_t tapcli_interface_class; + +static void tapcli_nopunt_frame (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame); +typedef struct { + u32 unix_fd; + u32 unix_file_index; + u32 provision_fd; + u32 sw_if_index; /* for counters */ + u32 hw_if_index; + u32 is_promisc; + struct ifreq ifr; + u32 per_interface_next_index; + u8 active; /* for delete */ +} tapcli_interface_t; + +typedef struct { + /* Vector of iovecs for readv/writev calls. */ + struct iovec * iovecs; + + /* Vector of VLIB rx buffers to use. We allocate them in blocks + of VLIB_FRAME_SIZE (256). */ + u32 * rx_buffers; + + /* tap device destination MAC address. Required, or Linux drops pkts */ + u8 ether_dst_mac[6]; + + /* Interface MTU in bytes and # of default sized buffers. */ + u32 mtu_bytes, mtu_buffers; + + /* Vector of tap interfaces */ + tapcli_interface_t * tapcli_interfaces; + + /* Vector of deleted tap interfaces */ + u32 * tapcli_inactive_interfaces; + + /* Bitmap of tap interfaces with pending reads */ + uword * pending_read_bitmap; + + /* Hash table to find tapcli interface given hw_if_index */ + uword * tapcli_interface_index_by_sw_if_index; + + /* Hash table to find tapcli interface given unix fd */ + uword * tapcli_interface_index_by_unix_fd; + + /* renumbering table */ + u32 * show_dev_instance_by_real_dev_instance; + + /* 1 => disable CLI */ + int is_disabled; + + /* convenience */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; + unix_main_t * unix_main; +} tapcli_main_t; + +static tapcli_main_t tapcli_main; + +/* + * tapcli_tx + * Output node, writes the buffers comprising the incoming frame + * to the tun/tap device, aka hands them to the Linux kernel stack. + * + */ +static uword +tapcli_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 * buffers = vlib_frame_args (frame); + uword n_packets = frame->n_vectors; + tapcli_main_t * tm = &tapcli_main; + tapcli_interface_t * ti; + int i; + + for (i = 0; i < n_packets; i++) + { + struct iovec * iov; + vlib_buffer_t * b; + uword l; + vnet_hw_interface_t * hw; + uword * p; + u32 tx_sw_if_index; + + b = vlib_get_buffer (vm, buffers[i]); + + tx_sw_if_index = vnet_buffer(b)->sw_if_index[VLIB_TX]; + if (tx_sw_if_index == (u32)~0) + tx_sw_if_index = vnet_buffer(b)->sw_if_index[VLIB_RX]; + + ASSERT(tx_sw_if_index != (u32)~0); + + /* Use the sup intfc to finesse vlan subifs */ + hw = vnet_get_sup_hw_interface (tm->vnet_main, tx_sw_if_index); + tx_sw_if_index = hw->sw_if_index; + + p = hash_get (tm->tapcli_interface_index_by_sw_if_index, + tx_sw_if_index); + if (p == 0) + { + clib_warning ("sw_if_index %d unknown", tx_sw_if_index); + /* $$$ leak, but this should never happen... */ + continue; + } + else + ti = vec_elt_at_index (tm->tapcli_interfaces, p[0]); + + /* Re-set iovecs if present. */ + if (tm->iovecs) + _vec_len (tm->iovecs) = 0; + + /* VLIB buffer chain -> Unix iovec(s). */ + vec_add2 (tm->iovecs, iov, 1); + iov->iov_base = b->data + b->current_data; + iov->iov_len = l = b->current_length; + + if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + do { + b = vlib_get_buffer (vm, b->next_buffer); + + vec_add2 (tm->iovecs, iov, 1); + + iov->iov_base = b->data + b->current_data; + iov->iov_len = b->current_length; + l += b->current_length; + } while (b->flags & VLIB_BUFFER_NEXT_PRESENT); + } + + if (writev (ti->unix_fd, tm->iovecs, vec_len (tm->iovecs)) < l) + clib_unix_warning ("writev"); + } + + /* interface output path flattens buffer chains */ + vlib_buffer_free_no_next (vm, buffers, n_packets); + + return n_packets; +} + +VLIB_REGISTER_NODE (tapcli_tx_node,static) = { + .function = tapcli_tx, + .name = "tapcli-tx", + .type = VLIB_NODE_TYPE_INTERNAL, + .vector_size = 4, +}; + +enum { + TAPCLI_RX_NEXT_IP4_INPUT, + TAPCLI_RX_NEXT_IP6_INPUT, + TAPCLI_RX_NEXT_ETHERNET_INPUT, + TAPCLI_RX_NEXT_DROP, + TAPCLI_RX_N_NEXT, +}; + +static uword +tapcli_rx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + tapcli_main_t * tm = &tapcli_main; + vlib_buffer_t * b; + u32 bi; +#if DPDK == 0 + const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + u32 free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX; +#else + dpdk_main_t * dm = &dpdk_main; + const uword buffer_size = MBUF_SIZE; + u32 free_list_index = dm->vlib_buffer_free_list_index; +#endif + static u32 * ready_interface_indices; + tapcli_interface_t * ti; + int i; + + vec_reset_length (ready_interface_indices); + + clib_bitmap_foreach (i, tm->pending_read_bitmap, + ({ + vec_add1 (ready_interface_indices, i); + })); + + if (vec_len (ready_interface_indices) == 0) + return 1; + + for (i = 0; i < vec_len(ready_interface_indices); i++) + { + /* Clear the "interrupt" bit */ + tm->pending_read_bitmap = + clib_bitmap_set (tm->pending_read_bitmap, + ready_interface_indices[i], 0); + + ti = vec_elt_at_index (tm->tapcli_interfaces, ready_interface_indices[i]); + + /* Make sure we have some RX buffers. */ + { + uword n_left = vec_len (tm->rx_buffers); + uword n_alloc; + + if (n_left < VLIB_FRAME_SIZE / 2) + { + if (! tm->rx_buffers) + vec_alloc (tm->rx_buffers, VLIB_FRAME_SIZE); + + n_alloc = vlib_buffer_alloc_from_free_list + (vm, tm->rx_buffers + n_left, VLIB_FRAME_SIZE - n_left, + free_list_index); + _vec_len (tm->rx_buffers) = n_left + n_alloc; + } + } + + /* Allocate RX buffers from end of rx_buffers. + Turn them into iovecs to pass to readv. */ + { + uword i_rx = vec_len (tm->rx_buffers) - 1; + vlib_buffer_t * b; + word j, n_bytes_left, n_bytes_in_packet; +#if DPDK == 1 + u8 out_of_dpdk_buffers = 0; +#endif + + /* We need enough buffers left for an MTU sized packet. */ + if (PREDICT_FALSE(vec_len (tm->rx_buffers) < tm->mtu_buffers)) + { + clib_bitmap_set (tm->pending_read_bitmap, + ready_interface_indices[i], 1); + clib_warning ("buffer allocation failure"); + continue; + } + + vec_validate (tm->iovecs, tm->mtu_buffers - 1); + for (j = 0; j < tm->mtu_buffers; j++) + { + b = vlib_get_buffer (vm, tm->rx_buffers[i_rx - j]); + tm->iovecs[j].iov_base = b->data; + tm->iovecs[j].iov_len = buffer_size; + } + +#if DPDK == 1 + if (PREDICT_FALSE(out_of_dpdk_buffers == 1)) + continue; +#endif + + n_bytes_left = readv (ti->unix_fd, tm->iovecs, tm->mtu_buffers); + n_bytes_in_packet = n_bytes_left; + if (n_bytes_left <= 0) + { + if (errno != EAGAIN) + clib_unix_warning ("readv %d", n_bytes_left); + return 0; + } + + bi = tm->rx_buffers[i_rx]; + while (1) + { + b = vlib_get_buffer (vm, tm->rx_buffers[i_rx]); + + b->flags = 0; + b->current_data = 0; + b->current_length = n_bytes_left < buffer_size + ? n_bytes_left : buffer_size; + + n_bytes_left -= buffer_size; + + if (n_bytes_left <= 0) + { +#if DPDK == 1 + struct rte_mbuf *mb = (struct rte_mbuf *)(b - 1); + rte_pktmbuf_data_len (mb) = n_bytes_in_packet; + rte_pktmbuf_pkt_len (mb) = n_bytes_in_packet; +#endif + break; + } + + i_rx--; + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + b->next_buffer = tm->rx_buffers[i_rx]; +#if DPDK == 1 + ASSERT(0); /* $$$$ fixme */ + /* ((struct rte_pktmbuf *)(b->mb))->next = + vlib_get_buffer (vm, tm->rx_buffers[i_rx])->mb; */ +#endif + } + + /* Interface counters for tapcli interface. */ + vlib_increment_combined_counter + (vnet_main.interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + os_get_cpu_number(), + ti->sw_if_index, + 1, n_bytes_in_packet); + + _vec_len (tm->rx_buffers) = i_rx; + } + + b = vlib_get_buffer (vm, bi); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See .../vlib/vlib/buffer.h + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b); + + { + u32 next_index; + uword n_trace = vlib_get_trace_count (vm, node); + + vnet_buffer (b)->sw_if_index[VLIB_RX] = ti->sw_if_index; + vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32)~0; + + b->error = node->errors[0]; + + { + next_index = TAPCLI_RX_NEXT_ETHERNET_INPUT; + + next_index = (ti->per_interface_next_index != ~0) ? + ti->per_interface_next_index : next_index; + } + { + vnet_main_t *vnm = vnet_get_main(); + vnet_sw_interface_t * si; + si = vnet_get_sw_interface (vnm, ti->sw_if_index); + if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)) + next_index = TAPCLI_RX_NEXT_DROP; + } + + + vlib_set_next_frame_buffer (vm, node, next_index, bi); + + if (n_trace > 0) + { + vlib_trace_buffer (vm, node, next_index, + b, /* follow_chain */ 1); + vlib_set_trace_count (vm, node, n_trace - 1); + } + } + } + + return 1; +} + +static char * tapcli_rx_error_strings[] = { + "Interface down", +}; + +VLIB_REGISTER_NODE (tapcli_rx_node,static) = { + .function = tapcli_rx, + .name = "tapcli-rx", + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_INTERRUPT, + .vector_size = 4, + .n_errors = 1, + .error_strings = tapcli_rx_error_strings, + + .n_next_nodes = TAPCLI_RX_N_NEXT, + .next_nodes = { + [TAPCLI_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [TAPCLI_RX_NEXT_IP6_INPUT] = "ip6-input", + [TAPCLI_RX_NEXT_DROP] = "error-drop", + [TAPCLI_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + }, +}; + +/* Gets called when file descriptor is ready from epoll. */ +static clib_error_t * tapcli_read_ready (unix_file_t * uf) +{ + vlib_main_t * vm = vlib_get_main(); + tapcli_main_t * tm = &tapcli_main; + uword * p; + + /* Schedule the rx node */ + vlib_node_set_interrupt_pending (vm, tapcli_rx_node.index); + + p = hash_get (tm->tapcli_interface_index_by_unix_fd, uf->file_descriptor); + + /* Mark the specific tap interface ready-to-read */ + if (p) + tm->pending_read_bitmap = clib_bitmap_set (tm->pending_read_bitmap, + p[0], 1); + else + clib_warning ("fd %d not in hash table", uf->file_descriptor); + + return 0; +} + +static clib_error_t * +tapcli_config (vlib_main_t * vm, unformat_input_t * input) +{ + tapcli_main_t *tm = &tapcli_main; +#if DPDK == 0 + const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; +#else + const uword buffer_size = MBUF_SIZE; +#endif + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "mtu %d", &tm->mtu_bytes)) + ; + else if (unformat (input, "disable")) + tm->is_disabled = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (tm->is_disabled) + return 0; + + if (geteuid()) + { + clib_warning ("tapcli disabled: must be superuser"); + tm->is_disabled = 1; + return 0; + } + + tm->mtu_buffers = (tm->mtu_bytes + (buffer_size - 1)) / buffer_size; + + return 0; +} + +static int tap_name_renumber (vnet_hw_interface_t * hi, + u32 new_dev_instance) +{ + tapcli_main_t *tm = &tapcli_main; + + vec_validate_init_empty (tm->show_dev_instance_by_real_dev_instance, + hi->dev_instance, ~0); + + tm->show_dev_instance_by_real_dev_instance [hi->dev_instance] = + new_dev_instance; + + return 0; +} + +VLIB_CONFIG_FUNCTION (tapcli_config, "tapcli"); + +static void +tapcli_nopunt_frame (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 * buffers = vlib_frame_args (frame); + uword n_packets = frame->n_vectors; + vlib_buffer_free (vm, buffers, n_packets); + vlib_frame_free (vm, node, frame); +} + +VNET_HW_INTERFACE_CLASS (tapcli_interface_class,static) = { + .name = "tapcli", +}; + +static u8 * format_tapcli_interface_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + u32 show_dev_instance = ~0; + tapcli_main_t * tm = &tapcli_main; + + if (i < vec_len (tm->show_dev_instance_by_real_dev_instance)) + show_dev_instance = tm->show_dev_instance_by_real_dev_instance[i]; + + if (show_dev_instance != ~0) + i = show_dev_instance; + + s = format (s, "tap-%d", i); + return s; +} + +static u32 tapcli_flag_change (vnet_main_t * vnm, + vnet_hw_interface_t * hw, + u32 flags) +{ + tapcli_main_t *tm = &tapcli_main; + tapcli_interface_t *ti; + struct ifreq ifr; + u32 want_promisc; + + ti = vec_elt_at_index (tm->tapcli_interfaces, hw->dev_instance); + + memcpy (&ifr, &ti->ifr, sizeof (ifr)); + + /* get flags, modify to bring up interface... */ + if (ioctl (ti->provision_fd, SIOCGIFFLAGS, &ifr) < 0) + { + clib_unix_warning ("Couldn't get interface flags for %s", hw->name); + return 0; + } + + want_promisc = (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) != 0; + + if (want_promisc == ti->is_promisc) + return 0; + + + if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) + ifr.ifr_flags |= IFF_PROMISC; + else + ifr.ifr_flags &= ~(IFF_PROMISC); + + /* get flags, modify to bring up interface... */ + if (ioctl (ti->provision_fd, SIOCSIFFLAGS, &ifr) < 0) + { + clib_unix_warning ("Couldn't set interface flags for %s", hw->name); + return 0; + } + + ti->is_promisc = want_promisc; + + return 0; +} + +static void tapcli_set_interface_next_node (vnet_main_t *vnm, + u32 hw_if_index, + u32 node_index) +{ + tapcli_main_t *tm = &tapcli_main; + tapcli_interface_t *ti; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + + ti = vec_elt_at_index (tm->tapcli_interfaces, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + ti->per_interface_next_index = node_index; + return; + } + + ti->per_interface_next_index = + vlib_node_add_next (tm->vlib_main, tapcli_rx_node.index, node_index); +} + +/* + * Mainly exists to set link_state == admin_state + * otherwise, e.g. ip6 neighbor discovery breaks + */ +static clib_error_t * +tapcli_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + uword is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + u32 hw_flags; + u32 speed_duplex = VNET_HW_INTERFACE_FLAG_FULL_DUPLEX + | VNET_HW_INTERFACE_FLAG_SPEED_1G; + + if (is_admin_up) + hw_flags = VNET_HW_INTERFACE_FLAG_LINK_UP | speed_duplex; + else + hw_flags = speed_duplex; + + vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags); + return 0; +} + +VNET_DEVICE_CLASS (tapcli_dev_class,static) = { + .name = "tapcli", + .tx_function = tapcli_tx, + .format_device_name = format_tapcli_interface_name, + .rx_redirect_to_node = tapcli_set_interface_next_node, + .name_renumber = tap_name_renumber, + .admin_up_down_function = tapcli_interface_admin_up_down, +}; + +int vnet_tap_dump_ifs (tapcli_interface_details_t **out_tapids) +{ + tapcli_main_t * tm = &tapcli_main; + tapcli_interface_t * ti; + + tapcli_interface_details_t * r_tapids = NULL; + tapcli_interface_details_t * tapid = NULL; + + vec_foreach (ti, tm->tapcli_interfaces) { + if (!ti->active) + continue; + vec_add2(r_tapids, tapid, 1); + tapid->sw_if_index = ti->sw_if_index; + strncpy((char *)tapid->dev_name, ti->ifr.ifr_name, sizeof (ti->ifr.ifr_name)-1); + } + + *out_tapids = r_tapids; + + return 0; +} + +/* get tap interface from inactive interfaces or create new */ +static tapcli_interface_t *tapcli_get_new_tapif() +{ + tapcli_main_t * tm = &tapcli_main; + tapcli_interface_t *ti = NULL; + + int inactive_cnt = vec_len(tm->tapcli_inactive_interfaces); + // if there are any inactive ifaces + if (inactive_cnt > 0) { + // take last + u32 ti_idx = tm->tapcli_inactive_interfaces[inactive_cnt - 1]; + if (vec_len(tm->tapcli_interfaces) > ti_idx) { + ti = vec_elt_at_index (tm->tapcli_interfaces, ti_idx); + clib_warning("reusing tap interface"); + } + // "remove" from inactive list + _vec_len(tm->tapcli_inactive_interfaces) -= 1; + } + + // ti was not retrieved from inactive ifaces - create new + if (!ti) + vec_add2 (tm->tapcli_interfaces, ti, 1); + + return ti; +} + +int vnet_tap_connect (vlib_main_t * vm, u8 * intfc_name, u8 *hwaddr_arg, + u32 * sw_if_indexp) +{ + tapcli_main_t * tm = &tapcli_main; + tapcli_interface_t * ti = NULL; + struct ifreq ifr; + int flags; + int dev_net_tun_fd; + int dev_tap_fd = -1; + clib_error_t * error; + u8 hwaddr [6]; + int rv = 0; + + if (tm->is_disabled) + { + return VNET_API_ERROR_FEATURE_DISABLED; + } + + flags = IFF_TAP | IFF_NO_PI; + + if ((dev_net_tun_fd = open ("/dev/net/tun", O_RDWR)) < 0) + return VNET_API_ERROR_SYSCALL_ERROR_1; + + memset (&ifr, 0, sizeof (ifr)); + strncpy(ifr.ifr_name, (char *) intfc_name, sizeof (ifr.ifr_name)-1); + ifr.ifr_flags = flags; + if (ioctl (dev_net_tun_fd, TUNSETIFF, (void *)&ifr) < 0) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_2; + goto error; + } + + /* Open a provisioning socket */ + if ((dev_tap_fd = socket(PF_PACKET, SOCK_RAW, + htons(ETH_P_ALL))) < 0 ) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_3; + goto error; + } + + /* Find the interface index. */ + { + struct ifreq ifr; + struct sockaddr_ll sll; + + memset (&ifr, 0, sizeof(ifr)); + strncpy (ifr.ifr_name, (char *) intfc_name, sizeof (ifr.ifr_name)-1); + if (ioctl (dev_tap_fd, SIOCGIFINDEX, &ifr) < 0 ) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_4; + goto error; + } + + /* Bind the provisioning socket to the interface. */ + memset(&sll, 0, sizeof(sll)); + sll.sll_family = AF_PACKET; + sll.sll_ifindex = ifr.ifr_ifindex; + sll.sll_protocol = htons(ETH_P_ALL); + + if (bind(dev_tap_fd, (struct sockaddr*) &sll, sizeof(sll)) < 0) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_5; + goto error; + } + } + + /* non-blocking I/O on /dev/tapX */ + { + int one = 1; + if (ioctl (dev_net_tun_fd, FIONBIO, &one) < 0) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_6; + goto error; + } + } + ifr.ifr_mtu = tm->mtu_bytes; + if (ioctl (dev_tap_fd, SIOCSIFMTU, &ifr) < 0) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_7; + goto error; + } + + /* get flags, modify to bring up interface... */ + if (ioctl (dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_8; + goto error; + } + + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); + + if (ioctl (dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_9; + goto error; + } + + if (ioctl (dev_tap_fd, SIOCGIFHWADDR, &ifr) < 0) + { + rv = VNET_API_ERROR_SYSCALL_ERROR_1; + goto error; + } + + ti = tapcli_get_new_tapif(); + + if (hwaddr_arg != 0) + memcpy(hwaddr, hwaddr_arg, 6); + + error = ethernet_register_interface + (tm->vnet_main, + tapcli_dev_class.index, + ti - tm->tapcli_interfaces /* device instance */, + hwaddr_arg != 0 ? hwaddr : + (u8 *) ifr.ifr_hwaddr.sa_data /* ethernet address */, + &ti->hw_if_index, + tapcli_flag_change); + + if (error) + { + clib_error_report (error); + rv = VNET_API_ERROR_INVALID_REGISTRATION; + goto error; + } + + { + unix_file_t template = {0}; + template.read_function = tapcli_read_ready; + template.file_descriptor = dev_net_tun_fd; + ti->unix_file_index = unix_file_add (&unix_main, &template); + ti->unix_fd = dev_net_tun_fd; + ti->provision_fd = dev_tap_fd; + memcpy (&ti->ifr, &ifr, sizeof (ifr)); + } + + { + vnet_hw_interface_t * hw; + hw = vnet_get_hw_interface (tm->vnet_main, ti->hw_if_index); + ti->sw_if_index = hw->sw_if_index; + if (sw_if_indexp) + *sw_if_indexp = hw->sw_if_index; + } + + ti->active = 1; + + hash_set (tm->tapcli_interface_index_by_sw_if_index, ti->sw_if_index, + ti - tm->tapcli_interfaces); + + hash_set (tm->tapcli_interface_index_by_unix_fd, ti->unix_fd, + ti - tm->tapcli_interfaces); + + return rv; + + error: + close (dev_net_tun_fd); + close (dev_tap_fd); + + return rv; +} + +int vnet_tap_connect_renumber (vlib_main_t * vm, u8 * intfc_name, + u8 *hwaddr_arg, u32 * sw_if_indexp, + u8 renumber, u32 custom_dev_instance) +{ + int rv = vnet_tap_connect(vm, intfc_name, hwaddr_arg, sw_if_indexp); + + if (!rv && renumber) + vnet_interface_name_renumber (*sw_if_indexp, custom_dev_instance); + + return rv; +} + +static int tapcli_tap_disconnect (tapcli_interface_t *ti) +{ + int rv = 0; + vnet_main_t * vnm = vnet_get_main(); + tapcli_main_t * tm = &tapcli_main; + u32 sw_if_index = ti->sw_if_index; + + // bring interface down + vnet_sw_interface_set_flags (vnm, sw_if_index, 0); + + if (ti->unix_file_index != ~0) { + unix_file_del (&unix_main, unix_main.file_pool + ti->unix_file_index); + ti->unix_file_index = ~0; + } + + hash_unset (tm->tapcli_interface_index_by_unix_fd, ti->unix_fd); + hash_unset (tm->tapcli_interface_index_by_sw_if_index, ti->sw_if_index); + close(ti->unix_fd); + close(ti->provision_fd); + ti->unix_fd = -1; + ti->provision_fd = -1; + + return rv; +} + +int vnet_tap_delete(vlib_main_t *vm, u32 sw_if_index) +{ + int rv = 0; + tapcli_main_t * tm = &tapcli_main; + tapcli_interface_t *ti; + uword *p = NULL; + + p = hash_get (tm->tapcli_interface_index_by_sw_if_index, + sw_if_index); + if (p == 0) { + clib_warning ("sw_if_index %d unknown", sw_if_index); + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + } + ti = vec_elt_at_index (tm->tapcli_interfaces, p[0]); + + // inactive + ti->active = 0; + tapcli_tap_disconnect(ti); + // add to inactive list + vec_add1(tm->tapcli_inactive_interfaces, ti - tm->tapcli_interfaces); + + // reset renumbered iface + if (p[0] < vec_len (tm->show_dev_instance_by_real_dev_instance)) + tm->show_dev_instance_by_real_dev_instance[p[0]] = ~0; + + ethernet_delete_interface (tm->vnet_main, ti->hw_if_index); + return rv; +} + +static clib_error_t * +tap_delete_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + tapcli_main_t * tm = &tapcli_main; + u32 sw_if_index = ~0; + + if (tm->is_disabled) + { + return clib_error_return (0, "device disabled..."); + } + + if (unformat (input, "%U", unformat_vnet_sw_interface, tm->vnet_main, + &sw_if_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + + + int rc = vnet_tap_delete (vm, sw_if_index); + + if (!rc) { + vlib_cli_output (vm, "Deleted."); + } else { + vlib_cli_output (vm, "Error during deletion of tap interface. (rc: %d)", rc); + } + + return 0; +} + +VLIB_CLI_COMMAND (tap_delete_command, static) = { + .path = "tap delete", + .short_help = "tap delete <vpp-tap-intfc-name>", + .function = tap_delete_command_fn, +}; + +/* modifies tap interface - can result in new interface being created */ +int vnet_tap_modify (vlib_main_t * vm, u32 orig_sw_if_index, + u8 * intfc_name, u8 *hwaddr_arg, + u32 * sw_if_indexp, + u8 renumber, u32 custom_dev_instance) +{ + int rv = vnet_tap_delete (vm, orig_sw_if_index); + + if (rv) + return rv; + + rv = vnet_tap_connect_renumber(vm, intfc_name, hwaddr_arg, sw_if_indexp, + renumber, custom_dev_instance); + + return rv; +} + +static clib_error_t * +tap_modify_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 * intfc_name; + tapcli_main_t * tm = &tapcli_main; + u32 sw_if_index = ~0; + u32 new_sw_if_index = ~0; + int user_hwaddr = 0; + u8 hwaddr[6]; + + if (tm->is_disabled) + { + return clib_error_return (0, "device disabled..."); + } + + if (unformat (input, "%U", unformat_vnet_sw_interface, tm->vnet_main, + &sw_if_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + + if (unformat (input, "%s", &intfc_name)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + + if (unformat(input, "hwaddr %U", unformat_ethernet_address, + &hwaddr)) + user_hwaddr = 1; + + + int rc = vnet_tap_modify (vm, sw_if_index, intfc_name, + (user_hwaddr == 1 ? hwaddr : 0), + &new_sw_if_index, 0, 0); + + if (!rc) { + vlib_cli_output (vm, "Modified %U for Linux tap '%s'", + format_vnet_sw_if_index_name, tm->vnet_main, + new_sw_if_index, intfc_name); + } else { + vlib_cli_output (vm, "Error during modification of tap interface. (rc: %d)", rc); + } + + return 0; +} + +VLIB_CLI_COMMAND (tap_modify_command, static) = { + .path = "tap modify", + .short_help = "tap modify <vpp-tap-intfc-name> <linux-intfc-name> [hwaddr [<addr> | random]]", + .function = tap_modify_command_fn, +}; + +static clib_error_t * +tap_connect_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 * intfc_name; + tapcli_main_t * tm = &tapcli_main; + tapcli_interface_t * ti; + struct ifreq ifr; + int flags; + int dev_net_tun_fd; + int dev_tap_fd = -1; + clib_error_t * error; + int user_hwaddr = 0; + u8 hwaddr[6]; + + if (tm->is_disabled) + { + return clib_error_return (0, "device disabled..."); + } + + if (unformat (input, "%s", &intfc_name)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + + if (unformat(input, "hwaddr %U", unformat_ethernet_address, + &hwaddr)) + user_hwaddr = 1; + + flags = IFF_TAP | IFF_NO_PI; + + if ((dev_net_tun_fd = open ("/dev/net/tun", O_RDWR)) < 0) + { + vlib_cli_output (vm, "Couldn't open /dev/net/tun"); + return 0; + } + + memset (&ifr, 0, sizeof (ifr)); + strncpy(ifr.ifr_name, (char *) intfc_name, sizeof (ifr.ifr_name)-1); + ifr.ifr_flags = flags; + if (ioctl (dev_net_tun_fd, TUNSETIFF, (void *)&ifr) < 0) + { + vlib_cli_output (vm, "Error setting flags on '%s'", intfc_name); + goto error; + } + + /* Open a provisioning socket */ + if ((dev_tap_fd = socket(PF_PACKET, SOCK_RAW, + htons(ETH_P_ALL))) < 0 ) + { + vlib_cli_output (vm, "Couldn't open provisioning socket"); + goto error; + } + + /* Find the interface index. */ + { + struct ifreq ifr; + struct sockaddr_ll sll; + + memset (&ifr, 0, sizeof(ifr)); + strncpy (ifr.ifr_name, (char *) intfc_name, sizeof (ifr.ifr_name)-1); + if (ioctl (dev_tap_fd, SIOCGIFINDEX, &ifr) < 0 ) + { + vlib_cli_output (vm, "Couldn't get if_index"); + goto error; + } + + /* Bind the provisioning socket to the interface. */ + memset(&sll, 0, sizeof(sll)); + sll.sll_family = AF_PACKET; + sll.sll_ifindex = ifr.ifr_ifindex; + sll.sll_protocol = htons(ETH_P_ALL); + + if (bind(dev_tap_fd, (struct sockaddr*) &sll, sizeof(sll)) < 0) + { + vlib_cli_output (vm, "Couldn't bind provisioning socket"); + goto error; + } + } + + /* non-blocking I/O on /dev/tapX */ + { + int one = 1; + if (ioctl (dev_net_tun_fd, FIONBIO, &one) < 0) + { + vlib_cli_output (0, "Couldn't set device non-blocking flag"); + goto error; + } + } + ifr.ifr_mtu = tm->mtu_bytes; + if (ioctl (dev_tap_fd, SIOCSIFMTU, &ifr) < 0) + { + vlib_cli_output (0, "Couldn't set device MTU"); + goto error; + } + + /* get flags, modify to bring up interface... */ + if (ioctl (dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0) + { + vlib_cli_output (0, "Couldn't get interface flags"); + goto error; + } + + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); + + if (ioctl (dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0) + { + vlib_cli_output (0, "Couldn't set intfc admin state up"); + goto error; + } + + if (ioctl (dev_tap_fd, SIOCGIFHWADDR, &ifr) < 0) + { + vlib_cli_output (0, "Couldn't get intfc MAC address"); + goto error; + } + + ti = tapcli_get_new_tapif(); + ti->per_interface_next_index = ~0; + + if (unformat(input, "hwaddr random")) + { + f64 now = vlib_time_now(vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + + memcpy (hwaddr+2, &rnd, sizeof(rnd)); + hwaddr[0] = 2; + hwaddr[1] = 0xfe; + user_hwaddr = 1; + } + + error = ethernet_register_interface + (tm->vnet_main, + tapcli_dev_class.index, + ti - tm->tapcli_interfaces /* device instance */, + user_hwaddr ? hwaddr : + (u8 *) ifr.ifr_hwaddr.sa_data /* ethernet address */, + &ti->hw_if_index, + tapcli_flag_change); + + if (error) + clib_error_report (error); + + { + unix_file_t template = {0}; + template.read_function = tapcli_read_ready; + template.file_descriptor = dev_net_tun_fd; + ti->unix_file_index = unix_file_add (&unix_main, &template); + ti->unix_fd = dev_net_tun_fd; + ti->provision_fd = dev_tap_fd; + memcpy (&ti->ifr, &ifr, sizeof (ifr)); + } + + { + vnet_hw_interface_t * hw; + hw = vnet_get_hw_interface (tm->vnet_main, ti->hw_if_index); + ti->sw_if_index = hw->sw_if_index; + } + + ti->active = 1; + + hash_set (tm->tapcli_interface_index_by_sw_if_index, ti->sw_if_index, + ti - tm->tapcli_interfaces); + + hash_set (tm->tapcli_interface_index_by_unix_fd, ti->unix_fd, + ti - tm->tapcli_interfaces); + + vlib_cli_output (vm, "Created %U for Linux tap '%s'", + format_vnet_sw_if_index_name, tm->vnet_main, + ti->sw_if_index, intfc_name); + + return 0; + + error: + close (dev_net_tun_fd); + close (dev_tap_fd); + + return 0; +} + +VLIB_CLI_COMMAND (tap_connect_command, static) = { + .path = "tap connect", + .short_help = "tap connect <intfc-name> [hwaddr [<addr> | random]]", + .function = tap_connect_command_fn, +}; + +clib_error_t * +tapcli_init (vlib_main_t * vm) +{ + tapcli_main_t * tm = &tapcli_main; + + tm->vlib_main = vm; + tm->vnet_main = vnet_get_main(); + tm->unix_main = &unix_main; + tm->mtu_bytes = 4096 + 256; + tm->tapcli_interface_index_by_sw_if_index = hash_create (0, sizeof(uword)); + tm->tapcli_interface_index_by_unix_fd = hash_create (0, sizeof (uword)); + vm->os_punt_frame = tapcli_nopunt_frame; + + return 0; +} + +VLIB_INIT_FUNCTION (tapcli_init); + + diff --git a/vnet/vnet/unix/tapcli.h b/vnet/vnet/unix/tapcli.h new file mode 100644 index 00000000000..1f5f4c3ee73 --- /dev/null +++ b/vnet/vnet/unix/tapcli.h @@ -0,0 +1,29 @@ +/* + * tapcli.h : tap support + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_tapcli_h__ +#define __included_tapcli_h__ + + +typedef struct { + u32 sw_if_index; + u8 dev_name[64]; +} tapcli_interface_details_t; + +int vnet_tap_dump_ifs (tapcli_interface_details_t **out_tapids); + +#endif /* __included_tapcli_h__ */ diff --git a/vnet/vnet/unix/tuntap.c b/vnet/vnet/unix/tuntap.c new file mode 100644 index 00000000000..77c60fd6ee2 --- /dev/null +++ b/vnet/vnet/unix/tuntap.c @@ -0,0 +1,907 @@ +/* + *------------------------------------------------------------------ + * tuntap.c - kernel stack (reverse) punt/inject path + * + * Copyright (c) 2009 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <fcntl.h> /* for open */ +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/uio.h> /* for iovec */ +#include <netinet/in.h> + +#include <linux/if_arp.h> +#include <linux/if_tun.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> + +#include <vnet/ip/ip.h> + +#include <vnet/ethernet/ethernet.h> + +#if DPDK == 1 +#include <vnet/devices/dpdk/dpdk.h> +#endif + +static vnet_device_class_t tuntap_dev_class; +static vnet_hw_interface_class_t tuntap_interface_class; + +static void tuntap_punt_frame (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame); +static void tuntap_nopunt_frame (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame); + +/* + * This driver runs in one of two distinct modes: + * "punt/inject" mode, where we send pkts not otherwise processed + * by the forwarding to the Linux kernel stack, and + * "normal interface" mode, where we treat the Linux kernel stack + * as a peer. + * + * By default, we select punt/inject mode. + */ + +typedef struct { + u32 sw_if_index; + u8 is_v6; + u8 addr[16]; +} subif_address_t; + +typedef struct { + /* Vector of iovecs for readv/writev calls. */ + struct iovec * iovecs; + + /* Vector of VLIB rx buffers to use. We allocate them in blocks + of VLIB_FRAME_SIZE (256). */ + u32 * rx_buffers; + + /* File descriptors for /dev/net/tun and provisioning socket. */ + int dev_net_tun_fd, dev_tap_fd; + + /* Create a "tap" [ethernet] encaps device */ + int is_ether; + + /* 1 if a "normal" routed intfc, 0 if a punt/inject interface */ + + int have_normal_interface; + + /* tap device destination MAC address. Required, or Linux drops pkts */ + u8 ether_dst_mac[6]; + + /* Interface MTU in bytes and # of default sized buffers. */ + u32 mtu_bytes, mtu_buffers; + + /* Linux interface name for tun device. */ + char * tun_name; + + /* Pool of subinterface addresses */ + subif_address_t *subifs; + + /* Hash for subif addresses */ + mhash_t subif_mhash; + + u32 unix_file_index; + + /* For the "normal" interface, if configured */ + u32 hw_if_index, sw_if_index; + +} tuntap_main_t; + +static tuntap_main_t tuntap_main = { + .tun_name = "vnet", + + /* Suitable defaults for an Ethernet-like tun/tap device */ + .mtu_bytes = 4096 + 256, +}; + +/* + * tuntap_tx + * Output node, writes the buffers comprising the incoming frame + * to the tun/tap device, aka hands them to the Linux kernel stack. + * + */ +static uword +tuntap_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 * buffers = vlib_frame_args (frame); + uword n_packets = frame->n_vectors; + tuntap_main_t * tm = &tuntap_main; + int i; + + for (i = 0; i < n_packets; i++) + { + struct iovec * iov; + vlib_buffer_t * b; + uword l; + + b = vlib_get_buffer (vm, buffers[i]); + + if (tm->is_ether && (!tm->have_normal_interface)) + { + vlib_buffer_reset(b); + memcpy (vlib_buffer_get_current (b), tm->ether_dst_mac, 6); + } + + /* Re-set iovecs if present. */ + if (tm->iovecs) + _vec_len (tm->iovecs) = 0; + + /* VLIB buffer chain -> Unix iovec(s). */ + vec_add2 (tm->iovecs, iov, 1); + iov->iov_base = b->data + b->current_data; + iov->iov_len = l = b->current_length; + + if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + do { + b = vlib_get_buffer (vm, b->next_buffer); + + vec_add2 (tm->iovecs, iov, 1); + + iov->iov_base = b->data + b->current_data; + iov->iov_len = b->current_length; + l += b->current_length; + } while (b->flags & VLIB_BUFFER_NEXT_PRESENT); + } + + if (writev (tm->dev_net_tun_fd, tm->iovecs, vec_len (tm->iovecs)) < l) + clib_unix_warning ("writev"); + } + + /* The normal interface path flattens the buffer chain */ + if (tm->have_normal_interface) + vlib_buffer_free_no_next (vm, buffers, n_packets); + else + vlib_buffer_free (vm, buffers, n_packets); + + return n_packets; +} + +VLIB_REGISTER_NODE (tuntap_tx_node,static) = { + .function = tuntap_tx, + .name = "tuntap-tx", + .type = VLIB_NODE_TYPE_INTERNAL, + .vector_size = 4, +}; + +enum { + TUNTAP_RX_NEXT_IP4_INPUT, + TUNTAP_RX_NEXT_IP6_INPUT, + TUNTAP_RX_NEXT_ETHERNET_INPUT, + TUNTAP_RX_NEXT_DROP, + TUNTAP_RX_N_NEXT, +}; + +static uword +tuntap_rx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + tuntap_main_t * tm = &tuntap_main; + vlib_buffer_t * b; + u32 bi; +#if DPDK == 0 + const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + u32 free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX; +#else + dpdk_main_t * dm = &dpdk_main; + const uword buffer_size = MBUF_SIZE; + u32 free_list_index = dm->vlib_buffer_free_list_index; +#endif + + /* Make sure we have some RX buffers. */ + { + uword n_left = vec_len (tm->rx_buffers); + uword n_alloc; + + if (n_left < VLIB_FRAME_SIZE / 2) + { + if (! tm->rx_buffers) + vec_alloc (tm->rx_buffers, VLIB_FRAME_SIZE); + + n_alloc = vlib_buffer_alloc_from_free_list + (vm, tm->rx_buffers + n_left, VLIB_FRAME_SIZE - n_left, + free_list_index); + _vec_len (tm->rx_buffers) = n_left + n_alloc; + } + } + + /* Allocate RX buffers from end of rx_buffers. + Turn them into iovecs to pass to readv. */ + { + uword i_rx = vec_len (tm->rx_buffers) - 1; + vlib_buffer_t * b; + word i, n_bytes_left, n_bytes_in_packet; + + /* We should have enough buffers left for an MTU sized packet. */ + ASSERT (vec_len (tm->rx_buffers) >= tm->mtu_buffers); + + vec_validate (tm->iovecs, tm->mtu_buffers - 1); + for (i = 0; i < tm->mtu_buffers; i++) + { + b = vlib_get_buffer (vm, tm->rx_buffers[i_rx - i]); + tm->iovecs[i].iov_base = b->data; + tm->iovecs[i].iov_len = buffer_size; + } + + n_bytes_left = readv (tm->dev_net_tun_fd, tm->iovecs, tm->mtu_buffers); + n_bytes_in_packet = n_bytes_left; + if (n_bytes_left <= 0) + { + if (errno != EAGAIN) + clib_unix_warning ("readv %d", n_bytes_left); + return 0; + } + + bi = tm->rx_buffers[i_rx]; + + while (1) + { +#if DPDK == 1 + struct rte_mbuf * mb; +#endif + b = vlib_get_buffer (vm, tm->rx_buffers[i_rx]); +#if DPDK == 1 + mb = (((struct rte_mbuf *)b)-1); +#endif + b->flags = 0; + b->current_data = 0; + b->current_length = n_bytes_left < buffer_size ? n_bytes_left : buffer_size; + + n_bytes_left -= buffer_size; +#if DPDK == 1 + rte_pktmbuf_data_len (mb) = b->current_length; +#endif + + if (n_bytes_left <= 0) + { +#if DPDK == 1 + rte_pktmbuf_pkt_len (mb) = n_bytes_in_packet; +#endif + break; + } + + i_rx--; + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + b->next_buffer = tm->rx_buffers[i_rx]; +#if DPDK == 1 + ASSERT(0); + // ((struct rte_pktmbuf *)(b->mb))->next = + // vlib_get_buffer (vm, tm->rx_buffers[i_rx])->mb; +#endif + } + + /* Interface counters for tuntap interface. */ + vlib_increment_combined_counter + (vnet_main.interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + os_get_cpu_number(), + tm->sw_if_index, + 1, n_bytes_in_packet); + + _vec_len (tm->rx_buffers) = i_rx; + } + + b = vlib_get_buffer (vm, bi); + + { + u32 next_index; + uword n_trace = vlib_get_trace_count (vm, node); + + vnet_buffer (b)->sw_if_index[VLIB_RX] = tm->sw_if_index; + vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32)~0; + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... + */ + if (VLIB_BUFFER_TRACE_TRAJECTORY) + b->pre_data[0] = 0; + + b->error = node->errors[0]; + + if (tm->is_ether) + { + next_index = TUNTAP_RX_NEXT_ETHERNET_INPUT; + } + else + switch (b->data[0] & 0xf0) + { + case 0x40: + next_index = TUNTAP_RX_NEXT_IP4_INPUT; + break; + case 0x60: + next_index = TUNTAP_RX_NEXT_IP6_INPUT; + break; + default: + next_index = TUNTAP_RX_NEXT_DROP; + break; + } + + /* The linux kernel couldn't care less if our interface is up */ + if (tm->have_normal_interface) + { + vnet_main_t *vnm = vnet_get_main(); + vnet_sw_interface_t * si; + si = vnet_get_sw_interface (vnm, tm->sw_if_index); + if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)) + next_index = TUNTAP_RX_NEXT_DROP; + } + + vlib_set_next_frame_buffer (vm, node, next_index, bi); + + if (n_trace > 0) + { + vlib_trace_buffer (vm, node, next_index, + b, /* follow_chain */ 1); + vlib_set_trace_count (vm, node, n_trace - 1); + } + } + + return 1; +} + +static char * tuntap_rx_error_strings[] = { + "unknown packet type", +}; + +VLIB_REGISTER_NODE (tuntap_rx_node,static) = { + .function = tuntap_rx, + .name = "tuntap-rx", + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_INTERRUPT, + .vector_size = 4, + .n_errors = 1, + .error_strings = tuntap_rx_error_strings, + + .n_next_nodes = TUNTAP_RX_N_NEXT, + .next_nodes = { + [TUNTAP_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [TUNTAP_RX_NEXT_IP6_INPUT] = "ip6-input", + [TUNTAP_RX_NEXT_DROP] = "error-drop", + [TUNTAP_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + }, +}; + +/* Gets called when file descriptor is ready from epoll. */ +static clib_error_t * tuntap_read_ready (unix_file_t * uf) +{ + vlib_main_t * vm = vlib_get_main(); + vlib_node_set_interrupt_pending (vm, tuntap_rx_node.index); + return 0; +} + +/* + * tuntap_exit + * Clean up the tun/tap device + */ + +static clib_error_t * +tuntap_exit (vlib_main_t * vm) +{ + tuntap_main_t *tm = &tuntap_main; + struct ifreq ifr; + int sfd; + + /* Not present. */ + if (! tm->dev_net_tun_fd || tm->dev_net_tun_fd < 0) + return 0; + + sfd = socket (AF_INET, SOCK_STREAM, 0); + if (sfd < 0) + clib_unix_warning("provisioning socket"); + + memset(&ifr, 0, sizeof (ifr)); + strncpy (ifr.ifr_name, tm->tun_name, sizeof (ifr.ifr_name)-1); + + /* get flags, modify to bring down interface... */ + if (ioctl (sfd, SIOCGIFFLAGS, &ifr) < 0) + clib_unix_warning ("SIOCGIFFLAGS"); + + ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING); + + if (ioctl (sfd, SIOCSIFFLAGS, &ifr) < 0) + clib_unix_warning ("SIOCSIFFLAGS"); + + /* Turn off persistence */ + if (ioctl (tm->dev_net_tun_fd, TUNSETPERSIST, 0) < 0) + clib_unix_warning ("TUNSETPERSIST"); + close(tm->dev_tap_fd); + close(tm->dev_net_tun_fd); + close (sfd); + + return 0; +} + +VLIB_MAIN_LOOP_EXIT_FUNCTION (tuntap_exit); + +static clib_error_t * +tuntap_config (vlib_main_t * vm, unformat_input_t * input) +{ + tuntap_main_t *tm = &tuntap_main; + clib_error_t * error = 0; + struct ifreq ifr; + u8 * name; + int flags = IFF_TUN | IFF_NO_PI; + int is_enabled = 0, is_ether = 0, have_normal_interface = 0; +#if DPDK == 0 + const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; +#else + const uword buffer_size = MBUF_SIZE; +#endif + + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "mtu %d", &tm->mtu_bytes)) + ; + else if (unformat (input, "enable")) + is_enabled = 1; + else if (unformat (input, "disable")) + is_enabled = 0; + else if (unformat (input, "ethernet") || + unformat (input, "ether")) + is_ether = 1; + else if (unformat (input, "have-normal-interface") || + unformat (input, "have-normal")) + have_normal_interface = 1; + else if (unformat (input, "name %s", &name)) + tm->tun_name = (char *) name; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + tm->dev_net_tun_fd = -1; + tm->dev_tap_fd = -1; + + if (is_enabled == 0) + return 0; + + if (geteuid()) + { + clib_warning ("tuntap disabled: must be superuser"); + return 0; + } + + tm->is_ether = is_ether; + tm->have_normal_interface = have_normal_interface; + + if (is_ether) + flags = IFF_TAP | IFF_NO_PI; + + if ((tm->dev_net_tun_fd = open ("/dev/net/tun", O_RDWR)) < 0) + { + error = clib_error_return_unix (0, "open /dev/net/tun"); + goto done; + } + + memset (&ifr, 0, sizeof (ifr)); + strncpy(ifr.ifr_name, tm->tun_name, sizeof(ifr.ifr_name)-1); + ifr.ifr_flags = flags; + if (ioctl (tm->dev_net_tun_fd, TUNSETIFF, (void *)&ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl TUNSETIFF"); + goto done; + } + + /* Make it persistent, at least until we split. */ + if (ioctl (tm->dev_net_tun_fd, TUNSETPERSIST, 1) < 0) + { + error = clib_error_return_unix (0, "TUNSETPERSIST"); + goto done; + } + + /* Open a provisioning socket */ + if ((tm->dev_tap_fd = socket(PF_PACKET, SOCK_RAW, + htons(ETH_P_ALL))) < 0 ) + { + error = clib_error_return_unix (0, "socket"); + goto done; + } + + /* Find the interface index. */ + { + struct ifreq ifr; + struct sockaddr_ll sll; + + memset (&ifr, 0, sizeof(ifr)); + strncpy (ifr.ifr_name, tm->tun_name, sizeof(ifr.ifr_name)-1); + if (ioctl (tm->dev_tap_fd, SIOCGIFINDEX, &ifr) < 0 ) + { + error = clib_error_return_unix (0, "ioctl SIOCGIFINDEX"); + goto done; + } + + /* Bind the provisioning socket to the interface. */ + memset(&sll, 0, sizeof(sll)); + sll.sll_family = AF_PACKET; + sll.sll_ifindex = ifr.ifr_ifindex; + sll.sll_protocol = htons(ETH_P_ALL); + + if (bind(tm->dev_tap_fd, (struct sockaddr*) &sll, sizeof(sll)) < 0) + { + error = clib_error_return_unix (0, "bind"); + goto done; + } + } + + /* non-blocking I/O on /dev/tapX */ + { + int one = 1; + if (ioctl (tm->dev_net_tun_fd, FIONBIO, &one) < 0) + { + error = clib_error_return_unix (0, "ioctl FIONBIO"); + goto done; + } + } + + tm->mtu_buffers = (tm->mtu_bytes + (buffer_size - 1)) / buffer_size; + + ifr.ifr_mtu = tm->mtu_bytes; + if (ioctl (tm->dev_tap_fd, SIOCSIFMTU, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl SIOCSIFMTU"); + goto done; + } + + /* get flags, modify to bring up interface... */ + if (ioctl (tm->dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl SIOCGIFFLAGS"); + goto done; + } + + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); + + if (ioctl (tm->dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl SIOCSIFFLAGS"); + goto done; + } + + if (is_ether) + { + if (ioctl (tm->dev_tap_fd, SIOCGIFHWADDR, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl SIOCGIFHWADDR"); + goto done; + } + else + memcpy (tm->ether_dst_mac, ifr.ifr_hwaddr.sa_data, 6); + } + + if (have_normal_interface) + { + vnet_main_t *vnm = vnet_get_main(); + error = ethernet_register_interface + (vnm, + tuntap_dev_class.index, + 0 /* device instance */, + tm->ether_dst_mac /* ethernet address */, + &tm->hw_if_index, + 0 /* flag change */); + if (error) + clib_error_report (error); + tm->sw_if_index = tm->hw_if_index; + vm->os_punt_frame = tuntap_nopunt_frame; + } + else + { + vnet_main_t *vnm = vnet_get_main(); + vnet_hw_interface_t * hi; + + vm->os_punt_frame = tuntap_punt_frame; + + tm->hw_if_index = vnet_register_interface + (vnm, + tuntap_dev_class.index, 0 /* device instance */, + tuntap_interface_class.index, 0); + hi = vnet_get_hw_interface (vnm, tm->hw_if_index); + tm->sw_if_index = hi->sw_if_index; + + /* Interface is always up. */ + vnet_hw_interface_set_flags (vnm, tm->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + vnet_sw_interface_set_flags (vnm, tm->sw_if_index, + VNET_SW_INTERFACE_FLAG_ADMIN_UP); + } + + { + unix_file_t template = {0}; + template.read_function = tuntap_read_ready; + template.file_descriptor = tm->dev_net_tun_fd; + tm->unix_file_index = unix_file_add (&unix_main, &template); + } + + done: + if (error) + { + if (tm->dev_net_tun_fd >= 0) + close (tm->dev_net_tun_fd); + if (tm->dev_tap_fd >= 0) + close (tm->dev_tap_fd); + } + + return error; +} + +VLIB_CONFIG_FUNCTION (tuntap_config, "tuntap"); + +void +tuntap_ip4_add_del_interface_address (ip4_main_t * im, + uword opaque, + u32 sw_if_index, + ip4_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_delete) +{ + tuntap_main_t * tm = &tuntap_main; + struct ifreq ifr; + subif_address_t subif_addr, * ap; + uword * p; + + /* Tuntap disabled, or using a "normal" interface. */ + if (tm->have_normal_interface || tm->dev_tap_fd < 0) + return; + + /* See if we already know about this subif */ + memset (&subif_addr, 0, sizeof (subif_addr)); + subif_addr.sw_if_index = sw_if_index; + memcpy (&subif_addr.addr, address, sizeof (*address)); + + p = mhash_get (&tm->subif_mhash, &subif_addr); + + if (p) + ap = pool_elt_at_index (tm->subifs, p[0]); + else + { + pool_get (tm->subifs, ap); + *ap = subif_addr; + mhash_set (&tm->subif_mhash, ap, ap - tm->subifs, 0); + } + + /* Use subif pool index to select alias device. */ + memset (&ifr, 0, sizeof (ifr)); + snprintf (ifr.ifr_name, sizeof(ifr.ifr_name), + "%s:%d", tm->tun_name, (int)(ap - tm->subifs)); + + if (! is_delete) + { + struct sockaddr_in * sin; + + sin = (struct sockaddr_in *)&ifr.ifr_addr; + + /* Set ipv4 address, netmask. */ + sin->sin_family = AF_INET; + memcpy (&sin->sin_addr.s_addr, address, 4); + if (ioctl (tm->dev_tap_fd, SIOCSIFADDR, &ifr) < 0) + clib_unix_warning ("ioctl SIOCSIFADDR"); + + sin->sin_addr.s_addr = im->fib_masks[address_length]; + if (ioctl (tm->dev_tap_fd, SIOCSIFNETMASK, &ifr) < 0) + clib_unix_warning ("ioctl SIOCSIFNETMASK"); + } + else + { + mhash_unset (&tm->subif_mhash, &subif_addr, 0 /* old value ptr */); + pool_put (tm->subifs, ap); + } + + /* get flags, modify to bring up interface... */ + if (ioctl (tm->dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0) + clib_unix_warning ("ioctl SIOCGIFFLAGS"); + + if (is_delete) + ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING); + else + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); + + if (ioctl (tm->dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0) + clib_unix_warning ("ioctl SIOCSIFFLAGS"); +} + +/* + * $$$$ gross workaround for a known #include bug + * #include <linux/ipv6.h> causes multiple definitions if + * netinet/in.h is also included. + */ +struct in6_ifreq { + struct in6_addr ifr6_addr; + u32 ifr6_prefixlen; + int ifr6_ifindex; +}; + +/* + * Both the v6 interface address API and the way ifconfig + * displays subinterfaces differ from their v4 couterparts. + * The code given here seems to work but YMMV. + */ +void +tuntap_ip6_add_del_interface_address (ip6_main_t * im, + uword opaque, + u32 sw_if_index, + ip6_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_delete) +{ + tuntap_main_t * tm = &tuntap_main; + struct ifreq ifr; + struct in6_ifreq ifr6; + subif_address_t subif_addr, * ap; + uword * p; + + /* Tuntap disabled, or using a "normal" interface. */ + if (tm->have_normal_interface || tm->dev_tap_fd < 0) + return; + + /* See if we already know about this subif */ + memset (&subif_addr, 0, sizeof (subif_addr)); + subif_addr.sw_if_index = sw_if_index; + subif_addr.is_v6 = 1; + memcpy (&subif_addr.addr, address, sizeof (*address)); + + p = mhash_get (&tm->subif_mhash, &subif_addr); + + if (p) + ap = pool_elt_at_index (tm->subifs, p[0]); + else + { + pool_get (tm->subifs, ap); + *ap = subif_addr; + mhash_set (&tm->subif_mhash, ap, ap - tm->subifs, 0); + } + + /* Use subif pool index to select alias device. */ + memset (&ifr, 0, sizeof (ifr)); + memset (&ifr6, 0, sizeof (ifr6)); + snprintf (ifr.ifr_name, sizeof(ifr.ifr_name), + "%s:%d", tm->tun_name, (int)(ap - tm->subifs)); + + if (! is_delete) + { + int sockfd = socket (AF_INET6, SOCK_STREAM, 0); + if (sockfd < 0) + clib_unix_warning ("get ifindex socket"); + + if (ioctl (sockfd, SIOGIFINDEX, &ifr) < 0) + clib_unix_warning ("get ifindex"); + + ifr6.ifr6_ifindex = ifr.ifr_ifindex; + ifr6.ifr6_prefixlen = address_length; + memcpy (&ifr6.ifr6_addr, address, 16); + + if (ioctl (sockfd, SIOCSIFADDR, &ifr6) < 0) + clib_unix_warning ("set address"); + + close (sockfd); + } + else + { + int sockfd = socket (AF_INET6, SOCK_STREAM, 0); + if (sockfd < 0) + clib_unix_warning ("get ifindex socket"); + + if (ioctl (sockfd, SIOGIFINDEX, &ifr) < 0) + clib_unix_warning ("get ifindex"); + + ifr6.ifr6_ifindex = ifr.ifr_ifindex; + ifr6.ifr6_prefixlen = address_length; + memcpy (&ifr6.ifr6_addr, address, 16); + + if (ioctl (sockfd, SIOCDIFADDR, &ifr6) < 0) + clib_unix_warning ("del address"); + + close (sockfd); + + mhash_unset (&tm->subif_mhash, &subif_addr, 0 /* old value ptr */); + pool_put (tm->subifs, ap); + } +} + +static void +tuntap_punt_frame (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + tuntap_tx (vm, node, frame); + vlib_frame_free (vm, node, frame); +} + +static void +tuntap_nopunt_frame (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 * buffers = vlib_frame_args (frame); + uword n_packets = frame->n_vectors; + vlib_buffer_free (vm, buffers, n_packets); + vlib_frame_free (vm, node, frame); +} + +VNET_HW_INTERFACE_CLASS (tuntap_interface_class,static) = { + .name = "tuntap", +}; + +static u8 * format_tuntap_interface_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + + s = format (s, "tuntap-%d", i); + return s; +} + +static uword +tuntap_intfc_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + tuntap_main_t * tm = &tuntap_main; + u32 * buffers = vlib_frame_args (frame); + uword n_buffers = frame->n_vectors; + + /* Normal interface transmit happens only on the normal interface... */ + if (tm->have_normal_interface) + return tuntap_tx (vm, node, frame); + + vlib_buffer_free (vm, buffers, n_buffers); + return n_buffers; +} + +VNET_DEVICE_CLASS (tuntap_dev_class,static) = { + .name = "tuntap", + .tx_function = tuntap_intfc_tx, + .format_device_name = format_tuntap_interface_name, +}; + +static clib_error_t * +tuntap_init (vlib_main_t * vm) +{ + clib_error_t * error; + ip4_main_t * im4 = &ip4_main; + ip6_main_t * im6 = &ip6_main; + ip4_add_del_interface_address_callback_t cb4; + ip6_add_del_interface_address_callback_t cb6; + tuntap_main_t * tm = &tuntap_main; + + error = vlib_call_init_function (vm, ip4_init); + if (error) + return error; + + mhash_init (&tm->subif_mhash, sizeof (u32), sizeof(subif_address_t)); + + cb4.function = tuntap_ip4_add_del_interface_address; + cb4.function_opaque = 0; + vec_add1 (im4->add_del_interface_address_callbacks, cb4); + + cb6.function = tuntap_ip6_add_del_interface_address; + cb6.function_opaque = 0; + vec_add1 (im6->add_del_interface_address_callbacks, cb6); + + return 0; +} + +VLIB_INIT_FUNCTION (tuntap_init); diff --git a/vnet/vnet/unix/tuntap.h b/vnet/vnet/unix/tuntap.h new file mode 100644 index 00000000000..ba0b77938e8 --- /dev/null +++ b/vnet/vnet/unix/tuntap.h @@ -0,0 +1,37 @@ +/* + *------------------------------------------------------------------ + * tuntap.h - kernel stack (reverse) punt/inject path + * + * Copyright (c) 2009 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +/* + * Call from some VLIB_INIT_FUNCTION to set the Linux kernel + * inject node name. + */ +void register_tuntap_inject_node_name (char *name); + +int vnet_tap_connect (vlib_main_t * vm, u8 * intfc_name, + u8 *hwaddr_arg, u32 * sw_if_indexp); +int vnet_tap_connect_renumber (vlib_main_t * vm, u8 * intfc_name, + u8 *hwaddr_arg, u32 * sw_if_indexp, + u8 renumber, u32 custom_dev_instance); + +int vnet_tap_delete(vlib_main_t *vm, u32 sw_if_index); + +int vnet_tap_modify (vlib_main_t * vm, u32 orig_sw_if_index, + u8 * intfc_name, u8 *hwaddr_arg, + u32 * sw_if_indexp, + u8 renumber, u32 custom_dev_instance); |