diff options
author | Mohsin KAZMI <sykazmi@cisco.com> | 2016-08-10 15:51:17 +0200 |
---|---|---|
committer | Mohsin KAZMI <sykazmi@cisco.com> | 2016-08-10 17:33:56 +0200 |
commit | adff8bfb431798fc1d4e4571d53074dc3438d14f (patch) | |
tree | ac73cd352be4eba5d1dac60596a39f4bc72b711d /turbotap/turbotap/node.c | |
parent | 5d7ae6ad224dab352219e3a60d40f0ed5af2be22 (diff) |
turbotap: A plugin for turbo tap interfaces
This patch implements a plugin for tap interfaces using
socket API. It has advantage of reduced context switching
over traditional tap interfaces using "sendmmsg" or
"recvmmsg" system calls to send/receive multiple packets
using single system call.
Change-Id: I5b98e403692ac47d119c03174a21fbd9ff24de82
Signed-off-by: Mohsin KAZMI <sykazmi@cisco.com>
Diffstat (limited to 'turbotap/turbotap/node.c')
-rw-r--r-- | turbotap/turbotap/node.c | 362 |
1 files changed, 362 insertions, 0 deletions
diff --git a/turbotap/turbotap/node.c b/turbotap/turbotap/node.c new file mode 100644 index 0000000..3616d46 --- /dev/null +++ b/turbotap/turbotap/node.c @@ -0,0 +1,362 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#define _GNU_SOURCE + +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/uio.h> /* for iovec */ +#include <netinet/in.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/vnet.h> + +#include <vnet/ip/ip.h> + +#if DPDK == 1 +#include <vnet/devices/dpdk/dpdk.h> +#endif + +#include <turbotap/turbotap.h> + +vlib_node_registration_t turbotap_rx_node; + +enum { + TURBOTAP_RX_NEXT_IP4_INPUT, + TURBOTAP_RX_NEXT_IP6_INPUT, + TURBOTAP_RX_NEXT_ETHERNET_INPUT, + TURBOTAP_RX_NEXT_DROP, + TURBOTAP_RX_N_NEXT, +}; + +typedef struct { + u16 sw_if_index; +} turbotap_rx_trace_t; + +u8 * format_turbotap_rx_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + vnet_main_t * vnm = vnet_get_main(); + turbotap_rx_trace_t * t = va_arg (*va, turbotap_rx_trace_t *); + s = format (s, "%U", format_vnet_sw_if_index_name, + vnm, t->sw_if_index); + return s; +} + +always_inline void +buffer_add_to_chain(vlib_main_t *vm, u32 bi, u32 first_bi, u32 prev_bi) +{ + vlib_buffer_t * b = vlib_get_buffer (vm, bi); + vlib_buffer_t * first_b = vlib_get_buffer (vm, first_bi); + vlib_buffer_t * prev_b = vlib_get_buffer (vm, prev_bi); + + /* update first buffer */ + first_b->total_length_not_including_first_buffer += b->current_length; + + /* update previous buffer */ + prev_b->next_buffer = bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + b->next_buffer = 0; + +#if DPDK > 0 + struct rte_mbuf * mbuf = rte_mbuf_from_vlib_buffer(b); + struct rte_mbuf * first_mbuf = rte_mbuf_from_vlib_buffer(first_b); + struct rte_mbuf * prev_mbuf = rte_mbuf_from_vlib_buffer(prev_b); + first_mbuf->nb_segs++; + prev_mbuf->next = mbuf; + mbuf->data_len = b->current_length; + mbuf->data_off = RTE_PKTMBUF_HEADROOM + b->current_data; + mbuf->next = 0; +#endif +} + +static uword +turbotap_rx_iface(vlib_main_t * vm, + vlib_node_runtime_t * node, + turbotap_interface_t * ti) +{ + turbotap_main_t * tr = &turbotap_main; + const uword buffer_size = vlib_buffer_free_list_buffer_size ( vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + u32 n_trace = vlib_get_trace_count (vm, node); + u8 set_trace = 0; + vnet_main_t *vnm; + vnet_sw_interface_t * si; + u8 admin_down; + uword len = 0; + u32 next_index = TURBOTAP_RX_NEXT_ETHERNET_INPUT; + u32 *to_next; + + vnm = vnet_get_main(); + si = vnet_get_sw_interface (vnm, ti->sw_if_index); + admin_down = !(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP); + + if (ti->per_interface_next_index != ~0) + next_index = ti->per_interface_next_index; + + /* Buffer Allocation */ + u32 desired_allocation = ti->rx_ready * ti->mtu_buffers + 32; + if (PREDICT_TRUE(vec_len(tr->rx_buffers) < ti->rx_ready * ti->mtu_buffers)) + { + len = vec_len(tr->rx_buffers); + vec_validate(tr->rx_buffers, desired_allocation - 1); + vec_validate(tr->unused_buffer_list, desired_allocation - 1); + _vec_len(tr->unused_buffer_list) = 0; + _vec_len(tr->rx_buffers) = len + + vlib_buffer_alloc(vm, &tr->rx_buffers[len], desired_allocation - len); + if (PREDICT_FALSE(vec_len(tr->rx_buffers) < ti->rx_ready * ti->mtu_buffers)) + { + vlib_node_increment_counter(vm, turbotap_rx_node.index, TURBOTAP_ERROR_BUFFER_ALLOCATION, 1); + } + } + + /* Filling msgs */ + u32 i = 0; + len = vec_len(tr->rx_buffers); + while (i < ti->rx_ready && len > ti->mtu_buffers) + { + u32 j = 0; + vec_validate(ti->rx_msg[i].msg_hdr.msg_iov, ti->mtu_buffers - 1); + while (j < ti->mtu_buffers) + { + vlib_buffer_t *b = vlib_get_buffer(vm, tr->rx_buffers[len - 1]); + ti->rx_msg[i].msg_hdr.msg_iov[j].iov_base = b->data; + ti->rx_msg[i].msg_hdr.msg_iov[j].iov_len = buffer_size; + len--; + j++; + } + + ti->rx_msg[i].msg_hdr.msg_iovlen = ti->mtu_buffers; + ti->rx_msg[i].msg_hdr.msg_flags = MSG_DONTWAIT; + ti->rx_msg[i].msg_hdr.msg_name = NULL; + ti->rx_msg[i].msg_hdr.msg_namelen = 0; + ti->rx_msg[i].msg_hdr.msg_control = NULL; + ti->rx_msg[i].msg_hdr.msg_controllen = 0; + ti->rx_msg[i].msg_len = 0; + i++; + } + + /* + * Be careful here + * + * Experiments show that we need to set the time according + * to the number of msgs receive from kernel even if the call + * is NON-BLOCKING. If timeout is so small, then recvmmsg + * call gets as many packets as it can in that time period. + */ + struct timespec timeout = {.tv_sec = 0, .tv_nsec = 500000}; + int num_rx_msgs = recvmmsg(ti->sock_fd, ti->rx_msg, i, MSG_DONTWAIT, &timeout); + if (num_rx_msgs <= 0) { + if (errno != EAGAIN) { + vlib_node_increment_counter(vm, turbotap_rx_node.index, + TURBOTAP_ERROR_READ, 1); + } + return 0; + } + + u32 next = next_index; + u32 n_left_to_next = 0; + + i = 0; + len = vec_len(tr->rx_buffers); + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (i != num_rx_msgs && n_left_to_next) + { + vlib_buffer_t *b0, *first_b0; + u32 bi0 = 0, first_bi0 = 0, prev_bi0, j = 0; + u32 bytes_to_put = 0, bytes_already_put = 0; + u32 remain_len = ti->rx_msg[i].msg_len; + + while (remain_len && len) + { + /* grab free buffer */ + prev_bi0 = bi0; + bi0 = tr->rx_buffers[len - 1]; + b0 = vlib_get_buffer(vm, bi0); + + bytes_to_put = remain_len > buffer_size ? buffer_size : remain_len; + b0->current_length = bytes_to_put; + + if (bytes_already_put == 0) + { +#if DPDK > 0 + struct rte_mbuf * mb = rte_mbuf_from_vlib_buffer(b0); + rte_pktmbuf_data_len (mb) = b0->current_length; + rte_pktmbuf_pkt_len (mb) = b0->current_length; +#endif + b0->total_length_not_including_first_buffer = 0; + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + vnet_buffer(b0)->sw_if_index[VLIB_RX] = ti->sw_if_index; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0; + first_bi0 = bi0; + first_b0 = vlib_get_buffer(vm, first_bi0); + } + else + buffer_add_to_chain(vm, bi0, first_bi0, prev_bi0); + + + bytes_already_put += bytes_to_put; + remain_len -= bytes_to_put; + j++; + len--; + } + + /* record unused buffers */ + while (j < ti->mtu_buffers) + { + u32 vec_len_unused = vec_len(tr->unused_buffer_list); + tr->unused_buffer_list[vec_len_unused] = tr->rx_buffers[len - 1]; + len--; + j++; + _vec_len(tr->unused_buffer_list) = vec_len_unused + 1; + } + + /* trace */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT(first_b0); + + first_b0->error = node->errors[TURBOTAP_ERROR_NONE]; + + /* Interface counters and tracing. */ + if (PREDICT_TRUE(!admin_down)) + { + vlib_increment_combined_counter ( + vnet_main.interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + os_get_cpu_number(), ti->sw_if_index, + 1, ti->rx_msg[i].msg_len); + + if (PREDICT_FALSE(n_trace > 0)) + { + vlib_trace_buffer (vm, node, next_index, + first_b0, /* follow_chain */ 1); + n_trace--; + set_trace = 1; + turbotap_rx_trace_t *t0 = vlib_add_trace (vm, node, first_b0, sizeof (*t0)); + t0->sw_if_index = si->sw_if_index; + } + } else { + next = TURBOTAP_RX_NEXT_DROP; + } + + /* next packet */ + to_next[0] = first_bi0; + n_left_to_next -= 1; + to_next +=1; + + /* enque and take next packet */ + vlib_validate_buffer_enqueue_x1(vm, node, next_index , to_next, + n_left_to_next, first_bi0, next); + + i++; + } + + _vec_len(tr->rx_buffers) = len; + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + + /* put unused buffers back */ + while (vec_len(tr->unused_buffer_list) > 0) + { + u32 vec_len_unused = vec_len(tr->unused_buffer_list); + u32 vec_len_rx = vec_len(tr->rx_buffers); + tr->rx_buffers[vec_len_rx] = tr->unused_buffer_list[vec_len_unused - 1]; + _vec_len(tr->unused_buffer_list) -= 1; + _vec_len(tr->rx_buffers) += 1; + } + + if (ti->rx_ready - i > 0 ) + { + ti->rx_ready -= i; + if (ti->rx_ready < i) + ti->rx_ready = i; + } + else if (ti->rx_ready + i > MAX_RECV) + ti->rx_ready = MAX_RECV; + else + ti->rx_ready += i; + + if (set_trace) + vlib_set_trace_count (vm, node, n_trace); + return i; +} + +static uword +turbotap_rx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + turbotap_main_t * tr = &turbotap_main; + static u32 * ready_interface_indices; + turbotap_interface_t * ti; + int i; + u32 total_count = 0; + + vec_reset_length (ready_interface_indices); + clib_bitmap_foreach (i, tr->pending_read_bitmap, + ({ + vec_add1 (ready_interface_indices, i); + })); + + if (vec_len (ready_interface_indices) == 0) + return 0; + + for (i = 0; i < vec_len(ready_interface_indices); i++) + { + tr->pending_read_bitmap = + clib_bitmap_set (tr->pending_read_bitmap, + ready_interface_indices[i], 0); + + ti = vec_elt_at_index (tr->turbotap_interfaces, ready_interface_indices[i]); + total_count += turbotap_rx_iface(vm, node, ti); + } + return total_count; //This might return more than 256. +} + +static char * turbotap_rx_error_strings[] = { +#define _(sym,string) string, + foreach_turbotap_error +#undef _ +}; + +VLIB_REGISTER_NODE (turbotap_rx_node) = { + .function = turbotap_rx, + .name = "turbotap-rx", + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_INTERRUPT, + .vector_size = 4, + .n_errors = TURBOTAP_N_ERROR, + .error_strings = turbotap_rx_error_strings, + .format_trace = format_turbotap_rx_trace, + + .n_next_nodes = TURBOTAP_RX_N_NEXT, + .next_nodes = { + [TURBOTAP_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [TURBOTAP_RX_NEXT_IP6_INPUT] = "ip6-input", + [TURBOTAP_RX_NEXT_DROP] = "error-drop", + [TURBOTAP_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + }, +}; + |