From 4a76d6f6da035220917097bc047b08bc58254803 Mon Sep 17 00:00:00 2001 From: Benoît Ganne Date: Fri, 12 Jun 2020 08:47:34 +0200 Subject: af_xdp: AF_XDP input plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Type: feature Change-Id: I85aa4ad6b68c1aa0e51938002dc691a4b11c545c Signed-off-by: Damjan Marion Signed-off-by: Benoît Ganne --- src/plugins/af_xdp/input.c | 367 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 src/plugins/af_xdp/input.c (limited to 'src/plugins/af_xdp/input.c') diff --git a/src/plugins/af_xdp/input.c b/src/plugins/af_xdp/input.c new file mode 100644 index 00000000000..c5b3488d438 --- /dev/null +++ b/src/plugins/af_xdp/input.c @@ -0,0 +1,367 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2018 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include +#include +#include +#include +#include +#include +#include "af_xdp.h" + +#define foreach_af_xdp_input_error \ + _(POLL_REQUIRED, "poll required") \ + _(POLL_FAILURES, "poll failures") + +typedef enum +{ +#define _(f,s) AF_XDP_INPUT_ERROR_##f, + foreach_af_xdp_input_error +#undef _ + AF_XDP_INPUT_N_ERROR, +} af_xdp_input_error_t; + +static __clib_unused char *af_xdp_input_error_strings[] = { +#define _(n,s) s, + foreach_af_xdp_input_error +#undef _ +}; + +static_always_inline void +af_xdp_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node, + u32 n_left, const u32 * bi, u32 next_index, + u32 hw_if_index) +{ + u32 n_trace; + + if (PREDICT_TRUE (0 == (n_trace = vlib_get_trace_count (vm, node)))) + return; + + while (n_trace && n_left) + { + vlib_buffer_t *b; + af_xdp_input_trace_t *tr; + b = vlib_get_buffer (vm, bi[0]); + vlib_trace_buffer (vm, node, next_index, b, + /* follow_chain */ 0); + tr = vlib_add_trace (vm, node, b, sizeof (*tr)); + tr->next_index = next_index; + tr->hw_if_index = hw_if_index; + + n_trace--; + n_left--; + bi++; + } + + vlib_set_trace_count (vm, node, n_trace); +} + +static_always_inline void +af_xdp_device_input_refill_db (vlib_main_t * vm, + const vlib_node_runtime_t * node, + af_xdp_device_t * ad, af_xdp_rxq_t * rxq, + const u32 n_alloc) +{ + int ret; + + xsk_ring_prod__submit (&rxq->fq, n_alloc); + + if (!xsk_ring_prod__needs_wakeup (&rxq->fq)) + return; + + vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_REQUIRED, + 1); + + struct pollfd fd = {.fd = rxq->xsk_fd,.events = POLLIN }; + ret = poll (&fd, 1, 0); + if (PREDICT_TRUE (ret >= 0)) + return; + + /* something bad is happening */ + vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_FAILURES, + 1); + af_xdp_device_error (ad, "poll() failed"); +} + +static_always_inline void +af_xdp_device_input_refill (vlib_main_t * vm, + const vlib_node_runtime_t * node, + af_xdp_device_t * ad, af_xdp_rxq_t * rxq, + const int copy) +{ + __u64 *fill; + const u32 size = rxq->fq.size; + const u32 mask = size - 1; + u32 bis[VLIB_FRAME_SIZE], *bi = bis; + u32 n_alloc, n, n_wrap; + u32 idx; + + ASSERT (mask == rxq->fq.mask); + + /* do not enqueue more packet than ring space */ + n_alloc = xsk_prod_nb_free (&rxq->fq, 16); + /* do not bother to allocate if too small */ + if (n_alloc < 16) + return; + + n_alloc = clib_min (n_alloc, ARRAY_LEN (bis)); + n_alloc = vlib_buffer_alloc_from_pool (vm, bis, n_alloc, ad->pool); + n = xsk_ring_prod__reserve (&rxq->fq, n_alloc, &idx); + ASSERT (n == n_alloc); + + fill = xsk_ring_prod__fill_addr (&rxq->fq, idx); + n = clib_min (n_alloc, size - (idx & mask)); + n_wrap = n_alloc - n; + + /* + * Note about headroom: for some reasons, there seem to be a discrepency + * between 0-copy and copy mode: + * - 0-copy: XDP_PACKET_HEADROOM will be added to the user headroom + * - copy: nothing is added to the user headroom + * We privileged 0-copy and set headroom to 0. As XDP_PACKET_HEADROOM == + * sizeof(vlib_buffer_t), data will correctly point to vlib_buffer_t->data. + * In copy mode, we have to add sizeof(vlib_buffer_t) to desc offset during + * refill. + */ + STATIC_ASSERT (sizeof (vlib_buffer_t) == XDP_PACKET_HEADROOM, "wrong size"); +#define bi2addr(bi) \ + (((bi) << CLIB_LOG2_CACHE_LINE_BYTES) + (copy ? sizeof(vlib_buffer_t) : 0)) + +wrap_around: + + while (n >= 8) + { +#ifdef CLIB_HAVE_VEC256 + u64x4 b0 = u32x4_extend_to_u64x4 (*(u32x4u *) (bi + 0)); + u64x4 b1 = u32x4_extend_to_u64x4 (*(u32x4u *) (bi + 4)); + *(u64x4u *) (fill + 0) = bi2addr (b0); + *(u64x4u *) (fill + 4) = bi2addr (b1); +#else + fill[0] = bi2addr (bi[0]); + fill[1] = bi2addr (bi[1]); + fill[2] = bi2addr (bi[2]); + fill[3] = bi2addr (bi[3]); + fill[4] = bi2addr (bi[4]); + fill[5] = bi2addr (bi[5]); + fill[6] = bi2addr (bi[6]); + fill[7] = bi2addr (bi[7]); +#endif + fill += 8; + bi += 8; + n -= 8; + } + + while (n >= 1) + { + fill[0] = bi2addr (bi[0]); + fill += 1; + bi += 1; + n -= 1; + } + + if (n_wrap) + { + fill = xsk_ring_prod__fill_addr (&rxq->fq, 0); + n = n_wrap; + n_wrap = 0; + goto wrap_around; + } + + af_xdp_device_input_refill_db (vm, node, ad, rxq, n_alloc); +} + +static_always_inline void +af_xdp_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node, + const u32 next_index, const u32 sw_if_index, + const u32 hw_if_index) +{ + vlib_next_frame_t *nf; + vlib_frame_t *f; + ethernet_input_frame_t *ef; + + if (PREDICT_FALSE (VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT != next_index)) + return; + + nf = + vlib_node_runtime_get_next_frame (vm, node, + VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT); + f = vlib_get_frame (vm, nf->frame); + f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + + ef = vlib_frame_scalar_args (f); + ef->sw_if_index = sw_if_index; + ef->hw_if_index = hw_if_index; +} + +static_always_inline u32 +af_xdp_device_input_bufs (vlib_main_t * vm, const af_xdp_device_t * ad, + af_xdp_rxq_t * rxq, u32 * bis, const u32 n_rx, + vlib_buffer_t * bt, u32 idx, const int copy) +{ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; + u16 lens[VLIB_FRAME_SIZE], *len = lens; + const u32 mask = rxq->rx.mask; + u32 n = n_rx, *bi = bis, bytes = 0; + +#define addr2bi(addr) \ + (((addr) - (copy ? sizeof(vlib_buffer_t) : 0)) >> CLIB_LOG2_CACHE_LINE_BYTES) + + while (n >= 1) + { + const struct xdp_desc *desc = xsk_ring_cons__rx_desc (&rxq->rx, idx); + bi[0] = addr2bi (xsk_umem__extract_addr (desc->addr)); + ASSERT (vlib_buffer_is_known (vm, bi[0]) == + VLIB_BUFFER_KNOWN_ALLOCATED); + len[0] = desc->len; + idx = (idx + 1) & mask; + bi += 1; + len += 1; + n -= 1; + } + + vlib_get_buffers (vm, bis, bufs, n_rx); + + n = n_rx; + len = lens; + + while (n >= 8) + { + vlib_prefetch_buffer_header (b[4], LOAD); + vlib_buffer_copy_template (b[0], bt); + bytes += b[0]->current_length = len[0]; + + vlib_prefetch_buffer_header (b[5], LOAD); + vlib_buffer_copy_template (b[1], bt); + bytes += b[1]->current_length = len[1]; + + vlib_prefetch_buffer_header (b[6], LOAD); + vlib_buffer_copy_template (b[2], bt); + bytes += b[2]->current_length = len[2]; + + vlib_prefetch_buffer_header (b[7], LOAD); + vlib_buffer_copy_template (b[3], bt); + bytes += b[3]->current_length = len[3]; + + b += 4; + len += 4; + n -= 4; + } + + while (n >= 1) + { + vlib_buffer_copy_template (b[0], bt); + bytes += b[0]->current_length = len[0]; + b += 1; + len += 1; + n -= 1; + } + + xsk_ring_cons__release (&rxq->rx, n_rx); + return bytes; +} + +static_always_inline uword +af_xdp_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, af_xdp_device_t * ad, + u16 qid, const int copy) +{ + vnet_main_t *vnm = vnet_get_main (); + af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, qid); + vlib_buffer_t bt; + u32 next_index, *to_next, n_left_to_next; + u32 n_rx_packets, n_rx_bytes; + u32 idx; + + n_rx_packets = xsk_ring_cons__peek (&rxq->rx, VLIB_FRAME_SIZE, &idx); + + if (PREDICT_FALSE (0 == n_rx_packets)) + goto refill; + + vlib_buffer_copy_template (&bt, ad->buffer_template); + next_index = ad->per_interface_next_index; + if (PREDICT_FALSE (vnet_device_input_have_features (ad->sw_if_index))) + vnet_feature_start_device_input_x1 (ad->sw_if_index, &next_index, &bt); + + vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); + + n_rx_bytes = + af_xdp_device_input_bufs (vm, ad, rxq, to_next, n_rx_packets, &bt, idx, + copy); + af_xdp_device_input_ethernet (vm, node, next_index, ad->sw_if_index, + ad->hw_if_index); + + vlib_put_next_frame (vm, node, next_index, n_left_to_next - n_rx_packets); + + af_xdp_device_input_trace (vm, node, n_rx_packets, to_next, next_index, + ad->hw_if_index); + + vlib_increment_combined_counter + (vnm->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, vm->thread_index, + ad->hw_if_index, n_rx_packets, n_rx_bytes); + +refill: + af_xdp_device_input_refill (vm, node, ad, rxq, copy); + + return n_rx_packets; +} + +VLIB_NODE_FN (af_xdp_input_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_rx = 0; + af_xdp_main_t *am = &af_xdp_main; + vnet_device_input_runtime_t *rt = (void *) node->runtime_data; + vnet_device_and_queue_t *dq; + + foreach_device_and_queue (dq, rt->devices_and_queues) + { + af_xdp_device_t *ad; + ad = vec_elt_at_index (am->devices, dq->dev_instance); + if ((ad->flags & AF_XDP_DEVICE_F_ADMIN_UP) == 0) + continue; + if (PREDICT_TRUE (ad->flags & AF_XDP_DEVICE_F_ZEROCOPY)) + n_rx += af_xdp_device_input_inline (vm, node, frame, ad, dq->queue_id, + /* copy */ 0); + else + n_rx += af_xdp_device_input_inline (vm, node, frame, ad, dq->queue_id, + /* copy */ 1); + } + return n_rx; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (af_xdp_input_node) = { + .name = "af_xdp-input", + .sibling_of = "device-input", + .format_trace = format_af_xdp_input_trace, + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_DISABLED, + .n_errors = AF_XDP_INPUT_N_ERROR, + .error_strings = af_xdp_input_error_strings, + .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg