diff options
author | Damjan Marion <damarion@cisco.com> | 2023-11-13 12:18:24 +0000 |
---|---|---|
committer | Damjan Marion <damarion@cisco.com> | 2023-11-13 19:05:17 +0000 |
commit | 2d725c61286ccb8625ffad5c678cee337f88bceb (patch) | |
tree | c3b70710a4f4e5b30c68a6791af151b9de787c62 /src/plugins/dev_ena/rx_node.c | |
parent | e596ca1b51d25c24bd06132f9824d4bd0c6a5ecc (diff) |
ena: Amazon Elastic Network Adapter (ENA) native driver
Type: feature
Change-Id: Icd9de05f2cbac0e5a6dfb1f1414f21dc4b893104
Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/plugins/dev_ena/rx_node.c')
-rw-r--r-- | src/plugins/dev_ena/rx_node.c | 457 |
1 files changed, 457 insertions, 0 deletions
diff --git a/src/plugins/dev_ena/rx_node.c b/src/plugins/dev_ena/rx_node.c new file mode 100644 index 00000000000..41fc5b8c943 --- /dev/null +++ b/src/plugins/dev_ena/rx_node.c @@ -0,0 +1,457 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2023 Cisco Systems, Inc. + */ + +#include <vnet/vnet.h> +#include <vnet/dev/dev.h> +#include <vnet/ethernet/ethernet.h> +#include <vppinfra/vector/mask_compare.h> +#include <vppinfra/vector/compress.h> + +#include <dev_ena/ena.h> +#include <dev_ena/ena_inlines.h> + +#define ENA_RX_REFILL_BATCH 32 + +typedef struct +{ + u16 phase_bit; + u16 size; + u32 mask; + ena_rx_cdesc_status_t st_or; + ena_rx_cdesc_status_t st_and; + u16 *comp_sqe_indices; + u32 *sq_buffer_indices; +} ena_rx_ctx_t; + +static_always_inline void +ena_device_input_status_to_flags (ena_rx_cdesc_status_t *statuses, u32 *flags, + u32 n_desc, vlib_frame_bitmap_t first_bmp, + int maybe_chained) +{ + const ena_rx_cdesc_status_t mask_first = { .first = 1 }, + match_first1 = { .first = 1 }; + + const ena_rx_cdesc_status_t mask_last = { .last = 1 }, + match_last0 = { .last = 0 }; + + const ena_rx_cdesc_status_t mask_l4_csum = { .ipv4_frag = 1, + .l4_csum_checked = 1, + .l4_csum_err = 1 }, + match_l4_csum_ok = { .l4_csum_checked = 1 }; + + clib_memset_u32 (statuses + n_desc, 0, 8); +#if defined(CLIB_HAVE_VEC128) + +#if defined(CxLIB_HAVE_VEC512) +#define N 16 +#define u32xN u32x16 +#define u32xNu u32x16u +#define u32xN_splat u32x16_splat +#elif defined(CxLIB_HAVE_VEC256) +#define N 8 +#define u32xN u32x8 +#define u32xNu u32x8u +#define u32xN_splat u32x8_splat +#else +#define N 4 +#define u32xN u32x4 +#define u32xNu u32x4u +#define u32xN_splat u32x4_splat +#endif + + const u32xN st_mask_first = u32xN_splat (mask_first.as_u32); + const u32xN st_match_first1 = u32xN_splat (match_first1.as_u32); + const u32xN st_mask_last = u32xN_splat (mask_last.as_u32); + const u32xN st_match_last0 = u32xN_splat (match_last0.as_u32); + const u32xN st_mask_l4_csum = u32xN_splat (mask_l4_csum.as_u32); + const u32xN st_match_l4_csum_ok = u32xN_splat (match_l4_csum_ok.as_u32); + const u32xN f_total_len_valid = u32xN_splat (VLIB_BUFFER_TOTAL_LENGTH_VALID); + const u32xN f_next_preset = u32xN_splat (VLIB_BUFFER_NEXT_PRESENT); + const u32xN f_l4_csum = u32xN_splat (VNET_BUFFER_F_L4_CHECKSUM_CORRECT | + VNET_BUFFER_F_L4_CHECKSUM_COMPUTED); + + for (u32 i = 0; i < round_pow2 (n_desc, 2 * N); i += 2 * N) + { + uword msk = 0; + u32xN f0, f1, r0, r1; + u32xN s0 = ((u32xNu *) (statuses + i))[0]; + u32xN s1 = ((u32xNu *) (statuses + i))[1]; + + r0 = (s0 & st_mask_first) == st_match_first1; + r1 = (s1 & st_mask_first) == st_match_first1; + f0 = r0 & f_total_len_valid; + f1 = r1 & f_total_len_valid; + + if (maybe_chained) + { +#if defined(CxLIB_HAVE_VEC512) + u64 msb_mask = 0x1111111111111111; + msk = bit_extract_u64 (u8x64_msb_mask ((u8x64) r0), msb_mask); + msk |= bit_extract_u64 (u8x64_msb_mask ((u8x64) r1), msb_mask) << 16; +#elif defined(CxLIB_HAVE_VEC256) + msk = u8x32_msb_mask ((u8x32) r0); + msk |= (u64) u8x32_msb_mask ((u8x32) r1) << 32; + msk = bit_extract_u64 (msk, 0x1111111111111111); +#else + msk = u8x16_msb_mask ((u8x16) r0); + msk |= (u32) u8x16_msb_mask ((u8x16) r1) << 16; + msk = bit_extract_u32 (msk, 0x11111111); +#endif + first_bmp[i / uword_bits] |= msk << (i % uword_bits); + } + + f0 |= ((s0 & st_mask_last) == st_match_last0) & f_next_preset; + f1 |= ((s1 & st_mask_last) == st_match_last0) & f_next_preset; + + f0 |= ((s0 & st_mask_l4_csum) == st_match_l4_csum_ok) & f_l4_csum; + f1 |= ((s1 & st_mask_l4_csum) == st_match_l4_csum_ok) & f_l4_csum; + + ((u32xNu *) (flags + i))[0] = f0; + ((u32xNu *) (flags + i))[1] = f1; + } +#else + while (n_left) + { + u16 f = 0; + ena_rx_cdesc_status_t st = statuses++[0]; + + if ((st.as_u32 & mask_first.as_u32) == match_first1.as_u32) + f |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + + if ((st.as_u32 & mask_last.as_u32) == match_last0.as_u32) + f |= VLIB_BUFFER_NEXT_PRESENT; + + if ((st.as_u32 & mask_l4_csum.as_u32) == match_l4_csum_ok.as_u32) + f |= VNET_BUFFER_F_L4_CHECKSUM_COMPUTED | + VNET_BUFFER_F_L4_CHECKSUM_CORRECT; + + flags++[0] = f; + n_left--; + } +#endif +} + +static_always_inline u16 +ena_device_input_cq_dequeue_no_wrap (ena_rx_ctx_t *ctx, ena_rxq_t *q, + ena_rx_cdesc_status_t *statuses, + u16 *lengths, u16 *csi) +{ + u32 next = q->cq_next; + ena_rx_cdesc_t *cqes = q->cqes; + u32 phase = (next & ctx->size << 1) != 0; + u16 index = next & ctx->mask; + ena_rx_cdesc_t *cd = cqes + index; + ena_rx_cdesc_status_t st; + u32 n_to_check, i = 0; + + st = cd->status; + if (st.phase == phase) + return 0; + + n_to_check = clib_min (VLIB_FRAME_SIZE, ctx->size - index); + + ctx->st_or.as_u32 |= st.as_u32; + ctx->st_and.as_u32 &= st.as_u32; + statuses[i] = st; + lengths[i] = cd->length; + csi[i] = cd->req_id; + i++; + cd++; + +more: + for (st = cd->status; i < n_to_check && st.phase != phase; + i++, st = (++cd)->status) + { + ctx->st_or.as_u32 |= st.as_u32; + ctx->st_and.as_u32 &= st.as_u32; + statuses[i] = st; + lengths[i] = cd->length; + csi[i] = cd->req_id; + } + + if (i == n_to_check) + { + n_to_check = VLIB_FRAME_SIZE - n_to_check; + if (n_to_check) + { + phase ^= 1; + cd = cqes; + goto more; + } + } + + /* revert incomplete */ + if (PREDICT_FALSE (statuses[i - 1].last == 0)) + { + i--; + while (i && statuses[i - 1].last == 0) + i--; + } + + return i; +} + +static_always_inline void +ena_device_input_refill (vlib_main_t *vm, ena_rx_ctx_t *ctx, + vnet_dev_rx_queue_t *rxq, int use_va) +{ + ena_rxq_t *q = vnet_dev_get_rx_queue_data (rxq); + const u64x2 flip_phase = (ena_rx_desc_t){ .lo.phase = 1 }.as_u64x2; + u32 buffer_indices[ENA_RX_REFILL_BATCH]; + uword dma_addr[ENA_RX_REFILL_BATCH]; + u32 n_alloc, n_compl_sqes = q->n_compl_sqes; + u16 *csi = ctx->comp_sqe_indices; + ena_rx_desc_t *sqes = q->sqes; + + while (n_compl_sqes > 0) + { + n_alloc = vlib_buffer_alloc_from_pool ( + vm, buffer_indices, clib_min (ENA_RX_REFILL_BATCH, n_compl_sqes), + vnet_dev_get_rx_queue_buffer_pool_index (rxq)); + + if (PREDICT_FALSE (n_alloc == 0)) + break; + + vlib_get_buffers_with_offset (vm, buffer_indices, (void **) dma_addr, + ENA_RX_REFILL_BATCH, + STRUCT_OFFSET_OF (vlib_buffer_t, data)); + + if (!use_va) + for (u32 i = 0; i < n_alloc; i++) + dma_addr[i] = vlib_physmem_get_pa (vm, (void *) dma_addr[i]); + + for (u32 i = 0; i < n_alloc; i++) + { + u16 slot = csi[i]; + u64x2 r = sqes[slot].as_u64x2 ^ flip_phase; + ctx->sq_buffer_indices[slot] = buffer_indices[i]; + r[1] = dma_addr[i]; + sqes[slot].as_u64x2 = r; /* write SQE as single 16-byte store */ + } + + csi += n_alloc; + n_compl_sqes -= n_alloc; + } + + if (n_compl_sqes == q->n_compl_sqes) + return; + + q->sq_next += q->n_compl_sqes - n_compl_sqes; + __atomic_store_n (q->sq_db, q->sq_next, __ATOMIC_RELEASE); + + if (PREDICT_FALSE (n_compl_sqes)) + clib_memmove (ctx->comp_sqe_indices, csi, n_compl_sqes * sizeof (csi[0])); + + q->n_compl_sqes = n_compl_sqes; +} + +static_always_inline uword +ena_device_input_inline (vlib_main_t *vm, vlib_node_runtime_t *node, + vnet_dev_rx_queue_t *rxq) +{ + ena_rxq_t *q = vnet_dev_get_rx_queue_data (rxq); + vnet_dev_port_t *port = rxq->port; + vnet_main_t *vnm = vnet_get_main (); + vlib_buffer_t *buffers[VLIB_FRAME_SIZE], **b; + ena_rx_cdesc_status_t statuses[VLIB_FRAME_SIZE + 8]; + u16 lengths[VLIB_FRAME_SIZE + 8], *l; + u32 flags[VLIB_FRAME_SIZE + 8], *f; + u16 *csi; + uword n_rx_packets = 0, n_rx_bytes = 0; + vlib_frame_bitmap_t head_bmp = {}; + u32 sw_if_index = port->intf.sw_if_index; + u32 hw_if_index = port->intf.hw_if_index; + u32 n_trace, n_deq, n_left; + u32 cq_next = q->cq_next; + u32 next_index = rxq->next_index; + vlib_frame_t *next_frame; + vlib_buffer_template_t bt = rxq->buffer_template; + u32 *bi; + int maybe_chained; + + ASSERT (count_set_bits (rxq->size) == 1); + ena_rx_ctx_t ctx = { + .size = rxq->size, + .mask = rxq->size - 1, + .st_and.as_u32 = ~0, + .comp_sqe_indices = q->compl_sqe_indices, + .sq_buffer_indices = q->buffer_indices, + }; + + /* we may have completed SQE indices from previous run */ + csi = ctx.comp_sqe_indices + q->n_compl_sqes; + + n_deq = + ena_device_input_cq_dequeue_no_wrap (&ctx, q, statuses, lengths, csi); + + if (n_deq == 0) + goto refill; + + q->n_compl_sqes += n_deq; + + maybe_chained = ctx.st_and.first && ctx.st_and.last ? 0 : 1; + + next_frame = + vlib_get_next_frame_internal (vm, node, next_index, /* new frame */ 1); + bi = vlib_frame_vector_args (next_frame); + + /* move buffer indices from the ring */ + for (u32 i = 0; i < n_deq; i++) + { + u32 slot = csi[i]; + bi[i] = ctx.sq_buffer_indices[slot]; + ctx.sq_buffer_indices[slot] = VLIB_BUFFER_INVALID_INDEX; + } + + vlib_get_buffers (vm, bi, buffers, n_deq); + + if (PREDICT_FALSE (maybe_chained)) + ena_device_input_status_to_flags (statuses, flags, n_deq, head_bmp, 1); + else + ena_device_input_status_to_flags (statuses, flags, n_deq, head_bmp, 0); + + for (b = buffers, l = lengths, f = flags, n_left = n_deq; n_left >= 8; + b += 4, f += 4, l += 4, n_left -= 4) + { + clib_prefetch_store (b[4]); + clib_prefetch_store (b[5]); + clib_prefetch_store (b[6]); + clib_prefetch_store (b[7]); + b[0]->template = bt; + n_rx_bytes += b[0]->current_length = l[0]; + b[0]->flags = f[0]; + b[1]->template = bt; + n_rx_bytes += b[1]->current_length = l[1]; + b[1]->flags = f[1]; + b[2]->template = bt; + n_rx_bytes += b[2]->current_length = l[2]; + b[2]->flags = f[2]; + b[3]->template = bt; + n_rx_bytes += b[3]->current_length = l[3]; + b[3]->flags = f[3]; + } + + for (; n_left > 0; b += 1, f += 1, l += 1, n_left -= 1) + { + b[0]->template = bt; + n_rx_bytes += b[0]->current_length = l[0]; + b[0]->flags = f[0]; + } + + if (maybe_chained) + { + vlib_buffer_t *hb = 0; + vlib_frame_bitmap_t tail_buf_bmp = {}; + u32 i, total_len = 0, head_flags = 0, tail_flags = 0; + n_rx_packets = vlib_frame_bitmap_count_set_bits (head_bmp); + + vlib_frame_bitmap_init (tail_buf_bmp, n_deq); + vlib_frame_bitmap_xor (tail_buf_bmp, head_bmp); + + foreach_vlib_frame_bitmap_set_bit_index (i, tail_buf_bmp) + { + vlib_buffer_t *pb = buffers[i - 1]; + /* only store opertations here */ + pb->next_buffer = bi[i]; + if (vlib_frame_bitmap_is_bit_set (tail_buf_bmp, i - 1) == 0) + { + if (hb) + { + hb->total_length_not_including_first_buffer = total_len; + /* tail descriptor contains protocol info so we need to + * combine head and tail buffer flags */ + hb->flags = head_flags | tail_flags; + } + head_flags = flags[i - 1]; + total_len = 0; + hb = pb; + } + total_len += lengths[i]; + tail_flags = flags[i]; + } + + hb->total_length_not_including_first_buffer = total_len; + hb->flags = head_flags | tail_flags; + } + else + n_rx_packets = n_deq; + + /* packet tracing */ + if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node)))) + { + u32 i; + if (!maybe_chained) + vlib_frame_bitmap_init (head_bmp, n_deq); + foreach_vlib_frame_bitmap_set_bit_index (i, head_bmp) + { + vlib_buffer_t *b = buffers[i]; + if (vlib_trace_buffer (vm, node, next_index, b, 0)) + { + u32 j = i; + ena_rx_trace_t *tr = vlib_add_trace (vm, node, b, sizeof (*tr)); + tr->next_index = next_index; + tr->qid = rxq->queue_id; + tr->hw_if_index = hw_if_index; + tr->n_desc = 1; + tr->length = lengths[i]; + tr->req_id = csi[i]; + tr->status = statuses[i]; + while (statuses[j].last == 0) + { + j++; + tr->n_desc++; + tr->length += lengths[j]; + } + tr->status = statuses[j]; + + if (-n_trace) + goto trace_done; + } + } + trace_done: + vlib_set_trace_count (vm, node, n_trace); + } + + if (PREDICT_FALSE (maybe_chained)) + clib_compress_u32 (bi, bi, head_bmp, n_deq); + + if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT)) + { + ethernet_input_frame_t *ef; + next_frame->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + + ef = vlib_frame_scalar_args (next_frame); + ef->sw_if_index = sw_if_index; + ef->hw_if_index = hw_if_index; + + if (ctx.st_or.l3_csum_err == 0) + next_frame->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK; + vlib_frame_no_append (next_frame); + } + + vlib_put_next_frame (vm, node, next_index, VLIB_FRAME_SIZE - n_rx_packets); + + vlib_increment_combined_counter ( + vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + vm->thread_index, hw_if_index, n_rx_packets, n_rx_bytes); + + q->cq_next = cq_next + n_deq; + +refill: + if (rxq->port->dev->va_dma) + ena_device_input_refill (vm, &ctx, rxq, 1); + else + ena_device_input_refill (vm, &ctx, rxq, 0); + + return n_rx_packets; +} + +VNET_DEV_NODE_FN (ena_rx_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + u32 n_rx = 0; + foreach_vnet_dev_rx_queue_runtime (rxq, node) + n_rx += ena_device_input_inline (vm, node, rxq); + return n_rx; +} |