diff options
author | Mohammed Hawari <mohammed@hawari.fr> | 2020-10-21 14:48:38 +0200 |
---|---|---|
committer | Damjan Marion <dmarion@me.com> | 2020-10-24 09:25:19 +0000 |
commit | 4df9f737a24be94c2988f18337a4ad845b1b0186 (patch) | |
tree | 292a032012d443205db2988667946bb77aeb3008 /src/plugins/rdma/input.c | |
parent | 91603958d1d4fc3114739f9b264808940942e5c8 (diff) |
rdma: implement striding rq for multiseg rx
This change leverages the striding RQ feature of
ConnectX-5 adapters to support chained buffers on
the RX path. In Striding RQ mode, WQE are SG lists
of data segments, each mapped to a vlib_buffer.
When a packet is received, it can consume one or
multiple data segments belonging to the WQE,
without wasting the whole WQE.
Change-Id: I74eba5b2c2c66538e75e046335058ba011cb27fd
Type: improvement
Signed-off-by: Mohammed Hawari <mohammed@hawari.fr>
Diffstat (limited to 'src/plugins/rdma/input.c')
-rw-r--r-- | src/plugins/rdma/input.c | 437 |
1 files changed, 319 insertions, 118 deletions
diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c index 3842a58a4ab..1d267ad6cc0 100644 --- a/src/plugins/rdma/input.c +++ b/src/plugins/rdma/input.c @@ -55,28 +55,35 @@ ibv_set_recv_wr_and_sge (struct ibv_recv_wr *w, struct ibv_sge *s, u64 va, static_always_inline void rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, - rdma_rxq_t * rxq, int is_mlx5dv) + rdma_rxq_t * rxq, int is_mlx5dv, int is_striding) { u32 n_alloc, n; + u16 ring_space; struct ibv_recv_wr wr[VLIB_FRAME_SIZE], *w = wr; struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge; u32 mask = rxq->size - 1; u32 slot = rxq->tail & mask; u32 *bufs = rxq->bufs + slot; - u32 data_size = vlib_buffer_get_default_data_size (vm); + u32 data_size = rxq->buf_sz; u32 lkey = rd->lkey; + int log_stride_per_wqe = rxq->log_stride_per_wqe; + int log_wqe_sz = rxq->log_wqe_sz; - /* do not enqueue more packet than ring space */ - n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - (rxq->tail - rxq->head)); + /* refilled buffers must be a multiple of 8 and of strides per WQE */ + u32 alloc_multiple = 1 << (clib_max (3, log_stride_per_wqe)); + + ring_space = rxq->size - (rxq->tail - rxq->head); + + n_alloc = clib_min (VLIB_FRAME_SIZE, ring_space); /* do not bother to allocate if too small */ - if (n_alloc < 16) + if (n_alloc < 2 * alloc_multiple) return; /* avoid wrap-around logic in core loop */ n_alloc = clib_min (n_alloc, rxq->size - slot); - n_alloc &= ~7; /* round to 8 */ + n_alloc &= ~(alloc_multiple - 1); /* round to alloc_multiple */ n = vlib_buffer_alloc_to_ring_from_pool (vm, rxq->bufs, slot, rxq->size, n_alloc, rd->pool); @@ -84,7 +91,7 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, if (PREDICT_FALSE (n != n_alloc)) { u32 n_free; - if (n < 8) + if (n < alloc_multiple) { if (n) vlib_buffer_free_from_ring (vm, rxq->bufs, slot, rxq->size, n); @@ -92,7 +99,7 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, } /* partial allocation, round and return rest */ - n_free = n & 7; + n_free = n & (alloc_multiple - 1); n -= n_free; if (n_free) vlib_buffer_free_from_ring (vm, rxq->bufs, (slot + n) & mask, @@ -104,7 +111,15 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, if (is_mlx5dv) { u64 __clib_aligned (32) va[8]; - mlx5dv_rwq_t *wqe = rxq->wqes + slot; + + /* slot does not necessarily correspond to the slot + in the wqes ring (in 16B words) */ + u32 wqes_slot = slot << (log_wqe_sz - log_stride_per_wqe); + u32 wqe_cnt = rxq->wqe_cnt; + mlx5dv_wqe_ds_t *wqe = rxq->wqes + wqes_slot; + int wqe_sz = 1 << log_wqe_sz; + int stride_per_wqe = 1 << log_stride_per_wqe; + int current_data_seg = 0; while (n >= 1) { @@ -117,22 +132,52 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, for (int i = 0; i < 8; i++) va[i] = clib_host_to_net_u64 (va[i]); #endif - wqe[0].addr = va[0]; - wqe[1].addr = va[1]; - wqe[2].addr = va[2]; - wqe[3].addr = va[3]; - wqe[4].addr = va[4]; - wqe[5].addr = va[5]; - wqe[6].addr = va[6]; - wqe[7].addr = va[7]; + + /*In striding RQ mode, the first 16B-word of the WQE is the SRQ header. + It is initialised as if it were a LINKED_LIST, as we have no guarantee + about what RDMA core does (CYCLIC_RQ or LINKED_LIST_RQ). In cyclic + mode, the SRQ header is ignored anyways... */ + +/* *INDENT-OFF* */ + if (is_striding && !(current_data_seg & (wqe_sz - 1))) + *(mlx5dv_wqe_srq_next_t *) wqe = (mlx5dv_wqe_srq_next_t) + { + .rsvd0 = {0}, + .next_wqe_index = clib_host_to_net_u16 (((wqes_slot >> log_wqe_sz) + 1) & (wqe_cnt - 1)), + .signature = 0, + .rsvd1 = {0} + }; +/* *INDENT-ON* */ + + if (!is_striding || !(current_data_seg & ~(stride_per_wqe - 1))) + { + wqe[0 + is_striding].addr = va[0]; + wqe[1 + is_striding].addr = va[1]; + wqe[2 + is_striding].addr = va[2]; + wqe[3 + is_striding].addr = va[3]; + wqe[4 + is_striding].addr = va[4]; + wqe[5 + is_striding].addr = va[5]; + wqe[6 + is_striding].addr = va[6]; + wqe[7 + is_striding].addr = va[7]; + slot += 8; + n -= 8; + } wqe += 8; - slot += 8; - n -= 8; + wqes_slot += 8; + current_data_seg += 8; + current_data_seg &= wqe_sz - 1; } CLIB_MEMORY_STORE_BARRIER (); rxq->tail += n_alloc; - rxq->wq_db[MLX5_RCV_DBR] = clib_host_to_net_u32 (rxq->tail); + if (is_striding) + { + rxq->striding_wqe_tail += n_alloc >> log_stride_per_wqe; + rxq->wq_db[MLX5_RCV_DBR] = + clib_host_to_net_u32 (rxq->striding_wqe_tail); + } + else + rxq->wq_db[MLX5_RCV_DBR] = clib_host_to_net_u32 (rxq->tail); return; } @@ -176,8 +221,9 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, static_always_inline void rdma_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node, - const rdma_device_t * rd, u32 n_left, const u32 * bi, - u32 next_index, u16 * cqe_flags, int is_mlx5dv) + const rdma_device_t * rd, u32 n_left, + const u32 * bi, u32 next_index, u16 * cqe_flags, + int is_mlx5dv) { u32 n_trace, i; @@ -424,9 +470,9 @@ rdma_device_poll_cq_mlx5dv (rdma_device_t * rd, rdma_rxq_t * rxq, if ((cqe_last_byte & 0x1) != owner) break; - cqe_last_byte &= 0xfe; /* remove owner bit */ + cqe_last_byte &= 0xfc; /* remove owner and solicited bits */ - if (cqe_last_byte == 0x2c) + if (cqe_last_byte == 0x2c) /* OPCODE = 0x2 (Responder Send), Format = 0x3 (Compressed CQE) */ { u32 n_mini_cqes = clib_net_to_host_u32 (cqe->mini_cqe_num); u32 n_left = VLIB_FRAME_SIZE - n_rx_packets; @@ -456,7 +502,7 @@ rdma_device_poll_cq_mlx5dv (rdma_device_t * rd, rdma_rxq_t * rxq, continue; } - if (cqe_last_byte == 0x20) + if (cqe_last_byte == 0x20) /* OPCODE = 0x2 (Responder Send), Format = 0x0 (no inline data) */ { byte_cnt[0] = cqe->byte_cnt; cqe_flags[0] = cqe->flags; @@ -476,17 +522,223 @@ done: return n_rx_packets; } +static_always_inline int +rdma_device_mlx5dv_striding_rq_parse_bc (int n_rx_packets, int *n_rx_segs, + u32 * bc) +{ +/* Determine if slow path is needed */ + int filler = 0; + for (int i = 0; i < n_rx_packets; i++) + { + *n_rx_segs += + (bc[i] & CQE_BC_CONSUMED_STRIDES_MASK) >> + CQE_BC_CONSUMED_STRIDES_SHIFT; + filler |= ! !(bc[i] & CQE_BC_FILLER_MASK); + } + return n_rx_packets != *n_rx_segs || filler; +} + +static_always_inline int +rdma_device_mlx5dv_l3_validate_and_swap_bc (rdma_per_thread_data_t + * ptd, int n_rx_packets, u32 * bc) +{ + u16 mask = CQE_FLAG_L3_HDR_TYPE_MASK | CQE_FLAG_L3_OK; + u16 match = CQE_FLAG_L3_HDR_TYPE_IP4 << CQE_FLAG_L3_HDR_TYPE_SHIFT; + + /* verify that all ip4 packets have l3_ok flag set and convert packet + length from network to host byte order */ + int skip_ip4_cksum = 1; + +#if defined CLIB_HAVE_VEC256 + u16x16 mask16 = u16x16_splat (mask); + u16x16 match16 = u16x16_splat (match); + u16x16 r = { }; + + for (int i = 0; i * 16 < n_rx_packets; i++) + r |= (ptd->cqe_flags16[i] & mask16) != match16; + + if (!u16x16_is_all_zero (r)) + skip_ip4_cksum = 0; + + for (int i = 0; i < n_rx_packets; i += 8) + *(u32x8 *) (bc + i) = u32x8_byte_swap (*(u32x8 *) (bc + i)); +#elif defined CLIB_HAVE_VEC128 + u16x8 mask8 = u16x8_splat (mask); + u16x8 match8 = u16x8_splat (match); + u16x8 r = { }; + + for (int i = 0; i * 8 < n_rx_packets; i++) + r |= (ptd->cqe_flags8[i] & mask8) != match8; + + if (!u16x8_is_all_zero (r)) + skip_ip4_cksum = 0; + + for (int i = 0; i < n_rx_packets; i += 4) + *(u32x4 *) (bc + i) = u32x4_byte_swap (*(u32x4 *) (bc + i)); +#else + for (int i = 0; i < n_rx_packets; i++) + if ((ptd->cqe_flags[i] & mask) == match) + skip_ip4_cksum = 0; + + for (int i = 0; i < n_rx_packets; i++) + bc[i] = clib_net_to_host_u32 (bc[i]); +#endif + return skip_ip4_cksum; +} + +static_always_inline u32 +rdma_device_mlx5dv_fast_input (vlib_main_t * vm, rdma_rxq_t * rxq, + u32 qs_mask, vlib_buffer_t * bt, + u32 * to_next, u32 n_rx_segs, u32 * bc, + u32 bc_mask) +{ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + vlib_buffer_t **b = bufs; + u32 n_left = n_rx_segs; + u32 n_rx_bytes = 0; + vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs, + rxq->head & qs_mask, rxq->size, + n_rx_segs); + rxq->head += n_rx_segs; + vlib_get_buffers (vm, to_next, bufs, n_rx_segs); + while (n_left >= 8) + { + clib_prefetch_store (b[4]); + vlib_buffer_copy_template (b[0], bt); + n_rx_bytes += b[0]->current_length = bc[0] & bc_mask; + clib_prefetch_store (b[5]); + vlib_buffer_copy_template (b[1], bt); + n_rx_bytes += b[1]->current_length = bc[1] & bc_mask; + clib_prefetch_store (b[6]); + vlib_buffer_copy_template (b[2], bt); + n_rx_bytes += b[2]->current_length = bc[2] & bc_mask; + clib_prefetch_store (b[7]); + vlib_buffer_copy_template (b[3], bt); + n_rx_bytes += b[3]->current_length = bc[3] & bc_mask; + /* next */ + bc += 4; + b += 4; + n_left -= 4; + } + while (n_left) + { + vlib_buffer_copy_template (b[0], bt); + n_rx_bytes += b[0]->current_length = bc[0] & bc_mask; + /* next */ + bc++; + b++; + n_left--; + } + return n_rx_bytes; +} + +static_always_inline u32 +rdma_device_mlx5dv_striding_rq_input (vlib_main_t * vm, + rdma_per_thread_data_t * ptd, + rdma_rxq_t * rxq, + vlib_buffer_t * bt, u32 * to_next, + int n_rx_segs, int *n_rx_packets, + u32 * bc, int slow_path_needed) +{ + u32 mask = rxq->size - 1; + u32 n_rx_bytes = 0; + if (PREDICT_TRUE (!slow_path_needed)) + { + n_rx_bytes += + rdma_device_mlx5dv_fast_input (vm, rxq, mask, bt, to_next, + n_rx_segs, bc, CQE_BC_BYTE_COUNT_MASK); + } + else /* Slow path with multiseg */ + { + vlib_buffer_t *pkt_head; /*Current head buffer */ + vlib_buffer_t *pkt_prev; /* Buffer processed at the previous iteration */ + u32 pkt_head_idx; + vlib_buffer_t **pkt; + uword n_segs_remaining = 0; /*Remaining strides in current buffer */ + u32 n_bytes_remaining = 0; /*Remaining bytes in current buffer */ + u32 *next_in_frame = to_next; + u32 *next_to_free = ptd->to_free_buffers; + bt->current_length = vlib_buffer_get_default_data_size (vm); + do + { + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + u32 n_left = clib_min (n_rx_segs, VLIB_FRAME_SIZE); + n_rx_segs -= n_left; + vlib_buffer_copy_indices_from_ring (ptd->current_segs, + rxq->bufs, rxq->head & mask, + rxq->size, n_left); + rxq->head += n_left; + vlib_get_buffers (vm, ptd->current_segs, bufs, n_left); + pkt = bufs; + while (n_left > 0) + { + /* Initialize the current buffer as full size */ + vlib_buffer_copy_template (pkt[0], bt); + if (!n_segs_remaining) /* No pending chain */ + { + n_segs_remaining = + (bc[0] & CQE_BC_CONSUMED_STRIDES_MASK) >> + CQE_BC_CONSUMED_STRIDES_SHIFT; + pkt_head = pkt[0]; + pkt_head_idx = ptd->current_segs[pkt - bufs]; + n_bytes_remaining = bc[0] & CQE_BC_BYTE_COUNT_MASK; + pkt_head->total_length_not_including_first_buffer = + n_segs_remaining > + 1 ? n_bytes_remaining - pkt[0]->current_length : 0; + } + else /* Perform chaining if it's a continuation buffer */ + { + pkt_prev->next_buffer = ptd->current_segs[pkt - bufs]; + pkt_prev->flags |= VLIB_BUFFER_NEXT_PRESENT; + pkt[0]->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + } + if (n_segs_remaining == 1) /* Last buffer of the chain */ + { + pkt[0]->current_length = n_bytes_remaining; + if (bc[0] & CQE_BC_FILLER_MASK) + { + (next_to_free++)[0] = pkt_head_idx; + (*n_rx_packets)--; + } + + else + { + (next_in_frame++)[0] = pkt_head_idx; + n_rx_bytes += + pkt_head->current_length + + pkt_head->total_length_not_including_first_buffer; + } + /*Go to next CQE */ + bc++; + } + else + { + n_bytes_remaining -= pkt[0]->current_length; + pkt_prev = pkt[0]; + } + n_segs_remaining--; + n_left--; + pkt++; + } + + } + while (n_rx_segs > 0); + vlib_buffer_free (vm, ptd->to_free_buffers, + next_to_free - ptd->to_free_buffers); + } + return n_rx_bytes; +} + static_always_inline uword rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * frame, rdma_device_t * rd, u16 qid, - int use_mlx5dv) + vlib_frame_t * frame, rdma_device_t * rd, + u16 qid, int use_mlx5dv) { rdma_main_t *rm = &rdma_main; vnet_main_t *vnm = vnet_get_main (); rdma_per_thread_data_t *ptd = vec_elt_at_index (rm->per_thread_data, vm->thread_index); rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid); - vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; struct ibv_wc wc[VLIB_FRAME_SIZE]; u32 __clib_aligned (32) byte_cnts[VLIB_FRAME_SIZE]; vlib_buffer_t bt; @@ -515,112 +767,61 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); - vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs, rxq->head & mask, - rxq->size, n_rx_packets); - - vlib_get_buffers (vm, to_next, bufs, n_rx_packets); - if (use_mlx5dv) { - u16 mask = CQE_FLAG_L3_HDR_TYPE_MASK | CQE_FLAG_L3_OK; - u16 match = CQE_FLAG_L3_HDR_TYPE_IP4 << CQE_FLAG_L3_HDR_TYPE_SHIFT; - u32 n_left = n_rx_packets; u32 *bc = byte_cnts; - - /* verify that all ip4 packets have l3_ok flag set and convert packet - length from network to host byte order */ - skip_ip4_cksum = 1; - -#if defined CLIB_HAVE_VEC256 - u16x16 mask16 = u16x16_splat (mask); - u16x16 match16 = u16x16_splat (match); - u16x16 r = { }; - - for (int i = 0; i * 16 < n_rx_packets; i++) - r |= (ptd->cqe_flags16[i] & mask16) != match16; - - if (!u16x16_is_all_zero (r)) - skip_ip4_cksum = 0; - - for (int i = 0; i < n_rx_packets; i += 8) - *(u32x8 *) (bc + i) = u32x8_byte_swap (*(u32x8 *) (bc + i)); -#elif defined CLIB_HAVE_VEC128 - u16x8 mask8 = u16x8_splat (mask); - u16x8 match8 = u16x8_splat (match); - u16x8 r = { }; - - for (int i = 0; i * 8 < n_rx_packets; i++) - r |= (ptd->cqe_flags8[i] & mask8) != match8; - - if (!u16x8_is_all_zero (r)) - skip_ip4_cksum = 0; - - for (int i = 0; i < n_rx_packets; i += 4) - *(u32x4 *) (bc + i) = u32x4_byte_swap (*(u32x4 *) (bc + i)); -#else - for (int i = 0; i < n_rx_packets; i++) - if ((ptd->cqe_flags[i] & mask) == match) - skip_ip4_cksum = 0; - - for (int i = 0; i < n_rx_packets; i++) - bc[i] = clib_net_to_host_u32 (bc[i]); -#endif - - while (n_left >= 8) + int slow_path_needed; + skip_ip4_cksum = + rdma_device_mlx5dv_l3_validate_and_swap_bc (ptd, n_rx_packets, bc); + if (rd->flags & RDMA_DEVICE_F_STRIDING_RQ) { - clib_prefetch_store (b[4]); - vlib_buffer_copy_template (b[0], &bt); - n_rx_bytes += b[0]->current_length = bc[0]; - clib_prefetch_store (b[5]); - vlib_buffer_copy_template (b[1], &bt); - n_rx_bytes += b[1]->current_length = bc[1]; - clib_prefetch_store (b[6]); - vlib_buffer_copy_template (b[2], &bt); - n_rx_bytes += b[2]->current_length = bc[2]; - clib_prefetch_store (b[7]); - vlib_buffer_copy_template (b[3], &bt); - n_rx_bytes += b[3]->current_length = bc[3]; - - /* next */ - bc += 4; - b += 4; - n_left -= 4; + int n_rx_segs = 0; + slow_path_needed = + rdma_device_mlx5dv_striding_rq_parse_bc (n_rx_packets, + &n_rx_segs, bc); + n_rx_bytes = + rdma_device_mlx5dv_striding_rq_input (vm, ptd, rxq, &bt, + to_next, n_rx_segs, + &n_rx_packets, bc, + slow_path_needed); } - while (n_left) + else { - vlib_buffer_copy_template (b[0], &bt); - n_rx_bytes += b[0]->current_length = bc[0]; - - /* next */ - bc++; - b++; - n_left--; + /*For now, legacy path doesn't support multiseg */ + n_rx_bytes = + rdma_device_mlx5dv_fast_input (vm, rxq, mask, &bt, to_next, + n_rx_packets, bc, ~1); } + } else - n_rx_bytes = rdma_device_input_bufs (vm, rd, bufs, wc, n_rx_packets, &bt); + { + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs, + rxq->head & mask, + rxq->size, n_rx_packets); + vlib_get_buffers (vm, to_next, bufs, n_rx_packets); + rxq->head += n_rx_packets; + n_rx_bytes = + rdma_device_input_bufs (vm, rd, bufs, wc, n_rx_packets, &bt); - rdma_device_input_ethernet (vm, node, rd, next_index, skip_ip4_cksum); + } + rdma_device_input_ethernet (vm, node, rd, next_index, skip_ip4_cksum); vlib_put_next_frame (vm, node, next_index, n_left_to_next - n_rx_packets); - - rxq->head += n_rx_packets; - - rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next, next_index, - ptd->cqe_flags, use_mlx5dv); - + rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next, + next_index, ptd->cqe_flags, use_mlx5dv); /* reset flags to zero for the next run */ if (use_mlx5dv) clib_memset_u16 (ptd->cqe_flags, 0, VLIB_FRAME_SIZE); - - vlib_increment_combined_counter - (vnm->interface_main.combined_sw_if_counters + - VNET_INTERFACE_COUNTER_RX, vm->thread_index, - rd->hw_if_index, n_rx_packets, n_rx_bytes); - + vlib_increment_combined_counter (vnm->interface_main. + combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + vm->thread_index, rd->hw_if_index, + n_rx_packets, n_rx_bytes); refill: - rdma_device_input_refill (vm, rd, rxq, use_mlx5dv); - + rdma_device_input_refill (vm, rd, rxq, use_mlx5dv, + ! !(rd->flags & RDMA_DEVICE_F_STRIDING_RQ)); return n_rx_packets; } |