From dc195d68456cd931260cfb5100f2ead46b63b080 Mon Sep 17 00:00:00 2001 From: Benoît Ganne Date: Wed, 3 Apr 2019 16:03:37 +0200 Subject: rdma: more batching, compile rdma-core in release mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rx: add batching for WC processing and release tx: improve batching for WC submission and processing rdma-core: compile in release mode to remove assert() Change-Id: I5fb8736db36b50f8b758cd688100477b67e72d80 Signed-off-by: Benoît Ganne --- src/plugins/rdma/input.c | 333 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 254 insertions(+), 79 deletions(-) (limited to 'src/plugins/rdma/input.c') diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c index 001d1c5d493..2aa98f07e8c 100644 --- a/src/plugins/rdma/input.c +++ b/src/plugins/rdma/input.c @@ -45,108 +45,283 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, rdma_rxq_t * rxq) { u32 n_alloc, n; - struct ibv_sge sg_entry; - struct ibv_recv_wr wr, *bad_wr; - u32 buffers[VLIB_FRAME_SIZE]; + u32 buffers[VLIB_FRAME_SIZE], *bi = buffers; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; + struct ibv_recv_wr wr[VLIB_FRAME_SIZE], *w = wr; + struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge; - if (rxq->n_enq >= rxq->size) + if (PREDICT_FALSE (rxq->n_enq >= rxq->size)) return; n_alloc = clib_min (VLIB_FRAME_SIZE, rxq->size - rxq->n_enq); - n_alloc = vlib_buffer_alloc (vm, buffers, n_alloc); - - sg_entry.length = vlib_buffer_get_default_data_size (vm); - sg_entry.lkey = rd->mr->lkey; - wr.num_sge = 1; - wr.sg_list = &sg_entry; - wr.next = NULL; - for (n = 0; n < n_alloc; n++) + n_alloc = n = vlib_buffer_alloc (vm, buffers, n_alloc); + vlib_get_buffers (vm, buffers, bufs, n_alloc); + + while (n >= 4) { - vlib_buffer_t *b = vlib_get_buffer (vm, buffers[n]); - sg_entry.addr = vlib_buffer_get_va (b); - wr.wr_id = buffers[n]; - if (ibv_post_recv (rxq->qp, &wr, &bad_wr) != 0) - vlib_buffer_free (vm, buffers + n, 1); - else - rxq->n_enq++; + if (PREDICT_TRUE (n >= 8)) + { + CLIB_PREFETCH (&s[4 + 0], 4 * sizeof (s[0]), STORE); + CLIB_PREFETCH (&w[4 + 0], 4 * sizeof (w[0]), STORE); + } + + s[0].addr = vlib_buffer_get_va (b[0]); + s[0].length = vlib_buffer_get_default_data_size (vm); + s[0].lkey = rd->mr->lkey; + + s[1].addr = vlib_buffer_get_va (b[1]); + s[1].length = vlib_buffer_get_default_data_size (vm); + s[1].lkey = rd->mr->lkey; + + s[2].addr = vlib_buffer_get_va (b[2]); + s[2].length = vlib_buffer_get_default_data_size (vm); + s[2].lkey = rd->mr->lkey; + + s[3].addr = vlib_buffer_get_va (b[3]); + s[3].length = vlib_buffer_get_default_data_size (vm); + s[3].lkey = rd->mr->lkey; + + w[0].wr_id = bi[0]; + w[0].next = &w[0] + 1; + w[0].sg_list = &s[0]; + w[0].num_sge = 1; + + w[1].wr_id = bi[1]; + w[1].next = &w[1] + 1; + w[1].sg_list = &s[1]; + w[1].num_sge = 1; + + w[2].wr_id = bi[2]; + w[2].next = &w[2] + 1; + w[2].sg_list = &s[2]; + w[2].num_sge = 1; + + w[3].wr_id = bi[3]; + w[3].next = &w[3] + 1; + w[3].sg_list = &s[3]; + w[3].num_sge = 1; + + s += 4; + bi += 4; + w += 4; + b += 4; + n -= 4; } + + while (n >= 1) + { + s[0].addr = vlib_buffer_get_va (b[0]); + s[0].length = vlib_buffer_get_default_data_size (vm); + s[0].lkey = rd->mr->lkey; + + w[0].wr_id = bi[0]; + w[0].next = &w[0] + 1; + w[0].sg_list = &s[0]; + w[0].num_sge = 1; + + s += 1; + bi += 1; + w += 1; + b += 1; + n -= 1; + } + + w[-1].next = 0; /* fix next pointer in WR linked-list last item */ + + w = wr; + ibv_post_recv (rxq->qp, wr, &w); + n = wr == w ? n_alloc : (uintptr_t) (w - wr); + + if (PREDICT_FALSE (n != n_alloc)) + vlib_buffer_free (vm, buffers + n, n_alloc - n); + + rxq->n_enq += n; } -static_always_inline uword -rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * frame, rdma_device_t * rd, u16 qid) +static_always_inline void +rdma_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node, + const rdma_device_t * rd, u32 n_left, const u32 * bi) { - vnet_main_t *vnm = vnet_get_main (); - rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid); - u32 n_trace; - struct ibv_wc wc[VLIB_FRAME_SIZE]; - u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; - u32 *bi, *to_next, n_left_to_next; - int i; - u32 n_rx_packets = 0, n_rx_bytes = 0; + u32 n_trace, i; - n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc); + if (PREDICT_TRUE (0 == (n_trace = vlib_get_trace_count (vm, node)))) + return; + + i = 0; + while (n_trace && n_left) + { + vlib_buffer_t *b; + rdma_input_trace_t *tr; + b = vlib_get_buffer (vm, bi[0]); + vlib_trace_buffer (vm, node, rd->per_interface_next_index, b, + /* follow_chain */ 0); + tr = vlib_add_trace (vm, node, b, sizeof (*tr)); + tr->next_index = rd->per_interface_next_index; + tr->hw_if_index = rd->hw_if_index; + + /* next */ + n_trace--; + n_left--; + bi++; + i++; + } + vlib_set_trace_count (vm, node, n_trace); +} + +static_always_inline void +rdma_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node, + const rdma_device_t * rd) +{ + vlib_next_frame_t *nf; + vlib_frame_t *f; + ethernet_input_frame_t *ef; - if (n_rx_packets <= 0) - rdma_device_input_refill (vm, rd, rxq); + if (PREDICT_FALSE + (VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT != rd->per_interface_next_index)) + return; - if (PREDICT_FALSE (rd->per_interface_next_index != ~0)) - next_index = rd->per_interface_next_index; + nf = + vlib_node_runtime_get_next_frame (vm, node, rd->per_interface_next_index); + f = vlib_get_frame (vm, nf->frame_index); + f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + /* FIXME: f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK; */ - vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); + ef = vlib_frame_scalar_args (f); + ef->sw_if_index = rd->sw_if_index; + ef->hw_if_index = rd->hw_if_index; +} - for (i = 0; i < n_rx_packets; i++) +static_always_inline u32 +rdma_device_input_load_wc (u32 n_left_from, struct ibv_wc * wc, u32 * to_next, + u32 * bufsz) +{ + u32 n_rx_bytes[4] = { 0 }; + + while (n_left_from >= 4) { - u32 bi = wc[i].wr_id; - vlib_buffer_t *b = vlib_get_buffer (vm, bi); - b->current_length = wc[i].byte_len; - vnet_buffer (b)->sw_if_index[VLIB_RX] = rd->sw_if_index; - vnet_buffer (b)->sw_if_index[VLIB_TX] = ~0; - to_next[i] = bi; - n_rx_bytes += wc[i].byte_len; + if (PREDICT_TRUE (n_left_from >= 8)) + { + CLIB_PREFETCH (&wc[4 + 0], CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (&wc[4 + 1], CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (&wc[4 + 2], CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (&wc[4 + 3], CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (&bufsz[4 + 0], 4 * sizeof (bufsz[0]), STORE); + CLIB_PREFETCH (&to_next[4 + 0], 4 * sizeof (to_next[0]), STORE); + } + + to_next[0] = wc[0].wr_id; + to_next[1] = wc[1].wr_id; + to_next[2] = wc[2].wr_id; + to_next[3] = wc[3].wr_id; + + bufsz[0] = wc[0].byte_len; + bufsz[1] = wc[1].byte_len; + bufsz[2] = wc[2].byte_len; + bufsz[3] = wc[3].byte_len; + + n_rx_bytes[0] += wc[0].byte_len; + n_rx_bytes[1] += wc[1].byte_len; + n_rx_bytes[2] += wc[2].byte_len; + n_rx_bytes[3] += wc[3].byte_len; + + wc += 4; + to_next += 4; + bufsz += 4; + n_left_from -= 4; } - if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node)))) + while (n_left_from >= 1) { - u32 n_left = n_rx_packets, i = 0; - bi = to_next; + to_next[0] = wc[0].wr_id; + bufsz[0] = wc[0].byte_len; + n_rx_bytes[0] += wc[0].byte_len; - while (n_trace && n_left) + wc += 1; + to_next += 1; + bufsz += 1; + n_left_from -= 1; + } + + return n_rx_bytes[0] + n_rx_bytes[1] + n_rx_bytes[2] + n_rx_bytes[3]; +} + +static_always_inline void +rdma_device_input_bufs_init (u32 n_left_from, vlib_buffer_t ** bufs, + u32 * bufsz, u32 sw_if_index) +{ + while (n_left_from >= 4) + { + if (PREDICT_TRUE (n_left_from >= 8)) { - vlib_buffer_t *b; - rdma_input_trace_t *tr; - b = vlib_get_buffer (vm, bi[0]); - vlib_trace_buffer (vm, node, next_index, b, /* follow_chain */ 0); - tr = vlib_add_trace (vm, node, b, sizeof (*tr)); - tr->next_index = next_index; - tr->hw_if_index = rd->hw_if_index; - - /* next */ - n_trace--; - n_left--; - bi++; - i++; + vlib_prefetch_buffer_header (bufs[4 + 0], STORE); + vlib_prefetch_buffer_header (bufs[4 + 1], STORE); + vlib_prefetch_buffer_header (bufs[4 + 2], STORE); + vlib_prefetch_buffer_header (bufs[4 + 3], STORE); + CLIB_PREFETCH (&bufsz[4 + 0], 4 * sizeof (bufsz[0]), LOAD); } - vlib_set_trace_count (vm, node, n_trace); + + bufs[0]->current_length = bufsz[0]; + bufs[1]->current_length = bufsz[1]; + bufs[2]->current_length = bufsz[2]; + bufs[3]->current_length = bufsz[3]; + + vnet_buffer (bufs[0])->sw_if_index[VLIB_RX] = sw_if_index; + vnet_buffer (bufs[1])->sw_if_index[VLIB_RX] = sw_if_index; + vnet_buffer (bufs[2])->sw_if_index[VLIB_RX] = sw_if_index; + vnet_buffer (bufs[3])->sw_if_index[VLIB_RX] = sw_if_index; + + vnet_buffer (bufs[0])->sw_if_index[VLIB_TX] = ~0; + vnet_buffer (bufs[1])->sw_if_index[VLIB_TX] = ~0; + vnet_buffer (bufs[2])->sw_if_index[VLIB_TX] = ~0; + vnet_buffer (bufs[3])->sw_if_index[VLIB_TX] = ~0; + + bufs += 4; + bufsz += 4; + n_left_from -= 4; } - if (PREDICT_TRUE (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT)) + while (n_left_from >= 1) { - vlib_next_frame_t *nf; - vlib_frame_t *f; - ethernet_input_frame_t *ef; - nf = vlib_node_runtime_get_next_frame (vm, node, next_index); - f = vlib_get_frame (vm, nf->frame_index); - f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; - - ef = vlib_frame_scalar_args (f); - ef->sw_if_index = rd->sw_if_index; - ef->hw_if_index = rd->hw_if_index; - //f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK; + bufs[0]->current_length = bufsz[0]; + vnet_buffer (bufs[0])->sw_if_index[VLIB_RX] = sw_if_index; + vnet_buffer (bufs[0])->sw_if_index[VLIB_TX] = ~0; + + bufs += 1; + bufsz += 1; + n_left_from -= 1; } +} - n_left_to_next -= n_rx_packets; - vlib_put_next_frame (vm, node, next_index, n_left_to_next); +static_always_inline uword +rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, rdma_device_t * rd, u16 qid) +{ + vnet_main_t *vnm = vnet_get_main (); + rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid); + struct ibv_wc wc[VLIB_FRAME_SIZE]; + u32 bufsz[VLIB_FRAME_SIZE]; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + u32 *to_next, n_left_to_next; + u32 n_rx_packets, n_rx_bytes; + + n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc); + + if (PREDICT_FALSE (n_rx_packets <= 0)) + { + rdma_device_input_refill (vm, rd, rxq); + return 0; + } + + vlib_get_new_next_frame (vm, node, rd->per_interface_next_index, to_next, + n_left_to_next); + n_rx_bytes = rdma_device_input_load_wc (n_rx_packets, wc, to_next, bufsz); + vlib_get_buffers (vm, to_next, bufs, n_rx_packets); + rdma_device_input_bufs_init (n_rx_packets, bufs, bufsz, rd->sw_if_index); + rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next); + rdma_device_input_ethernet (vm, node, rd); + + vlib_put_next_frame (vm, node, rd->per_interface_next_index, + n_left_to_next - n_rx_packets); vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + @@ -154,6 +329,7 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, rd->hw_if_index, n_rx_packets, n_rx_bytes); rxq->n_enq -= n_rx_packets; + rdma_device_input_refill (vm, rd, rxq); return n_rx_packets; @@ -172,9 +348,8 @@ VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm, { rdma_device_t *rd; rd = vec_elt_at_index (rm->devices, dq->dev_instance); - if ((rd->flags & RDMA_DEVICE_F_ADMIN_UP) == 0) - continue; - n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id); + if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ADMIN_UP)) + n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id); } return n_rx; } -- cgit 1.2.3-korg