diff options
author | Mohammed Hawari <mohammed@hawari.fr> | 2020-10-27 17:42:32 +0100 |
---|---|---|
committer | Beno�t Ganne <bganne@cisco.com> | 2020-11-13 16:58:18 +0000 |
commit | 798267aaa218b99daab4860fd630b9d4bb744c0d (patch) | |
tree | b7a628146d4858b9ae4a232ecd9f501085eed976 | |
parent | b8e129314aaf43a40e9bf15947578181cc53d675 (diff) |
rdma: implement multiseg rx without striding rq
Change-Id: I623617ad3c80610805dd3cf2a5f371e6677f4844
Signed-off-by: Mohammed Hawari <mohammed@hawari.fr>
Type: improvement
-rw-r--r-- | src/plugins/rdma/api.c | 34 | ||||
-rw-r--r-- | src/plugins/rdma/cli.c | 4 | ||||
-rw-r--r-- | src/plugins/rdma/device.c | 107 | ||||
-rw-r--r-- | src/plugins/rdma/input.c | 247 | ||||
-rw-r--r-- | src/plugins/rdma/rdma.api | 46 | ||||
-rw-r--r-- | src/plugins/rdma/rdma.h | 41 | ||||
-rw-r--r-- | src/plugins/rdma/test_api.c | 48 | ||||
-rw-r--r-- | src/plugins/rdma/unformat.c | 8 |
8 files changed, 470 insertions, 65 deletions
diff --git a/src/plugins/rdma/api.c b/src/plugins/rdma/api.c index fe1103f99fe..06e7385136f 100644 --- a/src/plugins/rdma/api.c +++ b/src/plugins/rdma/api.c @@ -45,6 +45,37 @@ rdma_api_mode (vl_api_rdma_mode_t mode) } static void +vl_api_rdma_create_v2_t_handler (vl_api_rdma_create_v2_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + rdma_main_t *rm = &rdma_main; + vl_api_rdma_create_v2_reply_t *rmp; + rdma_create_if_args_t args; + int rv; + + clib_memset (&args, 0, sizeof (rdma_create_if_args_t)); + + args.ifname = mp->host_if; + args.name = mp->name; + args.rxq_num = ntohs (mp->rxq_num); + args.rxq_size = ntohs (mp->rxq_size); + args.txq_size = ntohs (mp->txq_size); + args.mode = rdma_api_mode (mp->mode); + args.disable_striding_rq = 0; + args.no_multi_seg = mp->no_multi_seg; + args.max_pktlen = ntohs (mp->max_pktlen); + rdma_create_if (vm, &args); + rv = args.rv; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_RDMA_CREATE_V2_REPLY + rm->msg_id_base, + ({ + rmp->sw_if_index = ntohl (args.sw_if_index); + })); + /* *INDENT-ON* */ +} + +static void vl_api_rdma_create_t_handler (vl_api_rdma_create_t * mp) { vlib_main_t *vm = vlib_get_main (); @@ -61,6 +92,9 @@ vl_api_rdma_create_t_handler (vl_api_rdma_create_t * mp) args.rxq_size = ntohs (mp->rxq_size); args.txq_size = ntohs (mp->txq_size); args.mode = rdma_api_mode (mp->mode); + args.disable_striding_rq = 0; + args.no_multi_seg = 1; + args.max_pktlen = 0; rdma_create_if (vm, &args); rv = args.rv; diff --git a/src/plugins/rdma/cli.c b/src/plugins/rdma/cli.c index 918cd77387b..8f191e34b63 100644 --- a/src/plugins/rdma/cli.c +++ b/src/plugins/rdma/cli.c @@ -49,7 +49,9 @@ VLIB_CLI_COMMAND (rdma_create_command, static) = { .path = "create interface rdma", .short_help = "create interface rdma <host-if ifname> [name <name>]" " [rx-queue-size <size>] [tx-queue-size <size>]" - " [num-rx-queues <size>] [mode <auto|ibv|dv]", + " [num-rx-queues <size>] [mode <auto|ibv|dv]" + " [no-multi-seg] [no-striding]" + " [max-pktlen <size>]", .function = rdma_create_command_fn, }; /* *INDENT-ON* */ diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c index 9b6fda982ca..043232d0118 100644 --- a/src/plugins/rdma/device.c +++ b/src/plugins/rdma/device.c @@ -419,7 +419,8 @@ rdma_dev_cleanup (rdma_device_t * rd) } static clib_error_t * -rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) +rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc, + u8 no_multi_seg, u16 max_pktlen) { rdma_rxq_t *rxq; struct ibv_wq_init_attr wqia; @@ -427,17 +428,18 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) struct ibv_wq_attr wqa; struct ibv_cq_ex *cqex; struct mlx5dv_wq_init_attr dv_wqia = { }; + int is_mlx5dv = ! !(rd->flags & RDMA_DEVICE_F_MLX5DV); + int is_striding = ! !(rd->flags & RDMA_DEVICE_F_STRIDING_RQ); vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES); rxq = vec_elt_at_index (rd->rxqs, qid); rxq->size = n_desc; rxq->log_wqe_sz = 0; - rxq->log_stride_per_wqe = 0; rxq->buf_sz = vlib_buffer_get_default_data_size (vm); vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES); cqa.cqe = n_desc; - if (rd->flags & RDMA_DEVICE_F_MLX5DV) + if (is_mlx5dv) { struct mlx5dv_cq_init_attr dvcq = { }; dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE; @@ -460,14 +462,14 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) wqia.max_sge = 1; wqia.pd = rd->pd; wqia.cq = rxq->cq; - if (rd->flags & RDMA_DEVICE_F_MLX5DV) + if (is_mlx5dv) { - if (rd->flags & RDMA_DEVICE_F_STRIDING_RQ) + if (is_striding) { /* In STRIDING_RQ mode, map a descriptor to a stride, not a full WQE buffer */ uword data_seg_log2_sz = min_log2 (vlib_buffer_get_default_data_size (vm)); - + rxq->buf_sz = 1 << data_seg_log2_sz; /* The trick is also to map a descriptor to a data segment in the WQE SG list The number of strides per WQE and the size of a WQE (in 16-bytes words) both must be powers of two. @@ -478,24 +480,44 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) a stride, and a vlib_buffer) - RDMA_RXQ_MAX_CHAIN_SZ-1 null data segments */ - - wqia.max_sge = RDMA_RXQ_MAX_CHAIN_SZ; + int max_chain_log_sz = + max_pktlen ? max_log2 ((max_pktlen / + (rxq->buf_sz)) + + 1) : RDMA_RXQ_MAX_CHAIN_LOG_SZ; + max_chain_log_sz = clib_max (max_chain_log_sz, 3); + wqia.max_sge = 1 << max_chain_log_sz; dv_wqia.comp_mask = MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ; dv_wqia.striding_rq_attrs.two_byte_shift_en = 0; dv_wqia.striding_rq_attrs.single_wqe_log_num_of_strides = - RDMA_RXQ_MAX_CHAIN_LOG_SZ; + max_chain_log_sz; dv_wqia.striding_rq_attrs.single_stride_log_num_of_bytes = data_seg_log2_sz; - wqia.max_wr >>= RDMA_RXQ_MAX_CHAIN_LOG_SZ; - rxq->log_wqe_sz = RDMA_RXQ_MAX_CHAIN_LOG_SZ + 1; - rxq->log_stride_per_wqe = RDMA_RXQ_MAX_CHAIN_LOG_SZ; - rxq->buf_sz = 1 << data_seg_log2_sz; + wqia.max_wr >>= max_chain_log_sz; + rxq->log_wqe_sz = max_chain_log_sz + 1; + rxq->log_stride_per_wqe = max_chain_log_sz; } else { - /* For now, in non STRIDING_RQ mode, SG operations/chained buffers - are not supported */ - wqia.max_sge = 1; + /* In non STRIDING_RQ mode and if multiseg is not disabled, each WQE is a SG list of data + segments, each pointing to a vlib_buffer. */ + if (no_multi_seg) + { + wqia.max_sge = 1; + rxq->log_wqe_sz = 0; + rxq->n_ds_per_wqe = 1; + } + else + { + int max_chain_sz = + max_pktlen ? (max_pktlen / + (rxq->buf_sz)) + + 1 : RDMA_RXQ_LEGACY_MODE_MAX_CHAIN_SZ; + int max_chain_log_sz = max_log2 (max_chain_sz); + wqia.max_sge = 1 << max_chain_log_sz; + rxq->log_wqe_sz = max_chain_log_sz; + rxq->n_ds_per_wqe = max_chain_sz; + } + } if ((rxq->wq = mlx5dv_create_wq (rd->ctx, &wqia, &dv_wqia))) @@ -516,13 +538,14 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) if (ibv_modify_wq (rxq->wq, &wqa) != 0) return clib_error_return_unix (0, "Modify WQ (RDY) Failed"); - if (rd->flags & RDMA_DEVICE_F_MLX5DV) + if (is_mlx5dv) { struct mlx5dv_obj obj = { }; struct mlx5dv_cq dv_cq; struct mlx5dv_rwq dv_rwq; u64 qw0; u64 qw0_nullseg; + u32 wqe_sz_mask = (1 << rxq->log_wqe_sz) - 1; obj.cq.in = rxq->cq; obj.cq.out = &dv_cq; @@ -550,19 +573,36 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32; qw0_nullseg |= (u64) clib_host_to_net_u32 (rd->lkey) << 32; -/* Prefill the different 16 bytes words of the WQ. If not in striding RQ mode, - init with qw0 only with segments of rxq->buf_sz. Otherwise, for each WQE, the - RDMA_RXQ_MAX_CHAIN_SZ + 1 first 16-bytes words are initialised with qw0, the rest - are null segments */ +/* Prefill the different 16 bytes words of the WQ. + - If not in striding RQ mode, for each WQE, init with qw0 the first + RDMA_RXQ_LEGACY_MODE_MAX_CHAIN_SZ, and init the rest of the WQE + with null segments. + - If in striding RQ mode, for each WQE, the RDMA_RXQ_MAX_CHAIN_SZ + 1 + first 16-bytes words are initialised with qw0, the rest are null segments */ + for (int i = 0; i < rxq->wqe_cnt << rxq->log_wqe_sz; i++) - if (!(rd->flags & RDMA_DEVICE_F_STRIDING_RQ) - || (i == 0) || !(((i - 1) >> rxq->log_stride_per_wqe) & 0x1)) + if ((!is_striding + && ((i & wqe_sz_mask) < rxq->n_ds_per_wqe)) + || (is_striding + && ((i == 0) + || !(((i - 1) >> rxq->log_stride_per_wqe) & 0x1)))) rxq->wqes[i].dsz_and_lkey = qw0; else rxq->wqes[i].dsz_and_lkey = qw0_nullseg; for (int i = 0; i < (1 << rxq->log2_cq_size); i++) rxq->cqes[i].opcode_cqefmt_se_owner = 0xff; + + if (!is_striding) + { + vec_validate_aligned (rxq->second_bufs, n_desc - 1, + CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (rxq->n_used_per_chain, n_desc - 1, + CLIB_CACHE_LINE_BYTES); + rxq->n_total_additional_segs = n_desc * (rxq->n_ds_per_wqe - 1); + for (int i = 0; i < n_desc; i++) + rxq->n_used_per_chain[i] = rxq->n_ds_per_wqe - 1; + } } return 0; @@ -719,12 +759,15 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) } static clib_error_t * -rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size, - u32 txq_size, u32 rxq_num) +rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, + rdma_create_if_args_t * args) { clib_error_t *err; vlib_buffer_main_t *bm = vm->buffer_main; vlib_thread_main_t *tm = vlib_get_thread_main (); + u32 rxq_num = args->rxq_num; + u32 rxq_size = args->rxq_size; + u32 txq_size = args->txq_size; u32 i; if (rd->ctx == 0) @@ -758,7 +801,9 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size, return err; for (i = 0; i < rxq_num; i++) - if ((err = rdma_rxq_init (vm, rd, i, rxq_size))) + if ((err = + rdma_rxq_init (vm, rd, i, rxq_size, + args->no_multi_seg, args->max_pktlen))) return err; if ((err = rdma_rxq_finalize (vm, rd))) return err; @@ -799,7 +844,7 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) args->rxq_size = args->rxq_size ? args->rxq_size : 1024; args->txq_size = args->txq_size ? args->txq_size : 1024; - args->rxq_num = args->rxq_num ? args->rxq_num : 1; + args->rxq_num = args->rxq_num ? args->rxq_num : 2; if (!is_pow2 (args->rxq_num)) { @@ -896,7 +941,10 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1)) rd->flags |= RDMA_DEVICE_F_MLX5DV; - if (data_seg_log2_sz <= +/* Enable striding RQ if neither multiseg nor striding rq +are explicitly disabled, and if the interface supports it.*/ + if (!args->no_multi_seg && !args->disable_striding_rq + && data_seg_log2_sz <= mlx5dv_attrs.striding_rq_caps.max_single_stride_log_num_of_bytes && data_seg_log2_sz >= mlx5dv_attrs.striding_rq_caps.min_single_stride_log_num_of_bytes @@ -917,8 +965,7 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args) } } - if ((args->error = rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, - args->rxq_num))) + if ((args->error = rdma_dev_init (vm, rd, args))) goto err2; if ((args->error = rdma_register_interface (vnm, rd))) diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c index 52e921dd172..d7fbe96bc85 100644 --- a/src/plugins/rdma/input.c +++ b/src/plugins/rdma/input.c @@ -53,21 +53,106 @@ ibv_set_recv_wr_and_sge (struct ibv_recv_wr *w, struct ibv_sge *s, u64 va, w[0].num_sge = 1; } +static_always_inline u32 +rdma_device_legacy_input_refill_additional (vlib_main_t * vm, + rdma_device_t * rd, + rdma_rxq_t * rxq, + rdma_per_thread_data_t * ptd, + vlib_buffer_t * bt, + u32 first_slot, u32 n_alloc) +{ + int i; + u8 log_wqe_sz = rxq->log_wqe_sz; + u32 *bi = ptd->tmp_bi; + vlib_buffer_t **bufs = ptd->tmp_bufs; + + for (i = 0; i < n_alloc; i++) + { + u8 chain_sz = rxq->n_used_per_chain[first_slot + i]; + u8 chain_sz_alloc; + mlx5dv_wqe_ds_t *current_wqe = + rxq->wqes + ((first_slot + i) << log_wqe_sz); + if (chain_sz == 0) + continue; + if (PREDICT_FALSE ((chain_sz_alloc = + vlib_buffer_alloc_from_pool (vm, bi, chain_sz, + rd->pool)) != + chain_sz)) + { + vlib_buffer_free (vm, bi, chain_sz_alloc); + break; + } + /*Build the chain */ + vlib_get_buffers (vm, bi, bufs, chain_sz); + for (int j = 0; j < chain_sz - 1; j++) + { + vlib_buffer_copy_template (bufs[j], bt); + bufs[j]->next_buffer = bi[j + 1]; + bufs[j]->flags |= VLIB_BUFFER_NEXT_PRESENT; + } + /* The chain starting at the second buffer is pre-initialised */ + vlib_buffer_copy_template (bufs[chain_sz - 1], bt); + /* Stick with the already existing chain */ + if (chain_sz < rxq->n_ds_per_wqe - 1) + { + bufs[chain_sz - 1]->next_buffer = rxq->second_bufs[first_slot + i]; + bufs[chain_sz - 1]->flags |= VLIB_BUFFER_NEXT_PRESENT; + } + else + { + bufs[chain_sz - 1]->flags &= ~VLIB_BUFFER_NEXT_PRESENT; + } + + /* Update the wqes */ + for (int j = 0; j < chain_sz; j++) + { + u64 addr; + vlib_get_buffers_with_offset (vm, bi + j, + (void *) &addr, 1, + sizeof (vlib_buffer_t)); + current_wqe[j + 1].addr = clib_host_to_net_u64 (addr); + } + rxq->n_used_per_chain[first_slot + i] = 0; + rxq->n_total_additional_segs -= chain_sz; + rxq->second_bufs[first_slot + i] = bi[0]; + } + return i; +} + static_always_inline void rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, - rdma_rxq_t * rxq, int is_mlx5dv, int is_striding) + rdma_rxq_t * rxq, vlib_buffer_t * bt, + const int is_mlx5dv, const int is_striding) { u32 n_alloc, n; u16 ring_space; struct ibv_recv_wr wr[VLIB_FRAME_SIZE], *w = wr; struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge; + rdma_per_thread_data_t *ptd = + &rdma_main.per_thread_data[vlib_get_thread_index ()]; u32 mask = rxq->size - 1; u32 slot = rxq->tail & mask; u32 *bufs = rxq->bufs + slot; u32 data_size = rxq->buf_sz; u32 lkey = rd->lkey; - int log_stride_per_wqe = rxq->log_stride_per_wqe; - int log_wqe_sz = rxq->log_wqe_sz; + const int log_stride_per_wqe = is_striding ? rxq->log_stride_per_wqe : 0; + const int log_wqe_sz = rxq->log_wqe_sz; + + /*In legacy mode, maybe some buffers chains are incomplete? */ + if (PREDICT_FALSE + (is_mlx5dv && !is_striding && (rxq->incomplete_tail != rxq->tail))) + { + int n_incomplete = rxq->incomplete_tail - rxq->tail; + int n_completed = + rdma_device_legacy_input_refill_additional (vm, rd, rxq, ptd, bt, + slot, + n_incomplete); + rxq->tail += n_completed; + slot = rxq->tail & mask; + /* Don't start recycling head buffers if there are incomplete chains */ + if (n_completed != n_incomplete) + return; + } /* refilled buffers must be a multiple of 8 and of strides per WQE */ u32 alloc_multiple = 1 << (clib_max (3, log_stride_per_wqe)); @@ -115,12 +200,16 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, /* slot does not necessarily correspond to the slot in the wqes ring (in 16B words) */ u32 wqes_slot = slot << (log_wqe_sz - log_stride_per_wqe); - u32 wqe_cnt = rxq->wqe_cnt; + const u32 wqe_cnt = rxq->wqe_cnt; mlx5dv_wqe_ds_t *wqe = rxq->wqes + wqes_slot; - int wqe_sz = 1 << log_wqe_sz; - int stride_per_wqe = 1 << log_stride_per_wqe; + const int wqe_sz = 1 << log_wqe_sz; + const int stride_per_wqe = 1 << log_stride_per_wqe; int current_data_seg = 0; + /* In legacy mode, this function only refills head descriptors for each + WQE, so RDMA_RXQ_MAX_CHAIN_SZ-1 data segments are skipped per WQE */ + const int log_skip_wqe = is_striding ? 0 : log_wqe_sz; + while (n >= 1) { vlib_get_buffers_with_offset (vm, rxq->bufs + slot, (void **) va, 8, @@ -149,25 +238,38 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, }; /* *INDENT-ON* */ + /* TODO: when log_skip_wqe > 2, hw_prefetcher doesn't work, lots of LLC store + misses occur for wqes, to be fixed... */ if (!is_striding || !(current_data_seg & ~(stride_per_wqe - 1))) { - wqe[0 + is_striding].addr = va[0]; - wqe[1 + is_striding].addr = va[1]; - wqe[2 + is_striding].addr = va[2]; - wqe[3 + is_striding].addr = va[3]; - wqe[4 + is_striding].addr = va[4]; - wqe[5 + is_striding].addr = va[5]; - wqe[6 + is_striding].addr = va[6]; - wqe[7 + is_striding].addr = va[7]; + wqe[(0 << log_skip_wqe) + is_striding].addr = va[0]; + wqe[(1 << log_skip_wqe) + is_striding].addr = va[1]; + wqe[(2 << log_skip_wqe) + is_striding].addr = va[2]; + wqe[(3 << log_skip_wqe) + is_striding].addr = va[3]; + wqe[(4 << log_skip_wqe) + is_striding].addr = va[4]; + wqe[(5 << log_skip_wqe) + is_striding].addr = va[5]; + wqe[(6 << log_skip_wqe) + is_striding].addr = va[6]; + wqe[(7 << log_skip_wqe) + is_striding].addr = va[7]; slot += 8; n -= 8; } - wqe += 8; - wqes_slot += 8; + wqe += 8 << log_skip_wqe; + wqes_slot += 8 << log_skip_wqe; current_data_seg += 8; current_data_seg &= wqe_sz - 1; } + /* In legacy mode, there is some work required to finish building the SG lists */ + if (!is_striding) + { + int first_slot = slot - n_alloc; + rxq->incomplete_tail += n_alloc; + if (PREDICT_FALSE (rxq->n_total_additional_segs)) + n_alloc = + rdma_device_legacy_input_refill_additional (vm, rd, rxq, ptd, + bt, first_slot, + n_alloc); + } CLIB_MEMORY_STORE_BARRIER (); rxq->tail += n_alloc; if (is_striding) @@ -536,6 +638,33 @@ rdma_device_mlx5dv_striding_rq_parse_bc (int n_rx_packets, int *n_rx_segs, } static_always_inline int +rdma_device_mlx5dv_legacy_rq_slow_path_needed (u32 buf_sz, int n_rx_packets, + u32 * bc) +{ +#if defined CLIB_HAVE_VEC256 + u32x8 thresh8 = u32x8_splat (buf_sz); + for (int i = 0; i < n_rx_packets; i += 8) + if (!u32x8_is_all_zero (*(u32x8 *) (bc + i) > thresh8)) + return 1; +#elif defined CLIB_HAVE_VEC128 + u32x4 thresh4 = u32x4_splat (buf_sz); + for (int i = 0; i < n_rx_packets; i += 4) + if (!u32x4_is_all_zero (*(u32x4 *) (bc + i) > thresh4)) + return 1; +#else + while (n_rx_packets) + { + if (*bc > buf_sz) + return 1; + bc++; + n_rx_packets--; + } +#endif + + return 0; +} + +static_always_inline int rdma_device_mlx5dv_l3_validate_and_swap_bc (rdma_per_thread_data_t * ptd, int n_rx_packets, u32 * bc) { @@ -585,11 +714,11 @@ rdma_device_mlx5dv_l3_validate_and_swap_bc (rdma_per_thread_data_t static_always_inline u32 rdma_device_mlx5dv_fast_input (vlib_main_t * vm, rdma_rxq_t * rxq, + vlib_buffer_t ** bufs, u32 qs_mask, vlib_buffer_t * bt, u32 * to_next, u32 n_rx_segs, u32 * bc, u32 bc_mask) { - vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; vlib_buffer_t **b = bufs; u32 n_left = n_rx_segs; u32 n_rx_bytes = 0; @@ -629,6 +758,59 @@ rdma_device_mlx5dv_fast_input (vlib_main_t * vm, rdma_rxq_t * rxq, return n_rx_bytes; } +static_always_inline void +rdma_device_mlx5dv_legacy_rq_fix_chains (vlib_main_t * vm, rdma_rxq_t * rxq, + vlib_buffer_t ** bufs, u32 qs_mask, + u32 n) +{ + u32 buf_sz = rxq->buf_sz; + uword slot = (rxq->head - n) & qs_mask; + u32 *second = &rxq->second_bufs[slot]; + u32 n_wrap_around = (slot + n) & (qs_mask + 1) ? (slot + n) & qs_mask : 0; + u8 *n_used_per_chain = &rxq->n_used_per_chain[slot]; + n -= n_wrap_around; +wrap_around: + while (n > 0) + { + u16 total_length = bufs[0]->current_length; + if (total_length > buf_sz) + { + vlib_buffer_t *current_buf = bufs[0]; + u8 current_chain_sz = 0; + current_buf->current_length = buf_sz; + total_length -= buf_sz; + current_buf->total_length_not_including_first_buffer = total_length; + current_buf->flags |= VLIB_BUFFER_NEXT_PRESENT; + current_buf->next_buffer = second[0]; + do + { + current_buf = vlib_get_buffer (vm, current_buf->next_buffer); + current_buf->current_length = clib_min (buf_sz, total_length); + total_length -= current_buf->current_length; + current_chain_sz++; + } + while (total_length > 0); + current_buf->flags &= ~VLIB_BUFFER_NEXT_PRESENT; + second[0] = current_buf->next_buffer; + current_buf->next_buffer = 0; + rxq->n_total_additional_segs += current_chain_sz; + n_used_per_chain[0] = current_chain_sz; + } + bufs++; + second++; + n_used_per_chain++; + n--; + } + if (PREDICT_FALSE (n_wrap_around)) + { + n = n_wrap_around; + n_wrap_around = 0; + second = rxq->second_bufs; + n_used_per_chain = rxq->n_used_per_chain; + goto wrap_around; + } +} + static_always_inline u32 rdma_device_mlx5dv_striding_rq_input (vlib_main_t * vm, rdma_per_thread_data_t * ptd, @@ -641,8 +823,9 @@ rdma_device_mlx5dv_striding_rq_input (vlib_main_t * vm, u32 n_rx_bytes = 0; if (PREDICT_TRUE (!slow_path_needed)) { + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; n_rx_bytes += - rdma_device_mlx5dv_fast_input (vm, rxq, mask, bt, to_next, + rdma_device_mlx5dv_fast_input (vm, rxq, bufs, mask, bt, to_next, n_rx_segs, bc, CQE_BC_BYTE_COUNT_MASK); } else /* Slow path with multiseg */ @@ -729,7 +912,7 @@ rdma_device_mlx5dv_striding_rq_input (vlib_main_t * vm, static_always_inline uword rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, rdma_device_t * rd, - u16 qid, int use_mlx5dv) + u16 qid, const int use_mlx5dv) { rdma_main_t *rm = &rdma_main; vnet_main_t *vnm = vnet_get_main (); @@ -742,6 +925,7 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 next_index, *to_next, n_left_to_next, n_rx_bytes = 0; int n_rx_packets, skip_ip4_cksum = 0; u32 mask = rxq->size - 1; + const int is_striding = ! !(rd->flags & RDMA_DEVICE_F_STRIDING_RQ); if (use_mlx5dv) n_rx_packets = rdma_device_poll_cq_mlx5dv (rd, rxq, byte_cnts, @@ -749,14 +933,14 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, else n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc); - if (PREDICT_FALSE (n_rx_packets <= 0)) - goto refill; - /* init buffer template */ vlib_buffer_copy_template (&bt, &ptd->buffer_template); vnet_buffer (&bt)->sw_if_index[VLIB_RX] = rd->sw_if_index; bt.buffer_pool_index = rd->pool; + if (PREDICT_FALSE (n_rx_packets <= 0)) + goto refill; + /* update buffer template for input feature arcs if any */ next_index = rd->per_interface_next_index; if (PREDICT_FALSE (vnet_device_input_have_features (rd->sw_if_index))) @@ -770,7 +954,7 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, int slow_path_needed; skip_ip4_cksum = rdma_device_mlx5dv_l3_validate_and_swap_bc (ptd, n_rx_packets, bc); - if (rd->flags & RDMA_DEVICE_F_STRIDING_RQ) + if (is_striding) { int n_rx_segs = 0; slow_path_needed = @@ -784,12 +968,20 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } else { - /*For now, legacy path doesn't support multiseg */ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + slow_path_needed = + rdma_device_mlx5dv_legacy_rq_slow_path_needed (rxq->buf_sz, + n_rx_packets, bc); n_rx_bytes = - rdma_device_mlx5dv_fast_input (vm, rxq, mask, &bt, to_next, + rdma_device_mlx5dv_fast_input (vm, rxq, bufs, mask, &bt, to_next, n_rx_packets, bc, ~1); - } + /* If there are chained buffers, some of the head buffers have a current length + higher than buf_sz: it needs to be fixed */ + if (PREDICT_FALSE (slow_path_needed)) + rdma_device_mlx5dv_legacy_rq_fix_chains (vm, rxq, bufs, mask, + n_rx_packets); + } } else { @@ -817,8 +1009,7 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vm->thread_index, rd->hw_if_index, n_rx_packets, n_rx_bytes); refill: - rdma_device_input_refill (vm, rd, rxq, use_mlx5dv, - ! !(rd->flags & RDMA_DEVICE_F_STRIDING_RQ)); + rdma_device_input_refill (vm, rd, rxq, &bt, use_mlx5dv, is_striding); return n_rx_packets; } diff --git a/src/plugins/rdma/rdma.api b/src/plugins/rdma/rdma.api index 668f2380e6f..4519e2316d3 100644 --- a/src/plugins/rdma/rdma.api +++ b/src/plugins/rdma/rdma.api @@ -15,7 +15,7 @@ *------------------------------------------------------------------ */ -option version = "1.0.0"; +option version = "2.0.0"; import "vnet/interface_types.api"; enum rdma_mode @@ -39,6 +39,7 @@ enum rdma_mode define rdma_create { + option deprecated="21.01"; u32 client_index; u32 context; @@ -52,6 +53,36 @@ define rdma_create }; /** \brief + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param host_if - Linux netdev interface name + @param name - new rdma interface name + @param rxq_num - number of receive queues (optional) + @param rxq_size - receive queue size (optional) + @param txq_size - transmit queue size (optional) + @param mode - operation mode (optional) + @param no_multi_seg (optional) - disable chained buffer RX + @param max_pktlen (optional) - maximal RX packet size. +*/ + +define rdma_create_v2 +{ + u32 client_index; + u32 context; + + string host_if[64]; + string name[64]; + u16 rxq_num [default=1]; + u16 rxq_size [default=1024]; + u16 txq_size [default=1024]; + vl_api_rdma_mode_t mode [default=0]; + bool no_multi_seg [default=0]; + u16 max_pktlen [default=0]; + option vat_help = "<host-if ifname> [name <name>] [rx-queue-size <size>] [tx-queue-size <size>] [num-rx-queues <size>] [mode <auto|ibv|dv>] [no-multi-seg] [max-pktlen <size>]"; +}; + + +/** \brief @param context - sender context, to match reply w/ request @param retval - return value for request @param sw_if_index - software index for the new rdma interface @@ -65,6 +96,19 @@ define rdma_create_reply }; /** \brief + @param context - sender context, to match reply w/ request + @param retval - return value for request + @param sw_if_index - software index for the new rdma interface +*/ + +define rdma_create_v2_reply +{ + u32 context; + i32 retval; + vl_api_interface_index_t sw_if_index; +}; + +/** \brief @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request @param sw_if_index - interface index diff --git a/src/plugins/rdma/rdma.h b/src/plugins/rdma/rdma.h index e6d0abc0d41..a72765d117d 100644 --- a/src/plugins/rdma/rdma.h +++ b/src/plugins/rdma/rdma.h @@ -90,9 +90,27 @@ typedef struct u32 wqe_cnt; u32 wq_stride; u32 buf_sz; - u32 striding_wqe_tail; + union + { + struct + { + u32 striding_wqe_tail; /* Striding RQ: number of released whole WQE */ + u8 log_stride_per_wqe; /* Striding RQ: number of strides in a single WQE */ + }; + + struct + { + u8 *n_used_per_chain; /* Legacy RQ: for each buffer chain, how many additional segments are needed */ + + u32 *second_bufs; /* Legacy RQ: ring of second buffers of each chain */ + u32 incomplete_tail; /* Legacy RQ: tail index in bufs, + corresponds to buffer chains with recycled valid head buffer, + but whose other buffers are not yet recycled (due to pool exhaustion). */ + u16 n_total_additional_segs; + u8 n_ds_per_wqe; /* Legacy RQ: number of nonnull data segs per WQE */ + }; + }; u8 log_wqe_sz; /* log-size of a single WQE (in data segments) */ - u8 log_stride_per_wqe; /* Striding RQ: number of strides in a single WQE */ } rdma_rxq_t; typedef struct @@ -200,8 +218,20 @@ typedef struct u16x8 cqe_flags8[VLIB_FRAME_SIZE / 8]; u16x16 cqe_flags16[VLIB_FRAME_SIZE / 16]; }; - u32 current_segs[VLIB_FRAME_SIZE]; - u32 to_free_buffers[VLIB_FRAME_SIZE]; + union + { + struct + { + u32 current_segs[VLIB_FRAME_SIZE]; + u32 to_free_buffers[VLIB_FRAME_SIZE]; + }; /* Specific to STRIDING RQ mode */ + struct + { + u32 tmp_bi[VLIB_FRAME_SIZE]; + vlib_buffer_t *tmp_bufs[VLIB_FRAME_SIZE]; + }; /* Specific to LEGACY RQ mode */ + }; + vlib_buffer_t buffer_template; } rdma_per_thread_data_t; @@ -230,6 +260,9 @@ typedef struct u32 txq_size; u32 rxq_num; rdma_mode_t mode; + u8 no_multi_seg; + u8 disable_striding_rq; + u16 max_pktlen; /* return */ int rv; diff --git a/src/plugins/rdma/test_api.c b/src/plugins/rdma/test_api.c index 19c35920b7b..ff5dec1ad95 100644 --- a/src/plugins/rdma/test_api.c +++ b/src/plugins/rdma/test_api.c @@ -87,6 +87,36 @@ api_rdma_create (vat_main_t * vam) return ret; } +static int +api_rdma_create_v2 (vat_main_t * vam) +{ + vl_api_rdma_create_v2_t *mp; + rdma_create_if_args_t args; + int ret; + + if (!unformat_user (vam->input, unformat_rdma_create_if_args, &args)) + { + clib_warning ("unknown input `%U'", format_unformat_error, vam->input); + return -99; + } + + M (RDMA_CREATE_V2, mp); + + snprintf ((char *) mp->host_if, sizeof (mp->host_if), "%s", args.ifname); + snprintf ((char *) mp->name, sizeof (mp->name), "%s", args.name); + mp->rxq_num = clib_host_to_net_u16 (args.rxq_num); + mp->rxq_size = clib_host_to_net_u16 (args.rxq_size); + mp->txq_size = clib_host_to_net_u16 (args.txq_size); + mp->mode = api_rdma_mode (args.mode); + mp->no_multi_seg = args.no_multi_seg; + mp->max_pktlen = clib_host_to_net_u16 (args.max_pktlen); + + S (mp); + W (ret); + + return ret; +} + /* rdma-create reply handler */ static void vl_api_rdma_create_reply_t_handler (vl_api_rdma_create_reply_t * mp) @@ -105,6 +135,24 @@ vl_api_rdma_create_reply_t_handler (vl_api_rdma_create_reply_t * mp) vam->regenerate_interface_table = 1; } +/* rdma-create reply handler */ +static void +vl_api_rdma_create_v2_reply_t_handler (vl_api_rdma_create_v2_reply_t * mp) +{ + vat_main_t *vam = rdma_test_main.vat_main; + i32 retval = ntohl (mp->retval); + + if (retval == 0) + { + fformat (vam->ofp, "created rdma with sw_if_index %d\n", + ntohl (mp->sw_if_index)); + } + + vam->retval = retval; + vam->result_ready = 1; + vam->regenerate_interface_table = 1; +} + /* rdma delete API */ static int api_rdma_delete (vat_main_t * vam) diff --git a/src/plugins/rdma/unformat.c b/src/plugins/rdma/unformat.c index 2bbb266f1bf..26c184793bd 100644 --- a/src/plugins/rdma/unformat.c +++ b/src/plugins/rdma/unformat.c @@ -24,7 +24,7 @@ unformat_rdma_create_if_args (unformat_input_t * input, va_list * vargs) rdma_create_if_args_t *args = va_arg (*vargs, rdma_create_if_args_t *); unformat_input_t _line_input, *line_input = &_line_input; uword ret = 1; - + u32 tmp; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -48,6 +48,12 @@ unformat_rdma_create_if_args (unformat_input_t * input, va_list * vargs) args->mode = RDMA_MODE_IBV; else if (unformat (line_input, "mode dv")) args->mode = RDMA_MODE_DV; + else if (unformat (line_input, "no-striding")) + args->disable_striding_rq = 1; + else if (unformat (line_input, "no-multi-seg")) + args->no_multi_seg = 1; + else if (unformat (line_input, "max-pktlen %u", &tmp)) + args->max_pktlen = tmp; else { /* return failure on unknown input */ |