aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2020-03-12 11:56:00 +0100
committerDamjan Marion <dmarion@me.com>2020-03-16 19:09:39 +0000
commitdd648aac0615c416507de9097b6f50db16ad319c (patch)
tree37a19fb2db728567f49aa1928f947b64edf55142
parentd35887297d6320efb36c24ef123480f27a736b16 (diff)
rdma: add Mellanox mlx5 Direct Verbs receive support
Type: feature Change-Id: I3f287ab536a482c366ad7df47e1c04e640992ebc Signed-off-by: Damjan Marion <damarion@cisco.com>
-rw-r--r--src/plugins/rdma/cli.c62
-rw-r--r--src/plugins/rdma/device.c91
-rw-r--r--src/plugins/rdma/format.c168
-rw-r--r--src/plugins/rdma/input.c379
-rw-r--r--src/plugins/rdma/rdma.h24
-rw-r--r--src/plugins/rdma/rdma_mlx5dv.h156
-rw-r--r--src/vppinfra/vector_avx2.h10
-rw-r--r--src/vppinfra/vector_neon.h6
8 files changed, 859 insertions, 37 deletions
diff --git a/src/plugins/rdma/cli.c b/src/plugins/rdma/cli.c
index 73e91c298b1..918cd77387b 100644
--- a/src/plugins/rdma/cli.c
+++ b/src/plugins/rdma/cli.c
@@ -106,6 +106,68 @@ VLIB_CLI_COMMAND (rdma_delete_command, static) = {
};
/* *INDENT-ON* */
+static clib_error_t *
+test_rdma_dump_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 sw_if_index = ~0;
+ vnet_hw_interface_t *hw;
+ rdma_main_t *rm = &rdma_main;
+ rdma_device_t *rd;
+ vnet_main_t *vnm = vnet_get_main ();
+ int i;
+
+ /* Get a line of input. */
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+ ;
+ else if (unformat (line_input, "%U", unformat_vnet_sw_interface,
+ vnm, &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0,
+ "please specify interface name or sw_if_index");
+
+ hw = vnet_get_sup_hw_interface_api_visible_or_null (vnm, sw_if_index);
+ if (hw == NULL || rdma_device_class.index != hw->dev_class_index)
+ return clib_error_return (0, "not a RDMA interface");
+
+ rd = pool_elt_at_index (rm->devices, hw->dev_instance);
+
+ if ((rd->flags & RDMA_DEVICE_F_MLX5DV) == 0)
+ return clib_error_return (0, "not a mlx5 interface");
+
+ vlib_cli_output (vm, "netdev %s pci-addr %U lkey 0x%x",
+ rd->linux_ifname, format_vlib_pci_addr, &rd->pci->addr,
+ &rd->lkey);
+
+ vec_foreach_index (i, rd->rxqs)
+ {
+ vlib_cli_output (vm, "RX queue %u\n %U\n", i, format_rdma_rxq, rd, i);
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (test_rdma_mlx5dv_dump_command, static) = {
+ .path = "test rdma dump",
+ .short_help = "test rdma dump {<interface> | sw_if_index <sw_idx>}",
+ .function = test_rdma_dump_command_fn,
+};
+/* *INDENT-ON* */
+
clib_error_t *
rdma_cli_init (vlib_main_t * vm)
{
diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c
index 29d9842e2e2..f33d55c85a0 100644
--- a/src/plugins/rdma/device.c
+++ b/src/plugins/rdma/device.c
@@ -399,15 +399,32 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
{
rdma_rxq_t *rxq;
struct ibv_wq_init_attr wqia;
+ struct ibv_cq_init_attr_ex cqa = { };
struct ibv_wq_attr wqa;
+ struct ibv_cq_ex *cqex;
vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
rxq = vec_elt_at_index (rd->rxqs, qid);
rxq->size = n_desc;
vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
- if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
- return clib_error_return_unix (0, "Create CQ Failed");
+ cqa.cqe = n_desc;
+ if (rd->flags & RDMA_DEVICE_F_MLX5DV)
+ {
+ struct mlx5dv_cq_init_attr dvcq = { };
+ dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
+ dvcq.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
+
+ if ((cqex = mlx5dv_create_cq (rd->ctx, &cqa, &dvcq)) == 0)
+ return clib_error_return_unix (0, "Create mlx5dv rx CQ Failed");
+ }
+ else
+ {
+ if ((cqex = ibv_create_cq_ex (rd->ctx, &cqa)) == 0)
+ return clib_error_return_unix (0, "Create CQ Failed");
+ }
+
+ rxq->cq = ibv_cq_ex_to_cq (cqex);
memset (&wqia, 0, sizeof (wqia));
wqia.wq_type = IBV_WQT_RQ;
@@ -424,6 +441,44 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
if (ibv_modify_wq (rxq->wq, &wqa) != 0)
return clib_error_return_unix (0, "Modify WQ (RDY) Failed");
+ if (rd->flags & RDMA_DEVICE_F_MLX5DV)
+ {
+ struct mlx5dv_obj obj = { };
+ struct mlx5dv_cq dv_cq;
+ struct mlx5dv_rwq dv_rwq;
+ u64 qw0;
+
+ obj.cq.in = rxq->cq;
+ obj.cq.out = &dv_cq;
+ obj.rwq.in = rxq->wq;
+ obj.rwq.out = &dv_rwq;
+
+ if ((mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ)))
+ return clib_error_return_unix (0, "mlx5dv: failed to init rx obj");
+
+ if (dv_cq.cqe_size != sizeof (mlx5dv_cqe_t))
+ return clib_error_return_unix (0, "mlx5dv: incompatible rx CQE size");
+
+ rxq->log2_cq_size = max_log2 (dv_cq.cqe_cnt);
+ rxq->cqes = (mlx5dv_cqe_t *) dv_cq.buf;
+ rxq->cq_db = (volatile u32 *) dv_cq.dbrec;
+ rxq->cqn = dv_cq.cqn;
+
+ rxq->wqes = (mlx5dv_rwq_t *) dv_rwq.buf;
+ rxq->wq_db = (volatile u32 *) dv_rwq.dbrec;
+ rxq->wq_stride = dv_rwq.stride;
+ rxq->wqe_cnt = dv_rwq.wqe_cnt;
+
+ qw0 = clib_host_to_net_u32 (vlib_buffer_get_default_data_size (vm));
+ qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32;
+
+ for (int i = 0; i < rxq->size; i++)
+ rxq->wqes[i].dsz_and_lkey = qw0;
+
+ for (int i = 0; i < (1 << rxq->log2_cq_size); i++)
+ rxq->cqes[i].opcode_cqefmt_se_owner = 0xff;
+ }
+
return 0;
}
@@ -534,6 +589,12 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
ethernet_mac_address_generate (rd->hwaddr.bytes);
+ if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
+ bm->buffer_mem_size,
+ IBV_ACCESS_LOCAL_WRITE)) == 0)
+ return clib_error_return_unix (0, "Register MR Failed");
+ rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */
+
/*
* /!\ WARNING /!\ creation order is important
* We *must* create TX queues *before* RX queues, otherwise we will receive
@@ -549,12 +610,6 @@ rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
if ((err = rdma_rxq_finalize (vm, rd)))
return err;
- if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
- bm->buffer_mem_size,
- IBV_ACCESS_LOCAL_WRITE)) == 0)
- return clib_error_return_unix (0, "Register MR Failed");
- rd->lkey = rd->mr->lkey; /* avoid indirection in datapath */
-
return 0;
}
@@ -687,6 +742,26 @@ rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
break;
}
+ if (args->mode != RDMA_MODE_IBV)
+ {
+ struct mlx5dv_context mlx5dv_attrs = { };
+
+ if (mlx5dv_query_device (rd->ctx, &mlx5dv_attrs) == 0)
+ {
+ if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1))
+ rd->flags |= RDMA_DEVICE_F_MLX5DV;
+ }
+ else
+ {
+ if (args->mode == RDMA_MODE_DV)
+ {
+ args->error = clib_error_return (0, "Direct Verbs mode not "
+ "supported on this interface");
+ goto err2;
+ }
+ }
+ }
+
if ((args->error =
rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, args->rxq_num)))
goto err2;
diff --git a/src/plugins/rdma/format.c b/src/plugins/rdma/format.c
index c9553b21e41..89a2a7ca0a5 100644
--- a/src/plugins/rdma/format.c
+++ b/src/plugins/rdma/format.c
@@ -49,6 +49,30 @@ t = format (t, "%s%s", t ? " ":"", c);
}
u8 *
+format_rdma_bit_flag (u8 * s, va_list * args)
+{
+ u64 flags = va_arg (*args, u64);
+ char **strs = va_arg (*args, char **);
+ u32 n_strs = va_arg (*args, u32);
+ int i = 0;
+
+ while (flags)
+ {
+ if ((flags & (1 << i)))
+ {
+ if (i < n_strs && strs[i] != 0)
+ s = format (s, " %s", strs[i]);
+ else
+ s = format (s, " unknown(%u)", i);
+ flags ^= 1 << i;
+ }
+ i++;
+ }
+
+ return s;
+}
+
+u8 *
format_rdma_device (u8 * s, va_list * args)
{
u32 i = va_arg (*args, u32);
@@ -56,13 +80,33 @@ format_rdma_device (u8 * s, va_list * args)
rdma_device_t *rd = vec_elt_at_index (rm->devices, i);
u32 indent = format_get_indent (s);
- s = format (s, "netdev: %v\n", rd->linux_ifname);
+ s = format (s, "netdev %v pci-addr %U\n", rd->linux_ifname,
+ format_vlib_pci_addr, &rd->pci->addr);
s = format (s, "%Uflags: %U", format_white_space, indent,
format_rdma_device_flags, rd);
if (rd->error)
s = format (s, "\n%Uerror %U", format_white_space, indent,
format_clib_error, rd->error);
+ if (rd->flags & RDMA_DEVICE_F_MLX5DV)
+ {
+ struct mlx5dv_context c = { };
+ const char *str_flags[7] = { "cqe-v1", "obsolete", "mpw-allowed",
+ "enhanced-mpw", "cqe-128b-comp", "cqe-128b-pad",
+ "packet-based-credit-mode"
+ };
+
+ if (mlx5dv_query_device (rd->ctx, &c) != 0)
+ return s;
+
+ s = format (s, "\n%Umlx5: version %u", format_white_space, indent,
+ c.version);
+ s = format (s, "\n%Udevice flags: %U",
+ format_white_space, indent + 2,
+ format_rdma_bit_flag, c.flags, str_flags,
+ ARRAY_LEN (str_flags));
+ }
+
return s;
}
@@ -74,11 +118,133 @@ format_rdma_input_trace (u8 * s, va_list * args)
rdma_input_trace_t *t = va_arg (*args, rdma_input_trace_t *);
vnet_main_t *vnm = vnet_get_main ();
vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, t->hw_if_index);
+ char *l4_hdr_types[8] =
+ { 0, "tcp", "udp", "tcp-empty-ack", "tcp-with-acl" };
+ char *l3_hdr_types[4] = { 0, "ip6", "ip4" };
+ u8 l3_hdr_type = CQE_FLAG_L3_HDR_TYPE (t->cqe_flags);
+ u8 l4_hdr_type = CQE_FLAG_L4_HDR_TYPE (t->cqe_flags);
s = format (s, "rdma: %v (%d) next-node %U",
hi->name, t->hw_if_index, format_vlib_next_node_name, vm,
node->index, t->next_index);
+ if (t->cqe_flags & CQE_FLAG_L2_OK)
+ s = format (s, " l2-ok");
+
+ if (t->cqe_flags & CQE_FLAG_L3_OK)
+ s = format (s, " l3-ok");
+
+ if (t->cqe_flags & CQE_FLAG_L4_OK)
+ s = format (s, " l4-ok");
+
+ if (t->cqe_flags & CQE_FLAG_IP_FRAG)
+ s = format (s, " ip-frag");
+
+ if (l3_hdr_type)
+ s = format (s, " %s", l3_hdr_types[l3_hdr_type]);
+
+ if (l4_hdr_type)
+ s = format (s, " %s", l4_hdr_types[l4_hdr_type]);
+
+ if ((t->cqe_flags & CQE_FLAG_IP_EXT_OPTS))
+ {
+ if (l3_hdr_type == CQE_FLAG_L3_HDR_TYPE_IP6)
+ s = format (s, " ip4-ext-hdr");
+ if (l3_hdr_type == CQE_FLAG_L3_HDR_TYPE_IP4)
+ s = format (s, " ip4-opt");
+ }
+
+ return s;
+}
+
+static u8 *
+format_mlx5_bits (u8 * s, va_list * args)
+{
+ void *ptr = va_arg (*args, void *);
+ u32 offset = va_arg (*args, u32);
+ u32 sb = va_arg (*args, u32);
+ u32 eb = va_arg (*args, u32);
+
+ if (sb == 63 && eb == 0)
+ {
+ u64 x = mlx5_get_u64 (ptr, offset);
+ return format (s, "0x%lx", x);
+ }
+
+ u32 x = mlx5_get_bits (ptr, offset, sb, eb);
+ s = format (s, "%d", x);
+ if (x > 9)
+ s = format (s, " (0x%x)", x);
+ return s;
+}
+
+static u8 *
+format_mlx5_field (u8 * s, va_list * args)
+{
+ void *ptr = va_arg (*args, void *);
+ u32 offset = va_arg (*args, u32);
+ u32 sb = va_arg (*args, u32);
+ u32 eb = va_arg (*args, u32);
+ char *name = va_arg (*args, char *);
+
+ u8 *tmp = 0;
+
+ tmp = format (0, "0x%02x %s ", offset, name);
+ if (sb == eb)
+ tmp = format (tmp, "[%u]", sb);
+ else
+ tmp = format (tmp, "[%u:%u]", sb, eb);
+ s = format (s, "%-45v = %U", tmp, format_mlx5_bits, ptr, offset, sb, eb);
+ vec_free (tmp);
+
+ return s;
+}
+
+u8 *
+format_mlx5_cqe_rx (u8 * s, va_list * args)
+{
+ void *cqe = va_arg (*args, void *);
+ uword indent = format_get_indent (s);
+ int line = 0;
+
+#define _(a, b, c, d) \
+ if (mlx5_get_bits (cqe, a, b, c)) \
+ s = format (s, "%U%U\n", \
+ format_white_space, line++ ? indent : 0, \
+ format_mlx5_field, cqe, a, b, c, #d);
+ foreach_cqe_rx_field;
+#undef _
+ return s;
+}
+
+u8 *
+format_rdma_rxq (u8 * s, va_list * args)
+{
+ rdma_device_t *rd = va_arg (*args, rdma_device_t *);
+ u32 queue_index = va_arg (*args, u32);
+ rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, queue_index);
+ u32 indent = format_get_indent (s);
+
+ s = format (s, "size %u head %u tail %u", rxq->size, rxq->head, rxq->tail);
+
+ if (rd->flags & RDMA_DEVICE_F_MLX5DV)
+ {
+ u32 next_cqe_index = rxq->cq_ci & (rxq->size - 1);
+ s = format (s, "\n%Uwq: stride %u wqe-cnt %u",
+ format_white_space, indent + 2, rxq->wq_stride,
+ rxq->wqe_cnt);
+ s = format (s, "\n%Ucq: cqn %u cqe-cnt %u ci %u",
+ format_white_space, indent + 2, rxq->cqn,
+ 1 << rxq->log2_cq_size, rxq->cq_ci);
+ s = format (s, "\n%Unext-cqe(%u):", format_white_space, indent + 4,
+ next_cqe_index);
+ s = format (s, "\n%U%U", format_white_space, indent + 6,
+ format_mlx5_cqe_rx, rxq->cqes + next_cqe_index);
+ s = format (s, "\n%U%U", format_white_space, indent + 6,
+ format_hexdump, rxq->cqes + next_cqe_index,
+ sizeof (mlx5dv_cqe_t));
+ }
+
return s;
}
diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c
index f4ef6505a0f..cf0b6bffe7d 100644
--- a/src/plugins/rdma/input.c
+++ b/src/plugins/rdma/input.c
@@ -55,7 +55,7 @@ ibv_set_recv_wr_and_sge (struct ibv_recv_wr *w, struct ibv_sge *s, u64 va,
static_always_inline void
rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd,
- rdma_rxq_t * rxq)
+ rdma_rxq_t * rxq, int is_mlx5dv)
{
u32 n_alloc, n;
struct ibv_recv_wr wr[VLIB_FRAME_SIZE], *w = wr;
@@ -101,6 +101,41 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd,
n_alloc = n;
+ if (is_mlx5dv)
+ {
+ u64 va[8];
+ mlx5dv_rwq_t *wqe = rxq->wqes + slot;
+
+ while (n >= 1)
+ {
+ vlib_get_buffers_with_offset (vm, rxq->bufs + slot, (void **) va, 8,
+ sizeof (vlib_buffer_t));
+#ifdef CLIB_HAVE_VEC256
+ *(u64x4 *) va = u64x4_byte_swap (*(u64x4 *) va);
+ *(u64x4 *) (va + 4) = u64x4_byte_swap (*(u64x4 *) (va + 4));
+#else
+ for (int i = 0; i < 8; i++)
+ va[i] = clib_host_to_net_u64 (va[i]);
+#endif
+ wqe[0].addr = va[0];
+ wqe[1].addr = va[1];
+ wqe[2].addr = va[2];
+ wqe[3].addr = va[3];
+ wqe[4].addr = va[4];
+ wqe[5].addr = va[5];
+ wqe[6].addr = va[6];
+ wqe[7].addr = va[7];
+ wqe += 8;
+ slot += 8;
+ n -= 8;
+ }
+
+ CLIB_MEMORY_STORE_BARRIER ();
+ rxq->tail += n_alloc;
+ rxq->wq_db[MLX5_RCV_DBR] = clib_host_to_net_u32 (rxq->tail);
+ return;
+ }
+
while (n >= 8)
{
u64 va[8];
@@ -142,7 +177,7 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd,
static_always_inline void
rdma_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
const rdma_device_t * rd, u32 n_left, const u32 * bi,
- u32 next_index)
+ u32 next_index, u16 * cqe_flags, int is_mlx5dv)
{
u32 n_trace, i;
@@ -160,10 +195,12 @@ rdma_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
tr = vlib_add_trace (vm, node, b, sizeof (*tr));
tr->next_index = next_index;
tr->hw_if_index = rd->hw_if_index;
+ tr->cqe_flags = is_mlx5dv ? clib_net_to_host_u16 (cqe_flags[0]) : 0;
/* next */
n_trace--;
n_left--;
+ cqe_flags++;
bi++;
i++;
}
@@ -172,7 +209,8 @@ rdma_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
static_always_inline void
rdma_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node,
- const rdma_device_t * rd, u32 next_index)
+ const rdma_device_t * rd, u32 next_index,
+ int skip_ip4_cksum)
{
vlib_next_frame_t *nf;
vlib_frame_t *f;
@@ -186,7 +224,8 @@ rdma_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node,
VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT);
f = vlib_get_frame (vm, nf->frame);
f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
- /* FIXME: f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK; */
+ if (skip_ip4_cksum)
+ f->flags |= ETH_INPUT_FRAME_F_IP4_CKSUM_OK;
ef = vlib_frame_scalar_args (f);
ef->sw_if_index = rd->sw_if_index;
@@ -194,16 +233,12 @@ rdma_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node,
}
static_always_inline u32
-rdma_device_input_bufs (vlib_main_t * vm, const rdma_device_t * rd, u32 * bi,
- struct ibv_wc * wc, u32 n_left_from,
- vlib_buffer_t * bt)
+rdma_device_input_bufs (vlib_main_t * vm, const rdma_device_t * rd,
+ vlib_buffer_t ** b, struct ibv_wc *wc,
+ u32 n_left_from, vlib_buffer_t * bt)
{
- vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
u32 n_rx_bytes = 0;
- vlib_get_buffers (vm, bi, bufs, n_left_from);
- ASSERT (bt->buffer_pool_index == bufs[0]->buffer_pool_index);
-
while (n_left_from >= 4)
{
if (PREDICT_TRUE (n_left_from >= 8))
@@ -246,26 +281,224 @@ rdma_device_input_bufs (vlib_main_t * vm, const rdma_device_t * rd, u32 * bi,
return n_rx_bytes;
}
+static_always_inline void
+process_mini_cqes (rdma_rxq_t * rxq, u32 skip, u32 n_left, u32 cq_ci,
+ u32 mask, u32 * byte_cnt)
+{
+ mlx5dv_mini_cqe_t *mcqe;
+ u32 mcqe_array_index = (cq_ci + 1) & mask;
+ mcqe = (mlx5dv_mini_cqe_t *) (rxq->cqes + mcqe_array_index);
+
+ mcqe_array_index = cq_ci;
+
+ if (skip)
+ {
+ u32 n = skip & ~7;
+
+ if (n)
+ {
+ mcqe_array_index = (mcqe_array_index + n) & mask;
+ mcqe = (mlx5dv_mini_cqe_t *) (rxq->cqes + mcqe_array_index);
+ skip -= n;
+ }
+
+ if (skip)
+ {
+ n = clib_min (8 - skip, n_left);
+ for (int i = 0; i < n; i++)
+ byte_cnt[i] = mcqe[skip + i].byte_count;
+ mcqe_array_index = (mcqe_array_index + 8) & mask;
+ mcqe = (mlx5dv_mini_cqe_t *) (rxq->cqes + mcqe_array_index);
+ n_left -= n;
+ byte_cnt += n;
+ }
+
+ }
+
+ while (n_left >= 8)
+ {
+ for (int i = 0; i < 8; i++)
+ byte_cnt[i] = mcqe[i].byte_count;
+
+ n_left -= 8;
+ byte_cnt += 8;
+ mcqe_array_index = (mcqe_array_index + 8) & mask;
+ mcqe = (mlx5dv_mini_cqe_t *) (rxq->cqes + mcqe_array_index);
+ }
+
+ if (n_left)
+ {
+ for (int i = 0; i < n_left; i++)
+ byte_cnt[i] = mcqe[i].byte_count;
+ }
+}
+
+static_always_inline void
+cqe_set_owner (mlx5dv_cqe_t * cqe, u32 n_left, u8 owner)
+{
+ while (n_left >= 8)
+ {
+ cqe[0].opcode_cqefmt_se_owner = owner;
+ cqe[1].opcode_cqefmt_se_owner = owner;
+ cqe[2].opcode_cqefmt_se_owner = owner;
+ cqe[3].opcode_cqefmt_se_owner = owner;
+ cqe[4].opcode_cqefmt_se_owner = owner;
+ cqe[5].opcode_cqefmt_se_owner = owner;
+ cqe[6].opcode_cqefmt_se_owner = owner;
+ cqe[7].opcode_cqefmt_se_owner = owner;
+ n_left -= 8;
+ cqe += 8;
+ }
+ while (n_left)
+ {
+ cqe[0].opcode_cqefmt_se_owner = owner;
+ n_left--;
+ cqe++;
+ }
+}
+
+static_always_inline void
+compressed_cqe_reset_owner (rdma_rxq_t * rxq, u32 n_mini_cqes, u32 cq_ci,
+ u32 mask, u32 log2_cq_size)
+{
+ u8 owner;
+ u32 offset, cq_size = 1 << log2_cq_size;
+
+
+ /* first CQE is reset by hardware */
+ cq_ci++;
+ n_mini_cqes--;
+
+ offset = cq_ci & mask;
+ owner = 0xf0 | ((cq_ci >> log2_cq_size) & 1);
+
+ if (offset + n_mini_cqes < cq_size)
+ {
+ cqe_set_owner (rxq->cqes + offset, n_mini_cqes, owner);
+ }
+ else
+ {
+ u32 n = cq_size - offset;
+ cqe_set_owner (rxq->cqes + offset, n, owner);
+ cqe_set_owner (rxq->cqes, n_mini_cqes - n, owner ^ 1);
+ }
+
+}
+
+static_always_inline uword
+rdma_device_poll_cq_mlx5dv (rdma_device_t * rd, rdma_rxq_t * rxq,
+ u32 * byte_cnt, u16 * cqe_flags)
+{
+ u32 n_rx_packets = 0;
+ u32 log2_cq_size = rxq->log2_cq_size;
+ u32 mask = pow2_mask (log2_cq_size);
+ u32 cq_ci = rxq->cq_ci;
+
+ if (rxq->n_mini_cqes_left)
+ {
+ /* partially processed mini-cqe array */
+ u32 n_mini_cqes = rxq->n_mini_cqes;
+ u32 n_mini_cqes_left = rxq->n_mini_cqes_left;
+ process_mini_cqes (rxq, n_mini_cqes - n_mini_cqes_left,
+ n_mini_cqes_left, cq_ci, mask, byte_cnt);
+ compressed_cqe_reset_owner (rxq, n_mini_cqes, cq_ci, mask,
+ log2_cq_size);
+ clib_memset_u16 (cqe_flags, rxq->last_cqe_flags, n_mini_cqes_left);
+ n_rx_packets = n_mini_cqes_left;
+ byte_cnt += n_mini_cqes_left;
+ cqe_flags += n_mini_cqes_left;
+ rxq->n_mini_cqes_left = 0;
+ rxq->cq_ci = cq_ci = cq_ci + n_mini_cqes;
+ }
+
+ while (n_rx_packets < VLIB_FRAME_SIZE)
+ {
+ u8 cqe_last_byte, owner;
+ mlx5dv_cqe_t *cqe = rxq->cqes + (cq_ci & mask);
+
+ clib_prefetch_load (rxq->cqes + ((cq_ci + 8) & mask));
+
+ owner = (cq_ci >> log2_cq_size) & 1;
+ cqe_last_byte = cqe->opcode_cqefmt_se_owner;
+
+ if ((cqe_last_byte & 0x1) != owner)
+ break;
+
+ cqe_last_byte &= 0xfe; /* remove owner bit */
+
+ if (cqe_last_byte == 0x2c)
+ {
+ u32 n_mini_cqes = clib_net_to_host_u32 (cqe->mini_cqe_num);
+ u32 n_left = VLIB_FRAME_SIZE - n_rx_packets;
+ u16 flags = cqe->flags;
+
+ if (n_left >= n_mini_cqes)
+ {
+ process_mini_cqes (rxq, 0, n_mini_cqes, cq_ci, mask, byte_cnt);
+ clib_memset_u16 (cqe_flags, flags, n_mini_cqes);
+ compressed_cqe_reset_owner (rxq, n_mini_cqes, cq_ci, mask,
+ log2_cq_size);
+ n_rx_packets += n_mini_cqes;
+ byte_cnt += n_mini_cqes;
+ cqe_flags += n_mini_cqes;
+ cq_ci += n_mini_cqes;
+ }
+ else
+ {
+ process_mini_cqes (rxq, 0, n_left, cq_ci, mask, byte_cnt);
+ clib_memset_u16 (cqe_flags, flags, n_left);
+ n_rx_packets = VLIB_FRAME_SIZE;
+ rxq->n_mini_cqes = n_mini_cqes;
+ rxq->n_mini_cqes_left = n_mini_cqes - n_left;
+ rxq->last_cqe_flags = flags;
+ goto done;
+ }
+ continue;
+ }
+
+ if (cqe_last_byte == 0x20)
+ {
+ byte_cnt[0] = cqe->byte_cnt;
+ cqe_flags[0] = cqe->flags;
+ n_rx_packets++;
+ cq_ci++;
+ byte_cnt++;
+ continue;
+ }
+
+ rd->flags |= RDMA_DEVICE_F_ERROR;
+ break;
+ }
+
+done:
+ if (n_rx_packets)
+ rxq->cq_db[0] = rxq->cq_ci = cq_ci;
+ return n_rx_packets;
+}
+
static_always_inline uword
rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
- vlib_frame_t * frame, rdma_device_t * rd, u16 qid)
+ vlib_frame_t * frame, rdma_device_t * rd, u16 qid,
+ int use_mlx5dv)
{
rdma_main_t *rm = &rdma_main;
vnet_main_t *vnm = vnet_get_main ();
rdma_per_thread_data_t *ptd = vec_elt_at_index (rm->per_thread_data,
vm->thread_index);
rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid);
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
struct ibv_wc wc[VLIB_FRAME_SIZE];
+ u32 byte_cnts[VLIB_FRAME_SIZE];
vlib_buffer_t bt;
- u32 next_index, *to_next, n_left_to_next;
- u32 n_rx_packets, n_rx_bytes;
+ u32 next_index, *to_next, n_left_to_next, n_rx_bytes = 0;
+ int n_rx_packets, skip_ip4_cksum = 0;
u32 mask = rxq->size - 1;
- ASSERT (rxq->size >= VLIB_FRAME_SIZE && is_pow2 (rxq->size));
- ASSERT (rxq->tail - rxq->head <= rxq->size);
-
- n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc);
- ASSERT (n_rx_packets <= rxq->tail - rxq->head);
+ if (use_mlx5dv)
+ n_rx_packets = rdma_device_poll_cq_mlx5dv (rd, rxq, byte_cnts,
+ ptd->cqe_flags);
+ else
+ n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc);
if (PREDICT_FALSE (n_rx_packets <= 0))
goto refill;
@@ -281,20 +514,104 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vnet_feature_start_device_input_x1 (rd->sw_if_index, &next_index, &bt);
vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next);
- ASSERT (n_rx_packets <= n_left_to_next);
vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs, rxq->head & mask,
rxq->size, n_rx_packets);
- n_rx_bytes = rdma_device_input_bufs (vm, rd, to_next, wc, n_rx_packets,
- &bt);
- rdma_device_input_ethernet (vm, node, rd, next_index);
+ vlib_get_buffers (vm, to_next, bufs, n_rx_packets);
+
+ if (use_mlx5dv)
+ {
+ u16 mask = CQE_FLAG_L3_HDR_TYPE_MASK | CQE_FLAG_L3_OK;
+ u16 match = CQE_FLAG_L3_HDR_TYPE_IP4 << CQE_FLAG_L3_HDR_TYPE_SHIFT;
+ u32 n_left = n_rx_packets;
+ u32 *bc = byte_cnts;
+
+ /* verify that all ip4 packets have l3_ok flag set and convert packet
+ length from network to host byte order */
+ skip_ip4_cksum = 1;
+
+#if defined CLIB_HAVE_VEC256
+ u16x16 mask16 = u16x16_splat (mask);
+ u16x16 match16 = u16x16_splat (match);
+ u16x16 r = { };
+
+ for (int i = 0; i * 16 < n_rx_packets; i++)
+ r |= (ptd->cqe_flags16[i] & mask16) != match16;
+
+ if (!u16x16_is_all_zero (r))
+ skip_ip4_cksum = 0;
+
+ for (int i = 0; i < n_rx_packets; i += 8)
+ *(u32x8 *) (bc + i) = u32x8_byte_swap (*(u32x8 *) (bc + i));
+#elif defined CLIB_HAVE_VEC128
+ u16x8 mask8 = u16x8_splat (mask);
+ u16x8 match8 = u16x8_splat (match);
+ u16x8 r = { };
+
+ for (int i = 0; i * 8 < n_rx_packets; i++)
+ r |= (ptd->cqe_flags8[i] & mask8) != match8;
+
+ if (!u16x8_is_all_zero (r))
+ skip_ip4_cksum = 0;
+
+ for (int i = 0; i < n_rx_packets; i += 4)
+ *(u32x4 *) (bc + i) = u32x4_byte_swap (*(u32x4 *) (bc + i));
+#else
+ for (int i = 0; i < n_rx_packets; i++)
+ if ((ptd->cqe_flags[i] & mask) == match)
+ skip_ip4_cksum = 0;
+
+ for (int i = 0; i < n_rx_packets; i++)
+ bc[i] = clib_net_to_host_u32 (bc[i]);
+#endif
+
+ while (n_left >= 8)
+ {
+ clib_prefetch_store (b[4]);
+ vlib_buffer_copy_template (b[0], &bt);
+ n_rx_bytes += b[0]->current_length = bc[0];
+ clib_prefetch_store (b[5]);
+ vlib_buffer_copy_template (b[1], &bt);
+ n_rx_bytes += b[1]->current_length = bc[1];
+ clib_prefetch_store (b[6]);
+ vlib_buffer_copy_template (b[2], &bt);
+ n_rx_bytes += b[2]->current_length = bc[2];
+ clib_prefetch_store (b[7]);
+ vlib_buffer_copy_template (b[3], &bt);
+ n_rx_bytes += b[3]->current_length = bc[3];
+
+ /* next */
+ bc += 4;
+ b += 4;
+ n_left -= 4;
+ }
+ while (n_left)
+ {
+ vlib_buffer_copy_template (b[0], &bt);
+ n_rx_bytes += b[0]->current_length = bc[0];
+
+ /* next */
+ bc++;
+ b++;
+ n_left--;
+ }
+ }
+ else
+ n_rx_bytes = rdma_device_input_bufs (vm, rd, bufs, wc, n_rx_packets, &bt);
+
+ rdma_device_input_ethernet (vm, node, rd, next_index, skip_ip4_cksum);
vlib_put_next_frame (vm, node, next_index, n_left_to_next - n_rx_packets);
rxq->head += n_rx_packets;
- rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next, next_index);
+ rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next, next_index,
+ ptd->cqe_flags, use_mlx5dv);
+
+ /* reset flags to zero for the next run */
+ if (use_mlx5dv)
+ clib_memset_u16 (ptd->cqe_flags, 0, VLIB_FRAME_SIZE);
vlib_increment_combined_counter
(vnm->interface_main.combined_sw_if_counters +
@@ -302,7 +619,7 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
rd->hw_if_index, n_rx_packets, n_rx_bytes);
refill:
- rdma_device_input_refill (vm, rd, rxq);
+ rdma_device_input_refill (vm, rd, rxq, use_mlx5dv);
return n_rx_packets;
}
@@ -320,8 +637,16 @@ VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm,
{
rdma_device_t *rd;
rd = vec_elt_at_index (rm->devices, dq->dev_instance);
- if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ADMIN_UP))
- n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id);
+ if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ADMIN_UP) == 0)
+ continue;
+
+ if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ERROR))
+ continue;
+
+ if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_MLX5DV))
+ n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id, 1);
+ else
+ n_rx += rdma_device_input_inline (vm, node, frame, rd, dq->queue_id, 0);
}
return n_rx;
}
diff --git a/src/plugins/rdma/rdma.h b/src/plugins/rdma/rdma.h
index 302d2383cfe..1e2f3d9ea0e 100644
--- a/src/plugins/rdma/rdma.h
+++ b/src/plugins/rdma/rdma.h
@@ -23,12 +23,14 @@
#include <vlib/pci/pci.h>
#include <vnet/interface.h>
#include <vnet/ethernet/mac_address.h>
+#include <rdma/rdma_mlx5dv.h>
#define foreach_rdma_device_flags \
_(0, ERROR, "error") \
_(1, ADMIN_UP, "admin-up") \
_(2, LINK_UP, "link-up") \
- _(3, PROMISC, "promiscuous")
+ _(3, PROMISC, "promiscuous") \
+ _(4, MLX5DV, "mlx5dv")
enum
{
@@ -46,6 +48,18 @@ typedef struct
u32 size;
u32 head;
u32 tail;
+ u32 cq_ci;
+ u16 log2_cq_size;
+ u16 n_mini_cqes;
+ u16 n_mini_cqes_left;
+ u16 last_cqe_flags;
+ mlx5dv_cqe_t *cqes;
+ mlx5dv_rwq_t *wqes;
+ volatile u32 *wq_db;
+ volatile u32 *cq_db;
+ u32 cqn;
+ u32 wqe_cnt;
+ u32 wq_stride;
} rdma_rxq_t;
typedef struct
@@ -96,6 +110,12 @@ typedef struct
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ union
+ {
+ u16 cqe_flags[VLIB_FRAME_SIZE];
+ u16x8 cqe_flags8[VLIB_FRAME_SIZE / 8];
+ u16x16 cqe_flags16[VLIB_FRAME_SIZE / 16];
+ };
vlib_buffer_t buffer_template;
} rdma_per_thread_data_t;
@@ -140,12 +160,14 @@ extern vnet_device_class_t rdma_device_class;
format_function_t format_rdma_device;
format_function_t format_rdma_device_name;
format_function_t format_rdma_input_trace;
+format_function_t format_rdma_rxq;
unformat_function_t unformat_rdma_create_if_args;
typedef struct
{
u32 next_index;
u32 hw_if_index;
+ u16 cqe_flags;
} rdma_input_trace_t;
#define foreach_rdma_tx_func_error \
diff --git a/src/plugins/rdma/rdma_mlx5dv.h b/src/plugins/rdma/rdma_mlx5dv.h
new file mode 100644
index 00000000000..43d9002d050
--- /dev/null
+++ b/src/plugins/rdma/rdma_mlx5dv.h
@@ -0,0 +1,156 @@
+/*
+ *------------------------------------------------------------------
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef _RDMA_MLX5DV_H_
+#define _RDMA_MLX5DV_H_
+
+#undef always_inline
+#include <infiniband/mlx5dv.h>
+#define always_inline static_always_inline
+
+/* CQE flags - bits 16-31 of qword at offset 0x1c */
+#define CQE_FLAG_L4_OK 10
+#define CQE_FLAG_L3_OK 9
+#define CQE_FLAG_L2_OK 8
+#define CQE_FLAG_IP_FRAG 7
+#define CQE_FLAG_L4_HDR_TYPE(f) (((f) >> 4) & 7)
+#define CQE_FLAG_L3_HDR_TYPE_SHIFT (2)
+#define CQE_FLAG_L3_HDR_TYPE_MASK (3 << CQE_FLAG_L3_HDR_TYPE_SHIFT)
+#define CQE_FLAG_L3_HDR_TYPE(f) (((f) & CQE_FLAG_L3_HDR_TYPE_MASK) >> CQE_FLAG_L3_HDR_TYPE_SHIFT)
+#define CQE_FLAG_L3_HDR_TYPE_IP4 1
+#define CQE_FLAG_L3_HDR_TYPE_IP6 2
+#define CQE_FLAG_IP_EXT_OPTS 1
+
+typedef struct
+{
+ struct
+ {
+ u8 pad1[28];
+ u16 flags;
+ u8 pad2[14];
+ union
+ {
+ u32 byte_cnt;
+ u32 mini_cqe_num;
+ };
+ u8 pad3[15];
+ u8 opcode_cqefmt_se_owner;
+ };
+} mlx5dv_cqe_t;
+
+STATIC_ASSERT_SIZEOF (mlx5dv_cqe_t, 64);
+
+typedef struct
+{
+ union
+ {
+ u32 checksum;
+ u32 rx_hash_result;
+ };
+ u32 byte_count;
+} mlx5dv_mini_cqe_t;
+
+typedef struct
+{
+ u64 dsz_and_lkey;
+ u64 addr;
+} mlx5dv_rwq_t;
+
+#define foreach_cqe_rx_field \
+ _(0x1c, 26, 26, l4_ok) \
+ _(0x1c, 25, 25, l3_ok) \
+ _(0x1c, 24, 24, l2_ok) \
+ _(0x1c, 23, 23, ip_frag) \
+ _(0x1c, 22, 20, l4_hdr_type) \
+ _(0x1c, 19, 18, l3_hdr_type) \
+ _(0x1c, 17, 17, ip_ext_opts) \
+ _(0x1c, 16, 16, cv) \
+ _(0x2c, 31, 0, byte_cnt) \
+ _(0x30, 63, 0, timestamp) \
+ _(0x38, 31, 24, rx_drop_counter) \
+ _(0x38, 23, 0, flow_tag) \
+ _(0x3c, 31, 16, wqe_counter) \
+ _(0x3c, 15, 8, signature) \
+ _(0x3c, 7, 4, opcode) \
+ _(0x3c, 3, 2, cqe_format) \
+ _(0x3c, 1, 1, sc) \
+ _(0x3c, 0, 0, owner)
+
+
+/* inline functions */
+
+static inline u32
+mlx5_get_u32 (void *start, int offset)
+{
+ return clib_net_to_host_u32 (*(u32 *) (((u8 *) start) + offset));
+}
+
+static inline u64
+mlx5_get_u64 (void *start, int offset)
+{
+ return clib_net_to_host_u64 (*(u64 *) (((u8 *) start) + offset));
+}
+
+static inline void
+mlx5_set_u32 (void *start, int offset, u32 value)
+{
+ (*(u32 *) (((u8 *) start) + offset)) = clib_host_to_net_u32 (value);
+}
+
+static inline void
+mlx5_set_u64 (void *start, int offset, u64 value)
+{
+ (*(u64 *) (((u8 *) start) + offset)) = clib_host_to_net_u64 (value);
+}
+
+static inline void
+mlx5_set_bits (void *start, int offset, int first, int last, u32 value)
+{
+ u32 mask = (1 << (first - last + 1)) - 1;
+ u32 old = mlx5_get_u32 (start, offset);
+ if ((last == 0) && (first == 31))
+ {
+ mlx5_set_u32 (start, offset, value);
+ return;
+ }
+ ASSERT (value == (value & mask));
+ value &= mask;
+ old &= ~(mask << last);
+ mlx5_set_u32 (start, offset, old | value << last);
+}
+
+static inline u32
+mlx5_get_bits (void *start, int offset, int first, int last)
+{
+ u32 value = mlx5_get_u32 (start, offset);
+ if ((last == 0) && (first == 31))
+ return value;
+ value >>= last;
+ value &= (1 << (first - last + 1)) - 1;
+ return value;
+}
+
+
+#endif /* RDMA_MLX5DV_H */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h
index 482bdd515c9..8cc1d77d63c 100644
--- a/src/vppinfra/vector_avx2.h
+++ b/src/vppinfra/vector_avx2.h
@@ -132,6 +132,16 @@ _(i8x16, i64x4, epi8_epi64)
#undef _
/* *INDENT-ON* */
+static_always_inline u64x4
+u64x4_byte_swap (u64x4 v)
+{
+ u8x32 swap = {
+ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
+ };
+ return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
+}
+
static_always_inline u32x8
u32x8_byte_swap (u32x8 v)
{
diff --git a/src/vppinfra/vector_neon.h b/src/vppinfra/vector_neon.h
index 14930d664d5..d80c691e3d9 100644
--- a/src/vppinfra/vector_neon.h
+++ b/src/vppinfra/vector_neon.h
@@ -103,6 +103,12 @@ u16x8_byte_swap (u16x8 v)
return (u16x8) vrev16q_u8 ((u8x16) v);
}
+static_always_inline u32x4
+u32x4_byte_swap (u32x4 v)
+{
+ return vrev64q_u32 (v);
+}
+
static_always_inline u8x16
u8x16_shuffle (u8x16 v, u8x16 m)
{