diff options
Diffstat (limited to 'src/plugins/rdma')
-rw-r--r-- | src/plugins/rdma/CMakeLists.txt | 8 | ||||
-rw-r--r-- | src/plugins/rdma/api.c | 52 | ||||
-rw-r--r-- | src/plugins/rdma/cli.c | 14 | ||||
-rw-r--r-- | src/plugins/rdma/device.c | 71 | ||||
-rw-r--r-- | src/plugins/rdma/format.c | 15 | ||||
-rw-r--r-- | src/plugins/rdma/input.c | 90 | ||||
-rw-r--r-- | src/plugins/rdma/plugin.c | 2 | ||||
-rw-r--r-- | src/plugins/rdma/rdma.api | 54 | ||||
-rw-r--r-- | src/plugins/rdma/rdma_doc.md | 75 | ||||
-rw-r--r-- | src/plugins/rdma/rdma_doc.rst | 102 | ||||
-rw-r--r-- | src/plugins/rdma/rdma_mlx5dv.h | 12 | ||||
-rw-r--r-- | src/plugins/rdma/test_api.c | 53 |
12 files changed, 364 insertions, 184 deletions
diff --git a/src/plugins/rdma/CMakeLists.txt b/src/plugins/rdma/CMakeLists.txt index f598ff8c701..ef8bc90c6dd 100644 --- a/src/plugins/rdma/CMakeLists.txt +++ b/src/plugins/rdma/CMakeLists.txt @@ -19,17 +19,16 @@ if (NOT IBVERBS_INCLUDE_DIR) endif() vpp_plugin_find_library(rdma IBVERBS_LIB libibverbs.a) -vpp_plugin_find_library(rdma RDMA_UTIL_LIB librdma_util.a) vpp_plugin_find_library(rdma MLX5_LIB libmlx5.a) -if (NOT IBVERBS_LIB OR NOT RDMA_UTIL_LIB OR NOT MLX5_LIB) +if (NOT IBVERBS_LIB OR NOT MLX5_LIB) message(WARNING "rdma plugin - ibverbs not found - rdma plugin disabled") return() endif() -string_append(RDMA_LINK_FLAGS "-Wl,--whole-archive,${MLX5_LIB},--no-whole-archive") +string_append(RDMA_LINK_FLAGS "-Wl,--whole-archive,${MLX5_LIB},--no-whole-archive -Wl,--exclude-libs,ALL") -set(CMAKE_REQUIRED_FLAGS "-fPIC -shared -pthread -Wno-unused-command-line-argument ${RDMA_LINK_FLAGS} ${IBVERBS_LIB} ${RDMA_UTIL_LIB}") +set(CMAKE_REQUIRED_FLAGS "-fPIC -shared -pthread -Wno-unused-command-line-argument ${RDMA_LINK_FLAGS} ${IBVERBS_LIB}") set(CMAKE_REQUIRED_INCLUDES "${IBVERBS_INCLUDE_DIR}") set(CMAKE_REQUIRED_LIBRARIES "c") # force linkage by including libc explicitely CHECK_C_SOURCE_COMPILES(" @@ -73,5 +72,4 @@ add_vpp_plugin(rdma LINK_LIBRARIES ${IBVERBS_LIB} - ${RDMA_UTIL_LIB} ) diff --git a/src/plugins/rdma/api.c b/src/plugins/rdma/api.c index 7fe77105596..3fb17ff6ee0 100644 --- a/src/plugins/rdma/api.c +++ b/src/plugins/rdma/api.c @@ -27,6 +27,7 @@ #include <rdma/rdma.api_enum.h> #include <rdma/rdma.api_types.h> +#define REPLY_MSG_ID_BASE (rm->msg_id_base) #include <vlibapi/api_helper_macros.h> static rdma_mode_t @@ -41,6 +42,8 @@ rdma_api_mode (vl_api_rdma_mode_t mode) case RDMA_API_MODE_DV: return RDMA_MODE_DV; } + /* Fail the debug build. Useful for investigating endian issues. */ + ASSERT (0); return RDMA_MODE_AUTO; } @@ -79,6 +82,35 @@ rdma_api_rss6 (const vl_api_rdma_rss6_t rss6) } static void +vl_api_rdma_create_v4_t_handler (vl_api_rdma_create_v4_t *mp) +{ + vlib_main_t *vm = vlib_get_main (); + rdma_main_t *rm = &rdma_main; + vl_api_rdma_create_v4_reply_t *rmp; + rdma_create_if_args_t args; + int rv; + + clib_memset (&args, 0, sizeof (rdma_create_if_args_t)); + + args.ifname = mp->host_if; + args.name = mp->name; + args.rxq_num = mp->rxq_num; + args.rxq_size = mp->rxq_size; + args.txq_size = mp->txq_size; + args.mode = rdma_api_mode (mp->mode); + args.disable_striding_rq = 0; + args.no_multi_seg = mp->no_multi_seg; + args.max_pktlen = mp->max_pktlen; + args.rss4 = rdma_api_rss4 (mp->rss4); + args.rss6 = rdma_api_rss6 (mp->rss6); + rdma_create_if (vm, &args); + rv = args.rv; + + REPLY_MACRO2_END (VL_API_RDMA_CREATE_V4_REPLY, + ({ rmp->sw_if_index = args.sw_if_index; })); +} + +static void vl_api_rdma_create_v3_t_handler (vl_api_rdma_create_v3_t *mp) { vlib_main_t *vm = vlib_get_main (); @@ -103,7 +135,7 @@ vl_api_rdma_create_v3_t_handler (vl_api_rdma_create_v3_t *mp) rdma_create_if (vm, &args); rv = args.rv; - REPLY_MACRO2 (VL_API_RDMA_CREATE_V3_REPLY + rm->msg_id_base, + REPLY_MACRO2 (VL_API_RDMA_CREATE_V3_REPLY, ({ rmp->sw_if_index = ntohl (args.sw_if_index); })); } @@ -130,12 +162,8 @@ vl_api_rdma_create_v2_t_handler (vl_api_rdma_create_v2_t * mp) rdma_create_if (vm, &args); rv = args.rv; - /* *INDENT-OFF* */ - REPLY_MACRO2 (VL_API_RDMA_CREATE_V2_REPLY + rm->msg_id_base, - ({ - rmp->sw_if_index = ntohl (args.sw_if_index); - })); - /* *INDENT-ON* */ + REPLY_MACRO2 (VL_API_RDMA_CREATE_V2_REPLY, + ({ rmp->sw_if_index = ntohl (args.sw_if_index); })); } static void @@ -162,12 +190,8 @@ vl_api_rdma_create_t_handler (vl_api_rdma_create_t * mp) rdma_create_if (vm, &args); rv = args.rv; - /* *INDENT-OFF* */ - REPLY_MACRO2 (VL_API_RDMA_CREATE_REPLY + rm->msg_id_base, - ({ - rmp->sw_if_index = ntohl (args.sw_if_index); - })); - /* *INDENT-ON* */ + REPLY_MACRO2 (VL_API_RDMA_CREATE_REPLY, + ({ rmp->sw_if_index = ntohl (args.sw_if_index); })); } static void @@ -195,7 +219,7 @@ vl_api_rdma_delete_t_handler (vl_api_rdma_delete_t * mp) rdma_delete_if (vm, rd); reply: - REPLY_MACRO (VL_API_RDMA_DELETE_REPLY + rm->msg_id_base); + REPLY_MACRO (VL_API_RDMA_DELETE_REPLY); } /* set tup the API message handling tables */ diff --git a/src/plugins/rdma/cli.c b/src/plugins/rdma/cli.c index 8f191e34b63..bcedd625220 100644 --- a/src/plugins/rdma/cli.c +++ b/src/plugins/rdma/cli.c @@ -44,17 +44,15 @@ rdma_create_command_fn (vlib_main_t * vm, unformat_input_t * input, return args.error; } -/* *INDENT-OFF* */ VLIB_CLI_COMMAND (rdma_create_command, static) = { .path = "create interface rdma", .short_help = "create interface rdma <host-if ifname> [name <name>]" - " [rx-queue-size <size>] [tx-queue-size <size>]" - " [num-rx-queues <size>] [mode <auto|ibv|dv]" - " [no-multi-seg] [no-striding]" - " [max-pktlen <size>]", + " [rx-queue-size <size>] [tx-queue-size <size>]" + " [num-rx-queues <size>] [mode <auto|ibv|dv>]" + " [no-multi-seg] [no-striding]" + " [max-pktlen <size>]", .function = rdma_create_command_fn, }; -/* *INDENT-ON* */ static clib_error_t * rdma_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -99,14 +97,12 @@ rdma_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, return 0; } -/* *INDENT-OFF* */ VLIB_CLI_COMMAND (rdma_delete_command, static) = { .path = "delete interface rdma", .short_help = "delete interface rdma " "{<interface> | sw_if_index <sw_idx>}", .function = rdma_delete_command_fn, }; -/* *INDENT-ON* */ static clib_error_t * test_rdma_dump_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -162,13 +158,11 @@ test_rdma_dump_command_fn (vlib_main_t * vm, unformat_input_t * input, return 0; } -/* *INDENT-OFF* */ VLIB_CLI_COMMAND (test_rdma_mlx5dv_dump_command, static) = { .path = "test rdma dump", .short_help = "test rdma dump {<interface> | sw_if_index <sw_idx>}", .function = test_rdma_dump_command_fn, }; -/* *INDENT-ON* */ clib_error_t * rdma_cli_init (vlib_main_t * vm) diff --git a/src/plugins/rdma/device.c b/src/plugins/rdma/device.c index 1198d99b14e..8aeb586a42d 100644 --- a/src/plugins/rdma/device.c +++ b/src/plugins/rdma/device.c @@ -183,11 +183,11 @@ rdma_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new) return 0; } -static u32 -rdma_dev_change_mtu (rdma_device_t * rd) +static clib_error_t * +rdma_set_max_frame_size (vnet_main_t *vnm, vnet_hw_interface_t *hw, + u32 frame_size) { - rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "MTU change not supported"); - return ~0; + return vnet_error (VNET_ERR_UNSUPPORTED, 0); } static u32 @@ -202,8 +202,6 @@ rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) return rdma_dev_set_ucast (rd); case ETHERNET_INTERFACE_FLAG_ACCEPT_ALL: return rdma_dev_set_promisc (rd); - case ETHERNET_INTERFACE_FLAG_MTU: - return rdma_dev_change_mtu (rd); } rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unknown flag %x requested", flags); @@ -355,18 +353,20 @@ rdma_async_event_cleanup (rdma_device_t * rd) static clib_error_t * rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd) { - clib_error_t *err = - ethernet_register_interface (vnm, rdma_device_class.index, - rd->dev_instance, rd->hwaddr.bytes, - &rd->hw_if_index, rdma_flag_change); - + vnet_eth_interface_registration_t eir = {}; + + eir.dev_class_index = rdma_device_class.index; + eir.dev_instance = rd->dev_instance; + eir.address = rd->hwaddr.bytes; + eir.cb.flag_change = rdma_flag_change; + eir.cb.set_max_frame_size = rdma_set_max_frame_size; + rd->hw_if_index = vnet_eth_register_interface (vnm, &eir); /* Indicate ability to support L3 DMAC filtering and * initialize interface to L3 non-promisc mode */ - vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, rd->hw_if_index); - hi->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_MAC_FILTER; + vnet_hw_if_set_caps (vnm, rd->hw_if_index, VNET_HW_IF_CAP_MAC_FILTER); ethernet_set_flags (vnm, rd->hw_if_index, ETHERNET_INTERFACE_FLAG_DEFAULT_L3); - return err; + return 0; } static void @@ -445,9 +445,10 @@ rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc, if (is_mlx5dv) { struct mlx5dv_cq_init_attr dvcq = { }; - dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE; + dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE | + MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE; dvcq.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH; - + dvcq.cqe_size = 64; if ((cqex = mlx5dv_create_cq (rd->ctx, &cqa, &dvcq)) == 0) return clib_error_return_unix (0, "Create mlx5dv rx CQ Failed"); } @@ -717,15 +718,30 @@ rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc) struct ibv_qp_init_attr qpia; struct ibv_qp_attr qpa; int qp_flags; + int is_mlx5dv = !!(rd->flags & RDMA_DEVICE_F_MLX5DV); vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES); txq = vec_elt_at_index (rd->txqs, qid); ASSERT (is_pow2 (n_desc)); txq->bufs_log2sz = min_log2 (n_desc); vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES); - - if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0) - return clib_error_return_unix (0, "Create CQ Failed"); + if (is_mlx5dv) + { + struct ibv_cq_init_attr_ex cqa = {}; + struct ibv_cq_ex *cqex; + struct mlx5dv_cq_init_attr dvcq = {}; + dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE; + dvcq.cqe_size = 64; + cqa.cqe = n_desc; + if ((cqex = mlx5dv_create_cq (rd->ctx, &cqa, &dvcq)) == 0) + return clib_error_return_unix (0, "Create mlx5dv tx CQ Failed"); + txq->cq = ibv_cq_ex_to_cq (cqex); + } + else + { + if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0) + return clib_error_return_unix (0, "Create CQ Failed"); + } memset (&qpia, 0, sizeof (qpia)); qpia.send_cq = txq->cq; @@ -866,7 +882,7 @@ sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr) unformat_input_t in; u8 *s; - s = clib_sysfs_link_to_name (path); + s = clib_file_get_resolved_basename (path); if (!s) return 0; @@ -1022,7 +1038,7 @@ are explicitly disabled, and if the interface supports it.*/ /* * FIXME: add support for interrupt mode * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index); - * hw->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_INT_MODE; + * hw->caps |= VNET_HW_IF_CAP_INT_MODE; */ vnet_hw_if_set_input_node (vnm, rd->hw_if_index, rdma_input_node.index); @@ -1136,15 +1152,4 @@ rdma_init (vlib_main_t * vm) return 0; } -VLIB_INIT_FUNCTION (rdma_init) = -{ - .runs_after = VLIB_INITS ("pci_bus_init"), -}; - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ +VLIB_INIT_FUNCTION (rdma_init); diff --git a/src/plugins/rdma/format.c b/src/plugins/rdma/format.c index aada52a1ec3..a999460bd55 100644 --- a/src/plugins/rdma/format.c +++ b/src/plugins/rdma/format.c @@ -58,13 +58,13 @@ format_rdma_bit_flag (u8 * s, va_list * args) while (flags) { - if ((flags & (1 << i))) + if ((flags & ((u64) 1 << i))) { if (i < n_strs && strs[i] != 0) s = format (s, " %s", strs[i]); else s = format (s, " unknown(%u)", i); - flags ^= 1 << i; + flags ^= (u64) 1 << i; } i++; } @@ -122,8 +122,8 @@ format_rdma_device (u8 * s, va_list * args) format_vlib_pci_addr, &rd->pci->addr); if ((d = vlib_pci_get_device_info (vm, &rd->pci->addr, 0))) { - s = format (s, "%Uproduct name: %s\n", format_white_space, indent, - d->product_name ? (char *) d->product_name : ""); + s = format (s, "%Uproduct name: %v\n", format_white_space, indent, + d->product_name); s = format (s, "%Upart number: %U\n", format_white_space, indent, format_vlib_pci_vpd, d->vpd_r, "PN"); s = format (s, "%Urevision: %U\n", format_white_space, indent, @@ -281,7 +281,7 @@ format_rdma_rxq (u8 * s, va_list * args) if (rd->flags & RDMA_DEVICE_F_MLX5DV) { - u32 next_cqe_index = rxq->cq_ci & (rxq->size - 1); + u32 next_cqe_index = rxq->cq_ci & ((1 << rxq->log2_cq_size) - 1); s = format (s, "\n%Uwq: stride %u wqe-cnt %u", format_white_space, indent + 2, rxq->wq_stride, rxq->wqe_cnt); @@ -292,9 +292,8 @@ format_rdma_rxq (u8 * s, va_list * args) next_cqe_index); s = format (s, "\n%U%U", format_white_space, indent + 6, format_mlx5_cqe_rx, rxq->cqes + next_cqe_index); - s = format (s, "\n%U%U", format_white_space, indent + 6, - format_hexdump, rxq->cqes + next_cqe_index, - sizeof (mlx5dv_cqe_t)); + s = format (s, "\n%U%U", format_white_space, indent + 6, format_hexdump, + rxq->cqes + next_cqe_index, (u32) sizeof (mlx5dv_cqe_t)); } return s; diff --git a/src/plugins/rdma/input.c b/src/plugins/rdma/input.c index f1c508affa2..a7d41a1684d 100644 --- a/src/plugins/rdma/input.c +++ b/src/plugins/rdma/input.c @@ -228,7 +228,6 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, about what RDMA core does (CYCLIC_RQ or LINKED_LIST_RQ). In cyclic mode, the SRQ header is ignored anyways... */ -/* *INDENT-OFF* */ if (is_striding && !(current_data_seg & (wqe_sz - 1))) *(mlx5dv_wqe_srq_next_t *) wqe = (mlx5dv_wqe_srq_next_t) { @@ -237,7 +236,6 @@ rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, .signature = 0, .rsvd1 = {0} }; -/* *INDENT-ON* */ /* TODO: when log_skip_wqe > 2, hw_prefetcher doesn't work, lots of LLC store misses occur for wqes, to be fixed... */ @@ -609,6 +607,7 @@ rdma_device_poll_cq_mlx5dv (rdma_device_t * rd, rdma_rxq_t * rxq, n_rx_packets++; cq_ci++; byte_cnt++; + cqe_flags++; continue; } @@ -670,46 +669,77 @@ rdma_device_mlx5dv_l3_validate_and_swap_bc (rdma_per_thread_data_t * ptd, int n_rx_packets, u32 * bc) { u16 mask = CQE_FLAG_L3_HDR_TYPE_MASK | CQE_FLAG_L3_OK; - u16 match = CQE_FLAG_L3_HDR_TYPE_IP4 << CQE_FLAG_L3_HDR_TYPE_SHIFT; + u16 match = + CQE_FLAG_L3_HDR_TYPE_IP4 << CQE_FLAG_L3_HDR_TYPE_SHIFT | CQE_FLAG_L3_OK; + + /* convert mask/match to big endian for subsequant comparison */ + mask = clib_host_to_net_u16 (mask); + match = clib_host_to_net_u16 (match); /* verify that all ip4 packets have l3_ok flag set and convert packet length from network to host byte order */ int skip_ip4_cksum = 1; + int n_left = n_rx_packets; + u16 *cqe_flags = ptd->cqe_flags; #if defined CLIB_HAVE_VEC256 - u16x16 mask16 = u16x16_splat (mask); - u16x16 match16 = u16x16_splat (match); - u16x16 r = { }; + if (n_left >= 16) + { + u16x16 mask16 = u16x16_splat (mask); + u16x16 match16 = u16x16_splat (match); + u16x16 r16 = {}; + + while (n_left >= 16) + { + r16 |= (*(u16x16 *) cqe_flags & mask16) != match16; - for (int i = 0; i * 16 < n_rx_packets; i++) - r |= (ptd->cqe_flags16[i] & mask16) != match16; + *(u32x8 *) bc = u32x8_byte_swap (*(u32x8 *) bc); + *(u32x8 *) (bc + 8) = u32x8_byte_swap (*(u32x8 *) (bc + 8)); - if (!u16x16_is_all_zero (r)) - skip_ip4_cksum = 0; + cqe_flags += 16; + bc += 16; + n_left -= 16; + } - for (int i = 0; i < n_rx_packets; i += 8) - *(u32x8 *) (bc + i) = u32x8_byte_swap (*(u32x8 *) (bc + i)); + if (!u16x16_is_all_zero (r16)) + skip_ip4_cksum = 0; + } #elif defined CLIB_HAVE_VEC128 - u16x8 mask8 = u16x8_splat (mask); - u16x8 match8 = u16x8_splat (match); - u16x8 r = { }; + if (n_left >= 8) + { + u16x8 mask8 = u16x8_splat (mask); + u16x8 match8 = u16x8_splat (match); + u16x8 r8 = {}; - for (int i = 0; i * 8 < n_rx_packets; i++) - r |= (ptd->cqe_flags8[i] & mask8) != match8; + while (n_left >= 8) + { + r8 |= (*(u16x8 *) cqe_flags & mask8) != match8; - if (!u16x8_is_all_zero (r)) - skip_ip4_cksum = 0; + *(u32x4 *) bc = u32x4_byte_swap (*(u32x4 *) bc); + *(u32x4 *) (bc + 4) = u32x4_byte_swap (*(u32x4 *) (bc + 4)); - for (int i = 0; i < n_rx_packets; i += 4) - *(u32x4 *) (bc + i) = u32x4_byte_swap (*(u32x4 *) (bc + i)); -#else - for (int i = 0; i < n_rx_packets; i++) - if ((ptd->cqe_flags[i] & mask) != match) - skip_ip4_cksum = 0; + cqe_flags += 8; + bc += 8; + n_left -= 8; + } - for (int i = 0; i < n_rx_packets; i++) - bc[i] = clib_net_to_host_u32 (bc[i]); + if (!u16x8_is_all_zero (r8)) + skip_ip4_cksum = 0; + } #endif + + while (n_left >= 1) + { + if ((cqe_flags[0] & mask) != match) + skip_ip4_cksum = 0; + + bc[0] = clib_net_to_host_u32 (bc[0]); + + cqe_flags += 1; + bc += 1; + n_left -= 1; + } + return skip_ip4_cksum; } @@ -945,7 +975,7 @@ rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* update buffer template for input feature arcs if any */ next_index = rd->per_interface_next_index; if (PREDICT_FALSE (vnet_device_input_have_features (rd->sw_if_index))) - vnet_feature_start_device_input_x1 (rd->sw_if_index, &next_index, &bt); + vnet_feature_start_device_input (rd->sw_if_index, &next_index, &bt); vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); @@ -1028,7 +1058,7 @@ VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm, if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ADMIN_UP) == 0) continue; - if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ERROR)) + if (PREDICT_FALSE (rd->flags & RDMA_DEVICE_F_ERROR)) continue; if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_MLX5DV)) @@ -1041,7 +1071,6 @@ VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm, return n_rx; } -/* *INDENT-OFF* */ VLIB_REGISTER_NODE (rdma_input_node) = { .name = "rdma-input", .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED, @@ -1053,7 +1082,6 @@ VLIB_REGISTER_NODE (rdma_input_node) = { .error_strings = rdma_input_error_strings, }; -/* *INDENT-ON* */ /* diff --git a/src/plugins/rdma/plugin.c b/src/plugins/rdma/plugin.c index b0dddee42b6..0d2cccc96f8 100644 --- a/src/plugins/rdma/plugin.c +++ b/src/plugins/rdma/plugin.c @@ -19,12 +19,10 @@ #include <vnet/plugin/plugin.h> #include <vpp/app/version.h> -/* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, .description = "RDMA IBverbs Device Driver", }; -/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/plugins/rdma/rdma.api b/src/plugins/rdma/rdma.api index f2c70c7e514..4c06d8c6658 100644 --- a/src/plugins/rdma/rdma.api +++ b/src/plugins/rdma/rdma.api @@ -98,6 +98,8 @@ enum rdma_rss6 }; /** \brief + Same as v4, just not an autoendian (expect buggy handling of flag values). + @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request @param host_if - Linux netdev interface name @@ -114,6 +116,9 @@ enum rdma_rss6 define rdma_create_v3 { + option deprecated; + option replaced_by="rdma_create_v4"; + u32 client_index; u32 context; @@ -130,6 +135,38 @@ define rdma_create_v3 option vat_help = "<host-if ifname> [name <name>] [rx-queue-size <size>] [tx-queue-size <size>] [num-rx-queues <size>] [mode <auto|ibv|dv>] [no-multi-seg] [max-pktlen <size>] [rss <ipv4|ipv4-udp|ipv4-tcp>] [rss <ipv6|ipv6-udp|ipv6-tcp>]"; }; +/** \brief + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param host_if - Linux netdev interface name + @param name - new rdma interface name + @param rxq_num - number of receive queues (optional) + @param rxq_size - receive queue size (optional) + @param txq_size - transmit queue size (optional) + @param mode - operation mode (optional) + @param no_multi_seg (optional) - disable chained buffer RX + @param max_pktlen (optional) - maximal RX packet size. + @param rss4 (optional) - IPv4 RSS + @param rss6 (optional) - IPv6 RSS +*/ + +autoendian define rdma_create_v4 +{ + u32 client_index; + u32 context; + + string host_if[64]; + string name[64]; + u16 rxq_num [default=1]; + u16 rxq_size [default=1024]; + u16 txq_size [default=1024]; + vl_api_rdma_mode_t mode [default=0]; + bool no_multi_seg [default=0]; + u16 max_pktlen [default=0]; + vl_api_rdma_rss4_t rss4 [default=0]; + vl_api_rdma_rss6_t rss6 [default=0]; + option vat_help = "<host-if ifname> [name <name>] [rx-queue-size <size>] [tx-queue-size <size>] [num-rx-queues <size>] [mode <auto|ibv|dv>] [no-multi-seg] [max-pktlen <size>] [rss <ipv4|ipv4-udp|ipv4-tcp>] [rss <ipv6|ipv6-udp|ipv6-tcp>]"; +}; /** \brief @param context - sender context, to match reply w/ request @@ -139,6 +176,8 @@ define rdma_create_v3 define rdma_create_reply { + option deprecated; + u32 context; i32 retval; vl_api_interface_index_t sw_if_index; @@ -152,6 +191,8 @@ define rdma_create_reply define rdma_create_v2_reply { + option deprecated; + u32 context; i32 retval; vl_api_interface_index_t sw_if_index; @@ -176,6 +217,19 @@ define rdma_create_v3_reply @param sw_if_index - interface index */ +autoendian define rdma_create_v4_reply +{ + u32 context; + i32 retval; + vl_api_interface_index_t sw_if_index; +}; + +/** \brief + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface index +*/ + autoreply define rdma_delete { u32 client_index; diff --git a/src/plugins/rdma/rdma_doc.md b/src/plugins/rdma/rdma_doc.md deleted file mode 100644 index 3fed5b6fc49..00000000000 --- a/src/plugins/rdma/rdma_doc.md +++ /dev/null @@ -1,75 +0,0 @@ -# RDMA (ibverb) Ethernet driver {#rdma_doc} - -This driver relies on Linux rdma-core (libibverb) userspace poll-mode driver -to rx/tx Ethernet packets. Despite using the RDMA APIs, this is **not** about -RDMA (no Infiniband, no RoCE, no iWARP), just pure traditional Ethernet -packets. - -## Maturity level -Under development: it should work, but has not been thoroughly tested. - -## Supported Hardware - - Mellanox ConnectX-4 - - Mellanox ConnectX-5 - -## Features - - bifurcation: MAC based flow steering for transparent sharing of a single -physical port between multiple virtual interfaces including Linux netdev - - multiqueue - -## Security considerations -When creating a rdma interface, it will receive all packets to the MAC address -attributed to the interface plus a copy of all broadcast and multicast -traffic. -The MAC address is under the control of VPP: **the user controlling VPP can -divert all traffic of any MAC address to the VPP process, including the Linux -netdev MAC address as long as it can create a rdma interface**. -The rights to create a rdma interface are controlled by the access rights of -the `/dev/infiniband/uverbs[0-9]+`device nodes. - -## Quickstart -1. Make sure the `ib_uverbs` module is loaded: -``` -~# modprobe ib_uverbs -``` -2. In VPP, create a new rdma virtual interface tied to the Linux netdev of the -physical port you want to use (`enp94s0f0` in this example): -``` -vpp# create int rdma host-if enp94s0f0 name rdma-0 -``` -3. Use the interface as usual, eg.: -``` -vpp# set int ip addr rdma-0 1.1.1.1/24 -vpp# set int st rdma-0 up -vpp# ping 1.1.1.100` -``` - -## Containers support -It should work in containers as long as: - - the `ib_uverbs` module is loaded - - the device nodes `/dev/infiniband/uverbs[0-9]+` are usable from the - container (but see [security considerations](#Security considerations)) - -## SR-IOV VFs support -It should work on SR-IOV VFs the same way it does with PFs. Because of VFs -security containment features, make sure the MAC address of the rdma VPP -interface matches the MAC address assigned to the underlying VF. -For example: -``` -host# echo 1 > /sys/class/infiniband/mlx5_0/device/sriov_numvfs -host# ip l set dev enp94s0f0 vf 0 mac 92:5d:f5:df:b1:6f spoof on trust off -host# ip l set dev enp94s0f2 up -vpp# create int rdma host-if enp94s0f2 name rdma-0 -vpp# set int mac address rdma-0 92:5d:f5:df:b1:6f -``` -If you plan to use L2 features such as switching, make sure the underlying -VF is configured in trusted mode and spoof-checking is disabled (of course, be -aware of the [security considerations](#Security considerations)): -``` -host# ip l set dev enp94s0f0 vf 0 spoof off trust on -``` - -## Direct Verb mode -Direct Verb allows the driver to access the NIC HW RX/TX rings directly -instead of having to go through libibverb and suffering associated overhead. -It will be automatically selected if the adapter supports it. diff --git a/src/plugins/rdma/rdma_doc.rst b/src/plugins/rdma/rdma_doc.rst new file mode 100644 index 00000000000..c22ea550a75 --- /dev/null +++ b/src/plugins/rdma/rdma_doc.rst @@ -0,0 +1,102 @@ +RDMA (ibverb) device driver +=========================== + +This driver relies on Linux rdma-core (libibverb) userspace poll-mode +driver to rx/tx Ethernet packets. Despite using the RDMA APIs, this is +**not** about RDMA (no Infiniband, no RoCE, no iWARP), just pure +traditional Ethernet packets. + +Maturity level +-------------- + +Under development: it should work, but has not been thoroughly tested. + +Supported Hardware +------------------ + +- Mellanox ConnectX-4 +- Mellanox ConnectX-5 + +Features +-------- + +- bifurcation: MAC based flow steering for transparent sharing of a + single physical port between multiple virtual interfaces including + Linux netdev +- multiqueue + +Security considerations +----------------------- + +When creating a rdma interface, it will receive all packets to the MAC +address attributed to the interface plus a copy of all broadcast and +multicast traffic. The MAC address is under the control of VPP: **the +user controlling VPP can divert all traffic of any MAC address to the +VPP process, including the Linux netdev MAC address as long as it can +create a rdma interface**. The rights to create a rdma interface are +controlled by the access rights of the +``/dev/infiniband/uverbs[0-9]+``\ device nodes. + +Quickstart +---------- + +1. Make sure the ``ib_uverbs`` module is loaded: + +:: + + ~# modprobe ib_uverbs + +2. In VPP, create a new rdma virtual interface tied to the Linux netdev + of the physical port you want to use (``enp94s0f0`` in this example): + +:: + + vpp# create int rdma host-if enp94s0f0 name rdma-0 + +3. Use the interface as usual, e.g.: + +:: + + vpp# set int ip addr rdma-0 1.1.1.1/24 + vpp# set int st rdma-0 up + vpp# ping 1.1.1.100` + +Containers support +------------------ + +It should work in containers as long as: - the ``ib_uverbs`` module is +loaded - the device nodes ``/dev/infiniband/uverbs[0-9]+`` are usable +from the container (but see `security +considerations <#Security%20considerations>`__) + +SR-IOV VFs support +------------------ + +It should work on SR-IOV VFs the same way it does with PFs. Because of +VFs security containment features, make sure the MAC address of the rdma +VPP interface matches the MAC address assigned to the underlying VF. For +example: + +:: + + host# echo 1 > /sys/class/infiniband/mlx5_0/device/sriov_numvfs + host# ip l set dev enp94s0f0 vf 0 mac 92:5d:f5:df:b1:6f spoof on trust off + host# ip l set dev enp94s0f2 up + vpp# create int rdma host-if enp94s0f2 name rdma-0 + vpp# set int mac address rdma-0 92:5d:f5:df:b1:6f + +If you plan to use L2 features such as switching, make sure the +underlying VF is configured in trusted mode and spoof-checking is +disabled (of course, be aware of the `security +considerations <#Security%20considerations>`__): + +:: + + host# ip l set dev enp94s0f0 vf 0 spoof off trust on + +Direct Verb mode +---------------- + +Direct Verb allows the driver to access the NIC HW RX/TX rings directly +instead of having to go through libibverb and suffering associated +overhead. It will be automatically selected if the adapter supports it. diff --git a/src/plugins/rdma/rdma_mlx5dv.h b/src/plugins/rdma/rdma_mlx5dv.h index efcefe7fbf7..bf01a3a37d6 100644 --- a/src/plugins/rdma/rdma_mlx5dv.h +++ b/src/plugins/rdma/rdma_mlx5dv.h @@ -24,16 +24,16 @@ #include <vppinfra/types.h> #include <vppinfra/error.h> /* CQE flags - bits 16-31 of qword at offset 0x1c */ -#define CQE_FLAG_L4_OK 10 -#define CQE_FLAG_L3_OK 9 -#define CQE_FLAG_L2_OK 8 -#define CQE_FLAG_IP_FRAG 7 +#define CQE_FLAG_L4_OK (1 << 10) +#define CQE_FLAG_L3_OK (1 << 9) +#define CQE_FLAG_L2_OK (1 << 8) +#define CQE_FLAG_IP_FRAG (1 << 7) #define CQE_FLAG_L4_HDR_TYPE(f) (((f) >> 4) & 7) #define CQE_FLAG_L3_HDR_TYPE_SHIFT (2) #define CQE_FLAG_L3_HDR_TYPE_MASK (3 << CQE_FLAG_L3_HDR_TYPE_SHIFT) #define CQE_FLAG_L3_HDR_TYPE(f) (((f) & CQE_FLAG_L3_HDR_TYPE_MASK) >> CQE_FLAG_L3_HDR_TYPE_SHIFT) -#define CQE_FLAG_L3_HDR_TYPE_IP4 1 -#define CQE_FLAG_L3_HDR_TYPE_IP6 2 +#define CQE_FLAG_L3_HDR_TYPE_IP4 2 +#define CQE_FLAG_L3_HDR_TYPE_IP6 1 #define CQE_FLAG_IP_EXT_OPTS 1 /* CQE byte count (Striding RQ) */ diff --git a/src/plugins/rdma/test_api.c b/src/plugins/rdma/test_api.c index e9d5fcaad98..4ec4d3bf345 100644 --- a/src/plugins/rdma/test_api.c +++ b/src/plugins/rdma/test_api.c @@ -189,6 +189,41 @@ api_rdma_create_v3 (vat_main_t *vam) return ret; } +static int +api_rdma_create_v4 (vat_main_t *vam) +{ + vl_api_rdma_create_v4_t *mp; + rdma_create_if_args_t args; + int ret; + + if (!unformat_user (vam->input, unformat_rdma_create_if_args, &args)) + { + clib_warning ("unknown input `%U'", format_unformat_error, vam->input); + return -99; + } + + M (RDMA_CREATE_V4, mp); + + snprintf ((char *) mp->host_if, sizeof (mp->host_if), "%s", args.ifname); + if (args.name) + snprintf ((char *) mp->name, sizeof (mp->name), "%s", args.name); + else + mp->name[0] = 0; + mp->rxq_num = args.rxq_num; + mp->rxq_size = args.rxq_size; + mp->txq_size = args.txq_size; + mp->mode = api_rdma_mode (args.mode); + mp->no_multi_seg = args.no_multi_seg; + mp->max_pktlen = args.max_pktlen; + mp->rss4 = api_rdma_rss4 (args.rss4); + mp->rss6 = api_rdma_rss6 (args.rss6); + + S (mp); + W (ret); + + return ret; +} + /* rdma-create reply handler */ static void vl_api_rdma_create_reply_t_handler (vl_api_rdma_create_reply_t * mp) @@ -243,6 +278,24 @@ vl_api_rdma_create_v3_reply_t_handler (vl_api_rdma_create_v3_reply_t *mp) vam->regenerate_interface_table = 1; } +/* rdma-create reply handler v4 */ +static void +vl_api_rdma_create_v4_reply_t_handler (vl_api_rdma_create_v4_reply_t *mp) +{ + vat_main_t *vam = rdma_test_main.vat_main; + i32 retval = mp->retval; + + if (retval == 0) + { + fformat (vam->ofp, "created rdma with sw_if_index %d\n", + ntohl (mp->sw_if_index)); + } + + vam->retval = retval; + vam->result_ready = 1; + vam->regenerate_interface_table = 1; +} + /* rdma delete API */ static int api_rdma_delete (vat_main_t * vam) |