/* *------------------------------------------------------------------ * Copyright (c) 2018 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *------------------------------------------------------------------ */ #ifndef _RDMA_H_ #define _RDMA_H_ #include <infiniband/verbs.h> #include <vlib/log.h> #include <vlib/pci/pci.h> #include <vnet/interface.h> #include <vnet/ethernet/mac_address.h> #include <rdma/rdma_mlx5dv.h> #define foreach_rdma_device_flags \ _(0, ERROR, "error") \ _(1, ADMIN_UP, "admin-up") \ _(2, LINK_UP, "link-up") \ _(3, PROMISC, "promiscuous") \ _(4, MLX5DV, "mlx5dv") \ _(5, STRIDING_RQ, "striding-rq") enum { #define _(a, b, c) RDMA_DEVICE_F_##b = (1 << a), foreach_rdma_device_flags #undef _ }; #ifndef MLX5_ETH_L2_INLINE_HEADER_SIZE #define MLX5_ETH_L2_INLINE_HEADER_SIZE 18 #endif typedef struct { CLIB_ALIGN_MARK (align0, MLX5_SEND_WQE_BB); union { struct mlx5_wqe_ctrl_seg ctrl; struct { u8 opc_mod; u8 wqe_index_hi; u8 wqe_index_lo; u8 opcode; }; }; struct mlx5_wqe_eth_seg eseg; struct mlx5_wqe_data_seg dseg; } rdma_mlx5_wqe_t; #define RDMA_MLX5_WQE_SZ sizeof(rdma_mlx5_wqe_t) #define RDMA_MLX5_WQE_DS (RDMA_MLX5_WQE_SZ/sizeof(struct mlx5_wqe_data_seg)) STATIC_ASSERT (RDMA_MLX5_WQE_SZ == MLX5_SEND_WQE_BB && RDMA_MLX5_WQE_SZ % sizeof (struct mlx5_wqe_data_seg) == 0, "bad size"); typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); struct ibv_cq *cq; struct ibv_wq *wq; u32 *bufs; u32 size; u32 head; u32 tail; u32 cq_ci; u16 log2_cq_size; u16 n_mini_cqes; u16 n_mini_cqes_left; u16 last_cqe_flags; mlx5dv_cqe_t *cqes; mlx5dv_wqe_ds_t *wqes; CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); volatile u32 *wq_db; volatile u32 *cq_db; u32 cqn; u32 wqe_cnt; u32 wq_stride; u32 buf_sz; u32 queue_index; union { struct { u32 striding_wqe_tail; /* Striding RQ: number of released whole WQE */ u8 log_stride_per_wqe; /* Striding RQ: number of strides in a single WQE */ }; struct { u8 *n_used_per_chain; /* Legacy RQ: for each buffer chain, how many additional segments are needed */ u32 *second_bufs; /* Legacy RQ: ring of second buffers of each chain */ u32 incomplete_tail; /* Legacy RQ: tail index in bufs, corresponds to buffer chains with recycled valid head buffer, but whose other buffers are not yet recycled (due to pool exhaustion). */ u16 n_total_additional_segs; u8 n_ds_per_wqe; /* Legacy RQ: number of nonnull data segs per WQE */ }; }; u8 log_wqe_sz; /* log-size of a single WQE (in data segments) */ } rdma_rxq_t; typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); /* following fields are accessed in datapath */ clib_spinlock_t lock; union { struct { /* ibverb datapath. Cache of cq, sq below */ struct ibv_cq *ibv_cq; struct ibv_qp *ibv_qp; }; struct { /* direct verbs datapath */ rdma_mlx5_wqe_t *dv_sq_wqes; volatile u32 *dv_sq_dbrec; volatile u64 *dv_sq_db; struct mlx5_cqe64 *dv_cq_cqes; volatile u32 *dv_cq_dbrec; }; }; u32 *bufs; /* vlib_buffer ring buffer */ u16 head; u16 tail; u16 dv_cq_idx; /* monotonic CQE index (valid only for direct verbs) */ u8 bufs_log2sz; /* log2 vlib_buffer entries */ u8 dv_sq_log2sz:4; /* log2 SQ WQE entries (valid only for direct verbs) */ u8 dv_cq_log2sz:4; /* log2 CQ CQE entries (valid only for direct verbs) */ STRUCT_MARK (cacheline1); /* WQE template (valid only for direct verbs) */ u8 dv_wqe_tmpl[64]; /* end of 2nd 64-bytes cacheline (or 1st 128-bytes cacheline) */ STRUCT_MARK (cacheline2); /* fields below are not accessed in datapath */ struct ibv_cq *cq; struct ibv_qp *qp; } rdma_txq_t; STATIC_ASSERT_OFFSET_OF (rdma_txq_t, cacheline1, 64); STATIC_ASSERT_OFFSET_OF (rdma_txq_t, cacheline2, 128); #define RDMA_TXQ_DV_INVALID_ID 0xffffffff #define RDMA_TXQ_BUF_SZ(txq) (1U << (txq)->bufs_log2sz) #define RDMA_TXQ_DV_SQ_SZ(txq) (1U << (txq)->dv_sq_log2sz) #define RDMA_TXQ_DV_CQ_SZ(txq) (1U << (txq)->dv_cq_log2sz) #define RDMA_TXQ_USED_SZ(head, tail) ((u16)((u16)(tail) - (u16)(head))) #define RDMA_TXQ_AVAIL_SZ(txq, head, tail) ((u16)(RDMA_TXQ_BUF_SZ (txq) - RDMA_TXQ_USED_SZ (head, tail))) #define RDMA_RXQ_MAX_CHAIN_LOG_SZ 3 /* This should NOT be lower than 3! */ #define RDMA_RXQ_MAX_CHAIN_SZ (1U << RDMA_RXQ_MAX_CHAIN_LOG_SZ) #define RDMA_RXQ_LEGACY_MODE_MAX_CHAIN_SZ 5 typedef enum { RDMA_RSS4_AUTO = 0, RDMA_RSS4_IP, RDMA_RSS4_IP_UDP, RDMA_RSS4_IP_TCP, } rdma_rss4_t; typedef enum { RDMA_RSS6_AUTO = 0, RDMA_RSS6_IP, RDMA_RSS6_IP_UDP, RDMA_RSS6_IP_TCP, } rdma_rss6_t; typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); /* following fields are accessed in datapath */ rdma_rxq_t *rxqs; rdma_txq_t *txqs; u32 flags; u32 per_interface_next_index; u32 sw_if_index; u32 hw_if_index; u32 lkey; /* cache of mr->lkey */ u8 pool; /* buffer pool index */ /* fields below are not accessed in datapath */ vlib_pci_device_info_t *pci; u8 *name; u8 *linux_ifname; mac_address_t hwaddr; u32 async_event_clib_file_index; u32 dev_instance; rdma_rss4_t rss4; rdma_rss6_t rss6; struct ibv_context *ctx; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_qp *rx_qp4; struct ibv_qp *rx_qp6; struct ibv_rwq_ind_table *rx_rwq_ind_tbl; struct ibv_flow *flow_ucast4; struct ibv_flow *flow_mcast4; struct ibv_flow *flow_ucast6; struct ibv_flow *flow_mcast6; clib_error_t *error; } rdma_device_t; typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); union { u16 cqe_flags[VLIB_FRAME_SIZE]; u16x8 cqe_flags8[VLIB_FRAME_SIZE / 8]; u16x16 cqe_flags16[VLIB_FRAME_SIZE / 16]; }; union { struct { u32 current_segs[VLIB_FRAME_SIZE]; u32 to_free_buffers[VLIB_FRAME_SIZE]; }; /* Specific to STRIDING RQ mode */ struct { u32 tmp_bi[VLIB_FRAME_SIZE]; vlib_buffer_t *tmp_bufs[VLIB_FRAME_SIZE]; }; /* Specific to LEGACY RQ mode */ }; vlib_buffer_t buffer_template; } rdma_per_thread_data_t; typedef struct { rdma_per_thread_data_t *per_thread_data; rdma_device_t *devices; vlib_log_class_t log_class; u16 msg_id_base; } rdma_main_t; extern rdma_main_t rdma_main; typedef enum { RDMA_MODE_AUTO = 0, RDMA_MODE_IBV, RDMA_MODE_DV, } rdma_mode_t; typedef struct { u8 *ifname; u8 *name; u32 rxq_size; u32 txq_size; u32 rxq_num; rdma_mode_t mode; u8 no_multi_seg; u8 disable_striding_rq; u16 max_pktlen; rdma_rss4_t rss4; rdma_rss6_t rss6; /* return */ int rv; u32 sw_if_index; clib_error_t *error; } rdma_create_if_args_t; void rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args); void rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd); extern vlib_node_registration_t rdma_input_node; extern vnet_device_class_t rdma_device_class; format_function_t format_rdma_device; format_function_t format_rdma_device_name; format_function_t format_rdma_input_trace; format_function_t format_rdma_rxq; unformat_function_t unformat_rdma_create_if_args; typedef struct { u32 next_index; u32 hw_if_index; u16 cqe_flags; } rdma_input_trace_t; #define foreach_rdma_tx_func_error \ _(SEGMENT_SIZE_EXCEEDED, "segment size exceeded") \ _(NO_FREE_SLOTS, "no free tx slots") \ _(SUBMISSION, "tx submission errors") \ _(COMPLETION, "tx completion errors") typedef enum { #define _(f,s) RDMA_TX_ERROR_##f, foreach_rdma_tx_func_error #undef _ RDMA_TX_N_ERROR, } rdma_tx_func_error_t; #endif /* _RDMA_H_ */ /* * fd.io coding-style-patch-verification: ON * * Local Variables: * eval: (c-set-style "gnu") * End: */