/* *------------------------------------------------------------------ * Copyright (c) 2018 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *------------------------------------------------------------------ */ #include #include #include #include #include #include #include #define foreach_rdma_input_error \ _(BUFFER_ALLOC, "buffer alloc error") typedef enum { #define _(f,s) RDMA_INPUT_ERROR_##f, foreach_rdma_input_error #undef _ RDMA_INPUT_N_ERROR, } rdma_input_error_t; static __clib_unused char *rdma_input_error_strings[] = { #define _(n,s) s, foreach_rdma_input_error #undef _ }; static_always_inline void ibv_set_recv_wr_and_sge (struct ibv_recv_wr *w, struct ibv_sge *s, u64 va, u32 data_size, u32 lkey) { s[0].addr = va; s[0].length = data_size; s[0].lkey = lkey; w[0].next = w + 1; w[0].sg_list = s; w[0].num_sge = 1; } static_always_inline u32 rdma_device_legacy_input_refill_additional (vlib_main_t * vm, rdma_device_t * rd, rdma_rxq_t * rxq, rdma_per_thread_data_t * ptd, vlib_buffer_t * bt, u32 first_slot, u32 n_alloc) { int i; u8 log_wqe_sz = rxq->log_wqe_sz; u32 *bi = ptd->tmp_bi; vlib_buffer_t **bufs = ptd->tmp_bufs; for (i = 0; i < n_alloc; i++) { u8 chain_sz = rxq->n_used_per_chain[first_slot + i]; u8 chain_sz_alloc; mlx5dv_wqe_ds_t *current_wqe = rxq->wqes + ((first_slot + i) << log_wqe_sz); if (chain_sz == 0) continue; if (PREDICT_FALSE ((chain_sz_alloc = vlib_buffer_alloc_from_pool (vm, bi, chain_sz, rd->pool)) != chain_sz)) { vlib_buffer_free (vm, bi, chain_sz_alloc); break; } /*Build the chain */ vlib_get_buffers (vm, bi, bufs, chain_sz); for (int j = 0; j < chain_sz - 1; j++) { vlib_buffer_copy_template (bufs[j], bt); bufs[j]->next_buffer = bi[j + 1]; bufs[j]->flags |= VLIB_BUFFER_NEXT_PRESENT; } /* The chain starting at the second buffer is pre-initialised */ vlib_buffer_copy_template (bufs[chain_sz - 1], bt); /* Stick with the already existing chain */ if (chain_sz < rxq->n_ds_per_wqe - 1) { bufs[chain_sz - 1]->next_buffer = rxq->second_bufs[first_slot + i]; bufs[chain_sz - 1]->flags |= VLIB_BUFFER_NEXT_PRESENT; } else { bufs[chain_sz - 1]->flags &= ~VLIB_BUFFER_NEXT_PRESENT; } /* Update the wqes */ for (int j = 0; j < chain_sz; j++) { u64 addr; vlib_get_buffers_with_offset (vm, bi + j, (void *) &addr, 1, sizeof (vlib_buffer_t)); current_wqe[j + 1].addr = clib_host_to_net_u64 (addr); } rxq->n_used_per_chain[first_slot + i] = 0; rxq->n_total_additional_segs -= chain_sz; rxq->second_bufs[first_slot + i] = bi[0]; } return i; } static_always_inline void rdma_device_input_refill (vlib_main_t * vm, rdma_device_t * rd, rdma_rxq_t * rxq, vlib_buffer_t * bt, const int is_mlx5dv, const int is_striding) { u32 n_alloc, n; u16 ring_space; struct ibv_recv_wr wr[VLIB_FRAME_SIZE], *w = wr; struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge; rdma_per_thread_data_t *ptd = &rdma_main.per_thread_data[vlib_get_thread_index ()]; u32 mask = rxq->size - 1; u32 slot = rxq->tail & mask; u32 *bufs = rxq->bufs + slot; u32 data_size = rxq->buf_sz; u32 lkey = rd->lkey; const int log_stride_per_wqe = is_striding ? rxq->log_stride_per_wqe : 0; const int log_wqe_sz = rxq->log_wqe_sz; /*In legacy mode, maybe some buffers chains are incomplete? */ if (PREDICT_FALSE (is_mlx5dv && !is_striding && (rxq->incomplete_tail != rxq->tail))) { int n_incomplete = rxq->incomplete_tail - rxq->tail; int n_completed = rdma_device_legacy_input_refill_additional (vm, rd, rxq, ptd, bt, slot, n_incomplete); rxq->tail += n_completed; slot = rxq->tail & mask; /* Don't start recycling head buffers if there are incomplete chains */ if (n_completed != n_incomplete) return; } /* refilled buffers must be a multiple of 8 and of strides per WQE */ u32 alloc_multiple = 1 << (clib_max (3, log_stride_per_wqe)); ring_space = rxq->size - (rxq->tail - rxq->head); n_alloc = clib_min (VLIB_FRAME_SIZE, ring_space); /* do not bother to allocate if too small */ if (n_alloc < 2 * alloc_multiple) return; /* avoid wrap-around logic in core loop */ n_alloc = clib_min (n_alloc, rxq->size - slot); n_alloc &= ~(alloc_multiple - 1); /* round to alloc_multiple */ n = vlib_buffer_alloc_to_ring_from_pool (vm, rxq->bufs, slot, rxq->size, n_alloc, rd->pool); if (PREDICT_FALSE (n != n_alloc)) { u32 n_free; if (n < alloc_multiple) { if (n) vlib_buffer_free_from_ring (vm, rxq->bufs, slot, rxq->size, n); return; } /* partial allocation, round and return rest */ n_free = n & (alloc_multiple - 1); n -= n_free; if (n_free) vlib_buffer_free_from_ring (vm, rxq->bufs, (slot + n) & mask, rxq->size, n_free); } n_alloc = n; if (is_mlx5dv) { u64 __clib_aligned (32) va[8]; /* slot does not necessarily correspond to the slot in the wqes ring (in 16B words) */ u32 wqes_slot = slot << (log_wqe_sz - log_stride_per_wqe); const u32 wqe_cnt = rxq->wqe_cnt; mlx5dv_wqe_ds_t *wqe = rxq->wqes + wqes_slot; const int wqe_sz = 1 << log_wqe_sz; const int stride_per_wqe = 1 << log_stride_per_wqe; int current_data_seg = 0; /* In legacy mode, this function only refills head descriptors for each WQE, so RDMA_RXQ_MAX_CHAIN_SZ-1 data segments are skipped per WQE */ const int log_skip_wqe = is_striding ? 0 : log_wqe_sz; while (n >= 1) { vlib_get_buffers_with_offset (vm, rxq->bufs + slot, (void **) va, 8, sizeof (vlib_buffer_t)); #ifdef CLIB_HAVE_VEC256 *(u64x4 *) va = u64x4_byte_swap (*(u64x4 *) va); *(u64x4 *) (va + 4) = u64x4_byte_swap (*(u64x4 *) (va + 4)); #else for (int i = 0; i < 8; i++) va[i] = clib_host_to_net_u64 (va[i]); #endif /*In striding RQ mode, the first 16B-word of the WQE is the SRQ header. It is initialised as if it were a LINKED_LIST, as we have no guarantee about what RDMA core does (CYCLIC_RQ or LINKED_LIST_RQ). In cyclic mode, the SRQ header is ignored anyways... */ /* *INDENT-OFF* */ if (is_striding && !(current_data_seg & (wqe_sz - 1))) *(mlx5dv_wqe_srq_next_t *) wqe = (mlx5dv_wqe_srq_next_t) { .rsvd0 = {0}, .next_wqe_index = clib_host_to_net_u16 (((wqes_slot >> log_wqe_sz) + 1) & (wqe_cnt - 1)), .signature = 0, .rsvd1 = {0}
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
  Original license for the code used to construct
  clib_xxhash(...).

  xxHash - Fast Hash algorithm
  Copyright (C) 2012-2014, Yann Collet.
  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

  * Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.
  * Redistributions in binary form must reproduce the above
  copyright notice, this list of conditions and the following disclaimer
  in the documentation and/or other materials provided with the
  distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef __included_xxhash_h__
#define __included_xxhash_h__

#define PRIME64_1 11400714785074694791ULL
#define PRIME64_2 14029467366897019727ULL
#define PRIME64_3  1609587929392839161ULL
#define PRIME64_4  9650029242287828579ULL
#define PRIME64_5  2870177450012600261ULL
#define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))

static inline u64
clib_xxhash (u64 key)
{
  u64 k1, h64;

  k1 = key;
  h64 = 0x9e3779b97f4a7c13LL + PRIME64_5 + 8;
  k1 *= PRIME64_2;
  k1 = XXH_rotl64 (k1, 31);
  k1 *= PRIME64_1;
  h64 ^= k1;
  h64 = XXH_rotl64 (h64, 27) * PRIME64_1 + PRIME64_4;

  h64 ^= h64 >> 33;
  h64 *= PRIME64_2;
  h64 ^= h64 >> 29;
  h64 *= PRIME64_3;
  h64 ^= h64 >> 32;
  return h64;
}

#endif /* __included_xxhash_h__ */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */
vlib_buffer_t * bt, u32 * to_next, int n_rx_segs, int *n_rx_packets, u32 * bc, int slow_path_needed) { u32 mask = rxq->size - 1; u32 n_rx_bytes = 0; if (PREDICT_TRUE (!slow_path_needed)) { vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; n_rx_bytes += rdma_device_mlx5dv_fast_input (vm, rxq, bufs, mask, bt, to_next, n_rx_segs, bc, CQE_BC_BYTE_COUNT_MASK); } else /* Slow path with multiseg */ { vlib_buffer_t *pkt_head; /*Current head buffer */ vlib_buffer_t *pkt_prev; /* Buffer processed at the previous iteration */ u32 pkt_head_idx; vlib_buffer_t **pkt; uword n_segs_remaining = 0; /*Remaining strides in current buffer */ u32 n_bytes_remaining = 0; /*Remaining bytes in current buffer */ u32 *next_in_frame = to_next; u32 *next_to_free = ptd->to_free_buffers; bt->current_length = vlib_buffer_get_default_data_size (vm); do { vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; u32 n_left = clib_min (n_rx_segs, VLIB_FRAME_SIZE); n_rx_segs -= n_left; vlib_buffer_copy_indices_from_ring (ptd->current_segs, rxq->bufs, rxq->head & mask, rxq->size, n_left); rxq->head += n_left; vlib_get_buffers (vm, ptd->current_segs, bufs, n_left); pkt = bufs; while (n_left > 0) { /* Initialize the current buffer as full size */ vlib_buffer_copy_template (pkt[0], bt); if (!n_segs_remaining) /* No pending chain */ { n_segs_remaining = (bc[0] & CQE_BC_CONSUMED_STRIDES_MASK) >> CQE_BC_CONSUMED_STRIDES_SHIFT; pkt_head = pkt[0]; pkt_head_idx = ptd->current_segs[pkt - bufs]; n_bytes_remaining = bc[0] & CQE_BC_BYTE_COUNT_MASK; pkt_head->total_length_not_including_first_buffer = n_segs_remaining > 1 ? n_bytes_remaining - pkt[0]->current_length : 0; } else /* Perform chaining if it's a continuation buffer */ { pkt_prev->next_buffer = ptd->current_segs[pkt - bufs]; pkt_prev->flags |= VLIB_BUFFER_NEXT_PRESENT; pkt[0]->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; } if (n_segs_remaining == 1) /* Last buffer of the chain */ { pkt[0]->current_length = n_bytes_remaining; if (bc[0] & CQE_BC_FILLER_MASK) { (next_to_free++)[0] = pkt_head_idx; (*n_rx_packets)--; } else { (next_in_frame++)[0] = pkt_head_idx; n_rx_bytes += pkt_head->current_length + pkt_head->total_length_not_including_first_buffer; } /*Go to next CQE */ bc++; } else { n_bytes_remaining -= pkt[0]->current_length; pkt_prev = pkt[0]; } n_segs_remaining--; n_left--; pkt++; } } while (n_rx_segs > 0); vlib_buffer_free (vm, ptd->to_free_buffers, next_to_free - ptd->to_free_buffers); } return n_rx_bytes; } static_always_inline uword rdma_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, rdma_device_t * rd, u16 qid, const int use_mlx5dv) { rdma_main_t *rm = &rdma_main; vnet_main_t *vnm = vnet_get_main (); rdma_per_thread_data_t *ptd = vec_elt_at_index (rm->per_thread_data, vm->thread_index); rdma_rxq_t *rxq = vec_elt_at_index (rd->rxqs, qid); struct ibv_wc wc[VLIB_FRAME_SIZE]; u32 __clib_aligned (32) byte_cnts[VLIB_FRAME_SIZE]; vlib_buffer_t bt; u32 next_index, *to_next, n_left_to_next, n_rx_bytes = 0; int n_rx_packets, skip_ip4_cksum = 0; u32 mask = rxq->size - 1; const int is_striding = ! !(rd->flags & RDMA_DEVICE_F_STRIDING_RQ); if (use_mlx5dv) n_rx_packets = rdma_device_poll_cq_mlx5dv (rd, rxq, byte_cnts, ptd->cqe_flags); else n_rx_packets = ibv_poll_cq (rxq->cq, VLIB_FRAME_SIZE, wc); /* init buffer template */ vlib_buffer_copy_template (&bt, &ptd->buffer_template); vnet_buffer (&bt)->sw_if_index[VLIB_RX] = rd->sw_if_index; bt.buffer_pool_index = rd->pool; if (PREDICT_FALSE (n_rx_packets <= 0)) goto refill; /* update buffer template for input feature arcs if any */ next_index = rd->per_interface_next_index; if (PREDICT_FALSE (vnet_device_input_have_features (rd->sw_if_index))) vnet_feature_start_device_input_x1 (rd->sw_if_index, &next_index, &bt); vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); if (use_mlx5dv) { u32 *bc = byte_cnts; int slow_path_needed; skip_ip4_cksum = rdma_device_mlx5dv_l3_validate_and_swap_bc (ptd, n_rx_packets, bc); if (is_striding) { int n_rx_segs = 0; slow_path_needed = rdma_device_mlx5dv_striding_rq_parse_bc (n_rx_packets, &n_rx_segs, bc); n_rx_bytes = rdma_device_mlx5dv_striding_rq_input (vm, ptd, rxq, &bt, to_next, n_rx_segs, &n_rx_packets, bc, slow_path_needed); } else { vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; slow_path_needed = rdma_device_mlx5dv_legacy_rq_slow_path_needed (rxq->buf_sz, n_rx_packets, bc); n_rx_bytes = rdma_device_mlx5dv_fast_input ( vm, rxq, bufs, mask, &bt, to_next, n_rx_packets, bc, ~0); /* If there are chained buffers, some of the head buffers have a current length higher than buf_sz: it needs to be fixed */ if (PREDICT_FALSE (slow_path_needed)) rdma_device_mlx5dv_legacy_rq_fix_chains (vm, rxq, bufs, mask, n_rx_packets); } } else { vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; vlib_buffer_copy_indices_from_ring (to_next, rxq->bufs, rxq->head & mask, rxq->size, n_rx_packets); vlib_get_buffers (vm, to_next, bufs, n_rx_packets); rxq->head += n_rx_packets; n_rx_bytes = rdma_device_input_bufs (vm, rd, bufs, wc, n_rx_packets, &bt); } rdma_device_input_ethernet (vm, node, rd, next_index, skip_ip4_cksum); vlib_put_next_frame (vm, node, next_index, n_left_to_next - n_rx_packets); rdma_device_input_trace (vm, node, rd, n_rx_packets, to_next, next_index, ptd->cqe_flags, use_mlx5dv); /* reset flags to zero for the next run */ if (use_mlx5dv) clib_memset_u16 (ptd->cqe_flags, 0, VLIB_FRAME_SIZE); vlib_increment_combined_counter (vnm->interface_main. combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, vm->thread_index, rd->hw_if_index, n_rx_packets, n_rx_bytes); refill: rdma_device_input_refill (vm, rd, rxq, &bt, use_mlx5dv, is_striding); return n_rx_packets; } VLIB_NODE_FN (rdma_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { u32 n_rx = 0; rdma_main_t *rm = &rdma_main; vnet_hw_if_rxq_poll_vector_t *pv; pv = vnet_hw_if_get_rxq_poll_vector (vm, node); for (int i = 0; i < vec_len (pv); i++) { rdma_device_t *rd; rd = vec_elt_at_index (rm->devices, pv[i].dev_instance); if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ADMIN_UP) == 0) continue; if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_ERROR)) continue; if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_MLX5DV)) n_rx += rdma_device_input_inline (vm, node, frame, rd, pv[i].queue_id, 1); else n_rx += rdma_device_input_inline (vm, node, frame, rd, pv[i].queue_id, 0); } return n_rx; } /* *INDENT-OFF* */ VLIB_REGISTER_NODE (rdma_input_node) = { .name = "rdma-input", .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED, .sibling_of = "device-input", .format_trace = format_rdma_input_trace, .type = VLIB_NODE_TYPE_INPUT, .state = VLIB_NODE_STATE_DISABLED, .n_errors = RDMA_INPUT_N_ERROR, .error_strings = rdma_input_error_strings, }; /* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON * * Local Variables: * eval: (c-set-style "gnu") * End: */