aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/mlx5/mlx5_rxtx_vec_neon.h')
-rw-r--r--drivers/net/mlx5/mlx5_rxtx_vec_neon.h135
1 files changed, 57 insertions, 78 deletions
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index c721d80e..bbe1818e 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -1,34 +1,6 @@
-/*-
- * BSD LICENSE
- *
- * Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of 6WIND S.A. nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox.
*/
#ifndef RTE_PMD_MLX5_RXTX_VEC_NEON_H_
@@ -135,6 +107,8 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
assert(elts_n > pkts_n);
mlx5_tx_complete(txq);
+ /* A CQE slot must always be available. */
+ assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
if (unlikely(!pkts_n))
return 0;
for (n = 0; n < pkts_n; ++n) {
@@ -149,7 +123,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
11, 10, 9, 8, /* bswap32 */
12, 13, 14, 15
};
- uint8_t cs_flags = 0;
+ uint8_t cs_flags;
uint16_t max_elts;
uint16_t max_wqe;
uint8x16_t *t_wqe;
@@ -168,22 +142,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
break;
wqe = &((volatile struct mlx5_wqe64 *)
txq->wqes)[wqe_ci & wq_mask].hdr;
- if (buf->ol_flags &
- (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
- const uint64_t is_tunneled =
- buf->ol_flags & (PKT_TX_TUNNEL_GRE |
- PKT_TX_TUNNEL_VXLAN);
-
- if (is_tunneled && txq->tunnel_en) {
- cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
- MLX5_ETH_WQE_L4_INNER_CSUM;
- if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
- cs_flags |= MLX5_ETH_WQE_L3_CSUM;
- } else {
- cs_flags = MLX5_ETH_WQE_L3_CSUM |
- MLX5_ETH_WQE_L4_CSUM;
- }
- }
+ cs_flags = txq_ol_cksum_to_cs(txq, buf);
/* Title WQEBB pointer. */
t_wqe = (uint8x16_t *)wqe;
dseg = (uint8_t *)(wqe + 1);
@@ -220,7 +179,9 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
wqe->ctrl[2] = rte_cpu_to_be_32(8);
wqe->ctrl[3] = txq->elts_head;
txq->elts_comp = 0;
+#ifndef NDEBUG
++txq->cq_pi;
+#endif
}
#ifdef MLX5_PMD_SOFT_COUNTERS
txq->stats.opackets += n;
@@ -233,7 +194,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
* Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
* it returns to make it processed by txq_scatter_v(). All the packets in
* the pkts list should be single segment packets having same offload flags.
- * This must be checked by txq_check_multiseg() and txq_calc_offload().
+ * This must be checked by txq_count_contig_single_seg() and txq_calc_offload().
*
* @param txq
* Pointer to TX queue structure.
@@ -284,6 +245,8 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
assert(elts_n > pkts_n);
mlx5_tx_complete(txq);
max_elts = (elts_n - (elts_head - txq->elts_tail));
+ /* A CQE slot must always be available. */
+ assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
if (unlikely(!pkts_n))
@@ -321,7 +284,9 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
} else {
/* Request a completion. */
txq->elts_comp = 0;
+#ifndef NDEBUG
++txq->cq_pi;
+#endif
comp_req = 8;
}
/* Fill CTRL in the header. */
@@ -590,11 +555,15 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
if (rxq->mark) {
const uint32x4_t ft_def = vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT);
const uint32x4_t fdir_flags = vdupq_n_u32(PKT_RX_FDIR);
- const uint32x4_t fdir_id_flags = vdupq_n_u32(PKT_RX_FDIR_ID);
+ uint32x4_t fdir_id_flags = vdupq_n_u32(PKT_RX_FDIR_ID);
+ uint32x4_t invalid_mask;
/* Check if flow tag is non-zero then set PKT_RX_FDIR. */
- ol_flags = vorrq_u32(ol_flags, vbicq_u32(fdir_flags,
- vceqzq_u32(flow_tag)));
+ invalid_mask = vceqzq_u32(flow_tag);
+ ol_flags = vorrq_u32(ol_flags,
+ vbicq_u32(fdir_flags, invalid_mask));
+ /* Mask out invalid entries. */
+ fdir_id_flags = vbicq_u32(fdir_id_flags, invalid_mask);
/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
ol_flags = vorrq_u32(ol_flags,
vbicq_u32(fdir_id_flags,
@@ -665,12 +634,16 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
* Array to store received packets.
* @param pkts_n
* Maximum number of packets in array.
+ * @param[out] err
+ * Pointer to a flag. Set non-zero value if pkts array has at least one error
+ * packet to handle.
*
* @return
* Number of packets received including errors (<= pkts_n).
*/
static inline uint16_t
-rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
+ uint64_t *err)
{
const uint16_t q_n = 1 << rxq->cqe_n;
const uint16_t q_mask = q_n - 1;
@@ -813,6 +786,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16x4_t mask;
uint16x4_t byte_cnt;
uint32x4_t ptype_info, flow_tag;
+ register uint64x2_t c0, c1, c2, c3;
uint8_t *p0, *p1, *p2, *p3;
uint8_t *e0 = (void *)&elts[pos]->pkt_len;
uint8_t *e1 = (void *)&elts[pos + 1]->pkt_len;
@@ -829,6 +803,16 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
+ /* B.0 (CQE 3) load a block having op_own. */
+ c3 = vld1q_u64((uint64_t *)(p3 + 48));
+ /* B.0 (CQE 2) load a block having op_own. */
+ c2 = vld1q_u64((uint64_t *)(p2 + 48));
+ /* B.0 (CQE 1) load a block having op_own. */
+ c1 = vld1q_u64((uint64_t *)(p1 + 48));
+ /* B.0 (CQE 0) load a block having op_own. */
+ c0 = vld1q_u64((uint64_t *)(p0 + 48));
+ /* Synchronize for loading the rest of blocks. */
+ rte_cio_rmb();
/* Prefetch next 4 CQEs. */
if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;
@@ -838,50 +822,46 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_prefetch_non_temporal(&cq[next + 3]);
}
__asm__ volatile (
- /* B.1 (CQE 3) load a block having op_own. */
- "ld1 {v19.16b}, [%[p3]] \n\t"
- "sub %[p3], %[p3], #48 \n\t"
- /* B.2 (CQE 3) load the rest blocks. */
+ /* B.1 (CQE 3) load the rest of blocks. */
"ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
+ /* B.2 (CQE 3) move the block having op_own. */
+ "mov v19.16b, %[c3].16b \n\t"
/* B.3 (CQE 3) extract 16B fields. */
"tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 2) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
/* B.4 (CQE 3) adjust CRC length. */
"sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
- /* B.1 (CQE 2) load a block having op_own. */
- "ld1 {v19.16b}, [%[p2]] \n\t"
- "sub %[p2], %[p2], #48 \n\t"
/* C.1 (CQE 3) generate final structure for mbuf. */
"tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
- /* B.2 (CQE 2) load the rest blocks. */
- "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
+ /* B.2 (CQE 2) move the block having op_own. */
+ "mov v19.16b, %[c2].16b \n\t"
/* B.3 (CQE 2) extract 16B fields. */
"tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 1) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
/* B.4 (CQE 2) adjust CRC length. */
"sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
- /* B.1 (CQE 1) load a block having op_own. */
- "ld1 {v19.16b}, [%[p1]] \n\t"
- "sub %[p1], %[p1], #48 \n\t"
/* C.1 (CQE 2) generate final structure for mbuf. */
"tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
- /* B.2 (CQE 1) load the rest blocks. */
- "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
+ /* B.2 (CQE 1) move the block having op_own. */
+ "mov v19.16b, %[c1].16b \n\t"
/* B.3 (CQE 1) extract 16B fields. */
"tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 0) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
/* B.4 (CQE 1) adjust CRC length. */
"sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
- /* B.1 (CQE 0) load a block having op_own. */
- "ld1 {v19.16b}, [%[p0]] \n\t"
- "sub %[p0], %[p0], #48 \n\t"
/* C.1 (CQE 1) generate final structure for mbuf. */
"tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
- /* B.2 (CQE 0) load the rest blocks. */
- "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
+ /* B.2 (CQE 0) move the block having op_own. */
+ "mov v19.16b, %[c0].16b \n\t"
+ /* A.1 load mbuf pointers. */
+ "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
/* B.3 (CQE 0) extract 16B fields. */
"tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
/* B.4 (CQE 0) adjust CRC length. */
"sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
- /* A.1 load mbuf pointers. */
- "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
/* D.1 extract op_own byte. */
"tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t"
/* C.2 (CQE 3) adjust flow mark. */
@@ -916,9 +896,9 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
[byte_cnt]"=&w"(byte_cnt),
[ptype_info]"=&w"(ptype_info),
[flow_tag]"=&w"(flow_tag)
- :[p3]"r"(p3 + 48), [p2]"r"(p2 + 48),
- [p1]"r"(p1 + 48), [p0]"r"(p0 + 48),
+ :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0),
[e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+ [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0),
[elts_p]"r"(elts_p),
[pkts_p]"r"(pkts_p),
[cqe_shuf_m]"w"(cqe_shuf_m),
@@ -970,8 +950,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
opcode = vceq_u16(resp_err_check, opcode);
opcode = vbic_u16(opcode, invalid_mask);
/* D.4 mark if any error is set */
- rxq->pending_err |=
- !!vget_lane_u64(vreinterpret_u64_u16(opcode), 0);
+ *err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0);
/* C.4 fill in mbuf - rearm_data and packet_type. */
rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag,
opcode, &elts[pos]);