From df47a0812ab9365b2de10a7aabcf4b29a255c088 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Fri, 29 Mar 2024 21:33:03 +0100
Subject: octeon: fix memory ordering issue in tx batch free

Type: fix
Fixes: 01fe7ab
Change-Id: I4425e809f0977521ddecf91b58b26fe4519dd6e0
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/plugins/dev_octeon/octeon.h  | 14 +++++++++-----
 src/plugins/dev_octeon/port.c    |  7 +++++--
 src/plugins/dev_octeon/queue.c   | 14 +++++++++++---
 src/plugins/dev_octeon/tx_node.c | 26 +++++++++++++++++---------
 4 files changed, 42 insertions(+), 19 deletions(-)

(limited to 'src')

diff --git a/src/plugins/dev_octeon/octeon.h b/src/plugins/dev_octeon/octeon.h
index fd8a92c7b3d..72d2d56a437 100644
--- a/src/plugins/dev_octeon/octeon.h
+++ b/src/plugins/dev_octeon/octeon.h
@@ -15,6 +15,8 @@
 #include <base/roc_api.h>
 #include <dev_octeon/hw_defs.h>
 
+#define OCT_BATCH_ALLOC_IOVA0_MASK 0xFFFFFFFFFFFFFF80
+
 typedef enum
 {
   OCT_DEVICE_TYPE_UNKNOWN = 0,
@@ -72,13 +74,15 @@ typedef struct
 typedef struct
 {
   CLIB_ALIGN_MARK (cl, 128);
-  union
-  {
-    struct npa_batch_alloc_status_s status;
-    u64 iova[16];
-  };
+  u64 iova[16];
 } oct_npa_batch_alloc_cl128_t;
 
+typedef union
+{
+  struct npa_batch_alloc_status_s status;
+  u64 as_u64;
+} oct_npa_batch_alloc_status_t;
+
 STATIC_ASSERT_SIZEOF (oct_npa_batch_alloc_cl128_t, 128);
 
 typedef struct
diff --git a/src/plugins/dev_octeon/port.c b/src/plugins/dev_octeon/port.c
index 00ad8b9c477..a82e48004b5 100644
--- a/src/plugins/dev_octeon/port.c
+++ b/src/plugins/dev_octeon/port.c
@@ -284,8 +284,11 @@ oct_txq_stop (vlib_main_t *vm, vnet_dev_tx_queue_t *txq)
     for (n = ctq->ba_num_cl, cl = ctq->ba_buffer + ctq->ba_first_cl; n;
 	 cl++, n--)
       {
-	if (cl->status.ccode != 0)
-	  for (u32 i = 0; i < cl->status.count; i++)
+	oct_npa_batch_alloc_status_t st;
+
+	st.as_u64 = __atomic_load_n (cl->iova, __ATOMIC_ACQUIRE);
+	if (st.status.ccode != ALLOC_CCODE_INVAL)
+	  for (u32 i = 0; i < st.status.count; i++)
 	    {
 	      vlib_buffer_t *b = (vlib_buffer_t *) (cl->iova[i] + off);
 	      vlib_buffer_free_one (vm, vlib_get_buffer_index (vm, b));
diff --git a/src/plugins/dev_octeon/queue.c b/src/plugins/dev_octeon/queue.c
index 9378fc3b7c7..d6ae794fb8d 100644
--- a/src/plugins/dev_octeon/queue.c
+++ b/src/plugins/dev_octeon/queue.c
@@ -57,12 +57,20 @@ oct_tx_queue_alloc (vlib_main_t *vm, vnet_dev_tx_queue_t *txq)
   oct_txq_t *ctq = vnet_dev_get_tx_queue_data (txq);
   vnet_dev_port_t *port = txq->port;
   vnet_dev_t *dev = port->dev;
+  u32 sz = sizeof (void *) * ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS;
+  vnet_dev_rv_t rv;
 
   log_debug (dev, "tx_queue_alloc: queue %u alocated", txq->queue_id);
 
-  return vnet_dev_dma_mem_alloc (
-    vm, dev, sizeof (void *) * ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS, 128,
-    (void **) &ctq->ba_buffer);
+  rv = vnet_dev_dma_mem_alloc (vm, dev, sz, 128, (void **) &ctq->ba_buffer);
+
+  if (rv != VNET_DEV_OK)
+    return rv;
+
+  clib_memset_u64 (ctq->ba_buffer, OCT_BATCH_ALLOC_IOVA0_MASK,
+		   ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS);
+
+  return rv;
 }
 
 void
diff --git a/src/plugins/dev_octeon/tx_node.c b/src/plugins/dev_octeon/tx_node.c
index 28e8f25adb1..0dbf8759d35 100644
--- a/src/plugins/dev_octeon/tx_node.c
+++ b/src/plugins/dev_octeon/tx_node.c
@@ -46,9 +46,12 @@ oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
 
       for (cl = ctq->ba_buffer + ctq->ba_first_cl; num_cl > 0; num_cl--, cl++)
 	{
-	  u8 count;
-	  if (cl->status.ccode == ALLOC_CCODE_INVAL)
+	  oct_npa_batch_alloc_status_t st;
+
+	  if ((st.as_u64 = __atomic_load_n (cl->iova, __ATOMIC_RELAXED)) ==
+	      OCT_BATCH_ALLOC_IOVA0_MASK + ALLOC_CCODE_INVAL)
 	    {
+	    cl_not_ready:
 	      ctx->batch_alloc_not_ready++;
 	      n_freed = bi - (u32 *) ctq->ba_buffer;
 	      if (n_freed > 0)
@@ -63,11 +66,15 @@ oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
 	      return 0;
 	    }
 
-	  count = cl->status.count;
+	  if (st.status.count > 8 &&
+	      __atomic_load_n (cl->iova + 8, __ATOMIC_RELAXED) ==
+		OCT_BATCH_ALLOC_IOVA0_MASK)
+	    goto cl_not_ready;
+
 #if (CLIB_DEBUG > 0)
-	  cl->status.count = cl->status.ccode = 0;
+	  cl->iova[0] &= OCT_BATCH_ALLOC_IOVA0_MASK;
 #endif
-	  if (PREDICT_TRUE (count == 16))
+	  if (PREDICT_TRUE (st.status.count == 16))
 	    {
 	      /* optimize for likely case where cacheline is full */
 	      vlib_get_buffer_indices_with_offset (vm, (void **) cl, bi, 16,
@@ -76,9 +83,9 @@ oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
 	    }
 	  else
 	    {
-	      vlib_get_buffer_indices_with_offset (vm, (void **) cl, bi, count,
-						   off);
-	      bi += count;
+	      vlib_get_buffer_indices_with_offset (vm, (void **) cl, bi,
+						   st.status.count, off);
+	      bi += st.status.count;
 	    }
 	}
 
@@ -89,7 +96,8 @@ oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
       /* clear status bits in each cacheline */
       n = cl - ctq->ba_buffer;
       for (u32 i = 0; i < n; i++)
-	ctq->ba_buffer[i].iova[0] = 0;
+	ctq->ba_buffer[i].iova[0] = ctq->ba_buffer[i].iova[8] =
+	  OCT_BATCH_ALLOC_IOVA0_MASK;
 
       ctq->ba_num_cl = ctq->ba_first_cl = 0;
     }
-- 
cgit 1.2.3-korg