tun/tap: Bad packets sent to kernel via tun/tap interface

It was observed that under heavy traffic, VPP accidentally sent traffic with the wrong source and destination to the tun/tap interface. Traffic appears to be sent to the wrong direction. This problem is only seen when worker thread is configured. When worker thread is used, TX and RX may reside in different core. Yet both TX and RX threads are sharing the same global variable, namely iovecs without any mutex or memory barrier protection. This creates a race condition when heavy traffic is blasted to VPP, like 1000 pps. We could create a mutex or memory barrier to ensure atomic memory access. But why bother? It is a lot cheaper to just decouple the iovecs such that TX and RX have their own iovecs. Change-Id: I86a5a19bd8de54d54f32e1f0845bae6a81bbf686 Signed-off-by: Steven <sluong@cisco.com> (cherry picked from commit 4ff586d1c6fc5c40e1548cd6f221a8a7f3ad033b)
author: Steven <sluong@cisco.com> 2017-09-26 15:58:24 -0700
committer: Florin Coras <florin.coras@gmail.com> 2017-09-28 02:40:49 +0000
commit: 3fd57e67532bd55701bef7365adc17da229a44dc (patch)
tree: 265f1588284394b0cd60c60447435e134c485138
parent: d84f2ef54a457918f21bb0ee82392274b62611fe (diff)
2 files changed, 29 insertions, 22 deletions
diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c
index 13154b3b..ce386094 100644
--- a/src/vnet/unix/tapcli.c
+++ b/src/vnet/unix/tapcli.c
@@ -99,8 +99,11 @@ u8 * format_tapcli_rx_trace (u8 * s, va_list * va)
  * @brief TAPCLI main state struct
  */
 typedef struct {
-  /** Vector of iovecs for readv/writev calls. */
-  struct iovec * iovecs;
+  /** Vector of iovecs for readv calls. */
+  struct iovec * rd_iovecs;
+
+  /** Vector of iovecs for writev calls. */
+  struct iovec * wr_iovecs;
 
   /** Vector of VLIB rx buffers to use.  We allocate them in blocks
      of VLIB_FRAME_SIZE (256). */
@@ -199,11 +202,11 @@ tapcli_tx (vlib_main_t * vm,
         ti = vec_elt_at_index (tm->tapcli_interfaces, p[0]);
 
       /* Re-set iovecs if present. */
-      if (tm->iovecs)
-	_vec_len (tm->iovecs) = 0;
+      if (tm->wr_iovecs)
+	_vec_len (tm->wr_iovecs) = 0;
 
       /* VLIB buffer chain -> Unix iovec(s). */
-      vec_add2 (tm->iovecs, iov, 1);
+      vec_add2 (tm->wr_iovecs, iov, 1);
       iov->iov_base = b->data + b->current_data;
       iov->iov_len = l = b->current_length;
 
@@ -212,7 +215,7 @@ tapcli_tx (vlib_main_t * vm,
 	  do {
 	    b = vlib_get_buffer (vm, b->next_buffer);
 
-	    vec_add2 (tm->iovecs, iov, 1);
+	    vec_add2 (tm->wr_iovecs, iov, 1);
 
 	    iov->iov_base = b->data + b->current_data;
 	    iov->iov_len = b->current_length;
@@ -220,7 +223,7 @@ tapcli_tx (vlib_main_t * vm,
 	  } while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
 	}
 
-      if (writev (ti->unix_fd, tm->iovecs, vec_len (tm->iovecs)) < l)
+      if (writev (ti->unix_fd, tm->wr_iovecs, vec_len (tm->wr_iovecs)) < l)
 	clib_unix_warning ("writev");
     }
 
@@ -292,14 +295,14 @@ static uword tapcli_rx_iface(vlib_main_t * vm,
 
     /* Allocate RX buffers from end of rx_buffers.
            Turn them into iovecs to pass to readv. */
-    vec_validate (tm->iovecs, tm->mtu_buffers - 1);
+    vec_validate (tm->rd_iovecs, tm->mtu_buffers - 1);
     for (j = 0; j < tm->mtu_buffers; j++) {
       b = vlib_get_buffer (vm, tm->rx_buffers[i_rx - j]);
-      tm->iovecs[j].iov_base = b->data;
-      tm->iovecs[j].iov_len = buffer_size;
+      tm->rd_iovecs[j].iov_base = b->data;
+      tm->rd_iovecs[j].iov_len = buffer_size;
     }
 
-    n_bytes_left = readv (ti->unix_fd, tm->iovecs, tm->mtu_buffers);
+    n_bytes_left = readv (ti->unix_fd, tm->rd_iovecs, tm->mtu_buffers);
     n_bytes_in_packet = n_bytes_left;
     if (n_bytes_left <= 0) {
       if (errno != EAGAIN) {
diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c
index 9616feb2..dc5c2a89 100644
--- a/src/vnet/unix/tuntap.c
+++ b/src/vnet/unix/tuntap.c
@@ -71,8 +71,11 @@ typedef struct {
  * @brief TUNTAP node main state
  */
 typedef struct {
-  /** Vector of iovecs for readv/writev calls. */
-  struct iovec * iovecs;
+  /** Vector of iovecs for readv calls. */
+  struct iovec * rd_iovecs;
+
+  /** Vector of iovecs for writev calls. */
+  struct iovec * wr_iovecs;
 
   /** Vector of VLIB rx buffers to use.  We allocate them in blocks
      of VLIB_FRAME_SIZE (256). */
@@ -160,11 +163,11 @@ tuntap_tx (vlib_main_t * vm,
         }
 
       /* Re-set iovecs if present. */
-      if (tm->iovecs)
-	_vec_len (tm->iovecs) = 0;
+      if (tm->wr_iovecs)
+	_vec_len (tm->wr_iovecs) = 0;
 
       /** VLIB buffer chain -> Unix iovec(s). */
-      vec_add2 (tm->iovecs, iov, 1);
+      vec_add2 (tm->wr_iovecs, iov, 1);
       iov->iov_base = b->data + b->current_data;
       iov->iov_len = l = b->current_length;
 
@@ -173,7 +176,7 @@ tuntap_tx (vlib_main_t * vm,
 	  do {
 	    b = vlib_get_buffer (vm, b->next_buffer);
 
-	    vec_add2 (tm->iovecs, iov, 1);
+	    vec_add2 (tm->wr_iovecs, iov, 1);
 
 	    iov->iov_base = b->data + b->current_data;
 	    iov->iov_len = b->current_length;
@@ -181,7 +184,8 @@ tuntap_tx (vlib_main_t * vm,
 	  } while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
 	}
 
-      if (writev (tm->dev_net_tun_fd, tm->iovecs, vec_len (tm->iovecs)) < l)
+      if (writev (tm->dev_net_tun_fd, tm->wr_iovecs,
+		  vec_len (tm->wr_iovecs)) < l)
 	clib_unix_warning ("writev");
 
       n_bytes += l;
@@ -256,15 +260,15 @@ tuntap_rx (vlib_main_t * vm,
     /** We should have enough buffers left for an MTU sized packet. */
     ASSERT (vec_len (tm->rx_buffers) >= tm->mtu_buffers);
 
-    vec_validate (tm->iovecs, tm->mtu_buffers - 1);
+    vec_validate (tm->rd_iovecs, tm->mtu_buffers - 1);
     for (i = 0; i < tm->mtu_buffers; i++)
       {
 	b = vlib_get_buffer (vm, tm->rx_buffers[i_rx - i]);
-	tm->iovecs[i].iov_base = b->data;
-	tm->iovecs[i].iov_len = buffer_size;
+	tm->rd_iovecs[i].iov_base = b->data;
+	tm->rd_iovecs[i].iov_len = buffer_size;
       }
 
-    n_bytes_left = readv (tm->dev_net_tun_fd, tm->iovecs, tm->mtu_buffers);
+    n_bytes_left = readv (tm->dev_net_tun_fd, tm->rd_iovecs, tm->mtu_buffers);
     n_bytes_in_packet = n_bytes_left;
     if (n_bytes_left <= 0)
       {
author	Steven <sluong@cisco.com>	2017-09-26 15:58:24 -0700
committer	Florin Coras <florin.coras@gmail.com>	2017-09-28 02:40:49 +0000
commit	3fd57e67532bd55701bef7365adc17da229a44dc (patch)
tree	265f1588284394b0cd60c60447435e134c485138
parent	d84f2ef54a457918f21bb0ee82392274b62611fe (diff)