aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorBenoît Ganne <bganne@cisco.com>2021-04-29 18:24:24 +0200
committerDamjan Marion <dmarion@me.com>2021-05-21 19:50:14 +0000
commita42c41be4eed3e1ce2a42038b07ce1d3420891cd (patch)
treefc95c7c24cbef993cc2bef8742b3360123d70b66 /src
parent92a8d761c412590f5112239be4c511091b2b2d5a (diff)
af_xdp: workaround kernel race between poll() and sendmsg()
Prior to Linux 5.6 there is a race condition between poll() and sendmsg() in the kernel. This patch protects the syscalls with a lock to prevent it, unless the NO_SYSCALL_LOCK flag is set at create time. See https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/ Type: fix Change-Id: Ie7d4f5cb41f697b11a09b6046e54d190430d76df Signed-off-by: Benoît Ganne <bganne@cisco.com>
Diffstat (limited to 'src')
-rw-r--r--src/plugins/af_xdp/af_xdp.api10
-rw-r--r--src/plugins/af_xdp/af_xdp.h30
-rw-r--r--src/plugins/af_xdp/af_xdp_doc.md28
-rw-r--r--src/plugins/af_xdp/api.c12
-rw-r--r--src/plugins/af_xdp/cli.c5
-rw-r--r--src/plugins/af_xdp/device.c142
-rw-r--r--src/plugins/af_xdp/input.c35
-rw-r--r--src/plugins/af_xdp/output.c33
-rw-r--r--src/plugins/af_xdp/test_api.c2
-rw-r--r--src/plugins/af_xdp/unformat.c2
10 files changed, 213 insertions, 86 deletions
diff --git a/src/plugins/af_xdp/af_xdp.api b/src/plugins/af_xdp/af_xdp.api
index 14f51d87d6a..c6716123703 100644
--- a/src/plugins/af_xdp/af_xdp.api
+++ b/src/plugins/af_xdp/af_xdp.api
@@ -15,7 +15,7 @@
*------------------------------------------------------------------
*/
-option version = "0.1.0";
+option version = "0.2.0";
import "vnet/interface_types.api";
enum af_xdp_mode
@@ -25,6 +25,10 @@ enum af_xdp_mode
AF_XDP_API_MODE_ZERO_COPY = 2,
};
+enumflag af_xdp_flag : u8
+{
+ AF_XDP_API_FLAGS_NO_SYSCALL_LOCK = 1,
+};
/** \brief
@param client_index - opaque cookie to identify the sender
@@ -35,6 +39,7 @@ enum af_xdp_mode
@param rxq_size - receive queue size (optional)
@param txq_size - transmit queue size (optional)
@param mode - operation mode (optional)
+ @param flags - flags (optional)
@param prog - eBPF program path (optional)
*/
@@ -49,8 +54,9 @@ define af_xdp_create
u16 rxq_size [default=0];
u16 txq_size [default=0];
vl_api_af_xdp_mode_t mode [default=0];
+ vl_api_af_xdp_flag_t flags [default=0];
string prog[256];
- option vat_help = "<host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [zero-copy|no-zero-copy]";
+ option vat_help = "<host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [zero-copy|no-zero-copy] [no-syscall-lock]";
option status="in_progress";
};
diff --git a/src/plugins/af_xdp/af_xdp.h b/src/plugins/af_xdp/af_xdp.h
index 568380baa56..91895ced23b 100644
--- a/src/plugins/af_xdp/af_xdp.h
+++ b/src/plugins/af_xdp/af_xdp.h
@@ -32,7 +32,8 @@
_ (1, ERROR, "error") \
_ (2, ADMIN_UP, "admin-up") \
_ (3, LINK_UP, "link-up") \
- _ (4, ZEROCOPY, "zero-copy")
+ _ (4, ZEROCOPY, "zero-copy") \
+ _ (5, SYSCALL_LOCK, "syscall-lock")
enum
{
@@ -49,12 +50,20 @@ enum
clib_error_free(err_); \
}
+typedef enum
+{
+ AF_XDP_RXQ_MODE_UNKNOWN,
+ AF_XDP_RXQ_MODE_POLLING,
+ AF_XDP_RXQ_MODE_INTERRUPT,
+} __clib_packed af_xdp_rxq_mode_t;
+
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
/* fields below are accessed in data-plane (hot) */
+ clib_spinlock_t syscall_lock;
struct xsk_ring_cons rx;
struct xsk_ring_prod fq;
int xsk_fd;
@@ -63,7 +72,7 @@ typedef struct
uword file_index;
u32 queue_index;
- u8 is_polling;
+ af_xdp_rxq_mode_t mode;
} af_xdp_rxq_t;
typedef struct
@@ -73,6 +82,7 @@ typedef struct
/* fields below are accessed in data-plane (hot) */
clib_spinlock_t lock;
+ clib_spinlock_t syscall_lock;
struct xsk_ring_prod tx;
struct xsk_ring_cons cq;
int xsk_fd;
@@ -101,6 +111,8 @@ typedef struct
u32 dev_instance;
u8 hwaddr[6];
+ u8 rxq_num;
+
struct xsk_umem **umem;
struct xsk_socket **xsk;
@@ -127,12 +139,18 @@ typedef enum
AF_XDP_MODE_ZERO_COPY = 2,
} af_xdp_mode_t;
+typedef enum
+{
+ AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK = 1,
+} af_xdp_create_flag_t;
+
typedef struct
{
char *linux_ifname;
char *name;
char *prog;
af_xdp_mode_t mode;
+ af_xdp_create_flag_t flags;
u32 rxq_size;
u32 txq_size;
u32 rxq_num;
@@ -163,10 +181,10 @@ typedef struct
u32 hw_if_index;
} af_xdp_input_trace_t;
-#define foreach_af_xdp_tx_func_error \
-_(NO_FREE_SLOTS, "no free tx slots") \
-_(SENDTO_REQUIRED, "sendto required") \
-_(SENDTO_FAILURES, "sendto failures")
+#define foreach_af_xdp_tx_func_error \
+ _ (NO_FREE_SLOTS, "no free tx slots") \
+ _ (SYSCALL_REQUIRED, "syscall required") \
+ _ (SYSCALL_FAILURES, "syscall failures")
typedef enum
{
diff --git a/src/plugins/af_xdp/af_xdp_doc.md b/src/plugins/af_xdp/af_xdp_doc.md
index 7d83d712918..f5859dbb901 100644
--- a/src/plugins/af_xdp/af_xdp_doc.md
+++ b/src/plugins/af_xdp/af_xdp_doc.md
@@ -12,11 +12,15 @@ Under development: it should work, but has not been thoroughly tested.
- custom eBPF program
- polling, interrupt and adaptive mode
-## Limitations
+## Known limitations
+
+### MTU
Because of AF_XDP restrictions, the MTU is limited to below PAGE_SIZE
(4096-bytes on most systems) minus 256-bytes, and they are additional
limitations depending upon specific Linux device drivers.
As a rule of thumb, a MTU of 3000-bytes or less should be safe.
+
+### Number of buffers
Furthermore, upon UMEM creation, the kernel allocates a
physically-contiguous structure, whose size is proportional to the number
of 4KB pages contained in the UMEM. That allocation might fail when
@@ -25,7 +29,29 @@ controlled with the `buffers { buffers-per-numa }` configuration option.
Finally, note that because of this limitation, this plugin is unlikely
to be compatible with the use of 1GB hugepages.
+### Interrupt mode
+Interrupt and adaptive mode are supported but is limited by default to single
+threaded (no worker) configurations because of a kernel limitation prior to
+5.6. You can bypass the limitation at interface creation time by adding the
+`no-syscall-lock` parameter, but you must be sure that your kernel can
+support it, otherwise you will experience double-frees.
+See
+https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/
+for more details.
+
+### Mellanox
+When setting the number of queues on Mellanox NIC with `ethtool -L`, you must
+use twice the amount of configured queues: it looks like the Linux driver will
+create separate RX queues and TX queues (but all queues can be used for both
+RX and TX, the NIC will just not sent any packet on "pure" TX queues.
+Confused? So I am.). For example if you set `combined 2` you will effectively
+have to create 4 rx queues in AF_XDP if you want to be sure to receive all
+packets.
+
## Requirements
+This drivers supports Linux kernel 5.4 and later. Kernels older than 5.4 are
+missing unaligned buffers support.
+
The Linux kernel interface must be up and have enough queues before
creating the VPP AF_XDP interface, otherwise Linux will deny creating
the AF_XDP socket.
diff --git a/src/plugins/af_xdp/api.c b/src/plugins/af_xdp/api.c
index 7592aa4ffda..1864c4c2ee9 100644
--- a/src/plugins/af_xdp/api.c
+++ b/src/plugins/af_xdp/api.c
@@ -44,6 +44,17 @@ af_xdp_api_mode (vl_api_af_xdp_mode_t mode)
return AF_XDP_MODE_AUTO;
}
+static af_xdp_create_flag_t
+af_xdp_api_flags (vl_api_af_xdp_flag_t flags)
+{
+ af_xdp_create_flag_t cflags = 0;
+
+ if (flags & AF_XDP_API_FLAGS_NO_SYSCALL_LOCK)
+ cflags |= AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK;
+
+ return cflags;
+}
+
static void
vl_api_af_xdp_create_t_handler (vl_api_af_xdp_create_t * mp)
{
@@ -59,6 +70,7 @@ vl_api_af_xdp_create_t_handler (vl_api_af_xdp_create_t * mp)
args.name = mp->name[0] ? (char *) mp->name : 0;
args.prog = mp->prog[0] ? (char *) mp->prog : 0;
args.mode = af_xdp_api_mode (mp->mode);
+ args.flags = af_xdp_api_flags (mp->flags);
args.rxq_size = ntohs (mp->rxq_size);
args.txq_size = ntohs (mp->txq_size);
args.rxq_num = ntohs (mp->rxq_num);
diff --git a/src/plugins/af_xdp/cli.c b/src/plugins/af_xdp/cli.c
index d5f21d4c391..2f3deffaaee 100644
--- a/src/plugins/af_xdp/cli.c
+++ b/src/plugins/af_xdp/cli.c
@@ -47,7 +47,10 @@ af_xdp_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (af_xdp_create_command, static) = {
.path = "create interface af_xdp",
- .short_help = "create interface af_xdp <host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [zero-copy|no-zero-copy]",
+ .short_help =
+ "create interface af_xdp <host-if linux-ifname> [name ifname] "
+ "[rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] "
+ "[prog pathname] [zero-copy|no-zero-copy] [no-syscall-lock]",
.function = af_xdp_create_command_fn,
};
/* *INDENT-ON* */
diff --git a/src/plugins/af_xdp/device.c b/src/plugins/af_xdp/device.c
index 35ba617d6e3..5bc7e308173 100644
--- a/src/plugins/af_xdp/device.c
+++ b/src/plugins/af_xdp/device.c
@@ -93,8 +93,7 @@ af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad)
af_xdp_main_t *axm = &af_xdp_main;
struct xsk_socket **xsk;
struct xsk_umem **umem;
- af_xdp_rxq_t *rxq;
- af_xdp_txq_t *txq;
+ int i;
if (ad->hw_if_index)
{
@@ -102,11 +101,17 @@ af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad)
ethernet_delete_interface (vnm, ad->hw_if_index);
}
- vec_foreach (rxq, ad->rxqs) clib_file_del_by_index (&file_main,
- rxq->file_index);
- vec_foreach (txq, ad->txqs) clib_spinlock_free (&txq->lock);
- vec_foreach (xsk, ad->xsk) xsk_socket__delete (*xsk);
- vec_foreach (umem, ad->umem) xsk_umem__delete (*umem);
+ for (i = 0; i < ad->rxq_num; i++)
+ clib_file_del_by_index (&file_main, vec_elt (ad->rxqs, i).file_index);
+
+ for (i = 0; i < ad->txq_num; i++)
+ clib_spinlock_free (&vec_elt (ad->txqs, i).lock);
+
+ vec_foreach (xsk, ad->xsk)
+ xsk_socket__delete (*xsk);
+
+ vec_foreach (umem, ad->umem)
+ xsk_umem__delete (*umem);
if (ad->bpf_obj)
{
@@ -169,8 +174,8 @@ err0:
}
static int
-af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args,
- af_xdp_device_t * ad, int qid, int rxq_num, int txq_num)
+af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args,
+ af_xdp_device_t *ad, int qid)
{
struct xsk_umem **umem;
struct xsk_socket **xsk;
@@ -180,6 +185,8 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args,
struct xsk_socket_config sock_config;
struct xdp_options opt;
socklen_t optlen;
+ const int is_rx = qid < ad->rxq_num;
+ const int is_tx = qid < ad->txq_num;
vec_validate_aligned (ad->umem, qid, CLIB_CACHE_LINE_BYTES);
umem = vec_elt_at_index (ad->umem, qid);
@@ -197,9 +204,9 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args,
* fq and cq must always be allocated even if unused
* whereas rx and tx indicates whether we want rxq, txq, or both
*/
- struct xsk_ring_cons *rx = qid < rxq_num ? &rxq->rx : 0;
+ struct xsk_ring_cons *rx = is_rx ? &rxq->rx : 0;
struct xsk_ring_prod *fq = &rxq->fq;
- struct xsk_ring_prod *tx = qid < txq_num ? &txq->tx : 0;
+ struct xsk_ring_prod *tx = is_tx ? &txq->tx : 0;
struct xsk_ring_cons *cq = &txq->cq;
int fd;
@@ -268,8 +275,32 @@ af_xdp_create_queue (vlib_main_t * vm, af_xdp_create_if_args_t * args,
if (opt.flags & XDP_OPTIONS_ZEROCOPY)
ad->flags |= AF_XDP_DEVICE_F_ZEROCOPY;
- rxq->xsk_fd = qid < rxq_num ? fd : -1;
- txq->xsk_fd = qid < txq_num ? fd : -1;
+ rxq->xsk_fd = is_rx ? fd : -1;
+
+ if (is_tx)
+ {
+ txq->xsk_fd = fd;
+ if (is_rx && (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK))
+ {
+ /* This is a shared rx+tx queue and we need to lock before syscalls.
+ * Prior to Linux 5.6 there is a race condition preventing to call
+ * poll() and sendto() concurrently on AF_XDP sockets. This was
+ * fixed with commit 11cc2d21499cabe7e7964389634ed1de3ee91d33
+ * to workaround this issue, we protect the syscalls with a
+ * spinlock. Note that it also prevents to use interrupt mode in
+ * multi workers setup, because in this case the poll() is done in
+ * the framework w/o any possibility to protect it.
+ * See
+ * https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/
+ */
+ clib_spinlock_init (&rxq->syscall_lock);
+ txq->syscall_lock = rxq->syscall_lock;
+ }
+ }
+ else
+ {
+ txq->xsk_fd = -1;
+ }
return 0;
@@ -308,19 +339,37 @@ af_xdp_device_rxq_read_ready (clib_file_t * f)
return 0;
}
-static void
-af_xdp_device_set_rxq_mode (af_xdp_rxq_t *rxq, int is_polling)
+static clib_error_t *
+af_xdp_device_set_rxq_mode (const af_xdp_device_t *ad, af_xdp_rxq_t *rxq,
+ const af_xdp_rxq_mode_t mode)
{
clib_file_main_t *fm = &file_main;
+ clib_file_update_type_t update;
clib_file_t *f;
- if (rxq->is_polling == is_polling)
- return;
+ if (rxq->mode == mode)
+ return 0;
+
+ switch (mode)
+ {
+ case AF_XDP_RXQ_MODE_POLLING:
+ update = UNIX_FILE_UPDATE_DELETE;
+ break;
+ case AF_XDP_RXQ_MODE_INTERRUPT:
+ if (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK)
+ return clib_error_create (
+ "kernel workaround incompatible with interrupt mode");
+ update = UNIX_FILE_UPDATE_ADD;
+ break;
+ default:
+ ASSERT (0);
+ return clib_error_create ("unknown rxq mode %i", mode);
+ }
f = clib_file_get (fm, rxq->file_index);
- fm->file_update (f, is_polling ? UNIX_FILE_UPDATE_DELETE :
- UNIX_FILE_UPDATE_ADD);
- rxq->is_polling = !!is_polling;
+ fm->file_update (f, update);
+ rxq->mode = mode;
+ return 0;
}
void
@@ -361,6 +410,10 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args)
pool_get_zero (am->devices, ad);
+ if (tm->n_vlib_mains > 1 &&
+ 0 == (args->flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK))
+ ad->flags |= AF_XDP_DEVICE_F_SYSCALL_LOCK;
+
ad->linux_ifname = (char *) format (0, "%s", args->linux_ifname);
vec_validate (ad->linux_ifname, IFNAMSIZ - 1); /* libbpf expects ifname to be at least IFNAMSIZ */
@@ -368,10 +421,11 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args)
goto err1;
q_num = clib_max (rxq_num, txq_num);
+ ad->rxq_num = rxq_num;
ad->txq_num = txq_num;
for (i = 0; i < q_num; i++)
{
- if (af_xdp_create_queue (vm, args, ad, i, rxq_num, txq_num))
+ if (af_xdp_create_queue (vm, args, ad, i))
{
/*
* queue creation failed
@@ -387,15 +441,21 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args)
vec_set_len (ad->rxqs, i);
vec_set_len (ad->txqs, i);
+ ad->rxq_num = clib_min (i, rxq_num);
+ ad->txq_num = clib_min (i, txq_num);
+
if (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != rxq_num)
- goto err1; /* failed creating requested rxq: fatal error, bailing out */
+ {
+ ad->rxq_num = ad->txq_num = 0;
+ goto err1; /* failed creating requested rxq: fatal error, bailing
+ out */
+ }
if (i < txq_num)
{
/* we created less txq than threads not an error but initialize lock for shared txq */
- af_xdp_txq_t *txq;
- ad->txq_num = i;
- vec_foreach (txq, ad->txqs) clib_spinlock_init (&txq->lock);
+ for (i = 0; i < ad->txq_num; i++)
+ clib_spinlock_init (&vec_elt (ad->txqs, i).lock);
}
args->rv = 0;
@@ -436,7 +496,7 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args)
vnet_hw_if_set_input_node (vnm, ad->hw_if_index, af_xdp_input_node.index);
- for (i = 0; i < vec_len (ad->rxqs); i++)
+ for (i = 0; i < ad->rxq_num; i++)
{
af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i);
rxq->queue_index = vnet_hw_if_register_rx_queue (
@@ -445,7 +505,6 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args)
ad->dev_instance, i);
clib_file_t f = {
.file_descriptor = rxq->xsk_fd,
- .flags = UNIX_FILE_EVENT_EDGE_TRIGGERED,
.private_data = rxq->queue_index,
.read_function = af_xdp_device_rxq_read_ready,
.description = desc,
@@ -453,7 +512,8 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args)
rxq->file_index = clib_file_add (&file_main, &f);
vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index,
rxq->file_index);
- af_xdp_device_set_rxq_mode (rxq, 1 /* polling */);
+ if (af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING))
+ goto err1;
}
vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index);
@@ -510,24 +570,20 @@ af_xdp_interface_rx_mode_change (vnet_main_t *vnm, u32 hw_if_index, u32 qid,
switch (mode)
{
- case VNET_HW_IF_RX_MODE_UNKNOWN:
- case VNET_HW_IF_NUM_RX_MODES: /* fallthrough */
+ default: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_UNKNOWN: /* fallthrough */
+ case VNET_HW_IF_NUM_RX_MODES:
return clib_error_create ("uknown rx mode - doing nothing");
- case VNET_HW_IF_RX_MODE_DEFAULT:
- case VNET_HW_IF_RX_MODE_POLLING: /* fallthrough */
- if (rxq->is_polling)
- break;
- af_xdp_device_set_rxq_mode (rxq, 1 /* polling */);
- break;
- case VNET_HW_IF_RX_MODE_INTERRUPT:
- case VNET_HW_IF_RX_MODE_ADAPTIVE: /* fallthrough */
- if (0 == rxq->is_polling)
- break;
- af_xdp_device_set_rxq_mode (rxq, 0 /* interrupt */);
- break;
+ case VNET_HW_IF_RX_MODE_DEFAULT: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_POLLING:
+ return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING);
+ case VNET_HW_IF_RX_MODE_INTERRUPT: /* fallthrough */
+ case VNET_HW_IF_RX_MODE_ADAPTIVE:
+ return af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_INTERRUPT);
}
- return 0;
+ ASSERT (0 && "unreachable");
+ return clib_error_create ("unreachable");
}
static void
diff --git a/src/plugins/af_xdp/input.c b/src/plugins/af_xdp/input.c
index da422bdccaf..dcbf5a4b587 100644
--- a/src/plugins/af_xdp/input.c
+++ b/src/plugins/af_xdp/input.c
@@ -24,9 +24,9 @@
#include <vnet/interface/rx_queue_funcs.h>
#include "af_xdp.h"
-#define foreach_af_xdp_input_error \
- _(POLL_REQUIRED, "poll required") \
- _(POLL_FAILURES, "poll failures")
+#define foreach_af_xdp_input_error \
+ _ (SYSCALL_REQUIRED, "syscall required") \
+ _ (SYSCALL_FAILURES, "syscall failures")
typedef enum
{
@@ -77,25 +77,28 @@ af_xdp_device_input_refill_db (vlib_main_t * vm,
af_xdp_device_t * ad, af_xdp_rxq_t * rxq,
const u32 n_alloc)
{
- int ret;
-
xsk_ring_prod__submit (&rxq->fq, n_alloc);
- if (!xsk_ring_prod__needs_wakeup (&rxq->fq))
+ if (AF_XDP_RXQ_MODE_INTERRUPT == rxq->mode ||
+ !xsk_ring_prod__needs_wakeup (&rxq->fq))
return;
- vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_REQUIRED,
+ vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_SYSCALL_REQUIRED,
1);
- struct pollfd fd = {.fd = rxq->xsk_fd,.events = POLLIN };
- ret = poll (&fd, 1, 0);
- if (PREDICT_TRUE (ret >= 0))
- return;
-
- /* something bad is happening */
- vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_FAILURES,
- 1);
- af_xdp_device_error (ad, "poll() failed");
+ if (clib_spinlock_trylock_if_init (&rxq->syscall_lock))
+ {
+ struct pollfd fd = { .fd = rxq->xsk_fd, .events = POLLIN | POLLOUT };
+ int ret = poll (&fd, 1, 0);
+ clib_spinlock_unlock_if_init (&rxq->syscall_lock);
+ if (PREDICT_FALSE (ret < 0))
+ {
+ /* something bad is happening */
+ vlib_error_count (vm, node->node_index,
+ AF_XDP_INPUT_ERROR_SYSCALL_FAILURES, 1);
+ af_xdp_device_error (ad, "rx poll() failed");
+ }
+ }
}
static_always_inline void
diff --git a/src/plugins/af_xdp/output.c b/src/plugins/af_xdp/output.c
index 52c34e00d95..51a56ed866d 100644
--- a/src/plugins/af_xdp/output.c
+++ b/src/plugins/af_xdp/output.c
@@ -1,4 +1,4 @@
-#include <errno.h>
+#include <poll.h>
#include <string.h>
#include <vlib/vlib.h>
#include <vlib/unix/unix.h>
@@ -90,31 +90,29 @@ af_xdp_device_output_tx_db (vlib_main_t * vm,
af_xdp_device_t * ad,
af_xdp_txq_t * txq, const u32 n_tx)
{
- int ret;
-
xsk_ring_prod__submit (&txq->tx, n_tx);
if (!xsk_ring_prod__needs_wakeup (&txq->tx))
return;
- vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SENDTO_REQUIRED, 1);
+ vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SYSCALL_REQUIRED, 1);
- ret = sendto (txq->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
- if (PREDICT_TRUE (ret >= 0))
- return;
+ clib_spinlock_lock_if_init (&txq->syscall_lock);
- /* those errors are fine */
- switch (errno)
+ if (xsk_ring_prod__needs_wakeup (&txq->tx))
{
- case ENOBUFS:
- case EAGAIN:
- case EBUSY:
- return;
+ struct pollfd fd = { .fd = txq->xsk_fd, .events = POLLIN | POLLOUT };
+ int ret = poll (&fd, 1, 0);
+ if (PREDICT_FALSE (ret < 0))
+ {
+ /* something bad is happening */
+ vlib_error_count (vm, node->node_index,
+ AF_XDP_TX_ERROR_SYSCALL_FAILURES, 1);
+ af_xdp_device_error (ad, "tx poll() failed");
+ }
}
- /* something bad is happening */
- vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SENDTO_FAILURES, 1);
- af_xdp_device_error (ad, "sendto() failed");
+ clib_spinlock_unlock_if_init (&txq->syscall_lock);
}
static_always_inline u32
@@ -218,7 +216,8 @@ VNET_DEVICE_CLASS_TX_FN (af_xdp_device_class) (vlib_main_t * vm,
vnet_interface_output_runtime_t *ord = (void *) node->runtime_data;
af_xdp_device_t *ad = pool_elt_at_index (rm->devices, ord->dev_instance);
u32 thread_index = vm->thread_index;
- af_xdp_txq_t *txq = vec_elt_at_index (ad->txqs, thread_index % ad->txq_num);
+ af_xdp_txq_t *txq =
+ vec_elt_at_index (ad->txqs, (thread_index - 1) % ad->txq_num);
u32 *from;
u32 n, n_tx;
int i;
diff --git a/src/plugins/af_xdp/test_api.c b/src/plugins/af_xdp/test_api.c
index 270db4e2973..6dffa29bdd1 100644
--- a/src/plugins/af_xdp/test_api.c
+++ b/src/plugins/af_xdp/test_api.c
@@ -81,6 +81,8 @@ api_af_xdp_create (vat_main_t * vam)
mp->rxq_size = clib_host_to_net_u16 (args.rxq_size);
mp->txq_size = clib_host_to_net_u16 (args.txq_size);
mp->mode = api_af_xdp_mode (args.mode);
+ if (args.flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK)
+ mp->flags |= AF_XDP_API_FLAGS_NO_SYSCALL_LOCK;
snprintf ((char *) mp->prog, sizeof (mp->prog), "%s", args.prog ? : "");
S (mp);
diff --git a/src/plugins/af_xdp/unformat.c b/src/plugins/af_xdp/unformat.c
index b2292464112..bb4c3048d23 100644
--- a/src/plugins/af_xdp/unformat.c
+++ b/src/plugins/af_xdp/unformat.c
@@ -50,6 +50,8 @@ unformat_af_xdp_create_if_args (unformat_input_t * input, va_list * vargs)
args->mode = AF_XDP_MODE_COPY;
else if (unformat (line_input, "zero-copy"))
args->mode = AF_XDP_MODE_ZERO_COPY;
+ else if (unformat (line_input, "no-syscall-lock"))
+ args->flags |= AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK;
else
{
/* return failure on unknown input */