diff options
Diffstat (limited to 'src/plugins/af_xdp')
-rw-r--r-- | src/plugins/af_xdp/CMakeLists.txt | 34 | ||||
-rw-r--r-- | src/plugins/af_xdp/af_xdp.api | 97 | ||||
-rw-r--r-- | src/plugins/af_xdp/af_xdp.h | 9 | ||||
-rw-r--r-- | src/plugins/af_xdp/af_xdp_doc.md | 129 | ||||
-rw-r--r-- | src/plugins/af_xdp/af_xdp_doc.rst | 164 | ||||
-rw-r--r-- | src/plugins/af_xdp/api.c | 69 | ||||
-rw-r--r-- | src/plugins/af_xdp/cli.c | 8 | ||||
-rw-r--r-- | src/plugins/af_xdp/device.c | 425 | ||||
-rw-r--r-- | src/plugins/af_xdp/input.c | 9 | ||||
-rw-r--r-- | src/plugins/af_xdp/output.c | 51 | ||||
-rw-r--r-- | src/plugins/af_xdp/plugin.c | 2 | ||||
-rw-r--r-- | src/plugins/af_xdp/test_api.c | 105 | ||||
-rw-r--r-- | src/plugins/af_xdp/unformat.c | 2 |
13 files changed, 837 insertions, 267 deletions
diff --git a/src/plugins/af_xdp/CMakeLists.txt b/src/plugins/af_xdp/CMakeLists.txt index cbe96aa59dd..fd7ee4e835b 100644 --- a/src/plugins/af_xdp/CMakeLists.txt +++ b/src/plugins/af_xdp/CMakeLists.txt @@ -11,36 +11,37 @@ # See the License for the specific language governing permissions and # limitations under the License. -vpp_find_path(BPF_INCLUDE_DIR NAMES bpf/xsk.h) -if (NOT BPF_INCLUDE_DIR) - message(WARNING "libbpf headers not found - af_xdp plugin disabled") +vpp_find_path(XDP_INCLUDE_DIR NAMES xdp/xsk.h) +if (NOT XDP_INCLUDE_DIR) + message(WARNING "libxdp headers not found - af_xdp plugin disabled") return() endif() set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS TRUE) +vpp_plugin_find_library(af_xdp XDP_LIB libxdp.a) vpp_plugin_find_library(af_xdp BPF_LIB libbpf.a) -vpp_plugin_find_library(af_xdp BPF_ELF_LIB elf) -vpp_plugin_find_library(af_xdp BPF_Z_LIB z) -if (NOT BPF_LIB OR NOT BPF_ELF_LIB OR NOT BPF_Z_LIB) +vpp_plugin_find_library(af_xdp ELF_LIB elf) +vpp_plugin_find_library(af_xdp Z_LIB z) +if (NOT XDP_LIB OR NOT BPF_LIB OR NOT ELF_LIB OR NOT Z_LIB) message(WARNING "af_xdp plugin - missing libraries - af_xdp plugin disabled") return() endif() set(CMAKE_REQUIRED_FLAGS "-fPIC") -set(CMAKE_REQUIRED_INCLUDES "${BPF_INCLUDE_DIR}") -set(CMAKE_REQUIRED_LIBRARIES "${BPF_LIB}" "${BPF_ELF_LIB}" "${BPF_Z_LIB}") +set(CMAKE_REQUIRED_INCLUDES "${XDP_INCLUDE_DIR}") +set(CMAKE_REQUIRED_LIBRARIES "${XDP_LIB}" "${BPF_LIB}" "${ELF_LIB}" "${Z_LIB}") CHECK_C_SOURCE_COMPILES(" -#include <bpf/xsk.h> +#include <xdp/xsk.h> int main(void) { return xsk_socket__create (0, 0, 0, 0, 0, 0, 0); -}" BPF_COMPILES_CHECK) -if (NOT BPF_COMPILES_CHECK) - message(WARNING "af_xdp plugins - no working libbpf found - af_xdp plugin disabled") +}" XDP_COMPILES_CHECK) +if (NOT XDP_COMPILES_CHECK) +message(WARNING "af_xdp plugins - no working libxdp found - af_xdp plugin disabled") return() endif() -include_directories(${BPF_INCLUDE_DIR}) +include_directories(${XDP_INCLUDE_DIR}) add_vpp_plugin(af_xdp SOURCES @@ -65,7 +66,10 @@ add_vpp_plugin(af_xdp test_api.c LINK_LIBRARIES + ${XDP_LIB} ${BPF_LIB} - ${BPF_ELF_LIB} - ${BPF_Z_LIB} + ${ELF_LIB} + ${Z_LIB} + + SUPPORTED_OS_LIST Linux ) diff --git a/src/plugins/af_xdp/af_xdp.api b/src/plugins/af_xdp/af_xdp.api index c6716123703..4c2908e2037 100644 --- a/src/plugins/af_xdp/af_xdp.api +++ b/src/plugins/af_xdp/af_xdp.api @@ -15,7 +15,7 @@ *------------------------------------------------------------------ */ -option version = "0.2.0"; +option version = "1.0.0"; import "vnet/interface_types.api"; enum af_xdp_mode @@ -57,7 +57,39 @@ define af_xdp_create vl_api_af_xdp_flag_t flags [default=0]; string prog[256]; option vat_help = "<host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [zero-copy|no-zero-copy] [no-syscall-lock]"; - option status="in_progress"; + option deprecated; +}; + +/** \brief + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param host_if - Linux netdev interface name + @param name - new af_xdp interface name (optional) + @param rxq_num - number of receive queues. 65535 can be used as special value to request all available queues (optional) + @param rxq_size - receive queue size (optional) + @param txq_size - transmit queue size (optional) + @param mode - operation mode (optional) + @param flags - flags (optional) + @param prog - eBPF program path (optional) + @param namespace - netns of nic (optional) +*/ + +define af_xdp_create_v2 +{ + u32 client_index; + u32 context; + + string host_if[64]; + string name[64]; + u16 rxq_num [default=1]; + u16 rxq_size [default=0]; + u16 txq_size [default=0]; + vl_api_af_xdp_mode_t mode [default=0]; + vl_api_af_xdp_flag_t flags [default=0]; + string prog[256]; + string namespace[64]; + option vat_help = "<host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [netns ns] [zero-copy|no-zero-copy] [no-syscall-lock]"; + option deprecated; }; /** \brief @@ -71,7 +103,21 @@ define af_xdp_create_reply u32 context; i32 retval; vl_api_interface_index_t sw_if_index; - option status="in_progress"; + option deprecated; +}; + +/** \brief + @param context - sender context, to match reply w/ request + @param retval - return value for request + @param sw_if_index - software index for the new af_xdp interface +*/ + +define af_xdp_create_v2_reply +{ + u32 context; + i32 retval; + vl_api_interface_index_t sw_if_index; + option deprecated; }; /** \brief @@ -80,6 +126,50 @@ define af_xdp_create_reply @param sw_if_index - interface index */ +/** \brief + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param host_if - Linux netdev interface name + @param name - new af_xdp interface name (optional) + @param rxq_num - number of receive queues. 65535 can be used as special value to request all available queues (optional) + @param rxq_size - receive queue size (optional) + @param txq_size - transmit queue size (optional) + @param mode - operation mode (optional) + @param flags - flags (optional) + @param prog - eBPF program path (optional) + @param netns - netns of nic (optional) +*/ + +autoendian define af_xdp_create_v3 +{ + u32 client_index; + u32 context; + + string host_if[64]; + string name[64]; + u16 rxq_num [default=1]; + u16 rxq_size [default=0]; + u16 txq_size [default=0]; + vl_api_af_xdp_mode_t mode [default=0]; + vl_api_af_xdp_flag_t flags [default=0]; + string prog[256]; + string netns[64]; + option vat_help = "<host-if linux-ifname> [name ifname] [rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] [prog pathname] [netns ns] [zero-copy|no-zero-copy] [no-syscall-lock]"; +}; + +/** \brief + @param context - sender context, to match reply w/ request + @param retval - return value for request + @param sw_if_index - software index for the new af_xdp interface +*/ + +autoendian define af_xdp_create_v3_reply +{ + u32 context; + i32 retval; + vl_api_interface_index_t sw_if_index; +}; + autoreply define af_xdp_delete { u32 client_index; @@ -87,7 +177,6 @@ autoreply define af_xdp_delete vl_api_interface_index_t sw_if_index; option vat_help = "<sw_if_index index>"; - option status="in_progress"; }; /* diff --git a/src/plugins/af_xdp/af_xdp.h b/src/plugins/af_xdp/af_xdp.h index 825a3fb29fd..cf364fc86a8 100644 --- a/src/plugins/af_xdp/af_xdp.h +++ b/src/plugins/af_xdp/af_xdp.h @@ -20,7 +20,7 @@ #include <vlib/log.h> #include <vnet/interface.h> -#include <bpf/xsk.h> +#include <xdp/xsk.h> #define AF_XDP_NUM_RX_QUEUES_ALL ((u16)-1) @@ -86,6 +86,10 @@ typedef struct struct xsk_ring_prod tx; struct xsk_ring_cons cq; int xsk_fd; + + /* fields below are accessed in control-plane only (cold) */ + + u32 queue_index; } af_xdp_txq_t; typedef struct @@ -113,6 +117,8 @@ typedef struct u8 rxq_num; + char *netns; + struct xsk_umem **umem; struct xsk_socket **xsk; @@ -149,6 +155,7 @@ typedef struct char *linux_ifname; char *name; char *prog; + char *netns; af_xdp_mode_t mode; af_xdp_create_flag_t flags; u32 rxq_size; diff --git a/src/plugins/af_xdp/af_xdp_doc.md b/src/plugins/af_xdp/af_xdp_doc.md deleted file mode 100644 index f5859dbb901..00000000000 --- a/src/plugins/af_xdp/af_xdp_doc.md +++ /dev/null @@ -1,129 +0,0 @@ -# AF_XDP Ethernet driver {#af_xdp_doc} - -This driver relies on Linux AF_XDP socket to rx/tx Ethernet packets. - -## Maturity level -Under development: it should work, but has not been thoroughly tested. - -## Features - - copy and zero-copy mode - - multiqueue - - API - - custom eBPF program - - polling, interrupt and adaptive mode - -## Known limitations - -### MTU -Because of AF_XDP restrictions, the MTU is limited to below PAGE_SIZE -(4096-bytes on most systems) minus 256-bytes, and they are additional -limitations depending upon specific Linux device drivers. -As a rule of thumb, a MTU of 3000-bytes or less should be safe. - -### Number of buffers -Furthermore, upon UMEM creation, the kernel allocates a -physically-contiguous structure, whose size is proportional to the number -of 4KB pages contained in the UMEM. That allocation might fail when -the number of buffers allocated by VPP is too high. That number can be -controlled with the `buffers { buffers-per-numa }` configuration option. -Finally, note that because of this limitation, this plugin is unlikely -to be compatible with the use of 1GB hugepages. - -### Interrupt mode -Interrupt and adaptive mode are supported but is limited by default to single -threaded (no worker) configurations because of a kernel limitation prior to -5.6. You can bypass the limitation at interface creation time by adding the -`no-syscall-lock` parameter, but you must be sure that your kernel can -support it, otherwise you will experience double-frees. -See -https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/ -for more details. - -### Mellanox -When setting the number of queues on Mellanox NIC with `ethtool -L`, you must -use twice the amount of configured queues: it looks like the Linux driver will -create separate RX queues and TX queues (but all queues can be used for both -RX and TX, the NIC will just not sent any packet on "pure" TX queues. -Confused? So I am.). For example if you set `combined 2` you will effectively -have to create 4 rx queues in AF_XDP if you want to be sure to receive all -packets. - -## Requirements -This drivers supports Linux kernel 5.4 and later. Kernels older than 5.4 are -missing unaligned buffers support. - -The Linux kernel interface must be up and have enough queues before -creating the VPP AF_XDP interface, otherwise Linux will deny creating -the AF_XDP socket. -The AF_XDP interface will claim NIC RX queue starting from 0, up to the -requested number of RX queues (only 1 by default). It means all packets -destined to NIC RX queue `[0, num_rx_queues[` will be received by the -AF_XDP interface, and only them. Depending on your configuration, there -will usually be several RX queues (typically 1 per core) and packets are -spread accross queues by RSS. In order to receive consistent traffic, -you **must** program the NIC dispatching accordingly. The simplest way -to get all the packets is to specify `num-rx-queues all` to grab all -available queues or to reconfigure the Linux kernel driver to use only -`num_rx_queues` RX queues (ie all NIC queues will be associated with -the AF_XDP socket): -``` -~# ethtool -L <iface> combined <num_rx_queues> -``` -Additionally, the VPP AF_XDP interface will use a MAC address generated at -creation time instead of the Linux kernel interface MAC. As Linux kernel -interface are not in promiscuous mode by default (see below) this will -results in a useless configuration where the VPP AF_XDP interface only -receives packets destined to the Linux kernel interface MAC just to drop -them because the destination MAC does not match VPP AF_XDP interface MAC. -If you want to use the Linux interface MAC for the VPP AF_XDP interface, -you can change it afterwards in VPP: -``` -~# vppctl set int mac address <iface> <mac> -``` -Finally, if you wish to receive all packets and not only the packets -destined to the Linux kernel interface MAC you need to set the Linux -kernel interface in promiscuous mode: -``` -~# ip link set dev <iface> promisc on -``` - -## Security considerations -When creating an AF_XDP interface, it will receive all packets arriving -to the NIC RX queue `[0, num_rx_queues[`. You need to configure the Linux -kernel NIC driver properly to ensure that only intented packets will -arrive in this queue. There is no way to filter the packets after-the-fact -using eg. netfilter or eBPF. - -## Quickstart -1. Put the Linux kernel interface up and in promiscuous mode: -``` -~# ip l set dev enp216s0f0 promisc on up -``` -2. Create the AF_XDP interface: -``` -~# vppctl create int af_xdp host-if enp216s0f0 num-rx-queues all -``` -3. Use the interface as usual, eg.: -``` -~# vppctl set int ip addr enp216s0f0/0 1.1.1.1/24 -~# vppctl set int st enp216s0f0/0 up -~# vppctl ping 1.1.1.100` -``` - -## Custom eBPF XDP program -This driver relies on libbpf and as such relies on the `xsks_map` eBPF -map. The default behavior is to use the XDP program already attached -to the interface if any, otherwise load the default one. -You can request to load a custom XDP program with the `prog` option when -creating the interface in VPP: -``` -~# vppctl create int af_xdp host-if enp216s0f0 num-rx-queues 4 prog extras/bpf/af_xdp.bpf.o -``` -In that case it will replace any previously attached program. A custom -XDP program example is provided in `extras/bpf/`. - -## Performance consideration -AF_XDP relies on the Linux kernel NIC driver to rx/tx packets. To reach -high-performance (10's MPPS), the Linux kernel NIC driver must support -zero-copy mode and its RX path must run on a dedicated core in the NUMA -where the NIC is physically connected. diff --git a/src/plugins/af_xdp/af_xdp_doc.rst b/src/plugins/af_xdp/af_xdp_doc.rst new file mode 100644 index 00000000000..de951340a2d --- /dev/null +++ b/src/plugins/af_xdp/af_xdp_doc.rst @@ -0,0 +1,164 @@ +AF_XDP device driver +==================== + +This driver relies on Linux AF_XDP socket to rx/tx Ethernet packets. + +Maturity level +-------------- + +Under development: it should work, but has not been thoroughly tested. + +Features +-------- + +- copy and zero-copy mode +- multiqueue +- API +- custom eBPF program +- polling, interrupt and adaptive mode + +Known limitations +----------------- + +MTU +~~~ + +Because of AF_XDP restrictions, the MTU is limited to below PAGE_SIZE +(4096-bytes on most systems) minus 256-bytes, and they are additional +limitations depending upon specific Linux device drivers. As a rule of +thumb, a MTU of 3000-bytes or less should be safe. + +Number of buffers +~~~~~~~~~~~~~~~~~ + +Furthermore, upon UMEM creation, the kernel allocates a +physically-contiguous structure, whose size is proportional to the +number of 4KB pages contained in the UMEM. That allocation might fail +when the number of buffers allocated by VPP is too high. That number can +be controlled with the ``buffers { buffers-per-numa }`` configuration +option. Finally, note that because of this limitation, this plugin is +unlikely to be compatible with the use of 1GB hugepages. + +Interrupt mode +~~~~~~~~~~~~~~ + +Interrupt and adaptive mode are supported but is limited by default to +single threaded (no worker) configurations because of a kernel +limitation prior to 5.6. You can bypass the limitation at interface +creation time by adding the ``no-syscall-lock`` parameter, but you must +be sure that your kernel can support it, otherwise you will experience +double-frees. See +https://lore.kernel.org/bpf/BYAPR11MB365382C5DB1E5FCC53242609C1549@BYAPR11MB3653.namprd11.prod.outlook.com/ +for more details. + +Mellanox +~~~~~~~~ + +When setting the number of queues on Mellanox NIC with ``ethtool -L``, +you must use twice the amount of configured queues: it looks like the +Linux driver will create separate RX queues and TX queues (but all +queues can be used for both RX and TX, the NIC will just not sent any +packet on “pure” TX queues. Confused? So I am.). For example if you set +``combined 2`` you will effectively have to create 4 rx queues in AF_XDP +if you want to be sure to receive all packets. + +Requirements +------------ + +This drivers supports Linux kernel 5.4 and later. Kernels older than 5.4 +are missing unaligned buffers support. + +The Linux kernel interface must be up and have enough queues before +creating the VPP AF_XDP interface, otherwise Linux will deny creating +the AF_XDP socket. The AF_XDP interface will claim NIC RX queue starting +from 0, up to the requested number of RX queues (only 1 by default). It +means all packets destined to NIC RX queue ``[0, num_rx_queues[`` will +be received by the AF_XDP interface, and only them. Depending on your +configuration, there will usually be several RX queues (typically 1 per +core) and packets are spread across queues by RSS. In order to receive +consistent traffic, you **must** program the NIC dispatching +accordingly. The simplest way to get all the packets is to specify +``num-rx-queues all`` to grab all available queues or to reconfigure the +Linux kernel driver to use only ``num_rx_queues`` RX queues (i.e. all NIC +queues will be associated with the AF_XDP socket): + +:: + + ~# ethtool -L <iface> combined <num_rx_queues> + +Additionally, the VPP AF_XDP interface will use a MAC address generated +at creation time instead of the Linux kernel interface MAC. As Linux +kernel interface are not in promiscuous mode by default (see below) this +will results in a useless configuration where the VPP AF_XDP interface +only receives packets destined to the Linux kernel interface MAC just to +drop them because the destination MAC does not match VPP AF_XDP +interface MAC. If you want to use the Linux interface MAC for the VPP +AF_XDP interface, you can change it afterwards in VPP: + +:: + + ~# vppctl set int mac address <iface> <mac> + +Finally, if you wish to receive all packets and not only the packets +destined to the Linux kernel interface MAC you need to set the Linux +kernel interface in promiscuous mode: + +:: + + ~# ip link set dev <iface> promisc on + +Security considerations +----------------------- + +When creating an AF_XDP interface, it will receive all packets arriving +to the NIC RX queue ``[0, num_rx_queues[``. You need to configure the +Linux kernel NIC driver properly to ensure that only intended packets +will arrive in this queue. There is no way to filter the packets +after-the-fact using e.g. netfilter or eBPF. + +Quickstart +---------- + +1. Put the Linux kernel interface up and in promiscuous mode: + +:: + + ~# ip l set dev enp216s0f0 promisc on up + +2. Create the AF_XDP interface: + +:: + + ~# vppctl create int af_xdp host-if enp216s0f0 num-rx-queues all + +3. Use the interface as usual, e.g.: + +:: + + ~# vppctl set int ip addr enp216s0f0/0 1.1.1.1/24 + ~# vppctl set int st enp216s0f0/0 up + ~# vppctl ping 1.1.1.100` + +Custom eBPF XDP program +----------------------- + +This driver relies on libbpf and as such relies on the ``xsks_map`` eBPF +map. The default behavior is to use the XDP program already attached to +the interface if any, otherwise load the default one. You can request to +load a custom XDP program with the ``prog`` option when creating the +interface in VPP: + +:: + + ~# vppctl create int af_xdp host-if enp216s0f0 num-rx-queues 4 prog extras/bpf/af_xdp.bpf.o + +In that case it will replace any previously attached program. A custom +XDP program example is provided in ``extras/bpf/``. + +Performance consideration +------------------------- + +AF_XDP relies on the Linux kernel NIC driver to rx/tx packets. To reach +high-performance (10’s MPPS), the Linux kernel NIC driver must support +zero-copy mode and its RX path must run on a dedicated core in the NUMA +where the NIC is physically connected. diff --git a/src/plugins/af_xdp/api.c b/src/plugins/af_xdp/api.c index 1864c4c2ee9..3e9a3fe2578 100644 --- a/src/plugins/af_xdp/api.c +++ b/src/plugins/af_xdp/api.c @@ -27,6 +27,7 @@ #include <af_xdp/af_xdp.api_enum.h> #include <af_xdp/af_xdp.api_types.h> +#define REPLY_MSG_ID_BASE (rm->msg_id_base) #include <vlibapi/api_helper_macros.h> static af_xdp_mode_t @@ -78,12 +79,72 @@ vl_api_af_xdp_create_t_handler (vl_api_af_xdp_create_t * mp) af_xdp_create_if (vm, &args); rv = args.rv; - /* *INDENT-OFF* */ - REPLY_MACRO2 (VL_API_AF_XDP_CREATE_REPLY + rm->msg_id_base, + REPLY_MACRO2 (VL_API_AF_XDP_CREATE_REPLY, + ({ rmp->sw_if_index = ntohl (args.sw_if_index); })); +} + +static void +vl_api_af_xdp_create_v2_t_handler (vl_api_af_xdp_create_v2_t *mp) +{ + vlib_main_t *vm = vlib_get_main (); + af_xdp_main_t *rm = &af_xdp_main; + vl_api_af_xdp_create_v2_reply_t *rmp; + af_xdp_create_if_args_t args; + int rv; + + clib_memset (&args, 0, sizeof (af_xdp_create_if_args_t)); + + args.linux_ifname = mp->host_if[0] ? (char *) mp->host_if : 0; + args.name = mp->name[0] ? (char *) mp->name : 0; + args.prog = mp->prog[0] ? (char *) mp->prog : 0; + args.netns = mp->namespace[0] ? (char *) mp->namespace : 0; + args.mode = af_xdp_api_mode (mp->mode); + args.flags = af_xdp_api_flags (mp->flags); + args.rxq_size = ntohs (mp->rxq_size); + args.txq_size = ntohs (mp->txq_size); + args.rxq_num = ntohs (mp->rxq_num); + + af_xdp_create_if (vm, &args); + rv = args.rv; + + /* clang-format off */ + REPLY_MACRO2 (VL_API_AF_XDP_CREATE_V2_REPLY, ({ rmp->sw_if_index = ntohl (args.sw_if_index); })); - /* *INDENT-ON* */ + /* clang-format on */ +} + +static void +vl_api_af_xdp_create_v3_t_handler (vl_api_af_xdp_create_v3_t *mp) +{ + vlib_main_t *vm = vlib_get_main (); + af_xdp_main_t *rm = &af_xdp_main; + vl_api_af_xdp_create_v3_reply_t *rmp; + af_xdp_create_if_args_t args; + int rv; + + clib_memset (&args, 0, sizeof (af_xdp_create_if_args_t)); + + args.linux_ifname = mp->host_if[0] ? (char *) mp->host_if : 0; + args.name = mp->name[0] ? (char *) mp->name : 0; + args.prog = mp->prog[0] ? (char *) mp->prog : 0; + args.netns = mp->netns[0] ? (char *) mp->netns : 0; + args.mode = af_xdp_api_mode (mp->mode); + args.flags = af_xdp_api_flags (mp->flags); + args.rxq_size = mp->rxq_size; + args.txq_size = mp->txq_size; + args.rxq_num = mp->rxq_num; + + af_xdp_create_if (vm, &args); + rv = args.rv; + + /* clang-format off */ + REPLY_MACRO2_END (VL_API_AF_XDP_CREATE_V3_REPLY, + ({ + rmp->sw_if_index = args.sw_if_index; + })); + /* clang-format on */ } static void @@ -111,7 +172,7 @@ vl_api_af_xdp_delete_t_handler (vl_api_af_xdp_delete_t * mp) af_xdp_delete_if (vm, rd); reply: - REPLY_MACRO (VL_API_AF_XDP_DELETE_REPLY + rm->msg_id_base); + REPLY_MACRO (VL_API_AF_XDP_DELETE_REPLY); } /* set tup the API message handling tables */ diff --git a/src/plugins/af_xdp/cli.c b/src/plugins/af_xdp/cli.c index 2f3deffaaee..12d3b875a71 100644 --- a/src/plugins/af_xdp/cli.c +++ b/src/plugins/af_xdp/cli.c @@ -40,20 +40,20 @@ af_xdp_create_command_fn (vlib_main_t * vm, unformat_input_t * input, vec_free (args.linux_ifname); vec_free (args.name); + vec_free (args.prog); + vec_free (args.netns); return args.error; } -/* *INDENT-OFF* */ VLIB_CLI_COMMAND (af_xdp_create_command, static) = { .path = "create interface af_xdp", .short_help = "create interface af_xdp <host-if linux-ifname> [name ifname] " "[rx-queue-size size] [tx-queue-size size] [num-rx-queues <num|all>] " - "[prog pathname] [zero-copy|no-zero-copy] [no-syscall-lock]", + "[prog pathname] [netns ns] [zero-copy|no-zero-copy] [no-syscall-lock]", .function = af_xdp_create_command_fn, }; -/* *INDENT-ON* */ static clib_error_t * af_xdp_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -98,14 +98,12 @@ af_xdp_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, return 0; } -/* *INDENT-OFF* */ VLIB_CLI_COMMAND (af_xdp_delete_command, static) = { .path = "delete interface af_xdp", .short_help = "delete interface af_xdp " "{<interface> | sw_if_index <sw_idx>}", .function = af_xdp_delete_command_fn, }; -/* *INDENT-ON* */ clib_error_t * af_xdp_cli_init (vlib_main_t * vm) diff --git a/src/plugins/af_xdp/device.c b/src/plugins/af_xdp/device.c index 7a10bce4290..63a276ce51e 100644 --- a/src/plugins/af_xdp/device.c +++ b/src/plugins/af_xdp/device.c @@ -17,17 +17,27 @@ #include <stdio.h> #include <net/if.h> +#include <sys/ioctl.h> +#include <linux/ethtool.h> #include <linux/if_link.h> -#include <bpf/libbpf.h> +#include <linux/sockios.h> +#include <linux/limits.h> +#include <bpf/bpf.h> #include <vlib/vlib.h> #include <vlib/unix/unix.h> #include <vlib/pci/pci.h> +#include <vppinfra/linux/netns.h> #include <vppinfra/linux/sysfs.h> #include <vppinfra/unix.h> #include <vnet/ethernet/ethernet.h> #include <vnet/interface/rx_queue_funcs.h> +#include <vnet/interface/tx_queue_funcs.h> #include "af_xdp.h" +#ifndef XDP_UMEM_MIN_CHUNK_SIZE +#define XDP_UMEM_MIN_CHUNK_SIZE 2048 +#endif + af_xdp_main_t af_xdp_main; typedef struct @@ -62,6 +72,16 @@ af_xdp_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new) return 0; } +static clib_error_t * +af_xdp_set_max_frame_size (vnet_main_t *vnm, vnet_hw_interface_t *hw, + u32 frame_size) +{ + af_xdp_main_t *am = &af_xdp_main; + af_xdp_device_t *ad = vec_elt_at_index (am->devices, hw->dev_instance); + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "set mtu not supported yet"); + return vnet_error (VNET_ERR_UNSUPPORTED, 0); +} + static u32 af_xdp_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) { @@ -77,15 +97,87 @@ af_xdp_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "set promiscuous not supported yet"); return ~0; - case ETHERNET_INTERFACE_FLAG_MTU: - af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "set mtu not supported yet"); - return ~0; } af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "unknown flag %x requested", flags); return ~0; } +int +af_xdp_enter_netns (char *netns, int *fds) +{ + *fds = *(fds + 1) = -1; + if (netns != NULL) + { + *fds = clib_netns_open (NULL /* self */); + if ((*(fds + 1) = clib_netns_open ((u8 *) netns)) == -1) + return VNET_API_ERROR_SYSCALL_ERROR_8; + if (clib_setns (*(fds + 1)) == -1) + return VNET_API_ERROR_SYSCALL_ERROR_9; + } + return 0; +} + +void +af_xdp_cleanup_netns (int *fds) +{ + if (*fds != -1) + close (*fds); + + if (*(fds + 1) != -1) + close (*(fds + 1)); + + *fds = *(fds + 1) = -1; +} + +int +af_xdp_exit_netns (char *netns, int *fds) +{ + int ret = 0; + if (netns != NULL) + { + if (*fds != -1) + ret = clib_setns (*fds); + + af_xdp_cleanup_netns (fds); + } + + return ret; +} + +static int +af_xdp_remove_program (af_xdp_device_t *ad) +{ + u32 curr_prog_id = 0; + int ret; + int ns_fds[2]; + + af_xdp_enter_netns (ad->netns, ns_fds); + ret = bpf_xdp_query_id (ad->linux_ifindex, XDP_FLAGS_UPDATE_IF_NOEXIST, + &curr_prog_id); + if (ret != 0) + { + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "bpf_xdp_query_id failed\n"); + goto err0; + } + + ret = bpf_xdp_detach (ad->linux_ifindex, XDP_FLAGS_UPDATE_IF_NOEXIST, NULL); + if (ret != 0) + { + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "bpf_xdp_detach failed\n"); + goto err0; + } + af_xdp_exit_netns (ad->netns, ns_fds); + if (ad->bpf_obj) + bpf_object__close (ad->bpf_obj); + + return 0; + +err0: + af_xdp_exit_netns (ad->netns, ns_fds); + return ret; +} + void af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad) { @@ -101,9 +193,6 @@ af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad) ethernet_delete_interface (vnm, ad->hw_if_index); } - for (i = 0; i < ad->rxq_num; i++) - clib_file_del_by_index (&file_main, vec_elt (ad->rxqs, i).file_index); - for (i = 0; i < ad->txq_num; i++) clib_spinlock_free (&vec_elt (ad->txqs, i).lock); @@ -113,17 +202,20 @@ af_xdp_delete_if (vlib_main_t * vm, af_xdp_device_t * ad) vec_foreach (umem, ad->umem) xsk_umem__delete (*umem); - if (ad->bpf_obj) - { - bpf_set_link_xdp_fd (ad->linux_ifindex, -1, 0); - bpf_object__unload (ad->bpf_obj); - } + for (i = 0; i < ad->rxq_num; i++) + clib_file_del_by_index (&file_main, vec_elt (ad->rxqs, i).file_index); + + if (af_xdp_remove_program (ad) != 0) + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, "Error while removing XDP program.\n"); vec_free (ad->xsk); vec_free (ad->umem); vec_free (ad->buffer_template); vec_free (ad->rxqs); vec_free (ad->txqs); + vec_free (ad->name); + vec_free (ad->linux_ifname); + vec_free (ad->netns); clib_error_free (ad->error); pool_put (axm->devices, ad); } @@ -132,44 +224,49 @@ static int af_xdp_load_program (af_xdp_create_if_args_t * args, af_xdp_device_t * ad) { int fd; + struct bpf_program *bpf_prog; + struct rlimit r = { RLIM_INFINITY, RLIM_INFINITY }; - ad->linux_ifindex = if_nametoindex (ad->linux_ifname); - if (!ad->linux_ifindex) - { - args->rv = VNET_API_ERROR_INVALID_VALUE; - args->error = - clib_error_return_unix (0, "if_nametoindex(%s) failed", - ad->linux_ifname); - goto err0; - } + if (setrlimit (RLIMIT_MEMLOCK, &r)) + af_xdp_log (VLIB_LOG_LEVEL_WARNING, ad, + "setrlimit(%s) failed: %s (errno %d)", ad->linux_ifname, + strerror (errno), errno); - if (bpf_prog_load (args->prog, BPF_PROG_TYPE_XDP, &ad->bpf_obj, &fd)) + ad->bpf_obj = bpf_object__open_file (args->prog, NULL); + if (libbpf_get_error (ad->bpf_obj)) { args->rv = VNET_API_ERROR_SYSCALL_ERROR_5; - args->error = - clib_error_return_unix (0, "bpf_prog_load(%s) failed", args->prog); + args->error = clib_error_return_unix ( + 0, "bpf_object__open_file(%s) failed", args->prog); goto err0; } -#ifndef XDP_FLAGS_REPLACE -#define XDP_FLAGS_REPLACE 0 -#endif - if (bpf_set_link_xdp_fd (ad->linux_ifindex, fd, XDP_FLAGS_REPLACE)) + bpf_prog = bpf_object__next_program (ad->bpf_obj, NULL); + if (!bpf_prog) + goto err1; + + bpf_program__set_type (bpf_prog, BPF_PROG_TYPE_XDP); + + if (bpf_object__load (ad->bpf_obj)) + goto err1; + + fd = bpf_program__fd (bpf_prog); + + if (bpf_xdp_attach (ad->linux_ifindex, fd, XDP_FLAGS_UPDATE_IF_NOEXIST, + NULL)) { args->rv = VNET_API_ERROR_SYSCALL_ERROR_6; - args->error = - clib_error_return_unix (0, "bpf_set_link_xdp_fd(%s) failed", - ad->linux_ifname); + args->error = clib_error_return_unix (0, "bpf_xdp_attach(%s) failed", + ad->linux_ifname); goto err1; } return 0; err1: - bpf_object__unload (ad->bpf_obj); + bpf_object__close (ad->bpf_obj); ad->bpf_obj = 0; err0: - ad->linux_ifindex = ~0; return -1; } @@ -188,16 +285,9 @@ af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args, const int is_rx = qid < ad->rxq_num; const int is_tx = qid < ad->txq_num; - vec_validate_aligned (ad->umem, qid, CLIB_CACHE_LINE_BYTES); umem = vec_elt_at_index (ad->umem, qid); - - vec_validate_aligned (ad->xsk, qid, CLIB_CACHE_LINE_BYTES); xsk = vec_elt_at_index (ad->xsk, qid); - - vec_validate_aligned (ad->rxqs, qid, CLIB_CACHE_LINE_BYTES); rxq = vec_elt_at_index (ad->rxqs, qid); - - vec_validate_aligned (ad->txqs, qid, CLIB_CACHE_LINE_BYTES); txq = vec_elt_at_index (ad->txqs, qid); /* @@ -221,8 +311,18 @@ af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args, (umem, uword_to_pointer (vm->buffer_main->buffer_mem_start, void *), vm->buffer_main->buffer_mem_size, fq, cq, &umem_config)) { + uword sys_page_size = clib_mem_get_page_size (); args->rv = VNET_API_ERROR_SYSCALL_ERROR_1; args->error = clib_error_return_unix (0, "xsk_umem__create() failed"); + /* this should mimic the Linux kernel net/xdp/xdp_umem.c:xdp_umem_reg() + * check */ + if (umem_config.frame_size < XDP_UMEM_MIN_CHUNK_SIZE || + umem_config.frame_size > sys_page_size) + args->error = clib_error_return ( + args->error, + "(unsupported data-size? (should be between %d and %d))", + XDP_UMEM_MIN_CHUNK_SIZE - sizeof (vlib_buffer_t), + sys_page_size - sizeof (vlib_buffer_t)); goto err0; } @@ -241,6 +341,8 @@ af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args, sock_config.bind_flags |= XDP_ZEROCOPY; break; } + if (args->prog) + sock_config.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; if (xsk_socket__create (xsk, ad->linux_ifname, qid, *umem, rx, tx, &sock_config)) { @@ -253,10 +355,27 @@ af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args, } fd = xsk_socket__fd (*xsk); + if (args->prog) + { + struct bpf_map *map = + bpf_object__find_map_by_name (ad->bpf_obj, "xsks_map"); + int ret = xsk_socket__update_xskmap (*xsk, bpf_map__fd (map)); + if (ret) + { + args->rv = VNET_API_ERROR_SYSCALL_ERROR_3; + args->error = clib_error_return_unix ( + 0, "xsk_socket__update_xskmap %s qid %d return %d", + ad->linux_ifname, qid, ret); + goto err2; + } + } optlen = sizeof (opt); +#ifndef SOL_XDP +#define SOL_XDP 283 +#endif if (getsockopt (fd, SOL_XDP, XDP_OPTIONS, &opt, &optlen)) { - args->rv = VNET_API_ERROR_SYSCALL_ERROR_3; + args->rv = VNET_API_ERROR_SYSCALL_ERROR_4; args->error = clib_error_return_unix (0, "getsockopt(XDP_OPTIONS) failed"); goto err2; @@ -269,6 +388,7 @@ af_xdp_create_queue (vlib_main_t *vm, af_xdp_create_if_args_t *args, if (is_tx) { txq->xsk_fd = fd; + clib_spinlock_init (&txq->lock); if (is_rx && (ad->flags & AF_XDP_DEVICE_F_SYSCALL_LOCK)) { /* This is a shared rx+tx queue and we need to lock before syscalls. @@ -321,6 +441,31 @@ af_xdp_get_numa (const char *ifname) return numa; } +static void +af_xdp_get_q_count (const char *ifname, int *rxq_num, int *txq_num) +{ + struct ethtool_channels ec = { .cmd = ETHTOOL_GCHANNELS }; + struct ifreq ifr = { .ifr_data = (void *) &ec }; + int fd, err; + + *rxq_num = *txq_num = 1; + + fd = socket (AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return; + + snprintf (ifr.ifr_name, sizeof (ifr.ifr_name), "%s", ifname); + err = ioctl (fd, SIOCETHTOOL, &ifr); + + close (fd); + + if (err) + return; + + *rxq_num = clib_max (ec.combined_count, ec.rx_count); + *txq_num = clib_max (ec.combined_count, ec.tx_count); +} + static clib_error_t * af_xdp_device_rxq_read_ready (clib_file_t * f) { @@ -361,22 +506,88 @@ af_xdp_device_set_rxq_mode (const af_xdp_device_t *ad, af_xdp_rxq_t *rxq, return 0; } +static u32 +af_xdp_find_rxq_for_thread (vnet_main_t *vnm, const af_xdp_device_t *ad, + const u32 thread) +{ + u32 i; + for (i = 0; i < ad->rxq_num; i++) + { + const u32 qid = vec_elt (ad->rxqs, i).queue_index; + const u32 tid = vnet_hw_if_get_rx_queue (vnm, qid)->thread_index; + if (tid == thread) + return i; + } + return ~0; +} + +static clib_error_t * +af_xdp_finalize_queues (vnet_main_t *vnm, af_xdp_device_t *ad, + const int n_vlib_mains) +{ + clib_error_t *err = 0; + int i; + + for (i = 0; i < ad->rxq_num; i++) + { + af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i); + rxq->queue_index = vnet_hw_if_register_rx_queue ( + vnm, ad->hw_if_index, i, VNET_HW_IF_RXQ_THREAD_ANY); + u8 *desc = format (0, "%U rxq %d", format_af_xdp_device_name, + ad->dev_instance, i); + clib_file_t f = { + .file_descriptor = rxq->xsk_fd, + .private_data = rxq->queue_index, + .read_function = af_xdp_device_rxq_read_ready, + .description = desc, + }; + rxq->file_index = clib_file_add (&file_main, &f); + vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index, + rxq->file_index); + err = af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING); + if (err) + return err; + } + + for (i = 0; i < ad->txq_num; i++) + vec_elt (ad->txqs, i).queue_index = + vnet_hw_if_register_tx_queue (vnm, ad->hw_if_index, i); + + /* We set the rxq and txq of the same queue pair on the same thread + * by default to avoid locking because of the syscall lock. */ + int last_qid = clib_min (ad->rxq_num, ad->txq_num - 1); + for (i = 0; i < n_vlib_mains; i++) + { + /* search for the 1st rxq assigned on this thread, if any */ + u32 qid = af_xdp_find_rxq_for_thread (vnm, ad, i); + /* if this rxq is combined with a txq, use it. Otherwise, we'll + * assign txq in a round-robin fashion. We start from the 1st txq + * not shared with a rxq if possible... */ + qid = qid < ad->txq_num ? qid : (last_qid++ % ad->txq_num); + vnet_hw_if_tx_queue_assign_thread ( + vnm, vec_elt (ad->txqs, qid).queue_index, i); + } + + vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index); + return 0; +} + void af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) { vnet_main_t *vnm = vnet_get_main (); vlib_thread_main_t *tm = vlib_get_thread_main (); + vnet_eth_interface_registration_t eir = {}; af_xdp_main_t *am = &af_xdp_main; af_xdp_device_t *ad; vnet_sw_interface_t *sw; - vnet_hw_interface_t *hw; int rxq_num, txq_num, q_num; - int i; + int ns_fds[2]; + int i, ret; args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE; args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE; - rxq_num = args->rxq_num ? args->rxq_num : 1; - txq_num = tm->n_vlib_mains; + args->rxq_num = args->rxq_num ? args->rxq_num : 1; if (!args->linux_ifname) { @@ -397,6 +608,26 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) goto err0; } + ret = af_xdp_enter_netns (args->netns, ns_fds); + if (ret) + { + args->rv = ret; + args->error = clib_error_return (0, "enter netns %s failed, ret %d", + args->netns, args->rv); + goto err0; + } + + af_xdp_get_q_count (args->linux_ifname, &rxq_num, &txq_num); + if (args->rxq_num > rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != args->rxq_num) + { + args->rv = VNET_API_ERROR_INVALID_VALUE; + args->error = clib_error_create ("too many rxq requested (%d > %d)", + args->rxq_num, rxq_num); + goto err1; + } + rxq_num = clib_min (rxq_num, args->rxq_num); + txq_num = clib_min (txq_num, tm->n_vlib_mains); + pool_get_zero (am->devices, ad); if (tm->n_vlib_mains > 1 && @@ -406,12 +637,32 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) ad->linux_ifname = (char *) format (0, "%s", args->linux_ifname); vec_validate (ad->linux_ifname, IFNAMSIZ - 1); /* libbpf expects ifname to be at least IFNAMSIZ */ - if (args->prog && af_xdp_load_program (args, ad)) - goto err1; + if (args->netns) + ad->netns = (char *) format (0, "%s%c", args->netns, 0); + + ad->linux_ifindex = if_nametoindex (ad->linux_ifname); + if (!ad->linux_ifindex) + { + args->rv = VNET_API_ERROR_INVALID_VALUE; + args->error = clib_error_return_unix (0, "if_nametoindex(%s) failed", + ad->linux_ifname); + ad->linux_ifindex = ~0; + goto err1; + } + + if (args->prog && + (af_xdp_remove_program (ad) || af_xdp_load_program (args, ad))) + goto err2; q_num = clib_max (rxq_num, txq_num); ad->rxq_num = rxq_num; ad->txq_num = txq_num; + + vec_validate_aligned (ad->umem, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->xsk, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->rxqs, q_num - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (ad->txqs, q_num - 1, CLIB_CACHE_LINE_BYTES); + for (i = 0; i < q_num; i++) { if (af_xdp_create_queue (vm, args, ad, i)) @@ -423,6 +674,8 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) * requested 'max' * we might create less tx queues than workers but this is ok */ + af_xdp_log (VLIB_LOG_LEVEL_DEBUG, ad, + "create interface failed to create queue qid=%d", i); /* fixup vectors length */ vec_set_len (ad->umem, i); @@ -433,19 +686,14 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) ad->rxq_num = clib_min (i, rxq_num); ad->txq_num = clib_min (i, txq_num); - if (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != rxq_num) + if (i == 0 || + (i < rxq_num && AF_XDP_NUM_RX_QUEUES_ALL != args->rxq_num)) { ad->rxq_num = ad->txq_num = 0; - goto err1; /* failed creating requested rxq: fatal error, bailing + goto err2; /* failed creating requested rxq: fatal error, bailing out */ } - if (i < txq_num) - { - /* we created less txq than threads not an error but initialize lock for shared txq */ - for (i = 0; i < ad->txq_num; i++) - clib_spinlock_init (&vec_elt (ad->txqs, i).lock); - } args->rv = 0; clib_error_free (args->error); @@ -453,6 +701,13 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) } } + if (af_xdp_exit_netns (args->netns, ns_fds)) + { + args->rv = VNET_API_ERROR_SYSCALL_ERROR_10; + args->error = clib_error_return (0, "exit netns failed"); + goto err2; + } + ad->dev_instance = ad - am->devices; ad->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; ad->pool = @@ -460,53 +715,43 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) af_xdp_get_numa (ad->linux_ifname)); if (!args->name) - ad->name = - (char *) format (0, "%s/%d", ad->linux_ifname, ad->dev_instance); + { + char *ifname = ad->linux_ifname; + if (args->netns != NULL && strncmp (args->netns, "pid:", 4) == 0) + { + ad->name = + (char *) format (0, "%s/%u", ifname, atoi (args->netns + 4)); + } + else + ad->name = (char *) format (0, "%s/%d", ifname, ad->dev_instance); + } else ad->name = (char *) format (0, "%s", args->name); ethernet_mac_address_generate (ad->hwaddr); /* create interface */ - if (ethernet_register_interface (vnm, af_xdp_device_class.index, - ad->dev_instance, ad->hwaddr, - &ad->hw_if_index, af_xdp_flag_change)) - { - args->rv = VNET_API_ERROR_INVALID_INTERFACE; - args->error = - clib_error_return (0, "ethernet_register_interface() failed"); - goto err1; - } + eir.dev_class_index = af_xdp_device_class.index; + eir.dev_instance = ad->dev_instance; + eir.address = ad->hwaddr; + eir.cb.flag_change = af_xdp_flag_change; + eir.cb.set_max_frame_size = af_xdp_set_max_frame_size; + ad->hw_if_index = vnet_eth_register_interface (vnm, &eir); sw = vnet_get_hw_sw_interface (vnm, ad->hw_if_index); - hw = vnet_get_hw_interface (vnm, ad->hw_if_index); args->sw_if_index = ad->sw_if_index = sw->sw_if_index; - hw->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_INT_MODE; + + vnet_hw_if_set_caps (vnm, ad->hw_if_index, VNET_HW_IF_CAP_INT_MODE); vnet_hw_if_set_input_node (vnm, ad->hw_if_index, af_xdp_input_node.index); - for (i = 0; i < ad->rxq_num; i++) + args->error = af_xdp_finalize_queues (vnm, ad, tm->n_vlib_mains); + if (args->error) { - af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, i); - rxq->queue_index = vnet_hw_if_register_rx_queue ( - vnm, ad->hw_if_index, i, VNET_HW_IF_RXQ_THREAD_ANY); - u8 *desc = format (0, "%U rxq %d", format_af_xdp_device_name, - ad->dev_instance, i); - clib_file_t f = { - .file_descriptor = rxq->xsk_fd, - .private_data = rxq->queue_index, - .read_function = af_xdp_device_rxq_read_ready, - .description = desc, - }; - rxq->file_index = clib_file_add (&file_main, &f); - vnet_hw_if_set_rx_queue_file_index (vnm, rxq->queue_index, - rxq->file_index); - if (af_xdp_device_set_rxq_mode (ad, rxq, AF_XDP_RXQ_MODE_POLLING)) - goto err1; + args->rv = VNET_API_ERROR_SYSCALL_ERROR_7; + goto err2; } - vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index); - /* buffer template */ vec_validate_aligned (ad->buffer_template, 1, CLIB_CACHE_LINE_BYTES); ad->buffer_template->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; @@ -517,8 +762,10 @@ af_xdp_create_if (vlib_main_t * vm, af_xdp_create_if_args_t * args) return; -err1: +err2: af_xdp_delete_if (vm, ad); +err1: + af_xdp_cleanup_netns (ns_fds); err0: vlib_log_err (am->log_class, "%U", format_clib_error, args->error); } @@ -610,7 +857,6 @@ af_xdp_clear (u32 dev_instance) clib_error_free (ad->error); } -/* *INDENT-OFF* */ VNET_DEVICE_CLASS (af_xdp_device_class) = { .name = "AF_XDP interface", .format_device = format_af_xdp_device, @@ -623,7 +869,6 @@ VNET_DEVICE_CLASS (af_xdp_device_class) = { .mac_addr_change_function = af_xdp_mac_change, .clear_counters = af_xdp_clear, }; -/* *INDENT-ON* */ clib_error_t * af_xdp_init (vlib_main_t * vm) diff --git a/src/plugins/af_xdp/input.c b/src/plugins/af_xdp/input.c index 4f3ac5725a4..9177b3ffc5b 100644 --- a/src/plugins/af_xdp/input.c +++ b/src/plugins/af_xdp/input.c @@ -15,7 +15,6 @@ *------------------------------------------------------------------ */ -#include <poll.h> #include <vlib/vlib.h> #include <vlib/unix/unix.h> #include <vlib/pci/pci.h> @@ -89,8 +88,7 @@ af_xdp_device_input_refill_db (vlib_main_t * vm, if (clib_spinlock_trylock_if_init (&rxq->syscall_lock)) { - struct pollfd fd = { .fd = rxq->xsk_fd, .events = POLLIN | POLLOUT }; - int ret = poll (&fd, 1, 0); + int ret = recvmsg (rxq->xsk_fd, 0, MSG_DONTWAIT); clib_spinlock_unlock_if_init (&rxq->syscall_lock); if (PREDICT_FALSE (ret < 0)) { @@ -198,6 +196,7 @@ af_xdp_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node, ef = vlib_frame_scalar_args (f); ef->sw_if_index = sw_if_index; ef->hw_if_index = hw_if_index; + vlib_frame_no_append (f); } static_always_inline u32 @@ -297,7 +296,7 @@ af_xdp_device_input_inline (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_copy_template (&bt, ad->buffer_template); next_index = ad->per_interface_next_index; if (PREDICT_FALSE (vnet_device_input_have_features (ad->sw_if_index))) - vnet_feature_start_device_input_x1 (ad->sw_if_index, &next_index, &bt); + vnet_feature_start_device_input (ad->sw_if_index, &next_index, &bt); vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); @@ -353,7 +352,6 @@ af_xdp_device_input_refill (af_xdp_device_t *ad) } #endif /* CLIB_MARCH_VARIANT */ -/* *INDENT-OFF* */ VLIB_REGISTER_NODE (af_xdp_input_node) = { .name = "af_xdp-input", .sibling_of = "device-input", @@ -364,7 +362,6 @@ VLIB_REGISTER_NODE (af_xdp_input_node) = { .error_strings = af_xdp_input_error_strings, .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED, }; -/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/plugins/af_xdp/output.c b/src/plugins/af_xdp/output.c index 51a56ed866d..a59c01ca6e0 100644 --- a/src/plugins/af_xdp/output.c +++ b/src/plugins/af_xdp/output.c @@ -1,5 +1,5 @@ -#include <poll.h> #include <string.h> +#include <vppinfra/clib.h> #include <vlib/vlib.h> #include <vlib/unix/unix.h> #include <vnet/ethernet/ethernet.h> @@ -101,11 +101,19 @@ af_xdp_device_output_tx_db (vlib_main_t * vm, if (xsk_ring_prod__needs_wakeup (&txq->tx)) { - struct pollfd fd = { .fd = txq->xsk_fd, .events = POLLIN | POLLOUT }; - int ret = poll (&fd, 1, 0); + const struct msghdr msg = {}; + int ret; + /* On tx, xsk socket will only tx up to TX_BATCH_SIZE, as defined in + * kernel net/xdp/xsk.c. Unfortunately we do not know how much this is, + * our only option is to retry until everything is sent... */ + do + { + ret = sendmsg (txq->xsk_fd, &msg, MSG_DONTWAIT); + } + while (ret < 0 && EAGAIN == errno); if (PREDICT_FALSE (ret < 0)) { - /* something bad is happening */ + /* not EAGAIN: something bad is happening */ vlib_error_count (vm, node->node_index, AF_XDP_TX_ERROR_SYSCALL_FAILURES, 1); af_xdp_device_error (ad, "tx poll() failed"); @@ -147,6 +155,14 @@ wrap_around: while (n >= 8) { + if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_NEXT_PRESENT || + b[1]->flags & VLIB_BUFFER_NEXT_PRESENT || + b[2]->flags & VLIB_BUFFER_NEXT_PRESENT || + b[3]->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + break; + } + vlib_prefetch_buffer_header (b[4], LOAD); offset = (sizeof (vlib_buffer_t) + @@ -186,6 +202,17 @@ wrap_around: while (n >= 1) { + if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + if (vlib_buffer_chain_linearize (vm, b[0]) != 1) + { + af_xdp_log (VLIB_LOG_LEVEL_ERR, ad, + "vlib_buffer_chain_linearize failed"); + vlib_buffer_free_one (vm, vlib_get_buffer_index (vm, b[0])); + continue; + } + } + offset = (sizeof (vlib_buffer_t) + b[0]->current_data) << XSK_UNALIGNED_BUF_OFFSET_SHIFT; @@ -215,9 +242,9 @@ VNET_DEVICE_CLASS_TX_FN (af_xdp_device_class) (vlib_main_t * vm, af_xdp_main_t *rm = &af_xdp_main; vnet_interface_output_runtime_t *ord = (void *) node->runtime_data; af_xdp_device_t *ad = pool_elt_at_index (rm->devices, ord->dev_instance); - u32 thread_index = vm->thread_index; - af_xdp_txq_t *txq = - vec_elt_at_index (ad->txqs, (thread_index - 1) % ad->txq_num); + const vnet_hw_if_tx_frame_t *tf = vlib_frame_scalar_args (frame); + const int shared_queue = tf->shared_queue; + af_xdp_txq_t *txq = vec_elt_at_index (ad->txqs, tf->queue_id); u32 *from; u32 n, n_tx; int i; @@ -225,20 +252,22 @@ VNET_DEVICE_CLASS_TX_FN (af_xdp_device_class) (vlib_main_t * vm, from = vlib_frame_vector_args (frame); n_tx = frame->n_vectors; - clib_spinlock_lock_if_init (&txq->lock); + if (shared_queue) + clib_spinlock_lock (&txq->lock); for (i = 0, n = 0; i < AF_XDP_TX_RETRIES && n < n_tx; i++) { u32 n_enq; af_xdp_device_output_free (vm, node, txq); - n_enq = af_xdp_device_output_tx_try (vm, node, ad, txq, n_tx - n, from); + n_enq = + af_xdp_device_output_tx_try (vm, node, ad, txq, n_tx - n, from + n); n += n_enq; - from += n_enq; } af_xdp_device_output_tx_db (vm, node, ad, txq, n); - clib_spinlock_unlock_if_init (&txq->lock); + if (shared_queue) + clib_spinlock_unlock (&txq->lock); if (PREDICT_FALSE (n != n_tx)) { diff --git a/src/plugins/af_xdp/plugin.c b/src/plugins/af_xdp/plugin.c index 444ee553cbf..7be7afeac83 100644 --- a/src/plugins/af_xdp/plugin.c +++ b/src/plugins/af_xdp/plugin.c @@ -19,12 +19,10 @@ #include <vnet/plugin/plugin.h> #include <vpp/app/version.h> -/* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, .description = "AF_XDP Device Plugin", }; -/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/plugins/af_xdp/test_api.c b/src/plugins/af_xdp/test_api.c index 6dffa29bdd1..581697e341d 100644 --- a/src/plugins/af_xdp/test_api.c +++ b/src/plugins/af_xdp/test_api.c @@ -91,6 +91,75 @@ api_af_xdp_create (vat_main_t * vam) return ret; } +/* af_xdp create v2 API */ +static int +api_af_xdp_create_v2 (vat_main_t *vam) +{ + vl_api_af_xdp_create_v2_t *mp; + af_xdp_create_if_args_t args; + int ret; + + if (!unformat_user (vam->input, unformat_af_xdp_create_if_args, &args)) + { + clib_warning ("unknown input `%U'", format_unformat_error, vam->input); + return -99; + } + + M (AF_XDP_CREATE, mp); + + snprintf ((char *) mp->host_if, sizeof (mp->host_if), "%s", + args.linux_ifname ?: ""); + snprintf ((char *) mp->name, sizeof (mp->name), "%s", args.name ?: ""); + snprintf ((char *) mp->namespace, sizeof (mp->namespace), "%s", + args.netns ?: ""); + mp->rxq_num = clib_host_to_net_u16 (args.rxq_num); + mp->rxq_size = clib_host_to_net_u16 (args.rxq_size); + mp->txq_size = clib_host_to_net_u16 (args.txq_size); + mp->mode = api_af_xdp_mode (args.mode); + if (args.flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK) + mp->flags |= AF_XDP_API_FLAGS_NO_SYSCALL_LOCK; + snprintf ((char *) mp->prog, sizeof (mp->prog), "%s", args.prog ?: ""); + + S (mp); + W (ret); + + return ret; +} + +/* af_xdp create v2 API */ +static int +api_af_xdp_create_v3 (vat_main_t *vam) +{ + vl_api_af_xdp_create_v3_t *mp; + af_xdp_create_if_args_t args; + int ret; + + if (!unformat_user (vam->input, unformat_af_xdp_create_if_args, &args)) + { + clib_warning ("unknown input `%U'", format_unformat_error, vam->input); + return -99; + } + + M (AF_XDP_CREATE, mp); + + snprintf ((char *) mp->host_if, sizeof (mp->host_if), "%s", + args.linux_ifname ?: ""); + snprintf ((char *) mp->name, sizeof (mp->name), "%s", args.name ?: ""); + snprintf ((char *) mp->netns, sizeof (mp->netns), "%s", args.netns ?: ""); + mp->rxq_num = args.rxq_num; + mp->rxq_size = args.rxq_size; + mp->txq_size = args.txq_size; + mp->mode = api_af_xdp_mode (args.mode); + if (args.flags & AF_XDP_CREATE_FLAGS_NO_SYSCALL_LOCK) + mp->flags |= AF_XDP_API_FLAGS_NO_SYSCALL_LOCK; + snprintf ((char *) mp->prog, sizeof (mp->prog), "%s", args.prog ?: ""); + + S (mp); + W (ret); + + return ret; +} + /* af_xdp-create reply handler */ static void vl_api_af_xdp_create_reply_t_handler (vl_api_af_xdp_create_reply_t * mp) @@ -109,6 +178,42 @@ vl_api_af_xdp_create_reply_t_handler (vl_api_af_xdp_create_reply_t * mp) vam->regenerate_interface_table = 1; } +/* af_xdp-create v2 reply handler */ +static void +vl_api_af_xdp_create_v2_reply_t_handler (vl_api_af_xdp_create_v2_reply_t *mp) +{ + vat_main_t *vam = af_xdp_test_main.vat_main; + i32 retval = ntohl (mp->retval); + + if (retval == 0) + { + fformat (vam->ofp, "created af_xdp with sw_if_index %d\n", + ntohl (mp->sw_if_index)); + } + + vam->retval = retval; + vam->result_ready = 1; + vam->regenerate_interface_table = 1; +} + +/* af_xdp-create v3 reply handler */ +static void +vl_api_af_xdp_create_v3_reply_t_handler (vl_api_af_xdp_create_v2_reply_t *mp) +{ + vat_main_t *vam = af_xdp_test_main.vat_main; + i32 retval = mp->retval; + + if (retval == 0) + { + fformat (vam->ofp, "created af_xdp with sw_if_index %d\n", + mp->sw_if_index); + } + + vam->retval = retval; + vam->result_ready = 1; + vam->regenerate_interface_table = 1; +} + /* af_xdp delete API */ static int api_af_xdp_delete (vat_main_t * vam) diff --git a/src/plugins/af_xdp/unformat.c b/src/plugins/af_xdp/unformat.c index bb4c3048d23..8c0482d83ff 100644 --- a/src/plugins/af_xdp/unformat.c +++ b/src/plugins/af_xdp/unformat.c @@ -46,6 +46,8 @@ unformat_af_xdp_create_if_args (unformat_input_t * input, va_list * vargs) ; else if (unformat (line_input, "prog %s", &args->prog)) ; + else if (unformat (line_input, "netns %s", &args->netns)) + ; else if (unformat (line_input, "no-zero-copy")) args->mode = AF_XDP_MODE_COPY; else if (unformat (line_input, "zero-copy")) |