From d84f2ef54a457918f21bb0ee82392274b62611fe Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Wed, 27 Sep 2017 15:09:48 -0400 Subject: General documentation updates - We now have several developer-focused docs, so create an index page for them. - Rework several docs to fit into the index structure. - Experiment with code highlighting; tweak the CSS slightly to make it slightly nicer to look at. Change-Id: I4185a18f84fa0764745ca7a3148276064a3155c6 Signed-off-by: Chris Luke (cherry picked from commit 64ebb5ff1338140d94c7f9ee72138fe84d89de2e) --- README.md | 15 +- doxygen/Makefile | 4 +- doxygen/assets/doxy-vpp.css | 6 + doxygen/dev_doc.md | 12 + doxygen/doxygen.cfg | 3 +- doxygen/test_framework_doc.md | 1 + doxygen/user_doc.md | 2 - extras/libmemif/README.md | 72 ------ extras/libmemif/docs/BuildInstructions.md | 54 ---- extras/libmemif/docs/GettingStarted.md | 215 ---------------- extras/libmemif/docs/buildinstructions_doc.md | 55 ++++ extras/libmemif/docs/gettingstarted_doc.md | 219 ++++++++++++++++ extras/libmemif/examples/ExampleSetup.md | 206 --------------- extras/libmemif/examples/README.md | 16 -- extras/libmemif/examples/example_setup_doc.md | 207 +++++++++++++++ extras/libmemif/examples/examples_doc.md | 18 ++ extras/libmemif/libmemif_doc.md | 74 ++++++ src/plugins/acl/acl-plugin.md | 347 ------------------------- src/plugins/acl/acl_hash_lookup_doc.md | 241 ++++++++++++++++++ src/plugins/acl/acl_multicore_doc.md | 349 ++++++++++++++++++++++++++ src/plugins/acl/hash_lookup.md | 241 ------------------ src/vlibapi/api_doc.md | 77 +++--- 22 files changed, 1234 insertions(+), 1200 deletions(-) create mode 100644 doxygen/dev_doc.md delete mode 100644 extras/libmemif/README.md delete mode 100644 extras/libmemif/docs/BuildInstructions.md delete mode 100644 extras/libmemif/docs/GettingStarted.md create mode 100644 extras/libmemif/docs/buildinstructions_doc.md create mode 100644 extras/libmemif/docs/gettingstarted_doc.md delete mode 100644 extras/libmemif/examples/ExampleSetup.md delete mode 100644 extras/libmemif/examples/README.md create mode 100644 extras/libmemif/examples/example_setup_doc.md create mode 100644 extras/libmemif/examples/examples_doc.md create mode 100644 extras/libmemif/libmemif_doc.md delete mode 100644 src/plugins/acl/acl-plugin.md create mode 100644 src/plugins/acl/acl_hash_lookup_doc.md create mode 100644 src/plugins/acl/acl_multicore_doc.md delete mode 100644 src/plugins/acl/hash_lookup.md diff --git a/README.md b/README.md index 596494b9..090dd35f 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,11 @@ Directory name | Description build-root | Build output directory doxygen | Documentation generator configuration dpdk | DPDK patches and build infrastructure -@ref src | VPP source code +@ref extras/libmemif | Client library for memif +@ref src/examples | VPP example code @ref src/plugins | VPP bundled plugins directory @ref src/svm | Shared virtual memory allocation library - src/tests | Unit tests + src/tests | Standalone tests (not part of test harness) src/vat | VPP API test program @ref src/vlib | VPP application library @ref src/vlibapi | VPP API library @@ -43,9 +44,8 @@ Directory name | Description @ref src/vpp | VPP application @ref src/vpp-api | VPP application API bindings @ref src/vppinfra | VPP core library - test | Unit tests @ref src/vpp/api | Not-yet-relocated API bindings -@ref src/examples | VPP example code + test | Unit tests and Python test harness ## Getting started @@ -95,12 +95,13 @@ for this can be found [on the Setting up Vagrant wiki page] ## More information Several modules provide documentation, see @subpage user_doc for more -information. +end-user-oriented information. Also see @subpage dev_doc for developer notes. Visit the [VPP wiki](https://wiki.fd.io/view/VPP) for details on more -advanced building strategies and development notes. +advanced building strategies and other development notes. ## Test Framework -There is PyDoc generated documentation available for the VPP test framework. See @subpage test_framework_doc for details. +There is PyDoc generated documentation available for the VPP test framework. +See @ref test_framework_doc for details. diff --git a/doxygen/Makefile b/doxygen/Makefile index b6ba5887..5870530f 100644 --- a/doxygen/Makefile +++ b/doxygen/Makefile @@ -62,7 +62,8 @@ DOXY_INPUT ?= \ $(wildcard $(WS_ROOT)/*.md) \ $(wildcard $(DOXY_DIR)/*.md) \ $(DOXY_SRC_DIRECTORIES) \ - $(DOXY_SRC)/plugins + $(DOXY_SRC)/plugins \ + extras # Strip leading workspace path from input names DOXY_INPUT := $(subst $(WS_ROOT)/,,$(DOXY_INPUT)) @@ -73,7 +74,6 @@ DOXY_INPUT := $(subst $(WS_ROOT)/,,$(DOXY_INPUT)) # there's a DPDK equivalent that conflicts. # These must be left-anchored paths for the regexp below to work. DOXY_EXCLUDE ?= \ - $(DOXY_SRC)/vlib/buffer.c \ $(DOXY_SRC)/vpp-api/lua # Generate a regexp for filenames to exclude diff --git a/doxygen/assets/doxy-vpp.css b/doxygen/assets/doxy-vpp.css index 3b74eb6f..108600ac 100644 --- a/doxygen/assets/doxy-vpp.css +++ b/doxygen/assets/doxy-vpp.css @@ -45,3 +45,9 @@ pre { a.el { font-family: Consolas, Courier, monospace; } + +div.fragment { + padding: 2px; + margin-left: 8px; +} + diff --git a/doxygen/dev_doc.md b/doxygen/dev_doc.md new file mode 100644 index 00000000..d1d22f76 --- /dev/null +++ b/doxygen/dev_doc.md @@ -0,0 +1,12 @@ +Developer Documentation {#dev_doc} +======================= + +Programming notes for developers. + +- @subpage test_framework_doc +- @subpage sample_plugin_doc +- @subpage api_doc +- @subpage vapi_doc +- @subpage acl_hash_lookup +- @subpage acl_multicore +- @subpage libmemif_doc diff --git a/doxygen/doxygen.cfg b/doxygen/doxygen.cfg index 7675b693..f2d39112 100644 --- a/doxygen/doxygen.cfg +++ b/doxygen/doxygen.cfg @@ -882,7 +882,8 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = $(ROOT)/doxygen/assets +IMAGE_PATH = $(ROOT)/doxygen/assets \ + $(ROOT)/extras/libmemif/docs # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program diff --git a/doxygen/test_framework_doc.md b/doxygen/test_framework_doc.md index b2873c5f..755a29e4 100644 --- a/doxygen/test_framework_doc.md +++ b/doxygen/test_framework_doc.md @@ -3,5 +3,6 @@ Test Framework Documentation {#test_framework_doc} PyDoc generated documentation for the "make test" framework is available for the following releases +- [Test framework documentation for VPP 17.10](https://docs.fd.io/vpp/17.10/vpp_make_test/html) - [Test framework documentation for VPP 17.04](https://docs.fd.io/vpp/17.04/vpp_make_test/html) - [Test framework documentation for VPP 17.01](https://docs.fd.io/vpp/17.01/vpp_make_test/html) diff --git a/doxygen/user_doc.md b/doxygen/user_doc.md index 65b09f98..f7d7936b 100644 --- a/doxygen/user_doc.md +++ b/doxygen/user_doc.md @@ -16,6 +16,4 @@ Several modules provide operational, dataplane-user focused documentation. - @subpage span_doc - @subpage srv6_doc - @subpage srmpls_doc -- @subpage sample_plugin_doc - @subpage nat64_doc -- @subpage api_doc diff --git a/extras/libmemif/README.md b/extras/libmemif/README.md deleted file mode 100644 index 0cccfb37..00000000 --- a/extras/libmemif/README.md +++ /dev/null @@ -1,72 +0,0 @@ -Shared Memory Packet Interface (memif) Library -============================================== -## Introduction - -Shared memory packet interface (memif) provides high performance packet transmit and receive between user application and Vector Packet Processing (VPP) or multiple user applications. Using libmemif, user application can create shared memory interface in master or slave mode and connect to VPP or another application using libmemif. Once the connection is established, user application can receive or transmit packets using libmemif API. - -![Architecture](docs/architecture.png) - -## Features - -- [x] Slave mode - - [x] Connect to VPP over memif - - [x] ICMP responder example app -- [x] Transmit/receive packets -- [x] Interrupt mode support -- [x] File descriptor event polling in libmemif (optional) - - [x] Simplify file descriptor event polling (one handler for control and interrupt channel) -- [x] Multiple connections -- [x] Multiple queues - - [x] Multi-thread support -- [x] Master mode - - [ ] Multiple regions (TODO) -- [ ] Performance testing (TODO) - -## Quickstart - -This setup will run libmemif ICMP responder example app in container. Install [docker](https://docs.docker.com/engine/installation) engine. -Useful link: [Docker documentation](https://docs.docker.com/get-started). - -Pull image: -``` -# docker pull ligato/libmemif-sample-service -``` - -Now you should be able to see ligato/libmemif-sample-service image on your local machine (IMAGE ID in this README may be outdated): -``` -# docker images -REPOSITORY TAG IMAGE ID CREATED SIZE -ligato/libmemif-sample-service latest 32ecc2f9d013 About a minute ago 468MB -... -``` - -Run container: -``` -# docker run -it --rm --name icmp-responder --hostname icmp-responder --privileged -v "/run/vpp/:/run/vpp/" ligato/libmemif-sample-service -``` -Example application will start in debug mode. Output should look like this: -``` -ICMP_Responder:add_epoll_fd:204: fd 0 added to epoll -MEMIF_DEBUG:src/main.c:memif_init:383: app name: ICMP_Responder -ICMP_Responder:add_epoll_fd:204: fd 4 added to epoll -LIBMEMIF EXAMPLE APP: ICMP_Responder (debug) -============================== -libmemif version: 1.0 (debug) -memif version: 256 -commands: - help - prints this help - exit - exit app - conn - create memif (slave-mode) - del - delete memif - show - show connection details - ip-set - set interface ip address - rx-mode - set queue rx mode -``` - -Continue with [Example setup](examples/ExampleSetup.md) which contains instructions on how to set up conenction between icmpr-epoll example app and VPP-memif. - -#### Next steps - -- [Build instructions](docs/BuildInstructions.md) Instructions on how to build/install libmemif. -- [Examples](examples/README.md) More example apps presenting different features. -- [Getting started](docs/GettingStarted.md) Introduction to libmemif API. Explaining library usage in custom app. diff --git a/extras/libmemif/docs/BuildInstructions.md b/extras/libmemif/docs/BuildInstructions.md deleted file mode 100644 index 9bb59b94..00000000 --- a/extras/libmemif/docs/BuildInstructions.md +++ /dev/null @@ -1,54 +0,0 @@ -## Build Instructions - -Install dependencies -``` -# sudo apt-get install -y git autoconf pkg_config libtool check -``` - -Clone repository to your local machine. -``` -# git clone https://github.com/JakubGrajciar/libmemif.git -``` - -From root directory execute: -For debug build: -``` -# ./bootstrap -# ./configure -# make -# make install -``` - -For release build: -``` -# ./bootstrap -# ./configure -# make release -# make install -``` -Verify installation: -``` -# ./.libs/icmpr-epoll -``` -> Make sure to run the binary file from ./.libs. File ./icmp\_responder in libmemif root directory is script that links the library, so it only verifies successful build. Default install path is /usr/lib. -Use _help_ command to display build information and commands: -``` -ICMP_Responder:add_epoll_fd:204: fd 0 added to epoll -MEMIF_DEBUG:src/main.c:memif_init:383: app name: ICMP_Responder -ICMP_Responder:add_epoll_fd:204: fd 4 added to epoll -LIBMEMIF EXAMPLE APP: ICMP_Responder (debug) -============================== -libmemif version: 1.0 (debug) -memif version: 256 -commands: - help - prints this help - exit - exit app - conn - create memif (slave-mode) - del - delete memif - show - show connection details - ip-set - set interface ip address - rx-mode - set queue rx mode -``` -#### Examples - -Once the library is build/installed, refer to [Examples](../examples/README.md) and [Getting started](GettingStarted.md) for additional information on basic use cases and API usage. diff --git a/extras/libmemif/docs/GettingStarted.md b/extras/libmemif/docs/GettingStarted.md deleted file mode 100644 index 3cc0ceb0..00000000 --- a/extras/libmemif/docs/GettingStarted.md +++ /dev/null @@ -1,215 +0,0 @@ -## Getting started - -#### Concept (Connecting to VPP) - -For detailed information on api calls and structures please refer to [libmemif.h](../src/libmemif.h) - -1. Initialize memif - - Declare callback function handling file descriptor event polling. -```C -int -control_fd_update (int fd, uint8_t events) -{ -... -} -``` - - Call memif initialization function. memif\_init -```C -err = memif_init (control_fd_update, APP_NAME); -``` - -> If event occurres on any file descriptor returned by this callback, call memif\_control\_fd\_handler function. -```C -memif_err = memif_control_fd_handler (evt.data.fd, events); -``` -> If callback function parameter for memif\_init function is set to NULL, libmemif will handle file descriptor event polling. - Api call memif\_poll\_event will call epoll\_pwait with user defined timeout to poll event on file descriptors opened by libmemif. -```C -/* main loop */ - while (1) - { - if (memif_poll_event (-1) < 0) - { - DBG ("poll_event error!"); - } - } -``` - -> Memif initialization function will initialize internal structures and create timer file descriptor, which will be used for sending periodic connection requests. Timer is disarmed if no memif interface is created. - -2. Creating interface - - Declare memif connction handle. -```C -memif_conn_handle_t c; -``` -> example app uses struct that contains connection handle, rx/tx buffers and other connection specific information. - - - Specify connection arguments. -```C -memif_conn_args_t args; -memset (&args, 0, sizeof (args)); -args.is_master = is_master; -args.log2_ring_size = 10; -args.buffer_size = 2048; -args.num_s2m_rings = 2; -args.num_m2s_rings = 2; -strncpy ((char *) args.interface_name, IF_NAME, strlen (IF_NAME)); -strncpy ((char *) args.instance_name, APP_NAME, strlen (APP_NAME)); -args.mode = 0; -args.interface_id = 0; -``` - - Declare callback functions called on connected/disconnected/interrupted status changed. -```C -int -on_connect (memif_conn_handle_t conn, void *private_ctx) -{ -... -} - -int -on_disconnect (memif_conn_handle_t conn, void *private_ctx) -{ - INFO ("memif connected!"); - return 0; -} -``` - - Call memif interface create function. memif\_create -```C -err = memif_create (&c->conn, - &args, on_connect, on_disconnect, on_interrupt, &ctx[index]); -``` -> If connection is in slave mode, arms timer file descriptor. -> If on interrupt callback is set to NULL, user will not be notified about interrupt. Use memif\_get\_queue\_efd call to get interrupt file descriptor for specific queue. -```C -int fd = -1; -err = memif_get_queue_efd (c->conn, data->qid, &fd); -``` - -3. Connection establishment - - User application will poll events on all file descriptors returned in memif\_control\_fd\_update\_t callback. - - On event call memif\_control\_fd\_handler. - - Everything else regarding connection establishment will be done internally. - - Once connection has been established, a callback will inform the user about connection status change. - -4. Interrupt packet receive - - If event is polled on interrupt file descriptor, libmemif will call memif\_interrupt\_t callback specified for every connection instance. -```C -int -on_interrupt (memif_conn_handle_t conn, void *private_ctx, uint16_t qid) -{ -... -} -``` - -6. Memif buffers - - Packet data are stored in memif\_buffer\_t. Pointer _data_ points to shared memory buffer, and unsigned integer *data\_len* contains packet data length. -```C -typedef struct -{ - uint16_t desc_index; - uint32_t buffer_len; - uint32_t data_len; - void *data; -} memif_buffer_t; -``` - -5. Packet receive - - Api call memif\_rx\_burst will set all required fields in memif buffers provided by user application. -```C -err = memif_rx_burst (c->conn, qid, c->rx_bufs, MAX_MEMIF_BUFS, &rx); -``` - - User application can then process packets. - - Api call memif\_buffer\_free will make supplied memif buffers ready for next receive and mark shared memory buffers as free. -```C -err = memif_buffer_free (c->conn, qid, c->rx_bufs, rx, &fb); -``` - -6. Packet transmit - - Api call memif\_buffer\_alloc will set all required fields in memif buffers provided by user application. -```C -err = memif_buffer_alloc (c->conn, qid, c->tx_bufs, n, &r); -``` - - User application can populate shared memory buffers with packets. - - Api call memif\_tx\_burst will inform peer interface (master memif on VPP) that there are packets ready to receive and mark memif buffers as free. -```C -err = memif_tx_burst (c->conn, qid, c->tx_bufs, c->tx_buf_num, &r); -``` - -7. Helper functions - - Memif details - - Api call memif\_get\_details will return details about connection. -```C -err = memif_get_details (c->conn, &md, buf, buflen); -``` - - Memif error messages - - Every api call returns error code (integer value) mapped to error string. - - Call memif\_strerror will return error message assigned to specific error code. -```C -if (err != MEMIF_ERR_SUCCESS) - INFO ("memif_get_details: %s", memif_strerror (err)); -``` - - Not all syscall errors are translated to memif error codes. If error code 1 (MEMIF\_ERR\_SYSCALL) is returned then libmemif needs to be compiled with -DMEMIF_DBG flag to print error message. Use _make -B_ to rebuild libmemif in debug mode. - -#### Example app (libmemif fd event polling): - -- [ICMP Responder](../examples/icmp_responder/main.c) -> Optional argument: transmit queue id. -``` -icmpr 1 -``` -> Set transmit queue id to 1. Default is 0. -> Application will create memif interface in slave mode and try to connect to VPP. Exit using Ctrl+C. Application will handle SIGINT signal, free allocated memory and exit with EXIT_SUCCESS. - -#### Example app: - -- [ICMP Responder custom fd event polling](../examples/icmp_responder-epoll/main.c) - -#### Example app (multi-thread queue polling) - -- [ICMP Responder multi-thread](../examples/icmp_responder-mt/main.c) - -> Simple example of libmemif multi-thread usage. Connection establishment is handled by main thread. There are two rx queues in this example. One in polling mode and second in interrupt mode. - -VPP config: -``` -# create memif id 0 master -# set int state memif0 up -# set int ip address memif0 192.168.1.1/24 -# ping 192.168.1.2 -``` -For multiple rings (queues) support run VPP with worker threads: -example startup.conf: -``` -unix { - interactive - nodaemon - full-coredump -} - -cpu { - workers 2 -} -``` -VPP config: -``` -# create memif id 0 master -# set int state memif0 up -# set int ip address memif0 192.168.1.1/24 -# ping 192.168.1.2 -``` -> Master mode queue number is limited by worker threads. Slave mode interface needs to specify number of queues. -``` -# create memif id 0 slave rx-queues 2 tx-queues 2 -``` -> Example applications use VPP default socket file for memif: /run/vpp/memif.sock -> For master mode, socket directory must exist prior to memif\_create call. - -#### Unit tests - -Unit tests use [Check](https://libcheck.github.io/check/index.html) framework. This framework must be installed in order to build *unit\_test* binary. -Ubuntu/Debian: -``` -sudo apt-get install check -``` -[More platforms](https://libcheck.github.io/check/web/install.html) - diff --git a/extras/libmemif/docs/buildinstructions_doc.md b/extras/libmemif/docs/buildinstructions_doc.md new file mode 100644 index 00000000..838e835a --- /dev/null +++ b/extras/libmemif/docs/buildinstructions_doc.md @@ -0,0 +1,55 @@ +## Build Instructions {#libmemif_build_doc} + +Install dependencies +``` +# sudo apt-get install -y git autoconf pkg_config libtool check +``` + +Clone repository to your local machine. +``` +# git clone https://github.com/JakubGrajciar/libmemif.git +``` + +From root directory execute: +For debug build: +``` +# ./bootstrap +# ./configure +# make +# make install +``` + +For release build: +``` +# ./bootstrap +# ./configure +# make release +# make install +``` +Verify installation: +``` +# ./.libs/icmpr-epoll +``` +> Make sure to run the binary file from ./.libs. File ./icmp\_responder in libmemif root directory is script that links the library, so it only verifies successful build. Default install path is /usr/lib. +Use _help_ command to display build information and commands: +``` +ICMP_Responder:add_epoll_fd:204: fd 0 added to epoll +MEMIF_DEBUG:src/main.c:memif_init:383: app name: ICMP_Responder +ICMP_Responder:add_epoll_fd:204: fd 4 added to epoll +LIBMEMIF EXAMPLE APP: ICMP_Responder (debug) +============================== +libmemif version: 1.0 (debug) +memif version: 256 +commands: + help - prints this help + exit - exit app + conn - create memif (slave-mode) + del - delete memif + show - show connection details + ip-set - set interface ip address + rx-mode - set queue rx mode +``` + +#### Examples + +Once the library is built/installed, refer to @ref libmemif_examples_doc and @ref libmemif_gettingstarted_doc for additional information on basic use cases and API usage. diff --git a/extras/libmemif/docs/gettingstarted_doc.md b/extras/libmemif/docs/gettingstarted_doc.md new file mode 100644 index 00000000..e3ae6e56 --- /dev/null +++ b/extras/libmemif/docs/gettingstarted_doc.md @@ -0,0 +1,219 @@ +## Getting started {#libmemif_gettingstarted_doc} + +#### Concept (Connecting to VPP) + +For detailed information on api calls and structures please refer to @ref libmemif.h. + +1. Initialize memif + - Declare callback function handling file descriptor event polling. +```C +int +control_fd_update (int fd, uint8_t events) +{ +... +} +``` + - Call memif initialization function. memif\_init +```C +err = memif_init (control_fd_update, APP_NAME); +``` + +> If event occurres on any file descriptor returned by this callback, call memif\_control\_fd\_handler function. +```C +memif_err = memif_control_fd_handler (evt.data.fd, events); +``` +> If callback function parameter for memif\_init function is set to NULL, libmemif will handle file descriptor event polling. + Api call memif\_poll\_event will call epoll\_pwait with user defined timeout to poll event on file descriptors opened by libmemif. +```C +/* main loop */ + while (1) + { + if (memif_poll_event (-1) < 0) + { + DBG ("poll_event error!"); + } + } +``` + +> Memif initialization function will initialize internal structures and create timer file descriptor, which will be used for sending periodic connection requests. Timer is disarmed if no memif interface is created. + +2. Creating interface + - Declare memif connction handle. +```C +memif_conn_handle_t c; +``` +> example app uses struct that contains connection handle, rx/tx buffers and other connection specific information. + + - Specify connection arguments. +```C +memif_conn_args_t args; +memset (&args, 0, sizeof (args)); +args.is_master = is_master; +args.log2_ring_size = 10; +args.buffer_size = 2048; +args.num_s2m_rings = 2; +args.num_m2s_rings = 2; +strncpy ((char *) args.interface_name, IF_NAME, strlen (IF_NAME)); +strncpy ((char *) args.instance_name, APP_NAME, strlen (APP_NAME)); +args.mode = 0; +args.interface_id = 0; +``` + - Declare callback functions called on connected/disconnected/interrupted status changed. +```C +int +on_connect (memif_conn_handle_t conn, void *private_ctx) +{ +... +} + +int +on_disconnect (memif_conn_handle_t conn, void *private_ctx) +{ + INFO ("memif connected!"); + return 0; +} +``` + - Call memif interface create function. memif\_create +```C +err = memif_create (&c->conn, + &args, on_connect, on_disconnect, on_interrupt, &ctx[index]); +``` +> If connection is in slave mode, arms timer file descriptor. +> If on interrupt callback is set to NULL, user will not be notified about interrupt. Use memif\_get\_queue\_efd call to get interrupt file descriptor for specific queue. +```C +int fd = -1; +err = memif_get_queue_efd (c->conn, data->qid, &fd); +``` + +3. Connection establishment + - User application will poll events on all file descriptors returned in memif\_control\_fd\_update\_t callback. + - On event call memif\_control\_fd\_handler. + - Everything else regarding connection establishment will be done internally. + - Once connection has been established, a callback will inform the user about connection status change. + +4. Interrupt packet receive + - If event is polled on interrupt file descriptor, libmemif will call memif\_interrupt\_t callback specified for every connection instance. +```C +int +on_interrupt (memif_conn_handle_t conn, void *private_ctx, uint16_t qid) +{ +... +} +``` + +6. Memif buffers + - Packet data are stored in memif\_buffer\_t. Pointer _data_ points to shared memory buffer, and unsigned integer *data\_len* contains packet data length. +```C +typedef struct +{ + uint16_t desc_index; + uint32_t buffer_len; + uint32_t data_len; + void *data; +} memif_buffer_t; +``` + +5. Packet receive + - Api call memif\_rx\_burst will set all required fields in memif buffers provided by user application. +```C +err = memif_rx_burst (c->conn, qid, c->rx_bufs, MAX_MEMIF_BUFS, &rx); +``` + - User application can then process packets. + - Api call memif\_buffer\_free will make supplied memif buffers ready for next receive and mark shared memory buffers as free. +```C +err = memif_buffer_free (c->conn, qid, c->rx_bufs, rx, &fb); +``` + +6. Packet transmit + - Api call memif\_buffer\_alloc will set all required fields in memif buffers provided by user application. +```C +err = memif_buffer_alloc (c->conn, qid, c->tx_bufs, n, &r); +``` + - User application can populate shared memory buffers with packets. + - Api call memif\_tx\_burst will inform peer interface (master memif on VPP) that there are packets ready to receive and mark memif buffers as free. +```C +err = memif_tx_burst (c->conn, qid, c->tx_bufs, c->tx_buf_num, &r); +``` + +7. Helper functions + - Memif details + - Api call memif\_get\_details will return details about connection. +```C +err = memif_get_details (c->conn, &md, buf, buflen); +``` + - Memif error messages + - Every api call returns error code (integer value) mapped to error string. + - Call memif\_strerror will return error message assigned to specific error code. +```C +if (err != MEMIF_ERR_SUCCESS) + INFO ("memif_get_details: %s", memif_strerror (err)); +``` + - Not all syscall errors are translated to memif error codes. If error code 1 (MEMIF\_ERR\_SYSCALL) is returned then libmemif needs to be compiled with -DMEMIF_DBG flag to print error message. Use _make -B_ to rebuild libmemif in debug mode. + +#### Example app (libmemif fd event polling): + +- @ref extras/libmemif/examples/icmp_responder + +> Optional argument: transmit queue id. +``` +icmpr 1 +``` +> Set transmit queue id to 1. Default is 0. +> Application will create memif interface in slave mode and try to connect to VPP. Exit using Ctrl+C. Application will handle SIGINT signal, free allocated memory and exit with EXIT_SUCCESS. + +#### Example app: + +ICMP Responder custom fd event polling. + +- @ref extras/libmemif/examples/icmp_responder-epoll + +#### Example app (multi-thread queue polling) + +ICMP Responder multi-thread. +- @ref extras/libmemif/examples/icmp_responder-mt + +> Simple example of libmemif multi-thread usage. Connection establishment is handled by main thread. There are two rx queues in this example. One in polling mode and second in interrupt mode. + +VPP config: +``` +# create memif id 0 master +# set int state memif0 up +# set int ip address memif0 192.168.1.1/24 +# ping 192.168.1.2 +``` +For multiple rings (queues) support run VPP with worker threads: +example startup.conf: +``` +unix { + interactive + nodaemon + full-coredump +} + +cpu { + workers 2 +} +``` +VPP config: +``` +# create memif id 0 master +# set int state memif0 up +# set int ip address memif0 192.168.1.1/24 +# ping 192.168.1.2 +``` +> Master mode queue number is limited by worker threads. Slave mode interface needs to specify number of queues. +``` +# create memif id 0 slave rx-queues 2 tx-queues 2 +``` +> Example applications use VPP default socket file for memif: /run/vpp/memif.sock +> For master mode, socket directory must exist prior to memif\_create call. + +#### Unit tests + +Unit tests use [Check](https://libcheck.github.io/check/index.html) framework. This framework must be installed in order to build *unit\_test* binary. +Ubuntu/Debian: +``` +sudo apt-get install check +``` +[More platforms](https://libcheck.github.io/check/web/install.html) + diff --git a/extras/libmemif/examples/ExampleSetup.md b/extras/libmemif/examples/ExampleSetup.md deleted file mode 100644 index a02506dd..00000000 --- a/extras/libmemif/examples/ExampleSetup.md +++ /dev/null @@ -1,206 +0,0 @@ -## Example setup - -#### VPP-memif master icmp_responder slave - -> Libmemif example app(s) use memif default socket file: /run/vpp/memif.sock. - -Run VPP and icmpr-epoll example (default example when running in container). -> Other examples work similar to icmpr-epoll. Brief explanation can be found in [Examples readme](README.md) file. - -VPP-side config: -``` -DBGvpp# create memif id 0 master -DBGvpp# set int state memif0/0 up -DBGvpp# set int ip address memif0/0 192.168.1.1/24 -``` -icmpr-epoll: -``` -conn 0 0 -``` -Memif in slave mode will try to connect every 2 seconds. If connection establishment is successfull, a message will show. -``` -INFO: memif connected! -``` -> Error messages like "unmatched interface id" are printed only in debug mode. - -Check connected status. -Use show command in icmpr-epoll: -``` -show -MEMIF DETAILS -============================== -interface index: 0 - interface ip: 192.168.1.2 - interface name: memif_connection - app name: ICMP_Responder - remote interface name: memif0/0 - remote app name: VPP 17.10-rc0~132-g62f9cdd - id: 0 - secret: - role: slave - mode: ethernet - socket filename: /run/vpp/memif.sock - rx queues: - queue id: 0 - ring size: 1024 - buffer size: 2048 - tx queues: - queue id: 0 - ring size: 1024 - buffer size: 2048 - link: up -interface index: 1 - no connection - -``` -Use sh memif command in VPP: -``` -DBGvpp# sh memif -interface memif0/0 - remote-name "ICMP_Responder" - remote-interface "memif_connection" - id 0 mode ethernet file /run/vpp/memif.sock - flags admin-up connected - listener-fd 12 conn-fd 13 - num-s2m-rings 1 num-m2s-rings 1 buffer-size 0 - master-to-slave ring 0: - region 0 offset 32896 ring-size 1024 int-fd 16 - head 0 tail 0 flags 0x0000 interrupts 0 - master-to-slave ring 0: - region 0 offset 0 ring-size 1024 int-fd 15 - head 0 tail 0 flags 0x0001 interrupts 0 -``` - -Send ping from VPP to icmpr-epoll: -``` -DBGvpp# ping 192.168.1.2 -64 bytes from 192.168.1.2: icmp_seq=2 ttl=64 time=.1888 ms -64 bytes from 192.168.1.2: icmp_seq=3 ttl=64 time=.1985 ms -64 bytes from 192.168.1.2: icmp_seq=4 ttl=64 time=.1813 ms -64 bytes from 192.168.1.2: icmp_seq=5 ttl=64 time=.1929 ms - -Statistics: 5 sent, 4 received, 20% packet loss -``` -#### multiple queues VPP-memif slave icmp_responder master - -Run icmpr-epoll as in previous example setup. -Run VPP with startup conf, enabling 2 worker threads. -Example startup.conf: -``` -unix { - interactive - nodaemon - full-coredump -} - -cpu { - workers 2 -} -``` -VPP-side config: -``` -DBGvpp# create memif id 0 slave rx-queues 2 tx-queues 2 -DBGvpp# set int state memif0/0 up -DBGvpp# set int ip address memif0/0 192.168.1.1/24 -``` -icmpr-epoll: -``` -conn 0 1 -``` -When connection is established a message will print: -``` -INFO: memif connected! -``` -> Error messages like "unmatched interface id" are printed only in debug mode. - -Check connected status. -Use show command in icmpr-epoll: -``` -show -MEMIF DETAILS -============================== -interface index: 0 - interface ip: 192.168.1.2 - interface name: memif_connection - app name: ICMP_Responder - remote interface name: memif0/0 - remote app name: VPP 17.10-rc0~132-g62f9cdd - id: 0 - secret: - role: master - mode: ethernet - socket filename: /run/vpp/memif.sock - rx queues: - queue id: 0 - ring size: 1024 - buffer size: 2048 - queue id: 1 - ring size: 1024 - buffer size: 2048 - tx queues: - queue id: 0 - ring size: 1024 - buffer size: 2048 - queue id: 1 - ring size: 1024 - buffer size: 2048 - link: up -interface index: 1 - no connection - -``` -Use sh memif command in VPP: -``` -DBGvpp# sh memif -interface memif0/0 - remote-name "ICMP_Responder" - remote-interface "memif_connection" - id 0 mode ethernet file /run/vpp/memif.sock - flags admin-up slave connected - listener-fd -1 conn-fd 12 - num-s2m-rings 2 num-m2s-rings 2 buffer-size 2048 - slave-to-master ring 0: - region 0 offset 0 ring-size 1024 int-fd 14 - head 0 tail 0 flags 0x0000 interrupts 0 - slave-to-master ring 1: - region 0 offset 32896 ring-size 1024 int-fd 15 - head 0 tail 0 flags 0x0000 interrupts 0 - slave-to-master ring 0: - region 0 offset 65792 ring-size 1024 int-fd 16 - head 0 tail 0 flags 0x0001 interrupts 0 - slave-to-master ring 1: - region 0 offset 98688 ring-size 1024 int-fd 17 - head 0 tail 0 flags 0x0001 interrupts 0 - -``` -Send ping from VPP to icmpr-epoll: -``` -DBGvpp# ping 192.168.1.2 -64 bytes from 192.168.1.2: icmp_seq=2 ttl=64 time=.1439 ms -64 bytes from 192.168.1.2: icmp_seq=3 ttl=64 time=.2184 ms -64 bytes from 192.168.1.2: icmp_seq=4 ttl=64 time=.1458 ms -64 bytes from 192.168.1.2: icmp_seq=5 ttl=64 time=.1687 ms - -Statistics: 5 sent, 4 received, 20% packet loss -``` - -#### icmp_responder master icmp_responder slave - -> Example apps can only repond to ping. This setup creates connection between two applications using libmemif. Traffic functionality is the same as when connection to VPP. App can receive ARP/ICMP request and transmit response, but can not send ARP/ICMP request. - -Run two instances of icmpr-epoll example. -> If not running in container, make sure folder /run/vpp/ exists before creating memif master. -Instance 1 will be in master mode, instance 2 in slave mode. -instance 1: -``` -conn 0 1 -``` -instance 2: -``` -conn 0 0 -``` -In 2 seconds, both instances should print connected! message: -``` -INFO: memif connected! -``` -Check peer interface names using show command. diff --git a/extras/libmemif/examples/README.md b/extras/libmemif/examples/README.md deleted file mode 100644 index 1375d27a..00000000 --- a/extras/libmemif/examples/README.md +++ /dev/null @@ -1,16 +0,0 @@ -## Examples - -After build, root folder will contain scripts linking binary examples with library (same name as example apps). These scripts can be executed to run example apps without installing the library. Example apps binaries can be found in _libs_ filder. To run binaries directly, make sure that libmemif library is installed. - -#### Run in container -ligato/libmemif-sample-service image contains built and installed libmemf. To run different examples, override docker CMD to start container in bash: -``` -# docker run -it --entrypoint=/bin/bash -i --rm --name icmp-responder --hostname icmp-responder --privileged -v "/run/vpp/:/run/vpp/" ligato/libmemif-sample-service -``` -Current WORKDIR is set to root repository directory. Example apps can be run from this directory (a script linking binary with library), or browse to ./.libs folder and execute binary directly. - -Example app | Description -------------|------------ -[icmpr](../examples/icmp_responder/main.c) | Simplest implementaion. Event polling is handled by libmemif. Single memif conenction in slave mode is created (id 0). Use Ctrl + C to exit app. Memif receive mode: interrupt. -[icmpr-epoll](../examples/icmp_responder-epoll/main.c) (run in container by default) | Supports multiple connections and master mode. User can create/delete connections, set ip addresses, print connection information. [Example setup](ExampleSetup.md) contains instructions on basic connection use cases setups. Memif receive mode: interrupt. App provides functionality to disable interrupts for specified queue/s for testing purposes. Polling mode is not implemented in this example. -[icmpr-mt](../examples/icmp_responder-mt/main.c) | Multi-thread example, very similar to icmpr-epoll. Packets are handled in threads assigned to specific queues. Slave mode only. Memif receive mode: polling (memif_rx_poll function), interrupt (memif_rx_interrupt function). Receive modes differ per queue. diff --git a/extras/libmemif/examples/example_setup_doc.md b/extras/libmemif/examples/example_setup_doc.md new file mode 100644 index 00000000..89a0b10e --- /dev/null +++ b/extras/libmemif/examples/example_setup_doc.md @@ -0,0 +1,207 @@ +## Example setup {#libmemif_example_setup_doc} + +#### VPP-memif master icmp_responder slave + +> Libmemif example app(s) use memif default socket file: `/run/vpp/memif.sock`. + +Run VPP and icmpr-epoll example (default example when running in container). + +> Other examples work similar to icmpr-epoll. Brief explanation can be found in @ref libmemif_examples_doc . + +VPP-side config: +``` +DBGvpp# create memif id 0 master +DBGvpp# set int state memif0/0 up +DBGvpp# set int ip address memif0/0 192.168.1.1/24 +``` +icmpr-epoll: +``` +conn 0 0 +``` +Memif in slave mode will try to connect every 2 seconds. If connection establishment is successfull, a message will show. +``` +INFO: memif connected! +``` +> Error messages like "unmatched interface id" are printed only in debug mode. + +Check connected status. +Use show command in icmpr-epoll: +``` +show +MEMIF DETAILS +============================== +interface index: 0 + interface ip: 192.168.1.2 + interface name: memif_connection + app name: ICMP_Responder + remote interface name: memif0/0 + remote app name: VPP 17.10-rc0~132-g62f9cdd + id: 0 + secret: + role: slave + mode: ethernet + socket filename: /run/vpp/memif.sock + rx queues: + queue id: 0 + ring size: 1024 + buffer size: 2048 + tx queues: + queue id: 0 + ring size: 1024 + buffer size: 2048 + link: up +interface index: 1 + no connection + +``` +Use sh memif command in VPP: +``` +DBGvpp# sh memif +interface memif0/0 + remote-name "ICMP_Responder" + remote-interface "memif_connection" + id 0 mode ethernet file /run/vpp/memif.sock + flags admin-up connected + listener-fd 12 conn-fd 13 + num-s2m-rings 1 num-m2s-rings 1 buffer-size 0 + master-to-slave ring 0: + region 0 offset 32896 ring-size 1024 int-fd 16 + head 0 tail 0 flags 0x0000 interrupts 0 + master-to-slave ring 0: + region 0 offset 0 ring-size 1024 int-fd 15 + head 0 tail 0 flags 0x0001 interrupts 0 +``` + +Send ping from VPP to icmpr-epoll: +``` +DBGvpp# ping 192.168.1.2 +64 bytes from 192.168.1.2: icmp_seq=2 ttl=64 time=.1888 ms +64 bytes from 192.168.1.2: icmp_seq=3 ttl=64 time=.1985 ms +64 bytes from 192.168.1.2: icmp_seq=4 ttl=64 time=.1813 ms +64 bytes from 192.168.1.2: icmp_seq=5 ttl=64 time=.1929 ms + +Statistics: 5 sent, 4 received, 20% packet loss +``` +#### multiple queues VPP-memif slave icmp_responder master + +Run icmpr-epoll as in previous example setup. +Run VPP with startup conf, enabling 2 worker threads. +Example startup.conf: +``` +unix { + interactive + nodaemon + full-coredump +} + +cpu { + workers 2 +} +``` +VPP-side config: +``` +DBGvpp# create memif id 0 slave rx-queues 2 tx-queues 2 +DBGvpp# set int state memif0/0 up +DBGvpp# set int ip address memif0/0 192.168.1.1/24 +``` +icmpr-epoll: +``` +conn 0 1 +``` +When connection is established a message will print: +``` +INFO: memif connected! +``` +> Error messages like "unmatched interface id" are printed only in debug mode. + +Check connected status. +Use show command in icmpr-epoll: +``` +show +MEMIF DETAILS +============================== +interface index: 0 + interface ip: 192.168.1.2 + interface name: memif_connection + app name: ICMP_Responder + remote interface name: memif0/0 + remote app name: VPP 17.10-rc0~132-g62f9cdd + id: 0 + secret: + role: master + mode: ethernet + socket filename: /run/vpp/memif.sock + rx queues: + queue id: 0 + ring size: 1024 + buffer size: 2048 + queue id: 1 + ring size: 1024 + buffer size: 2048 + tx queues: + queue id: 0 + ring size: 1024 + buffer size: 2048 + queue id: 1 + ring size: 1024 + buffer size: 2048 + link: up +interface index: 1 + no connection + +``` +Use sh memif command in VPP: +``` +DBGvpp# sh memif +interface memif0/0 + remote-name "ICMP_Responder" + remote-interface "memif_connection" + id 0 mode ethernet file /run/vpp/memif.sock + flags admin-up slave connected + listener-fd -1 conn-fd 12 + num-s2m-rings 2 num-m2s-rings 2 buffer-size 2048 + slave-to-master ring 0: + region 0 offset 0 ring-size 1024 int-fd 14 + head 0 tail 0 flags 0x0000 interrupts 0 + slave-to-master ring 1: + region 0 offset 32896 ring-size 1024 int-fd 15 + head 0 tail 0 flags 0x0000 interrupts 0 + slave-to-master ring 0: + region 0 offset 65792 ring-size 1024 int-fd 16 + head 0 tail 0 flags 0x0001 interrupts 0 + slave-to-master ring 1: + region 0 offset 98688 ring-size 1024 int-fd 17 + head 0 tail 0 flags 0x0001 interrupts 0 + +``` +Send ping from VPP to icmpr-epoll: +``` +DBGvpp# ping 192.168.1.2 +64 bytes from 192.168.1.2: icmp_seq=2 ttl=64 time=.1439 ms +64 bytes from 192.168.1.2: icmp_seq=3 ttl=64 time=.2184 ms +64 bytes from 192.168.1.2: icmp_seq=4 ttl=64 time=.1458 ms +64 bytes from 192.168.1.2: icmp_seq=5 ttl=64 time=.1687 ms + +Statistics: 5 sent, 4 received, 20% packet loss +``` + +#### icmp_responder master icmp_responder slave + +> Example apps can only repond to ping. This setup creates connection between two applications using libmemif. Traffic functionality is the same as when connection to VPP. App can receive ARP/ICMP request and transmit response, but can not send ARP/ICMP request. + +Run two instances of icmpr-epoll example. +> If not running in container, make sure folder /run/vpp/ exists before creating memif master. +Instance 1 will be in master mode, instance 2 in slave mode. +instance 1: +``` +conn 0 1 +``` +instance 2: +``` +conn 0 0 +``` +In 2 seconds, both instances should print connected! message: +``` +INFO: memif connected! +``` +Check peer interface names using show command. diff --git a/extras/libmemif/examples/examples_doc.md b/extras/libmemif/examples/examples_doc.md new file mode 100644 index 00000000..11937c57 --- /dev/null +++ b/extras/libmemif/examples/examples_doc.md @@ -0,0 +1,18 @@ +## Examples {#libmemif_examples_doc} + +After build, root folder will contain scripts linking binary examples with library (same name as example apps). These scripts can be executed to run example apps without installing the library. Example apps binaries can be found in _libs_ filder. To run binaries directly, make sure that libmemif library is installed. + +#### Run in container + +`ligato/libmemif-sample-service` image contains built and installed libmemf. To run different examples, override docker CMD to start container in bash: + +``` +# docker run -it --entrypoint=/bin/bash -i --rm --name icmp-responder --hostname icmp-responder --privileged -v "/run/vpp/:/run/vpp/" ligato/libmemif-sample-service +``` +Current WORKDIR is set to root repository directory. Example apps can be run from this directory (a script linking binary with library), or browse to `./.libs` folder and execute binary directly. + +Example app | Description +------------|------------ +@ref extras/libmemif/examples/icmp_responder | Simplest implementaion. Event polling is handled by libmemif. Single memif conenction in slave mode is created (id 0). Use Ctrl + C to exit app. Memif receive mode: interrupt. +@ref extras/libmemif/examples/icmp_responder-epoll (run in container by default) | Supports multiple connections and master mode. User can create/delete connections, set ip addresses, print connection information. @ref libmemif_example_setup_doc contains instructions on basic connection use cases setups. Memif receive mode: interrupt. App provides functionality to disable interrupts for specified queue/s for testing purposes. Polling mode is not implemented in this example. +@ref extras/libmemif/examples/icmp_responder-mt) | Multi-thread example, very similar to icmpr-epoll. Packets are handled in threads assigned to specific queues. Slave mode only. Memif receive mode: polling (memif_rx_poll function), interrupt (memif_rx_interrupt function). Receive modes differ per queue. diff --git a/extras/libmemif/libmemif_doc.md b/extras/libmemif/libmemif_doc.md new file mode 100644 index 00000000..fb4a18af --- /dev/null +++ b/extras/libmemif/libmemif_doc.md @@ -0,0 +1,74 @@ +Shared Memory Packet Interface (memif) Library {#libmemif_doc} +============================================== + +## Introduction + +Shared memory packet interface (memif) provides high performance packet transmit and receive between user application and Vector Packet Processing (VPP) or multiple user applications. Using libmemif, user application can create shared memory interface in master or slave mode and connect to VPP or another application using libmemif. Once the connection is established, user application can receive or transmit packets using libmemif API. + +![Architecture](docs/architecture.png) + +## Features + +- [x] Slave mode + - [x] Connect to VPP over memif + - [x] ICMP responder example app +- [x] Transmit/receive packets +- [x] Interrupt mode support +- [x] File descriptor event polling in libmemif (optional) + - [x] Simplify file descriptor event polling (one handler for control and interrupt channel) +- [x] Multiple connections +- [x] Multiple queues + - [x] Multi-thread support +- [x] Master mode + - [ ] Multiple regions (TODO) +- [ ] Performance testing (TODO) + +## Quickstart + +This setup will run libmemif ICMP responder example app in container. Install [docker](https://docs.docker.com/engine/installation) engine. +Useful link: [Docker documentation](https://docs.docker.com/get-started). + +Pull image: +``` +# docker pull ligato/libmemif-sample-service +``` + +Now you should be able to see ligato/libmemif-sample-service image on your local machine (IMAGE ID in this README may be outdated): +``` +# docker images +REPOSITORY TAG IMAGE ID CREATED SIZE +ligato/libmemif-sample-service latest 32ecc2f9d013 About a minute ago 468MB +... +``` + +Run container: +``` +# docker run -it --rm --name icmp-responder --hostname icmp-responder --privileged -v "/run/vpp/:/run/vpp/" ligato/libmemif-sample-service +``` +Example application will start in debug mode. Output should look like this: +``` +ICMP_Responder:add_epoll_fd:204: fd 0 added to epoll +MEMIF_DEBUG:src/main.c:memif_init:383: app name: ICMP_Responder +ICMP_Responder:add_epoll_fd:204: fd 4 added to epoll +LIBMEMIF EXAMPLE APP: ICMP_Responder (debug) +============================== +libmemif version: 1.0 (debug) +memif version: 256 +commands: + help - prints this help + exit - exit app + conn - create memif (slave-mode) + del - delete memif + show - show connection details + ip-set - set interface ip address + rx-mode - set queue rx mode +``` + +Continue with @ref libmemif_example_setup which contains instructions on how to set up conenction between icmpr-epoll example app and VPP-memif. + +#### Next steps + +- @subpage libmemif_build_doc +- @subpage libmemif_examples_doc +- @subpage libmemif_example_setup_doc +- @subpage libmemif_gettingstarted_doc diff --git a/src/plugins/acl/acl-plugin.md b/src/plugins/acl/acl-plugin.md deleted file mode 100644 index 1b44bca9..00000000 --- a/src/plugins/acl/acl-plugin.md +++ /dev/null @@ -1,347 +0,0 @@ -Multicore support for ACL plugin -================================ - -This captures some considerations and design decisions that I have made, -both for my own memory later on ("what the hell was I thinking?!?"), -and for anyone interested to criticize/improve/hack on this code. - -One of the factors taken into account while making these decisions, -was the relative emphasis on the multi-thread vs. single-thread -use cases: the latter is the vastly more prevalent. But, -one can not optimize the single-thread performance without -having a functioning code for multi-thread. - -stateless ACLs -============== - -The stateless trivially parallelizes, and the only potential for the -race between the different threads is during the reconfiguration, -at the time of replacing the old ACL being checked, with -the new ACL. - -In case an acl_add_replace is being used to replace the rules -within the existing entry, a reallocation of am->acls[X].rules -vector will happen and potentially a change in count. - -acl_match_5tuple() has the following code: - - a = am->acls + acl_index; - for (i = 0; i < a->count; i++) - { - r = a->rules + i; - . . . - -Ideally we should be immune from a->rules changing, -but the problem arises if the count changes in flight, -and the new ruleset is smaller - then we will attempt -to "match" against the free memory. - -This can(?) be solved by replacing the for() with while(), -so the comparison happens at each iteration. - -full_acl_match_5tuple(), which iterates over the list -of ACLs, is a bit less immune, since it takes the pointer -to the vector to iterate and keeps a local copy of -that pointer. - -This race can be solved by checking the -current pointer to the vector with the source pointer, -and seeing if there is an (unlikely) change, and if -there is, return the "deny" action, or, better, -restart the check. - -Since the check reloads the ACL list on a per-packet basis, -there is only a window of opportunity of one packet to -"match" packet against an incorrect rule set. -The workers also do not change anything, only read. -Therefore, it looks like building special structures -to ensure that it does not happen at all might be not -worth it. - -At least not until we have a unit-test able to -reliably catch this condition and test that -the measures applied are effective. Adding the code -which is not possible to exercise is worse than -not adding any code at all. - -So, I opt for "do-nothing" here for the moment. - -reflexive ACLs: single-thread -============================= - -Before we talk multi-thread, is worth revisiting the -design of the reflexive ACLs in the plugin, and -the history of their evolution. - -The very first version of the ACL plugin, shipped in -1701, mostly did the job using the existing components -and gluing them together. Because it needed to work -in bridged forwarding path only, using L2 classifier -as an insertion point appeared natural, also L2 classifier, -being a table with sessions, seemed like a good place -to hold the sessions. - -So, the original design had two conceptual nodes: -one, pointed by the next_miss from the L2 classifier table, -was checking the actual ACL, and inserting session into -the L2 classifier table, and the other one, pointed -to by the next_match within the specific session rule, -was checking the existing session. The timing out -of the existing connections was done in the datapath, -by periodically calling the aging function. - -This decision to use the existing components, -with its attrativeness, did bring a few limitations as well: - -* L2 classifier is a simple mask-and-value match, with -a fixed mask across the table. So, sanely supporting IPv6 -packets with extension headers in that framework was impossible. - -* There is no way to get a backpressure from L2 classifier -depending on memory usage. When it runs out of memory, -it simply crashes the box. When it runs out of memory ? -We don't really know. Depends on how it allocates it. - -* Since we need to match the *reflected* traffic, -we had to create *two* full session entries -in two different directions, which is quite wasteful memory-wise. - -* (showstopper): the L2 classifier runs only in -the bridged data path, so supporting routed data path -would require creating something else entirely different, -which would mean much more headaches support-wise going forward. - -Because of that, I have moved to a different model of -creating a session-5-tuple from the packet data - once, -and then doing all the matching just on that 5-tuple. - -This has allowed to add support for skipping IPv6 extension headers. - -Also, this new version started to store the sessions in a dedicated -bihash-per-interface, with the session key data being -aligned for the ingress packets, and being mirrored for the -egress packets. This allows of significant savings in memory, -because now we need to keep only one copy of the session table per -interface instead of two, and also to only have ONE node for all the lookups, -(L2/L3 path, in/out, IPv4/IPv6) - significantly reducing the code complexity. - -Unfortunately, bihash still has the "lack of backpressure" problem, -in a sense that if you try to insert too many entries and run out -of memory in the heap you supplied, you get a crash. - -To somewhat workaround against that, there is a "maximum tested number of sessions" -value, which tracks the currently inserted sessions in the bihash, -and if this number is being approached, a more aggressive cleanup -can happen. If this number is reached, two behaviors are possible: - -* attempt to do the stateless ACL matching and permit the packet - if it succeeds - -* deny the packet - -Currently I have opted for a second one, since it allows for -a better defined behavior, and if you have to permit -the traffic in both directions, why using stateful anyway ? - -In order to be able to do the cleanup, we need to discriminate between -the session types, with each session type having its own idle timeout. -In order to do that, we keep three lists, defined in enum acl_timeout_e: -ACL_TIMEOUT_UDP_IDLE, ACL_TIMEOUT_TCP_IDLE, ACL_TIMEOUT_TCP_TRANSIENT. - -The first one is hopefully obvious - it is just all UDP connections. -They have an idle timeout of 600 seconds. - -The second and third is a bit more subtle. TCP is a complicated protocol, -and we need to tread the fine line between doing too little and doing -too much, and triggering the potential compatibility issues because of -being a "middlebox". - -I decided to split the TCP connections into two classes: -established, and everything else. "Established", means we have seen -the SYN and ACK from both sides (with PUSH obviously masked out). -This is the "active" state of any TCP connection and we would like -to ensure we do not screw it up. So, the connections in this state -have the default idle timer of 24 hours. - -All the rest of the connections have the idle timeout of 2 minutes, -(inspired by an old value of MSL) and based on the observation -that the states this class represent are usually very short lived. - -Once we have these three baskets of connections, it is trivial to -imagine a simple cleanup mechanism to deal with this: take a -TCP transient connection that has been hanging around. - -It is debatable whether we want to do discrimination between the -different TCP transient connections. Assuming we do FIFO (and -the lists allow us to do just that), it means a given connection -on the head of the list has been hanging around for longest. -Thus, if we are short on resources, we might just go ahead and -reuse it within the datapath. - -This is where we are slowly approaching the question -"Why in the world have not you used timer wheel or such ?" - -The answer is simple: within the above constraints, it does -not buy me much. - -Also, timer wheel creates a leaky abstraction with a difficult -to manage corner case. Which corner case ? - -We have a set of objects (sessions) with an event that may -or may not happen (idle timeout timer firing), and a -necessity to reset the idle timeout when there is -activity on the session. - -In the worst case, where we had a 10000 of one-packet -UDP sessions just created 10 minutes ago, we would need -to deal with a spike of 10000 expired timers. - -Of course, if we have the active traffic on all -of these 10000 connections, then we will not have -to deal with that ? Right, but we will still have to deal -with canceling and requeueing the timers. - -In the best possible case, requeueing a timer is -going to be something along the lines of a linked-list -removal and reinsertion. - -However, keep in mind we already need to classify the -connections for reuse, so therefore we already have -the linked lists! - -And if we just check these linked lists periodically in -a FIFO fashion, we can get away with a very simple per-packet operation: -writing back the timestamp of "now" into the connection structure. - -Then rather than requeueing the list on a per-packet or per-frame -basis, we can defer this action until the time this session -appears on the head of the FIFO list, and the cleaning -routine makes the decision about whether to discard -the session (because the interval since last activity is bigger -than the idle timeout), or to requeue the session back to -the end of the list (because the last activity was less -than idle timeout ago). - -So, rather than using the timers, we can simply reuse our classification -FIFOs, with the following heuristic: do not look at the session that was -enqueued at time X until X+session_timeout. If we enqueue the sessions -in the order of their initial activity, then we can simply use enqueue -timestamp of the head session as a decision criterion for when we need -to get back at looking at it for the timeout purposes. - -Since the number of FIFOs is small, we get a slightly worse check -performance than with timers, but still O(1). - -We seemingly do quite a few "useless" operations of requeueing the items -back to the tail of the list - but, these are the operations we do not -have to do in the active data path, so overall it is a win. - -(Diversion: I believe this problem is congruent to poll vs. epoll or -events vs. threads, some reading on this subject: -http://web.archive.org/web/20120225022154/http://sheddingbikes.com/posts/1280829388.html) - -We can also can run a TCP-like scheme for adaptively changing -the wait period in the routine that deals with the connection timeouts: -we can attempt to check the connections a couple of times per second -(same as we would advance the timer wheel), and then if we have requeued -close to a max-per-quantum number of connections, we can half the waiting -interval, and if we did not requeue any, we can slowly increment the waiting -interval - which at a steady state should stabilize similar to what the TCP rate -does. - -reflexive ACLs: multi-thread -============================= - -The single-threaded implementation in 1704 used a separate "cleaner" process -to deal with the timing out of the connections. -It is all good and great when you know that there is only a single core -to run everything on, but the existence of the lists proves to be -a massive difficulty when it comes to operating from multiple threads. - -Initial study shows that with a few assumptions (e.g. that the cleaner running in main thread -and the worker have a demarcation point in time where either one or the other one touches -the session in the list) it might be possible to make it work, but the resulting -trickiness of doing it neatly with all the corner cases is quite large. - -So, for the multi-threaded scenario, we need to move the connection -aging back to the same CPU as its creation. - -Luckily we can do this with the help of the interrupts. - -So, the design is as follows: the aging thread (acl_fa_session_cleaner_process) -periodically fires the interrupts to the workers interrupt nodes (acl_fa_worker_session_cleaner_process_node.index), -using vlib_node_set_interrupt_pending(), and -the interrupt node acl_fa_worker_conn_cleaner_process() calls acl_fa_check_idle_sessions() -which does the actual job of advancing the lists. And within the actual datapath the only thing we will be -doing is putting the items onto FIFO, and updating the last active time on the existing connection. - -The one "delicate" part is that the worker for one leg of the connection might be different from -the worker of another leg of the connection - but, even if the "owner" tries to free the connection, -nothing terrible can happen - worst case the element of the pool (which is nominally free for a short period) -will get the timestamp updated - same thing about the TCP flags seen. - -A slightly trickier issue arises when the packet initially seen by one worker (thus owned by that worker), -and the return packet processed by another worker, and as a result changes the -the class of the connection (e.g. becomes TCP_ESTABLISHED from TCP_TRANSIENT or vice versa). -If the class changes from one with the shorter idle time to the one with the longer idle time, -then unless we are in the starvation mode where the transient connections are recycled, -we can simply do nothing and let the normal requeue mechanism kick in. If the class changes from the longer idle -timer to the shorter idle timer, then we risk keeping the connection around for longer than needed, which -will affect the resource usage. - -One solution to that is to have NxN ring buffers (where N is the number of workers), such that the non-owner -can signal to the owner the connection# that needs to be requeued out of order. - -A simpler solution though, is to ensure that each FIFO's period is equal to that of a shortest timer. -This way the resource starvation problem is taken care of, at an expense of some additional work. - -This all looks sufficiently nice and simple until a skeleton falls out of the closet: -sometimes we want to clean the connections en masse before they expire. - -There few potential scenarios: -1) removal of an ACL from the interface -2) removal of an interface -3) manual action of an operator (in the future). - -In order to tackle this, we need to modify the logic which decides whether to requeue the -connection on the end of the list, or to delete it due to idle timeout: - -We define a point in time, and have each worker thread fast-forward through its FIFO, -in the process looking for sessions that satisfy the criteria, and either keeping them or requeueing them. - -To keep the ease of appearance to the outside world, we still process this as an event -within the connection cleaner thread, but this event handler does as follows: -1) it creates the bitmap of the sw_if_index values requested to be cleared -2) for each worker, it waits to ensure there is no cleanup operation in progress (and if there is one, -it waits), and then makes a copy of the bitmap, sets the per-worker flag of a cleanup operation, and sends an interrupt. -3) wait until all cleanup operations have completed. - -Within the worker interrupt node, we check if the "cleanup in progress" is set, -and if it is, we check the "fast forward time" value. If unset, we initialize it to value now, and compare the -requested bitmap of sw_if_index values (pending_clear_sw_if_index_bitmap) with the bitmap of sw_if_index that this worker deals with. - -(we set the bit in the bitmap every time we enqueue the packet onto a FIFO - serviced_sw_if_index_bitmap in acl_fa_conn_list_add_session). - -If the result of this AND operation is zero - then we can clear the flag of cleanup in progress and return. -Else we kick off the quantum of cleanup, and make sure we get another interrupt ASAP if that cleanup operation returns non-zero, -meaning there is more work to do. -When that operation returns zero, everything has been processed, we can clear the "cleanup-in-progress" flag, and -zeroize the bitmap of sw_if_index-es requested to be cleaned. - -The interrupt node signals its wish to receive an interrupt ASAP by setting interrupt_is_needed -flag within the per-worker structure. The main thread, while waiting for the -cleanup operation to complete, checks if there is a request for interrupt, -and if there is - it sends one. - -This approach gives us a way to mass-clean the connections which is reusing the code of the regular idle -connection cleanup. - -One potential inefficiency is the bitmap values set by the session insertion -in the data path - there is nothing to clear them. - -So, if one rearranges the interface placement with the workers, then the cleanups will cause some unnecessary work. -For now, we consider it an acceptable limitation. It can be resolved by having another per-worker bitmap, which, when set, -would trigger the cleanup of the bits in the serviced_sw_if_index_bitmap). - -=== the end === - diff --git a/src/plugins/acl/acl_hash_lookup_doc.md b/src/plugins/acl/acl_hash_lookup_doc.md new file mode 100644 index 00000000..cb93df04 --- /dev/null +++ b/src/plugins/acl/acl_hash_lookup_doc.md @@ -0,0 +1,241 @@ +ACL plugin constant-time lookup design {#acl_hash_lookup} +====================================== + +The initial implementation of ACL plugin performs a trivial for() cycle, +going through the assigned ACLs on a per-packet basis. This is not very +efficient, even if for very short ACLs due to its simplicity it can beat +more advanced methods. + +However, to cover the case of longer ACLs with acceptable performance, +we need to have a better way of matching. This write-up proposes +a mechanism to make a lookup from O(M) where M is number of entries +to O(N) where N is number of different mask combinations. + +Preparation of ACL(s) +--------------------- + +The ACL plugin will maintain a global list of "mask types", i.e. the specific +configurations of "do not care" bits within the ACEs. +Upon the creation of a new ACL, a pass will be made through all the +ACEs, to assign and possibly allocate the "mask type number". + +Each ACL has a structure *hash_acl_info_t* representing the "hash-based" +parts of information related to that ACL, primarily the array of +*hash_ace_info_t* structures - each of the members of that array +corresponding to one of the rules (ACEs) in the original ACL, +for this they have a pair of *(acl_index, ace_index)* to keep track, +predominantly for the debugging. + +Why do we need a whole separate structure, and are not adding new fields +to the existing rile structure ? First, encapsulation, to minimize +the pollution of the main ACL code with the hash-based lookup artifacts. + +Second, one rule may correspond to more than one "hash-based" ACE. +In fact, most of the rules do correspond to two of those. Why ? + +Consider that the current ACL lookup logic is that if a packet +is not the initial fragment, and there is an L4 entry acting on the packet, +the comparison will be made only on the L4 protocol field value rather +than on the protocol and port values. This beaviour is governed by +*l4_match_nonfirst_fragment* flag in the *acl_main*, and was needed to +maintain the compatibility with the existing software switch implementation. + +While for the sequential check in *single_acl_match_5tuple()* +it is very easy to implement by just breaking out at the right moment, +in case of hash-based matching this cost us two checks: +one on full 5-tuple and the flag *pkt.is_nonfirst_fragment* being zero, +the second on 3-tuple and the flag *pkt.is_nonfirst_fragment* being one, +with the second check triggered by the *acl_main.l4_match_nonfirst_fragment* +setting being the default 1. This dictates the necessity of having a "match" +field in a given *hash_ace_info_t* element, which would reflect the value +we are supposed to match after applying the mask. + +There can be other circumstances when it might be beneficial to expand +the given rule in the original ACL into multiple - for example, as an +optimization within the port range handling for small port ranges +(this is not done as of the time of writing). + +Assigning ACLs to an interface +------------------------------ + +Once the ACL list is assigned to an interface, or, rather, a new ACL +is added to the list of the existing ACLs applied to the interface, +we need to update the bihash accelerating the lookup. + +All the entries for the lookups are stored within a single *48_8* bihash, +which captures the 5-tuple from the packet as well as the miscellaneous +per-packet information flags, e.g. *l4_valid*, *is_non_first_fragment*, +and so on. To facilitate the use of the single bihash by all the interfaces, +the *is_ip6*, *is_input*, *sw_if_index* are part of the key, +as well as *mask_type_index* - the latter being necessary because +there can be entries with the same value but different masks, e.g.: +`permit ::/0, permit::/128`. + +At the moment of an ACL being applied to an interface, we need to +walk the list of *hash_ace_info_t* entries corresponding to that ACL, +and update the bihash with the keys corresponding to the match +values in these entries. + +The value of the hash match contains the index into a per-*sw_if_index* vector +of *applied_ace_hash_entry_t* elements, as well as a couple of flags: +*shadowed* (optimization: if this flag on a matched entry is zero, means +we can stop the lookup early and declare a match - see below), +and *need_portrange_check* - meaning that what matched was a superset +of the actual match, and we need to perform an extra check. + +Also, upon insertion, we must keep in mind there can be +multiple *applied_ace_hash_entry_t* for the same key and must keep +a list of those. This is necessary to incrementally apply/unapply +the ACLs as part of the ACL vector: say, two ACLs have +"permit 2001:db8::1/128 any" - we should be able to retain the entry +for the second ACL even if we have deleted the first one. +Also, in case there are two entries with the same key but +different port ranges, say 0..42 and 142..65535 - we need +to be able to sequentially match on those if we decide not +to expand them into individual port-specific entries. + +Per-packet lookup +----------------- + +The simple single-packet lookup is defined in +*multi_acl_match_get_applied_ace_index*, which returns the index +of the applied hash ACE if there was a match, or ~0 if there wasn't. + +The future optimized per-packet lookup may be batched in three phases: + +1. Prepare the keys in the per-worker vector by doing logical AND of + original 5-tuple record with the elements of the mask vector. +2. Lookup the keys in the bihash in a batch manner, collecting the + result with lowest u64 (acl index within vector, ACE index) from + the hash lookup value, and performing the list walk if necessary + (for portranges) +3. Take the action from the ACL record as defined by (ACL#, ACE#) from the + resulting lookup winner, or, if no match found, then perform default deny. + +Shadowed/independent/redundant ACEs +------------------------------------ + +During the phase of combining multiple ACLs into one rulebase, when they +are applied to interface, we also can perform several optimizations. + +If a given ACE is a strict subset of another ACE located up in the linear +search order, we can ignore this ACE completely - because by definition +it will never match. We will call such an ACE *redundant*. Here is an example: + +``` +permit 2001:db8:1::/48 2001:db8:2::/48 (B) +deny 2001:d8b:1:1::/64 2001:db8:2:1::/64 (A) +``` + +A bit more formally, we can define this relationship of an ACE A to ACE B as: + +``` +redundant(aceA, aceB) := (contains(protoB, protoA) && contains(srcB, srcA) + && contains(dstB, dstA) && is_after(A, B)) +``` + +Here as "contains" we define an operation operating on the sets defined by +the protocol, (srcIP, srcPortDefinition) and (dstIP, dstPortDefinition) +respectively, and returning true if all the elements represented by +the second argument are represented by the first argument. The "is_after" +is true if A is located below B in the ruleset. + +If a given ACE does not intersect at all with any other ACE +in front of it, we can mark it as such. + +Then during the sequence of the lookups the successful hit on this ACE means +we do not need to look up other mask combinations - thus potentially +significantly speeding up the match process. Here is an example, +assuming we have the following ACL: + +``` +permit 2001:db8:1::/48 2001:db8:2::/48 (B) +deny 2001:db8:3::/48 2001:db8:2:1::/64 (A) +``` + +In this case if we match the second entry, we do not need to check whether +we have matched the first one - the source addresses are completely +different. We call such an ACE *independent* from another. + +We can define this as + +``` +independent(aceA, aceB) := (!intersect(protoA, protoB) || + !intersect(srcA, srcB) || + !intersect(dstA, dstB)) +``` + +where intersect is defined as operation returning true if there are +elements belonging to the sets of both arguments. + +If the entry A is neither redundant nor independent from B, and is below +B in the ruleset, we call such an entry *shadowed* by B, here is an example: + +``` +deny tcp 2001:db8:1::/48 2001:db8:2::/48 (B) +permit 2001:d8b:1:1::/64 2001:db8:2:1::/64 (A) +``` + +This means the earlier rule "carves out" a subset of A, thus leaving +a "shadow". (Evidently, the action needs to be different for the shadow +to have an effect, but for for the terminology sake we do not care). + +The more formal definition: + +``` +shadowed(aceA, aceB) := !redundante(aceA, aceB) && + !independent(aceA, aceB) && + is_after(aceA, aceB) +``` + +Using this terminology, any ruleset can be represented as +a DAG (Directed Acyclic Graph), with the bottom being the implicit +"deny any", pointing to the set of rules shadowing it or the ones +it is redundant for. + +These rules may in turn be shadowing each other. There is no cycles in +this graph because of the natural order of the rules - the rule located +closer to the end of the ruleset can never shadow or make redundant a rule +higher up. + +The optimization that enables can allow for is to skip matching certain +masks on a per-lookup basis - if a given rule has matched, +the only adjustments that can happen is the match with one of +the shadowing rules. + +Also, another avenue for the optimization can be starting the lookup process +with the mask type that maximizes the chances of the independent ACE match, +thus resulting in an ACE lookup being a single hash table hit. + + +Plumbing +-------- + +All the new routines are located in a separate file, +so we can cleanly experiment with a different approach if this +does not fit all of the use cases. + +The constant-time lookup within the data path has the API with +the same signature as: + +``` +u8 +multi_acl_match_5tuple (u32 sw_if_index, fa_5tuple_t * pkt_5tuple, int is_l2, + int is_ip6, int is_input, u32 * acl_match_p, + u32 * rule_match_p, u32 * trace_bitmap) +``` + +There should be a new upper-level function with the same signature, which +will make a decision whether to use a linear lookup, or to use the +constant-time lookup implemented by this work, or to add some other +optimizations (e.g. by keeping the cache of the last N lookups). + +The calls to the routine doing preparatory work should happen +in `acl_add_list()` after creating the linear-lookup structures, +and the routine doing the preparatory work populating the hashtable +should be called from `acl_interface_add_del_inout_acl()` or its callees. + +The initial implementation will be geared towards looking up a single +match at a time, with the subsequent optimizations possible to make +the lookup for more than one packet. + diff --git a/src/plugins/acl/acl_multicore_doc.md b/src/plugins/acl/acl_multicore_doc.md new file mode 100644 index 00000000..b2cf7b9c --- /dev/null +++ b/src/plugins/acl/acl_multicore_doc.md @@ -0,0 +1,349 @@ +Multicore support for ACL plugin {#acl_multicore} +================================ + +This captures some considerations and design decisions that I have made, +both for my own memory later on ("what the hell was I thinking?!?"), +and for anyone interested to criticize/improve/hack on this code. + +One of the factors taken into account while making these decisions, +was the relative emphasis on the multi-thread vs. single-thread +use cases: the latter is the vastly more prevalent. But, +one can not optimize the single-thread performance without +having a functioning code for multi-thread. + +stateless ACLs +============== + +The stateless trivially parallelizes, and the only potential for the +race between the different threads is during the reconfiguration, +at the time of replacing the old ACL being checked, with +the new ACL. + +In case an acl_add_replace is being used to replace the rules +within the existing entry, a reallocation of `am->acls[X].rules` +vector will happen and potentially a change in count. + +acl_match_5tuple() has the following code: + +```{.c} + a = am->acls + acl_index; + for (i = 0; i < a->count; i++) + { + r = a->rules + i; + . . . +``` + +Ideally we should be immune from a->rules changing, +but the problem arises if the count changes in flight, +and the new ruleset is smaller - then we will attempt +to "match" against the free memory. + +This can(?) be solved by replacing the for() with while(), +so the comparison happens at each iteration. + +full_acl_match_5tuple(), which iterates over the list +of ACLs, is a bit less immune, since it takes the pointer +to the vector to iterate and keeps a local copy of +that pointer. + +This race can be solved by checking the +current pointer to the vector with the source pointer, +and seeing if there is an (unlikely) change, and if +there is, return the "deny" action, or, better, +restart the check. + +Since the check reloads the ACL list on a per-packet basis, +there is only a window of opportunity of one packet to +"match" packet against an incorrect rule set. +The workers also do not change anything, only read. +Therefore, it looks like building special structures +to ensure that it does not happen at all might be not +worth it. + +At least not until we have a unit-test able to +reliably catch this condition and test that +the measures applied are effective. Adding the code +which is not possible to exercise is worse than +not adding any code at all. + +So, I opt for "do-nothing" here for the moment. + +reflexive ACLs: single-thread +============================= + +Before we talk multi-thread, is worth revisiting the +design of the reflexive ACLs in the plugin, and +the history of their evolution. + +The very first version of the ACL plugin, shipped in +1701, mostly did the job using the existing components +and gluing them together. Because it needed to work +in bridged forwarding path only, using L2 classifier +as an insertion point appeared natural, also L2 classifier, +being a table with sessions, seemed like a good place +to hold the sessions. + +So, the original design had two conceptual nodes: +one, pointed by the next_miss from the L2 classifier table, +was checking the actual ACL, and inserting session into +the L2 classifier table, and the other one, pointed +to by the next_match within the specific session rule, +was checking the existing session. The timing out +of the existing connections was done in the datapath, +by periodically calling the aging function. + +This decision to use the existing components, +with its attrativeness, did bring a few limitations as well: + +* L2 classifier is a simple mask-and-value match, with +a fixed mask across the table. So, sanely supporting IPv6 +packets with extension headers in that framework was impossible. + +* There is no way to get a backpressure from L2 classifier +depending on memory usage. When it runs out of memory, +it simply crashes the box. When it runs out of memory ? +We don't really know. Depends on how it allocates it. + +* Since we need to match the *reflected* traffic, +we had to create *two* full session entries +in two different directions, which is quite wasteful memory-wise. + +* (showstopper): the L2 classifier runs only in +the bridged data path, so supporting routed data path +would require creating something else entirely different, +which would mean much more headaches support-wise going forward. + +Because of that, I have moved to a different model of +creating a session-5-tuple from the packet data - once, +and then doing all the matching just on that 5-tuple. + +This has allowed to add support for skipping IPv6 extension headers. + +Also, this new version started to store the sessions in a dedicated +bihash-per-interface, with the session key data being +aligned for the ingress packets, and being mirrored for the +egress packets. This allows of significant savings in memory, +because now we need to keep only one copy of the session table per +interface instead of two, and also to only have ONE node for all the lookups, +(L2/L3 path, in/out, IPv4/IPv6) - significantly reducing the code complexity. + +Unfortunately, bihash still has the "lack of backpressure" problem, +in a sense that if you try to insert too many entries and run out +of memory in the heap you supplied, you get a crash. + +To somewhat workaround against that, there is a "maximum tested number of sessions" +value, which tracks the currently inserted sessions in the bihash, +and if this number is being approached, a more aggressive cleanup +can happen. If this number is reached, two behaviors are possible: + +* attempt to do the stateless ACL matching and permit the packet + if it succeeds + +* deny the packet + +Currently I have opted for a second one, since it allows for +a better defined behavior, and if you have to permit +the traffic in both directions, why using stateful anyway ? + +In order to be able to do the cleanup, we need to discriminate between +the session types, with each session type having its own idle timeout. +In order to do that, we keep three lists, defined in enum acl_timeout_e: +ACL_TIMEOUT_UDP_IDLE, ACL_TIMEOUT_TCP_IDLE, ACL_TIMEOUT_TCP_TRANSIENT. + +The first one is hopefully obvious - it is just all UDP connections. +They have an idle timeout of 600 seconds. + +The second and third is a bit more subtle. TCP is a complicated protocol, +and we need to tread the fine line between doing too little and doing +too much, and triggering the potential compatibility issues because of +being a "middlebox". + +I decided to split the TCP connections into two classes: +established, and everything else. "Established", means we have seen +the SYN and ACK from both sides (with PUSH obviously masked out). +This is the "active" state of any TCP connection and we would like +to ensure we do not screw it up. So, the connections in this state +have the default idle timer of 24 hours. + +All the rest of the connections have the idle timeout of 2 minutes, +(inspired by an old value of MSL) and based on the observation +that the states this class represent are usually very short lived. + +Once we have these three baskets of connections, it is trivial to +imagine a simple cleanup mechanism to deal with this: take a +TCP transient connection that has been hanging around. + +It is debatable whether we want to do discrimination between the +different TCP transient connections. Assuming we do FIFO (and +the lists allow us to do just that), it means a given connection +on the head of the list has been hanging around for longest. +Thus, if we are short on resources, we might just go ahead and +reuse it within the datapath. + +This is where we are slowly approaching the question +"Why in the world have not you used timer wheel or such ?" + +The answer is simple: within the above constraints, it does +not buy me much. + +Also, timer wheel creates a leaky abstraction with a difficult +to manage corner case. Which corner case ? + +We have a set of objects (sessions) with an event that may +or may not happen (idle timeout timer firing), and a +necessity to reset the idle timeout when there is +activity on the session. + +In the worst case, where we had a 10000 of one-packet +UDP sessions just created 10 minutes ago, we would need +to deal with a spike of 10000 expired timers. + +Of course, if we have the active traffic on all +of these 10000 connections, then we will not have +to deal with that ? Right, but we will still have to deal +with canceling and requeueing the timers. + +In the best possible case, requeueing a timer is +going to be something along the lines of a linked-list +removal and reinsertion. + +However, keep in mind we already need to classify the +connections for reuse, so therefore we already have +the linked lists! + +And if we just check these linked lists periodically in +a FIFO fashion, we can get away with a very simple per-packet operation: +writing back the timestamp of "now" into the connection structure. + +Then rather than requeueing the list on a per-packet or per-frame +basis, we can defer this action until the time this session +appears on the head of the FIFO list, and the cleaning +routine makes the decision about whether to discard +the session (because the interval since last activity is bigger +than the idle timeout), or to requeue the session back to +the end of the list (because the last activity was less +than idle timeout ago). + +So, rather than using the timers, we can simply reuse our classification +FIFOs, with the following heuristic: do not look at the session that was +enqueued at time X until X+session_timeout. If we enqueue the sessions +in the order of their initial activity, then we can simply use enqueue +timestamp of the head session as a decision criterion for when we need +to get back at looking at it for the timeout purposes. + +Since the number of FIFOs is small, we get a slightly worse check +performance than with timers, but still O(1). + +We seemingly do quite a few "useless" operations of requeueing the items +back to the tail of the list - but, these are the operations we do not +have to do in the active data path, so overall it is a win. + +(Diversion: I believe this problem is congruent to poll vs. epoll or +events vs. threads, some reading on this subject: +http://web.archive.org/web/20120225022154/http://sheddingbikes.com/posts/1280829388.html) + +We can also can run a TCP-like scheme for adaptively changing +the wait period in the routine that deals with the connection timeouts: +we can attempt to check the connections a couple of times per second +(same as we would advance the timer wheel), and then if we have requeued +close to a max-per-quantum number of connections, we can half the waiting +interval, and if we did not requeue any, we can slowly increment the waiting +interval - which at a steady state should stabilize similar to what the TCP rate +does. + +reflexive ACLs: multi-thread +============================= + +The single-threaded implementation in 1704 used a separate "cleaner" process +to deal with the timing out of the connections. +It is all good and great when you know that there is only a single core +to run everything on, but the existence of the lists proves to be +a massive difficulty when it comes to operating from multiple threads. + +Initial study shows that with a few assumptions (e.g. that the cleaner running in main thread +and the worker have a demarcation point in time where either one or the other one touches +the session in the list) it might be possible to make it work, but the resulting +trickiness of doing it neatly with all the corner cases is quite large. + +So, for the multi-threaded scenario, we need to move the connection +aging back to the same CPU as its creation. + +Luckily we can do this with the help of the interrupts. + +So, the design is as follows: the aging thread (acl_fa_session_cleaner_process) +periodically fires the interrupts to the workers interrupt nodes (acl_fa_worker_session_cleaner_process_node.index), +using vlib_node_set_interrupt_pending(), and +the interrupt node acl_fa_worker_conn_cleaner_process() calls acl_fa_check_idle_sessions() +which does the actual job of advancing the lists. And within the actual datapath the only thing we will be +doing is putting the items onto FIFO, and updating the last active time on the existing connection. + +The one "delicate" part is that the worker for one leg of the connection might be different from +the worker of another leg of the connection - but, even if the "owner" tries to free the connection, +nothing terrible can happen - worst case the element of the pool (which is nominally free for a short period) +will get the timestamp updated - same thing about the TCP flags seen. + +A slightly trickier issue arises when the packet initially seen by one worker (thus owned by that worker), +and the return packet processed by another worker, and as a result changes the +the class of the connection (e.g. becomes TCP_ESTABLISHED from TCP_TRANSIENT or vice versa). +If the class changes from one with the shorter idle time to the one with the longer idle time, +then unless we are in the starvation mode where the transient connections are recycled, +we can simply do nothing and let the normal requeue mechanism kick in. If the class changes from the longer idle +timer to the shorter idle timer, then we risk keeping the connection around for longer than needed, which +will affect the resource usage. + +One solution to that is to have NxN ring buffers (where N is the number of workers), such that the non-owner +can signal to the owner the connection# that needs to be requeued out of order. + +A simpler solution though, is to ensure that each FIFO's period is equal to that of a shortest timer. +This way the resource starvation problem is taken care of, at an expense of some additional work. + +This all looks sufficiently nice and simple until a skeleton falls out of the closet: +sometimes we want to clean the connections en masse before they expire. + +There few potential scenarios: +1) removal of an ACL from the interface +2) removal of an interface +3) manual action of an operator (in the future). + +In order to tackle this, we need to modify the logic which decides whether to requeue the +connection on the end of the list, or to delete it due to idle timeout: + +We define a point in time, and have each worker thread fast-forward through its FIFO, +in the process looking for sessions that satisfy the criteria, and either keeping them or requeueing them. + +To keep the ease of appearance to the outside world, we still process this as an event +within the connection cleaner thread, but this event handler does as follows: +1) it creates the bitmap of the sw_if_index values requested to be cleared +2) for each worker, it waits to ensure there is no cleanup operation in progress (and if there is one, +it waits), and then makes a copy of the bitmap, sets the per-worker flag of a cleanup operation, and sends an interrupt. +3) wait until all cleanup operations have completed. + +Within the worker interrupt node, we check if the "cleanup in progress" is set, +and if it is, we check the "fast forward time" value. If unset, we initialize it to value now, and compare the +requested bitmap of sw_if_index values (pending_clear_sw_if_index_bitmap) with the bitmap of sw_if_index that this worker deals with. + +(we set the bit in the bitmap every time we enqueue the packet onto a FIFO - serviced_sw_if_index_bitmap in acl_fa_conn_list_add_session). + +If the result of this AND operation is zero - then we can clear the flag of cleanup in progress and return. +Else we kick off the quantum of cleanup, and make sure we get another interrupt ASAP if that cleanup operation returns non-zero, +meaning there is more work to do. +When that operation returns zero, everything has been processed, we can clear the "cleanup-in-progress" flag, and +zeroize the bitmap of sw_if_index-es requested to be cleaned. + +The interrupt node signals its wish to receive an interrupt ASAP by setting interrupt_is_needed +flag within the per-worker structure. The main thread, while waiting for the +cleanup operation to complete, checks if there is a request for interrupt, +and if there is - it sends one. + +This approach gives us a way to mass-clean the connections which is reusing the code of the regular idle +connection cleanup. + +One potential inefficiency is the bitmap values set by the session insertion +in the data path - there is nothing to clear them. + +So, if one rearranges the interface placement with the workers, then the cleanups will cause some unnecessary work. +For now, we consider it an acceptable limitation. It can be resolved by having another per-worker bitmap, which, when set, +would trigger the cleanup of the bits in the serviced_sw_if_index_bitmap). + +=== the end === + diff --git a/src/plugins/acl/hash_lookup.md b/src/plugins/acl/hash_lookup.md deleted file mode 100644 index 95524643..00000000 --- a/src/plugins/acl/hash_lookup.md +++ /dev/null @@ -1,241 +0,0 @@ -ACL plugin constant-time lookup design -====================================== - -The initial implementation of ACL plugin performs a trivial for() cycle, -going through the assigned ACLs on a per-packet basis. This is not very -efficient, even if for very short ACLs due to its simplicity it can beat -more advanced methods. - -However, to cover the case of longer ACLs with acceptable performance, -we need to have a better way of matching. This write-up proposes -a mechanism to make a lookup from O(M) where M is number of entries -to O(N) where N is number of different mask combinations. - -Preparation of ACL(s) ---------------------- - -The ACL plugin will maintain a global list of "mask types", i.e. the specific -configurations of "do not care" bits within the ACEs. -Upon the creation of a new ACL, a pass will be made through all the -ACEs, to assign and possibly allocate the "mask type number". - -Each ACL has a structure *hash_acl_info_t* representing the "hash-based" -parts of information related to that ACL, primarily the array of -*hash_ace_info_t* structures - each of the members of that array -corresponding to one of the rules (ACEs) in the original ACL, -for this they have a pair of *(acl_index, ace_index)* to keep track, -predominantly for the debugging. - -Why do we need a whole separate structure, and are not adding new fields -to the existing rile structure ? First, encapsulation, to minimize -the pollution of the main ACL code with the hash-based lookup artifacts. - -Second, one rule may correspond to more than one "hash-based" ACE. -In fact, most of the rules do correspond to two of those. Why ? - -Consider that the current ACL lookup logic is that if a packet -is not the initial fragment, and there is an L4 entry acting on the packet, -the comparison will be made only on the L4 protocol field value rather -than on the protocol and port values. This beaviour is governed by -*l4_match_nonfirst_fragment* flag in the *acl_main*, and was needed to -maintain the compatibility with the existing software switch implementation. - -While for the sequential check in *single_acl_match_5tuple()* -it is very easy to implement by just breaking out at the right moment, -in case of hash-based matching this cost us two checks: -one on full 5-tuple and the flag *pkt.is_nonfirst_fragment* being zero, -the second on 3-tuple and the flag *pkt.is_nonfirst_fragment* being one, -with the second check triggered by the *acl_main.l4_match_nonfirst_fragment* -setting being the default 1. This dictates the necessity of having a "match" -field in a given *hash_ace_info_t* element, which would reflect the value -we are supposed to match after applying the mask. - -There can be other circumstances when it might be beneficial to expand -the given rule in the original ACL into multiple - for example, as an -optimization within the port range handling for small port ranges -(this is not done as of the time of writing). - -Assigning ACLs to an interface ------------------------------- - -Once the ACL list is assigned to an interface, or, rather, a new ACL -is added to the list of the existing ACLs applied to the interface, -we need to update the bihash accelerating the lookup. - -All the entries for the lookups are stored within a single *48_8* bihash, -which captures the 5-tuple from the packet as well as the miscellaneous -per-packet information flags, e.g. *l4_valid*, *is_non_first_fragment*, -and so on. To facilitate the use of the single bihash by all the interfaces, -the *is_ip6*, *is_input*, *sw_if_index* are part of the key, -as well as *mask_type_index* - the latter being necessary because -there can be entries with the same value but different masks, e.g.: -`permit ::/0, permit::/128`. - -At the moment of an ACL being applied to an interface, we need to -walk the list of *hash_ace_info_t* entries corresponding to that ACL, -and update the bihash with the keys corresponding to the match -values in these entries. - -The value of the hash match contains the index into a per-*sw_if_index* vector -of *applied_ace_hash_entry_t* elements, as well as a couple of flags: -*shadowed* (optimization: if this flag on a matched entry is zero, means -we can stop the lookup early and declare a match - see below), -and *need_portrange_check* - meaning that what matched was a superset -of the actual match, and we need to perform an extra check. - -Also, upon insertion, we must keep in mind there can be -multiple *applied_ace_hash_entry_t* for the same key and must keep -a list of those. This is necessary to incrementally apply/unapply -the ACLs as part of the ACL vector: say, two ACLs have -"permit 2001:db8::1/128 any" - we should be able to retain the entry -for the second ACL even if we have deleted the first one. -Also, in case there are two entries with the same key but -different port ranges, say 0..42 and 142..65535 - we need -to be able to sequentially match on those if we decide not -to expand them into individual port-specific entries. - -Per-packet lookup ------------------ - -The simple single-packet lookup is defined in -*multi_acl_match_get_applied_ace_index*, which returns the index -of the applied hash ACE if there was a match, or ~0 if there wasn't. - -The future optimized per-packet lookup may be batched in three phases: - -1. Prepare the keys in the per-worker vector by doing logical AND of - original 5-tuple record with the elements of the mask vector. -2. Lookup the keys in the bihash in a batch manner, collecting the - result with lowest u64 (acl index within vector, ACE index) from - the hash lookup value, and performing the list walk if necessary - (for portranges) -3. Take the action from the ACL record as defined by (ACL#, ACE#) from the - resulting lookup winner, or, if no match found, then perform default deny. - -Shadowed/independent/redundant ACEs ------------------------------------- - -During the phase of combining multiple ACLs into one rulebase, when they -are applied to interface, we also can perform several optimizations. - -If a given ACE is a strict subset of another ACE located up in the linear -search order, we can ignore this ACE completely - because by definition -it will never match. We will call such an ACE *redundant*. Here is an example: - -``` -permit 2001:db8:1::/48 2001:db8:2::/48 (B) -deny 2001:d8b:1:1::/64 2001:db8:2:1::/64 (A) -``` - -A bit more formally, we can define this relationship of an ACE A to ACE B as: - -``` -redundant(aceA, aceB) := (contains(protoB, protoA) && contains(srcB, srcA) - && contains(dstB, dstA) && is_after(A, B)) -``` - -Here as "contains" we define an operation operating on the sets defined by -the protocol, (srcIP, srcPortDefinition) and (dstIP, dstPortDefinition) -respectively, and returning true if all the elements represented by -the second argument are represented by the first argument. The "is_after" -is true if A is located below B in the ruleset. - -If a given ACE does not intersect at all with any other ACE -in front of it, we can mark it as such. - -Then during the sequence of the lookups the successful hit on this ACE means -we do not need to look up other mask combinations - thus potentially -significantly speeding up the match process. Here is an example, -assuming we have the following ACL: - -``` -permit 2001:db8:1::/48 2001:db8:2::/48 (B) -deny 2001:db8:3::/48 2001:db8:2:1::/64 (A) -``` - -In this case if we match the second entry, we do not need to check whether -we have matched the first one - the source addresses are completely -different. We call such an ACE *independent* from another. - -We can define this as - -``` -independent(aceA, aceB) := (!intersect(protoA, protoB) || - !intersect(srcA, srcB) || - !intersect(dstA, dstB)) -``` - -where intersect is defined as operation returning true if there are -elements belonging to the sets of both arguments. - -If the entry A is neither redundant nor independent from B, and is below -B in the ruleset, we call such an entry *shadowed* by B, here is an example: - -``` -deny tcp 2001:db8:1::/48 2001:db8:2::/48 (B) -permit 2001:d8b:1:1::/64 2001:db8:2:1::/64 (A) -``` - -This means the earlier rule "carves out" a subset of A, thus leaving -a "shadow". (Evidently, the action needs to be different for the shadow -to have an effect, but for for the terminology sake we do not care). - -The more formal definition: - -``` -shadowed(aceA, aceB) := !redundante(aceA, aceB) && - !independent(aceA, aceB) && - is_after(aceA, aceB) -``` - -Using this terminology, any ruleset can be represented as -a DAG (Directed Acyclic Graph), with the bottom being the implicit -"deny any", pointing to the set of rules shadowing it or the ones -it is redundant for. - -These rules may in turn be shadowing each other. There is no cycles in -this graph because of the natural order of the rules - the rule located -closer to the end of the ruleset can never shadow or make redundant a rule -higher up. - -The optimization that enables can allow for is to skip matching certain -masks on a per-lookup basis - if a given rule has matched, -the only adjustments that can happen is the match with one of -the shadowing rules. - -Also, another avenue for the optimization can be starting the lookup process -with the mask type that maximizes the chances of the independent ACE match, -thus resulting in an ACE lookup being a single hash table hit. - - -Plumbing --------- - -All the new routines are located in a separate file, -so we can cleanly experiment with a different approach if this -does not fit all of the use cases. - -The constant-time lookup within the data path has the API with -the same signature as: - -``` -u8 -multi_acl_match_5tuple (u32 sw_if_index, fa_5tuple_t * pkt_5tuple, int is_l2, - int is_ip6, int is_input, u32 * acl_match_p, - u32 * rule_match_p, u32 * trace_bitmap) -``` - -There should be a new upper-level function with the same signature, which -will make a decision whether to use a linear lookup, or to use the -constant-time lookup implemented by this work, or to add some other -optimizations (e.g. by keeping the cache of the last N lookups). - -The calls to the routine doing preparatory work should happen -in `acl_add_list()` after creating the linear-lookup structures, -and the routine doing the preparatory work populating the hashtable -should be called from `acl_interface_add_del_inout_acl()` or its callees. - -The initial implementation will be geared towards looking up a single -match at a time, with the subsequent optimizations possible to make -the lookup for more than one packet. - diff --git a/src/vlibapi/api_doc.md b/src/vlibapi/api_doc.md index e620ee12..2e7ae09a 100644 --- a/src/vlibapi/api_doc.md +++ b/src/vlibapi/api_doc.md @@ -6,12 +6,12 @@ APIs. Messages are defined in `*.api` files. Today, there are about 50 api files, with more arriving as folks add programmable features. The API file compiler -sources reside in @ref src/tools/vppapigen . +sources reside in @ref src/tools/vppapigen. -Here's a typical request/response message definition, from -@ref src/vnet/interface.api : +From @ref src/vnet/interface.api, here's a typical request/response message +definition: -``` +```{.c} autoreply define sw_interface_set_flags { u32 client_index; @@ -22,10 +22,10 @@ Here's a typical request/response message definition, from }; ``` -To a first approximation, the API compiler renders this definition as -follows: +To a first approximation, the API compiler renders this definition into +`build-root/.../vpp/include/vnet/interface.api.h` as follows: -``` +```{.c} /****** Message ID / handler enum ******/ #ifdef vl_msg_id vl_msg_id(VL_API_SW_INTERFACE_SET_FLAGS, vl_api_sw_interface_set_flags_t_handler) @@ -60,10 +60,13 @@ follows: u32 context; i32 retval; }) vl_api_sw_interface_set_flags_reply_t; + + ... + #endif /* vl_typedefs */ ``` To change the admin state of an interface, a binary api client sends a -@ref vl_api_sw_interface_set_flags_t to vpp, which will respond with a +@ref vl_api_sw_interface_set_flags_t to VPP, which will respond with a @ref vl_api_sw_interface_set_flags_reply_t message. Multiple layers of software, transport types, and shared libraries @@ -76,7 +79,7 @@ implement a variety of features: message handlers. Correctly-coded message handlers know nothing about the transport used to -deliver messages to/from vpp. It's reasonably straighforward to use multiple +deliver messages to/from VPP. It's reasonably straighforward to use multiple API message transport types simultaneously. For historical reasons, binary api messages are (putatively) sent in network @@ -90,12 +93,12 @@ Since binary API messages are always processed in order, we allocate messages using a ring allocator whenever possible. This scheme is extremely fast when compared with a traditional memory allocator, and doesn't cause heap fragmentation. See -@ref src/vlibmemory/memory_shared.c @ref vl_msg_api_alloc_internal() . +@ref src/vlibmemory/memory_shared.c @ref vl_msg_api_alloc_internal(). Regardless of transport, binary api messages always follow a @ref msgbuf_t header: -``` +```{.c} typedef struct msgbuf_ { unix_shared_memory_queue_t *q; @@ -109,7 +112,7 @@ This structure makes it easy to trace messages without having to decode them - simply save data_len bytes - and allows @ref vl_msg_api_free() to rapidly dispose of message buffers: -``` +```{.c} void vl_msg_api_free (void *a) { @@ -130,34 +133,34 @@ decode them - simply save data_len bytes - and allows return; } - } + } ``` ## Message Tracing and Replay -It's extremely important that vpp can capture and replay sizeable binary API +It's extremely important that VPP can capture and replay sizeable binary API traces. System-level issues involving hundreds of thousands of API transactions can be re-run in a second or less. Partial replay allows one to binary-search for the point where the wheels fall off. One can add scaffolding to the data plane, to trigger when complex conditions obtain. With binary API trace, print, and replay, system-level bug reports of the form -"after 300,000 API transactions, the vpp data-plane stopped forwarding +"after 300,000 API transactions, the VPP data-plane stopped forwarding traffic, FIX IT!" can be solved offline. More often than not, one discovers that a control-plane client misprograms the data plane after a long time or under complex circumstances. Without direct evidence, "it's a data-plane problem!" -See @ref src/vlibmemory/memory_vlib.c @ref vl_msg_api_process_file() , -and @ref src/vlibapi/api_shared.c . See also the debug CLI command "api trace" +See @ref src/vlibmemory/memory_vlib.c @ref vl_msg_api_process_file(), +and @ref src/vlibapi/api_shared.c. See also the debug CLI command "api trace" ## Client connection details -Establishing a binary API connection to vpp from a C-language client +Establishing a binary API connection to VPP from a C-language client is easy: -``` +```{.c} int connect_to_vpe (char *client_name, int client_message_queue_length) { @@ -176,9 +179,9 @@ is easy: } ``` -32 is a typical value for client_message_queue_length. Vpp cannot +32 is a typical value for client_message_queue_length. VPP cannot block when it needs to send an API message to a binary API client, and -the vpp-side binary API message handlers are very fast. When sending +the VPP-side binary API message handlers are very fast. When sending asynchronous messages, make sure to scrape the binary API rx ring with some enthusiasm. @@ -187,7 +190,7 @@ some enthusiasm. Calling @ref vl_client_connect_to_vlib spins up a binary API message RX pthread: -``` +```{.c} static void * rx_thread_fn (void *arg) { @@ -214,31 +217,31 @@ To handle the binary API message queue yourself, use @ref vl_client_connect_to_vlib_no_rx_pthread. In turn, vl_msg_api_queue_handler(...) uses mutex/condvar signalling -to wake up, process vpp -> client traffic, then sleep. Vpp supplies a -condvar broadcast when the vpp -> client API message queue transitions +to wake up, process VPP -> client traffic, then sleep. VPP supplies a +condvar broadcast when the VPP -> client API message queue transitions from empty to nonempty. -Vpp checks its own binary API input queue at a very high rate. Vpp +VPP checks its own binary API input queue at a very high rate. VPP invokes message handlers in "process" context [aka cooperative multitasking thread context] at a variable rate, depending on data-plane packet processing requirements. ## Client disconnection details -To disconnect from vpp, call @ref vl_client_disconnect_from_vlib -. Please arrange to call this function if the client application -terminates abnormally. Vpp makes every effort to hold a decent funeral -for dead clients, but vpp can't guarantee to free leaked memory in the +To disconnect from VPP, call @ref vl_client_disconnect_from_vlib. +Please arrange to call this function if the client application +terminates abnormally. VPP makes every effort to hold a decent funeral +for dead clients, but VPP can't guarantee to free leaked memory in the shared binary API segment. -## Sending binary API messages to vpp +## Sending binary API messages to VPP -The point of the exercise is to send binary API messages to vpp, and -to receive replies from vpp. Many vpp binary APIs comprise a client +The point of the exercise is to send binary API messages to VPP, and +to receive replies from VPP. Many VPP binary APIs comprise a client request message, and a simple status reply. For example, to set the admin status of an interface, one codes: -``` +```{.c} vl_api_sw_interface_set_flags_t *mp; mp = vl_msg_api_alloc (sizeof (*mp)); @@ -262,9 +265,9 @@ Key points: network byte order * The client-library global data structure @ref api_main keeps track - of sufficient pointers and handles used to communicate with vpp + of sufficient pointers and handles used to communicate with VPP -## Receiving binary API messages from vpp +## Receiving binary API messages from VPP Unless you've made other arrangements (see @ref vl_client_connect_to_vlib_no_rx_pthread), *messages are received on a @@ -273,7 +276,7 @@ thread is the responsibility of the application! Set up message handlers about as follows: -``` +```{.c} #define vl_typedefs /* define message structures */ #include #undef vl_typedefs @@ -319,7 +322,7 @@ vectors in the @ref api_main_t structure. As of this writing: not all vector element values can be set through the API. You'll see sporadic API message registrations followed by minor adjustments of this form: -``` +```{.c} /* * Thread-safe API messages */ -- cgit 1.2.3-korg