aboutsummaryrefslogtreecommitdiffstats
path: root/docs/developer
diff options
context:
space:
mode:
authorNathan Skrzypczak <nathan.skrzypczak@gmail.com>2021-08-19 11:38:06 +0200
committerDave Wallace <dwallacelf@gmail.com>2021-10-13 23:22:32 +0000
commit9ad39c026c8a3c945a7003c4aa4f5cb1d4c80160 (patch)
tree3cca19635417e28ae381d67ae31c75df2925032d /docs/developer
parentf47122e07e1ecd0151902a3cabe46c60a99bee8e (diff)
docs: better docs, mv doxygen to sphinx
This patch refactors the VPP sphinx docs in order to make it easier to consume for external readers as well as VPP developers. It also makes sphinx the single source of documentation, which simplifies maintenance and operation. Most important updates are: - reformat the existing documentation as rst - split RELEASE.md and move it into separate rst files - remove section 'events' - remove section 'archive' - remove section 'related projects' - remove section 'feature by release' - remove section 'Various links' - make (Configuration reference, CLI docs, developer docs) top level items in the list - move 'Use Cases' as part of 'About VPP' - move 'Troubleshooting' as part of 'Getting Started' - move test framework docs into 'Developer Documentation' - add a 'Contributing' section for gerrit, docs and other contributer related infos - deprecate doxygen and test-docs targets - redirect the "make doxygen" target to "make docs" Type: refactor Change-Id: I552a5645d5b7964d547f99b1336e2ac24e7c209f Signed-off-by: Nathan Skrzypczak <nathan.skrzypczak@gmail.com> Signed-off-by: Andrew Yourtchenko <ayourtch@gmail.com>
Diffstat (limited to 'docs/developer')
-rw-r--r--docs/developer/build-run-debug/building.rst181
-rw-r--r--docs/developer/build-run-debug/cross_compile_macos.rst70
-rw-r--r--docs/developer/build-run-debug/gdb_examples.rst141
-rw-r--r--docs/developer/build-run-debug/index.rst14
-rw-r--r--docs/developer/build-run-debug/running_vpp.rst48
-rw-r--r--docs/developer/build-run-debug/testing_vpp.rst140
-rw-r--r--docs/developer/corearchitecture/bihash.rst313
-rw-r--r--docs/developer/corearchitecture/buffer_metadata.rst237
-rw-r--r--docs/developer/corearchitecture/buildsystem/buildrootmakefile.rst353
-rw-r--r--docs/developer/corearchitecture/buildsystem/cmakeandninja.rst186
-rw-r--r--docs/developer/corearchitecture/buildsystem/index.rst14
-rw-r--r--docs/developer/corearchitecture/buildsystem/mainmakefile.rst2
-rw-r--r--docs/developer/corearchitecture/featurearcs.rst225
-rw-r--r--docs/developer/corearchitecture/index.rst21
-rw-r--r--docs/developer/corearchitecture/infrastructure.rst612
l---------docs/developer/corearchitecture/mem.rst1
-rw-r--r--docs/developer/corearchitecture/multi_thread.rst169
-rw-r--r--docs/developer/corearchitecture/multiarch/arbfns.rst87
-rw-r--r--docs/developer/corearchitecture/multiarch/index.rst12
-rw-r--r--docs/developer/corearchitecture/multiarch/nodefns.rst138
-rw-r--r--docs/developer/corearchitecture/softwarearchitecture.rst47
-rw-r--r--docs/developer/corearchitecture/vlib.rst888
-rw-r--r--docs/developer/corearchitecture/vnet.rst807
l---------docs/developer/corefeatures/bfd_doc.rst1
-rw-r--r--docs/developer/corefeatures/eventviewer.rst286
-rw-r--r--docs/developer/corefeatures/fib/attachedexport.rst50
-rw-r--r--docs/developer/corefeatures/fib/barnacles.rst78
-rw-r--r--docs/developer/corefeatures/fib/controlplane.rst23
-rw-r--r--docs/developer/corefeatures/fib/dataplane.rst100
-rw-r--r--docs/developer/corefeatures/fib/debugging.rst106
-rw-r--r--docs/developer/corefeatures/fib/fastconvergence.rst576
-rw-r--r--docs/developer/corefeatures/fib/graphs.rst34
-rw-r--r--docs/developer/corefeatures/fib/graphwalks.rst80
-rw-r--r--docs/developer/corefeatures/fib/hacking.rst68
-rw-r--r--docs/developer/corefeatures/fib/index.rst21
-rw-r--r--docs/developer/corefeatures/fib/marknsweep.rst68
-rw-r--r--docs/developer/corefeatures/fib/missing.rst110
-rw-r--r--docs/developer/corefeatures/fib/mplsfib.rst220
-rw-r--r--docs/developer/corefeatures/fib/multicast.rst106
-rw-r--r--docs/developer/corefeatures/fib/neighbors.rst88
-rw-r--r--docs/developer/corefeatures/fib/prefixes.rst17
-rw-r--r--docs/developer/corefeatures/fib/prerequisites.rst12
-rw-r--r--docs/developer/corefeatures/fib/routes.rst353
-rw-r--r--docs/developer/corefeatures/fib/scale.rst247
-rw-r--r--docs/developer/corefeatures/fib/thedatamodel.rst15
-rw-r--r--docs/developer/corefeatures/fib/tunnels.rst62
-rw-r--r--docs/developer/corefeatures/index.rst21
l---------docs/developer/corefeatures/ipfix_doc.rst1
l---------docs/developer/corefeatures/ipsec.rst1
l---------docs/developer/corefeatures/mtu.rst1
l---------docs/developer/corefeatures/punt.rst1
l---------docs/developer/corefeatures/selinux_doc.rst1
l---------docs/developer/corefeatures/span_doc.rst1
-rw-r--r--docs/developer/corefeatures/sr/index.rst14
l---------docs/developer/corefeatures/sr/sr_doc.rst1
l---------docs/developer/corefeatures/sr/sr_localsid.rst1
l---------docs/developer/corefeatures/sr/sr_mpls.rst1
l---------docs/developer/corefeatures/sr/sr_policy.rst1
l---------docs/developer/corefeatures/sr/sr_steering.rst1
l---------docs/developer/corefeatures/stats.rst1
l---------docs/developer/corefeatures/sylog_doc.rst1
l---------docs/developer/devicedrivers/af_xdp.rst1
l---------docs/developer/devicedrivers/avf.rst1
-rw-r--r--docs/developer/devicedrivers/index.rst15
l---------docs/developer/devicedrivers/rdma.rst1
l---------docs/developer/devicedrivers/vmxnet3.rst1
-rw-r--r--docs/developer/extras/index.rst17
l---------docs/developer/extras/lcov.rst1
l---------docs/developer/extras/snap.rst1
l---------docs/developer/extras/strongswan.rst1
l---------docs/developer/extras/vcl_ldpreload.rst1
l---------docs/developer/extras/vpp_config.rst1
l---------docs/developer/extras/vpp_if_stats.rst1
l---------docs/developer/extras/vpp_stats_fs.rst1
l---------docs/developer/extras/vpptop.rst1
-rw-r--r--docs/developer/plugindoc/add_plugin.rst362
l---------docs/developer/plugindoc/handoffdemo.rst1
-rw-r--r--docs/developer/plugindoc/index.rst13
l---------docs/developer/plugindoc/sample_plugin_doc.rst1
l---------docs/developer/plugins/acl_hash_lookup.rst1
l---------docs/developer/plugins/acl_lookup_context.rst1
l---------docs/developer/plugins/acl_multicore.rst1
l---------docs/developer/plugins/bufmon_doc.rst1
l---------docs/developer/plugins/cnat.rst1
l---------docs/developer/plugins/dhcp6_pd.rst1
l---------docs/developer/plugins/flowprobe.rst1
-rw-r--r--docs/developer/plugins/index.rst41
l---------docs/developer/plugins/ioam.rst1
l---------docs/developer/plugins/lacp.rst1
l---------docs/developer/plugins/lb.rst1
l---------docs/developer/plugins/lcp.rst1
l---------docs/developer/plugins/lldp.rst1
l---------docs/developer/plugins/map_lw4o6.rst1
l---------docs/developer/plugins/marvell.rst1
l---------docs/developer/plugins/mdata.rst1
l---------docs/developer/plugins/nat44_ei_ha.rst1
l---------docs/developer/plugins/nat64.rst1
l---------docs/developer/plugins/pnat.rst1
l---------docs/developer/plugins/quic.rst1
l---------docs/developer/plugins/srtp.rst1
l---------docs/developer/plugins/srv6/ad_flow_plugin_doc.rst1
l---------docs/developer/plugins/srv6/ad_plugin_doc.rst1
l---------docs/developer/plugins/srv6/am_plugin_doc.rst1
l---------docs/developer/plugins/srv6/as_plugin_doc.rst1
-rw-r--r--docs/developer/plugins/srv6/index.rst16
l---------docs/developer/plugins/srv6/mobile_plugin_doc.rst1
l---------docs/developer/plugins/srv6/runner_doc.rst1
l---------docs/developer/plugins/srv6/srv6_sample_localsid_doc.rst1
l---------docs/developer/plugins/wireguard.rst1
-rw-r--r--docs/developer/tests/overview.rst450
110 files changed, 8431 insertions, 0 deletions
diff --git a/docs/developer/build-run-debug/building.rst b/docs/developer/build-run-debug/building.rst
new file mode 100644
index 00000000000..1df838abf84
--- /dev/null
+++ b/docs/developer/build-run-debug/building.rst
@@ -0,0 +1,181 @@
+.. _building:
+
+.. toctree::
+
+Building VPP
+============
+
+To get started developing with VPP, you need to get the required VPP sources and then build the packages.
+For more detailed information on the build system please refer to :ref:`buildsystem`.
+
+.. _setupproxies:
+
+Set up Proxies
+--------------------------
+
+Depending on the environment you are operating in, proxies may need to be set.
+Run these proxy commands to specify the *proxy-server-name* and corresponding *port-number*:
+
+.. code-block:: console
+
+ $ export http_proxy=http://<proxy-server-name>.com:<port-number>
+ $ export https_proxy=https://<proxy-server-name>.com:<port-number>
+
+
+Get the VPP Sources
+-----------------------------------
+
+To get the VPP sources that are used to create the build, run the following commands:
+
+.. code-block:: console
+
+ $ git clone https://gerrit.fd.io/r/vpp
+ $ cd vpp
+
+Build VPP Dependencies
+--------------------------------------
+
+Before building a VPP image, make sure there are no FD.io VPP or DPDK packages
+installed, by entering the following commands:
+
+.. code-block:: console
+
+ $ dpkg -l | grep vpp
+ $ dpkg -l | grep DPDK
+
+There should be no output, or no packages shown after the above commands are run.
+
+Run the following **make** command to install the dependencies for FD.io VPP.
+
+If the download hangs at any point, then you may need to
+:ref:`set up proxies <setupproxies>` for the download to work.
+
+.. code-block:: console
+
+ $ make install-dep
+ Hit:1 http://us.archive.ubuntu.com/ubuntu xenial InRelease
+ Get:2 http://us.archive.ubuntu.com/ubuntu xenial-updates InRelease [109 kB]
+ Get:3 http://security.ubuntu.com/ubuntu xenial-security InRelease [107 kB]
+ Get:4 http://us.archive.ubuntu.com/ubuntu xenial-backports InRelease [107 kB]
+ Get:5 http://us.archive.ubuntu.com/ubuntu xenial-updates/main amd64 Packages [803 kB]
+ Get:6 http://us.archive.ubuntu.com/ubuntu xenial-updates/main i386 Packages [732 kB]
+ ...
+ ...
+ Update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jmap to provide /usr/bin/jmap (jmap) in auto mode
+ Setting up default-jdk-headless (2:1.8-56ubuntu2) ...
+ Processing triggers for libc-bin (2.23-0ubuntu3) ...
+ Processing triggers for systemd (229-4ubuntu6) ...
+ Processing triggers for ureadahead (0.100.0-19) ...
+ Processing triggers for ca-certificates (20160104ubuntu1) ...
+ Updating certificates in /etc/ssl/certs...
+ 0 added, 0 removed; done.
+ Running hooks in /etc/ca-certificates/update.d...
+
+ done.
+ done.
+
+Build VPP (Debug)
+----------------------------
+
+This build version contains debug symbols which are useful for modifying VPP. The
+**make** command below builds a debug version of VPP. The binaries, when building the
+debug images, can be found in /build-root/vpp_debug-native.
+
+The Debug build version contains debug symbols, which are useful for troubleshooting
+or modifying VPP. The **make** command below, builds a debug version of VPP. The
+binaries used for building the debug image can be found in */build-root/vpp_debug-native*.
+
+.. code-block:: console
+
+ $ make build
+ make[1]: Entering directory '/home/vagrant/vpp-master/build-root'
+ @@@@ Arch for platform 'vpp' is native @@@@
+ @@@@ Finding source for dpdk @@@@
+ @@@@ Makefile fragment found in /home/vagrant/vpp-master/build-data/packages/dpdk.mk @@@@
+ @@@@ Source found in /home/vagrant/vpp-master/dpdk @@@@
+ @@@@ Arch for platform 'vpp' is native @@@@
+ @@@@ Finding source for vpp @@@@
+ @@@@ Makefile fragment found in /home/vagrant/vpp-master/build-data/packages/vpp.mk @@@@
+ @@@@ Source found in /home/vagrant/vpp-master/src @@@@
+ ...
+ ...
+ make[5]: Leaving directory '/home/vagrant/vpp-master/build-root/build-vpp_debug-native/vpp/vpp-api/java'
+ make[4]: Leaving directory '/home/vagrant/vpp-master/build-root/build-vpp_debug-native/vpp/vpp-api/java'
+ make[3]: Leaving directory '/home/vagrant/vpp-master/build-root/build-vpp_debug-native/vpp'
+ make[2]: Leaving directory '/home/vagrant/vpp-master/build-root/build-vpp_debug-native/vpp'
+ @@@@ Installing vpp: nothing to do @@@@
+ make[1]: Leaving directory '/home/vagrant/vpp-master/build-root'
+
+Build VPP (Release Version)
+-----------------------------------------
+
+This section describes how to build the regular release version of FD.io VPP. The
+release build is optimized and does not create any debug symbols.
+The binaries used in building the release images are found in */build-root/vpp-native*.
+
+Use the following **make** command below to build the release version of FD.io VPP.
+
+.. code-block:: console
+
+ $ make build-release
+
+
+Building Necessary Packages
+--------------------------------------------
+
+The package that needs to be built depends on the type system VPP will be running on:
+
+* The :ref:`Debian package <debianpackages>` is built if VPP is going to run on Ubuntu
+* The :ref:`RPM package <rpmpackages>` is built if VPP is going to run on Centos or Redhat
+
+.. _debianpackages:
+
+Building Debian Packages
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To build the debian packages, use the following command:
+
+.. code-block:: console
+
+ $ make pkg-deb
+
+.. _rpmpackages:
+
+Building RPM Packages
+^^^^^^^^^^^^^^^^^^^^^^^
+
+To build the rpm packages, use one of the following commands below, depending on the system:
+
+.. code-block:: console
+
+ $ make pkg-rpm
+
+Once the packages are built they can be found in the build-root directory.
+
+.. code-block:: console
+
+ $ ls *.deb
+
+ If the packages are built correctly, then this should be the corresponding output:
+
+ vpp_18.07-rc0~456-gb361076_amd64.deb vpp-dbg_18.07-rc0~456-gb361076_amd64.deb
+ vpp-dev_18.07-rc0~456-gb361076_amd64.deb vpp-api-lua_18.07-rc0~456-gb361076_amd64.deb
+ vpp-lib_18.07-rc0~456-gb361076_amd64.deb vpp-api-python_18.07-rc0~456-gb361076_amd64.deb
+ vpp-plugins_18.07-rc0~456-gb361076_amd64.deb
+
+Finally, the created packages can be installed using the following commands. Install
+the package that corresponds to OS that VPP will be running on:
+
+For Ubuntu:
+
+.. code-block:: console
+
+ $ sudo bash
+ # dpkg -i *.deb
+
+For Centos or Redhat:
+
+.. code-block:: console
+
+ $ sudo bash
+ # rpm -ivh *.rpm
diff --git a/docs/developer/build-run-debug/cross_compile_macos.rst b/docs/developer/build-run-debug/cross_compile_macos.rst
new file mode 100644
index 00000000000..5eec5569a8b
--- /dev/null
+++ b/docs/developer/build-run-debug/cross_compile_macos.rst
@@ -0,0 +1,70 @@
+.. _cross_compile_macos :
+
+Cross compilation on MacOS
+==========================
+
+This is a first attempt to support Cross compilation of VPP on MacOS for development (linting, completion, compile_commands.json)
+
+
+**Prerequisites**
+
+* You'll need to install the following packages
+
+.. code-block:: console
+
+ $ pip3 install ply pyyaml jsonschema
+ $ brew install gnu-sed pkg-config ninja crosstool-ng
+
+* You'll also need to install ``clang-format 10.0.0`` to be able to ``make checkstyle``. This can be done with :ref:`this doc<install_clang_format_10_0_0>`
+* You should link the binaries to make them available in your path with their original names e.g. :
+
+.. code-block:: console
+
+ $ ln -s $(which gsed) /usr/local/bin/sed
+
+**Setup**
+
+* Create a `cross compile toolchain <https://crosstool-ng.github.io/>`_
+* Create a case sensitive volume and mount the toolchain in it e.g. in ``/Volumes/xchain``
+* Create a xchain.toolchain file with ``$VPP_DIR/extras/scripts/cross_compile_macos.sh conf /Volumes/xchan``
+
+For now we don't support e-build so dpdk, rdma, quicly won't be compiled as part of ``make build``
+
+To build with the toolchain do:
+
+.. code-block:: console
+
+ $ $VPP_DIR/extras/scripts/cross_compile_macos.sh build
+
+
+To get the compile_commands.json do
+
+.. code-block:: console
+
+ $ $VPP_DIR/extras/scripts/cross_compile_macos.sh cc
+ $ >> ./build-root/build-vpp[_debug]-native/vpp/compile_commands.json
+
+
+
+This should build vpp on MacOS
+
+
+Good luck :)
+
+.. _install_clang_format_10_0_0 :
+
+Installing clang-format 10.0.0
+------------------------------
+
+In order to install clang-format on macos :
+
+.. code-block:: bash
+
+ $ wget https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.0/clang-10.0.0.src.tar.xz
+ $ tar -xvf clang+llvm-10.0.0-x86_64-apple-darwin.tar.xz
+ $ mv clang+llvm-10.0.0-x86_64-apple-darwin /usr/local/Cellar/
+ $ sudo ln -s ../Cellar/clang+llvm-10.0.0-x86_64-apple-darwin/bin/clang-format /usr/local/bin/clang-format
+ $ sudo ln -s ../Cellar/clang+llvm-10.0.0-x86_64-apple-darwin/bin/clang-format /usr/local/bin/clang-format-10
+ $ sudo ln -s ../Cellar/clang+llvm-10.0.0-x86_64-apple-darwin/share/clang/clang-format-diff.py /usr/local/bin/clang-format-diff-10
+
+Source `Clang website <https://releases.llvm.org/download.html#git>`_
diff --git a/docs/developer/build-run-debug/gdb_examples.rst b/docs/developer/build-run-debug/gdb_examples.rst
new file mode 100644
index 00000000000..2a33f17f4da
--- /dev/null
+++ b/docs/developer/build-run-debug/gdb_examples.rst
@@ -0,0 +1,141 @@
+.. _gdb_examples:
+
+.. toctree::
+
+GDB Examples
+===============
+
+In this section we have a few useful gdb commands.
+
+Starting GDB
+----------------------------
+
+Once at the gdb prompt, VPP can be started by running the following commands:
+
+.. code-block:: console
+
+ (gdb) run -c /etc/vpp/startup.conf
+ Starting program: /scratch/vpp-master/build-root/install-vpp_debug-native/vpp/bin/vpp -c /etc/vpp/startup.conf
+ [Thread debugging using libthread_db enabled]
+ Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
+ vlib_plugin_early_init:361: plugin path /scratch/vpp-master/build-root/install-vpp_debug-native/vpp/lib/vpp_plugins:/scratch/vpp-master/build-root/install-vpp_debug-native/vpp/lib/vpp_plugins
+ ....
+
+Backtrace
+----------------------------
+
+If you encounter errors when running VPP, such as VPP terminating due to a segfault
+or abort signal, then you can run the VPP debug binary and then execute **backtrace** or **bt**.
+
+.. code-block:: console
+
+ (gdb) bt
+ #0 ip4_icmp_input (vm=0x7ffff7b89a40 <vlib_global_main>, node=0x7fffb6bb6900, frame=0x7fffb6725ac0) at /scratch/vpp-master/build-data/../src/vnet/ip/icmp4.c:187
+ #1 0x00007ffff78da4be in dispatch_node (vm=0x7ffff7b89a40 <vlib_global_main>, node=0x7fffb6bb 6900, type=VLIB_NODE_TYPE_INTERNAL, dispatch_state=VLIB_NODE_STATE_POLLING, frame=0x7fffb6725ac0, last_time_stamp=10581236529 65565) at /scratch/vpp-master/build-data/../src/vlib/main.c:988
+ #2 0x00007ffff78daa77 in dispatch_pending_node (vm=0x7ffff7b89a40 <vlib_global_main>, pending_frame_index=6, last_time_stamp=1058123652965565) at /scratch/vpp-master/build-data/../src/vlib/main.c:1138
+ ....
+
+Get to the GDB prompt
+---------------------------------------
+
+When VPP is running, you can get to the command prompt by pressing **CTRL+C**.
+
+Breakpoints
+---------------------------------------
+
+When at the GDB prompt, set a breakpoint by running the commands below:
+
+.. code-block:: console
+
+ (gdb) break ip4_icmp_input
+ Breakpoint 4 at 0x7ffff6b9c00b: file /scratch/vpp-master/build-data/../src/vnet/ip/icmp4.c, line 142.
+
+List the breakpoints already set:
+
+.. code-block:: console
+
+ (gdb) i b
+ Num Type Disp Enb Address What
+ 1 breakpoint keep y 0x00007ffff6b9c00b in ip4_icmp_input at /scratch/vpp-master/build-data/../src/vnet/ip/icmp4.c:142
+ breakpoint already hit 3 times
+ 2 breakpoint keep y 0x00007ffff6b9c00b in ip4_icmp_input at /scratch/vpp-master/build-data/../src/vnet/ip/icmp4.c:142
+ 3 breakpoint keep y 0x00007ffff640f646 in tw_timer_expire_timers_internal_1t_3w_1024sl_ov
+ at /scratch/vpp-master/build-data/../src/vppinfra/tw_timer_template.c:775
+
+Delete a breakpoint:
+
+.. code-block:: console
+
+ (gdb) del 2
+ (gdb) i b
+ Num Type Disp Enb Address What
+ 1 breakpoint keep y 0x00007ffff6b9c00b in ip4_icmp_input at /scratch/vpp-master/build-data/../src/vnet/ip/icmp4.c:142
+ breakpoint already hit 3 times
+ 3 breakpoint keep y 0x00007ffff640f646 in tw_timer_expire_timers_internal_1t_3w_1024sl_ov
+ at /scratch/vpp-master/build-data/../src/vppinfra/tw_timer_template.c:775
+
+Step/Next/List
+---------------------------------------
+
+Step through the code using (s)tep into, (n)ext, and list some lines before and after where you are with list.
+
+.. code-block:: console
+
+ Thread 1 "vpp_main" hit Breakpoint 1, ip4_icmp_input (vm=0x7ffff7b89a40 <vlib_global_main>, node=0x7fffb6bb6900, frame=0x7fffb6709480)
+ at /scratch/jdenisco/vpp-master/build-data/../src/vnet/ip/icmp4.c:142
+ 142 {
+ (gdb) n
+ 143 icmp4_main_t *im = &icmp4_main;
+ (
+ (gdb) list
+ 202 vlib_put_next_frame (vm, node, next, n_left_to_next);
+ 203 }
+ 204
+ 205 return frame->n_vectors;
+ 206 }
+ 207
+ 208 /* *INDENT-OFF* */
+ 209 VLIB_REGISTER_NODE (ip4_icmp_input_node,static) = {
+ 210 .function = ip4_icmp_input,
+ 211 .name = "ip4-icmp-input",
+
+Examining Data and packets
+-----------------------------------------------
+
+To look at data and packets use e(x)amine or (p)rint.
+
+
+For example in this code look at the ip packet:
+
+.. code-block:: console
+
+ (gdb) p/x *ip0
+ $3 = {{ip_version_and_header_length = 0x45, tos = 0x0, length = 0x5400,
+ fragment_id = 0x7049, flags_and_fragment_offset = 0x40, ttl = 0x40, protocol = 0x1,
+ checksum = 0x2ddd, {{src_address = {data = {0xa, 0x0, 0x0, 0x2},
+ data_u32 = 0x200000a, as_u8 = {0xa, 0x0, 0x0, 0x2}, as_u16 = {0xa, 0x200},
+ as_u32 = 0x200000a}, dst_address = {data = {0xa, 0x0, 0x0, 0xa}, data_u32 = 0xa00000a,
+ as_u8 = {0xa, 0x0, 0x0, 0xa}, as_u16 = {0xa, 0xa00}, as_u32 = 0xa00000a}},
+ address_pair = {src = {data = {0xa, 0x0, 0x0, 0x2}, data_u32 = 0x200000a,
+ as_u8 = {0xa, 0x0, 0x0, 0x2}, as_u16 = {0xa, 0x200}, as_u32 = 0x200000a},
+ dst = {data = {0xa, 0x0, 0x0, 0xa}, data_u32 = 0xa00000a, as_u8 = {0xa, 0x0, 0x0, 0xa},
+ as_u16 = {0xa, 0xa00}, as_u32 = 0xa00000a}}}}, {checksum_data_64 =
+ {0x40704954000045, 0x200000a2ddd0140}, checksum_data_64_32 = {0xa00000a}},
+ {checksum_data_32 = {0x54000045, 0x407049, 0x2ddd0140, 0x200000a, 0xa00000a}}}
+
+Then the icmp header
+
+.. code-block:: console
+
+ (gdb) p/x *icmp0
+ $4 = {type = 0x8, code = 0x0, checksum = 0xf148}
+
+Then look at the actual bytes:
+
+.. code-block:: console
+
+ (gdb) x/50w ip0
+ 0x7fde9953510e: 0x54000045 0x00407049 0x2ddd0140 0x0200000a
+ 0x7fde9953511e: 0x0a00000a 0xf1480008 0x03000554 0x5b6b2e8a
+ 0x7fde9953512e: 0x00000000 0x000ca99a 0x00000000 0x13121110
+ 0x7fde9953513e: 0x17161514 0x1b1a1918 0x1f1e1d1c 0x23222120
diff --git a/docs/developer/build-run-debug/index.rst b/docs/developer/build-run-debug/index.rst
new file mode 100644
index 00000000000..f8bfeab0bf8
--- /dev/null
+++ b/docs/developer/build-run-debug/index.rst
@@ -0,0 +1,14 @@
+.. _build_run_debug:
+
+=======================
+Build, Run & Debug
+=======================
+
+.. toctree::
+ :maxdepth: 1
+
+ building
+ running_vpp
+ testing_vpp
+ gdb_examples
+ cross_compile_macos
diff --git a/docs/developer/build-run-debug/running_vpp.rst b/docs/developer/build-run-debug/running_vpp.rst
new file mode 100644
index 00000000000..9b33e53ec60
--- /dev/null
+++ b/docs/developer/build-run-debug/running_vpp.rst
@@ -0,0 +1,48 @@
+.. _running_vpp:
+
+.. toctree::
+
+Running VPP
+===========
+
+After building the VPP binaries, you now have several images built.
+These images are useful when you need to run VPP without installing the packages.
+For instance if you want to run VPP with GDB.
+
+Running Without GDB
+_________________________
+
+To run the VPP images that you've built without GDB, run the following commands:
+
+Running the release image:
+
+.. code-block:: console
+
+ # make run-release
+ #
+
+Running the debug image:
+
+.. code-block:: console
+
+ # make run
+ #
+
+Running With GDB
+_________________________
+
+With the following commands you can run VPP and then be dropped into the GDB prompt.
+
+Running the release image:
+
+.. code-block:: console
+
+ # make debug-release
+ (gdb)
+
+Running the debug image:
+
+.. code-block:: console
+
+ # make debug
+ (gdb)
diff --git a/docs/developer/build-run-debug/testing_vpp.rst b/docs/developer/build-run-debug/testing_vpp.rst
new file mode 100644
index 00000000000..ca9a09efb71
--- /dev/null
+++ b/docs/developer/build-run-debug/testing_vpp.rst
@@ -0,0 +1,140 @@
+Testing VPP
+===========
+
+As of this writing, the vpp source tree includes over 1,000 unit test
+vectors. Best practices prior to pushing patches for code review: make
+sure that all of the “make test” test vectors pass.
+
+We attempt to maintain the top-level “make test-help” command so that it
+accurately describes all of the “make test” options.
+
+Examples
+--------
+
+Basic test run, all test vectors, single-vpp instance, optimized image:
+
+::
+
+ $ make test
+
+10-way parallel basic test run:
+
+::
+
+ $ make TEST_JOBS=10 test
+
+Run a specific test suite (mpls, in this case):
+
+::
+
+ $ make TEST=test_mpls test
+
+Run a specific test suite, debug image, pause prior to running the test
+suite; attach to the vpp image in gdb:
+
+::
+
+ $ make TEST=xxx DEBUG=gdb test-debug
+
+Detailed Documentation
+----------------------
+
+Current “make test-help” output:
+
+::
+
+ $ make test-help
+ test - build and run (basic) functional tests
+ test-debug - build and run (basic) functional tests (debug build)
+ test-all - build and run functional and extended tests
+ test-all-debug - build and run functional and extended tests (debug build)
+ retest - run functional tests
+ retest-debug - run functional tests (debug build)
+ retest-all - run functional and extended tests
+ retest-all-debug - run functional and extended tests (debug build)
+ test-cov - generate code coverage report for test framework
+ test-gcov - build and run functional tests (gcov build)
+ test-wipe - wipe (temporary) files generated by unit tests
+ test-wipe-cov - wipe code coverage report for test framework
+ test-wipe-doc - wipe documentation for test framework
+ test-wipe-papi - rebuild vpp_papi sources
+ test-wipe-all - wipe (temporary) files generated by unit tests, docs, and coverage
+ test-shell - enter shell with test environment
+ test-shell-debug - enter shell with test environment (debug build)
+ test-checkstyle - check PEP8 compliance for test framework
+ test-refresh-deps - refresh the Python dependencies for the tests
+
+ Arguments controlling test runs:
+ V=[0|1|2] - set test verbosity level
+ 0=ERROR, 1=INFO, 2=DEBUG
+ TEST_JOBS=[<n>|auto] - use at most <n> parallel python processes for test execution, if auto, set to number of available cpus (default: 1)
+ MAX_VPP_CPUS=[<n>|auto]- use at most <n> cpus for running vpp main and worker threads, if auto, set to number of available cpus (default: auto)
+ CACHE_OUTPUT=[0|1] - cache VPP stdout/stderr and log as one block after test finishes (default: 1)
+ FAILFAST=[0|1] - fail fast if 1, complete all tests if 0
+ TIMEOUT=<timeout> - fail test suite if any single test takes longer than <timeout> (in seconds) to finish (default: 600)
+ RETRIES=<n> - retry failed tests <n> times
+ DEBUG=<type> - set VPP debugging kind
+ DEBUG=core - detect coredump and load it in gdb on crash
+ DEBUG=gdb - allow easy debugging by printing VPP PID
+ and waiting for user input before running
+ and tearing down a testcase
+ DEBUG=gdbserver - run gdb inside a gdb server, otherwise
+ same as above
+ DEBUG=attach - attach test case to already running vpp in gdb (see test-start-vpp-in-gdb)
+
+ STEP=[yes|no] - ease debugging by stepping through a testcase
+ SANITY=[yes|no] - perform sanity import of vpp-api/sanity vpp run before running tests (default: yes)
+ EXTENDED_TESTS=[1|y] - used by '[re]test-all' & '[re]test-all-debug' to run extended tests
+ TEST=<filter> - filter the set of tests:
+ by file-name - only run tests from specified file, e.g. TEST=test_bfd selects all tests from test_bfd.py
+ by file-suffix - same as file-name, but 'test_' is omitted e.g. TEST=bfd selects all tests from test_bfd.py
+ by wildcard - wildcard filter is <file>.<class>.<test function>, each can be replaced by '*'
+ e.g. TEST='test_bfd.*.*' is equivalent to above example of filter by file-name
+ TEST='bfd.*.*' is equivalent to above example of filter by file-suffix
+ TEST='bfd.BFDAPITestCase.*' selects all tests from test_bfd.py which are part of BFDAPITestCase class
+ TEST='bfd.BFDAPITestCase.test_add_bfd' selects a single test named test_add_bfd from test_bfd.py/BFDAPITestCase
+ TEST='*.*.test_add_bfd' selects all test functions named test_add_bfd from all files/classes
+
+ VARIANT=<variant> - specify which march node variant to unit test
+ e.g. VARIANT=skx test the skx march variants
+ e.g. VARIANT=icl test the icl march variants
+
+ COREDUMP_SIZE=<size> - pass <size> as unix { coredump-size <size> } argument to vpp
+ e.g. COREDUMP_SIZE=4g
+ COREDUMP_SIZE=unlimited
+ COREDUMP_COMPRESS=1 - compress core files if not debugging them
+ EXTERN_TESTS=<path> - path to out-of-tree test_<name>.py files containing test cases
+ EXTERN_PLUGINS=<path> - path to out-of-tree plugins to be loaded by vpp under test
+ EXTERN_COV_DIR=<path> - path to out-of-tree prefix, where source, object and .gcda files can be found for coverage report
+
+ PROFILE=1 - enable profiling of test framework via cProfile module
+ PROFILE_SORT_BY=opt - sort profiling report by opt - consult cProfile documentation for possible values (default: cumtime)
+ PROFILE_OUTPUT=file - output profiling info to file - use absolute path (default: stdout)
+
+ TEST_DEBUG=1 - turn on debugging of the test framework itself (expert)
+
+ SKIP_AARCH64=1 - skip tests that are failing on the ARM platorm in FD.io CI
+
+ RND_SEED=seed - Seed RND with given seed
+
+ Starting VPP in GDB for use with DEBUG=attach:
+
+ test-start-vpp-in-gdb - start VPP in gdb (release)
+ test-start-vpp-debug-in-gdb - start VPP in gdb (debug)
+
+ Arguments controlling VPP in GDB runs:
+
+ VPP_IN_GDB_TMP_DIR - specify directory to run VPP IN (default: /tmp/unittest-attach-gdb)
+ VPP_IN_GDB_NO_RMDIR=0 - don't remove existing tmp dir but fail instead
+ VPP_IN_GDB_CMDLINE=1 - add 'interactive' to VPP arguments to run with command line
+
+ Creating test documentation
+ test-doc - generate documentation for test framework
+ test-wipe-doc - wipe documentation for test framework
+
+ Creating test code coverage report
+ test-cov - generate code coverage report for test framework
+ test-wipe-cov - wipe code coverage report for test framework
+
+ Verifying code-style
+ test-checkstyle - check PEP8 compliance
diff --git a/docs/developer/corearchitecture/bihash.rst b/docs/developer/corearchitecture/bihash.rst
new file mode 100644
index 00000000000..9b62baaf9cf
--- /dev/null
+++ b/docs/developer/corearchitecture/bihash.rst
@@ -0,0 +1,313 @@
+Bounded-index Extensible Hashing (bihash)
+=========================================
+
+Vpp uses bounded-index extensible hashing to solve a variety of
+exact-match (key, value) lookup problems. Benefits of the current
+implementation:
+
+- Very high record count scaling, tested to 100,000,000 records.
+- Lookup performance degrades gracefully as the number of records
+ increases
+- No reader locking required
+- Template implementation, it’s easy to support arbitrary (key,value)
+ types
+
+Bounded-index extensible hashing has been widely used in databases for
+decades.
+
+Bihash uses a two-level data structure:
+
+::
+
+ +-----------------+
+ | bucket-0 |
+ | log2_size |
+ | backing store |
+ +-----------------+
+ | bucket-1 |
+ | log2_size | +--------------------------------+
+ | backing store | --------> | KVP_PER_PAGE * key-value-pairs |
+ +-----------------+ | page 0 |
+ ... +--------------------------------+
+ +-----------------+ | KVP_PER_PAGE * key-value-pairs |
+ | bucket-2**N-1 | | page 1 |
+ | log2_size | +--------------------------------+
+ | backing store | ---
+ +-----------------+ +--------------------------------+
+ | KVP_PER_PAGE * key-value-pairs |
+ | page 2**(log2(size)) - 1 |
+ +--------------------------------+
+
+Discussion of the algorithm
+---------------------------
+
+This structure has a couple of major advantages. In practice, each
+bucket entry fits into a 64-bit integer. Coincidentally, vpp’s target
+CPU architectures support 64-bit atomic operations. When modifying the
+contents of a specific bucket, we do the following:
+
+- Make a working copy of the bucket’s backing storage
+- Atomically swap a pointer to the working copy into the bucket array
+- Change the original backing store data
+- Atomically swap back to the original
+
+So, no reader locking is required to search a bihash table.
+
+At lookup time, the implementation computes a key hash code. We use the
+least-significant N bits of the hash to select the bucket.
+
+With the bucket in hand, we learn log2 (nBackingPages) for the selected
+bucket. At this point, we use the next log2_size bits from the hash code
+to select the specific backing page in which the (key,value) page will
+be found.
+
+Net result: we search **one** backing page, not 2**log2_size pages. This
+is a key property of the algorithm.
+
+When sufficient collisions occur to fill the backing pages for a given
+bucket, we double the bucket size, rehash, and deal the bucket contents
+into a double-sized set of backing pages. In the future, we may
+represent the size as a linear combination of two powers-of-two, to
+increase space efficiency.
+
+To solve the “jackpot case” where a set of records collide under hashing
+in a bad way, the implementation will fall back to linear search across
+2**log2_size backing pages on a per-bucket basis.
+
+To maintain *space* efficiency, we should configure the bucket array so
+that backing pages are effectively utilized. Lookup performance tends to
+change *very little* if the bucket array is too small or too large.
+
+Bihash depends on selecting an effective hash function. If one were to
+use a truly broken hash function such as “return 1ULL.” bihash would
+still work, but it would be equivalent to poorly-programmed linear
+search.
+
+We often use cpu intrinsic functions - think crc32 - to rapidly compute
+a hash code which has decent statistics.
+
+Bihash Cookbook
+---------------
+
+Using current (key,value) template instance types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It’s quite easy to use one of the template instance types. As of this
+writing, …/src/vppinfra provides pre-built templates for 8, 16, 20, 24,
+40, and 48 byte keys, u8 \* vector keys, and 8 byte values.
+
+See …/src/vppinfra/{bihash\_\_8}.h
+
+To define the data types, #include a specific template instance, most
+often in a subsystem header file:
+
+.. code:: c
+
+ #include <vppinfra/bihash_8_8.h>
+
+If you’re building a standalone application, you’ll need to define the
+various functions by #including the method implementation file in a C
+source file.
+
+The core vpp engine currently uses most if not all of the known bihash
+types, so you probably won’t need to #include the method implementation
+file.
+
+.. code:: c
+
+ #include <vppinfra/bihash_template.c>
+
+Add an instance of the selected bihash data structure to e.g. a “main_t”
+structure:
+
+.. code:: c
+
+ typedef struct
+ {
+ ...
+ BVT (clib_bihash) hash_table;
+ or
+ clib_bihash_8_8_t hash_table;
+ ...
+ } my_main_t;
+
+The BV macro concatenate its argument with the value of the preprocessor
+symbol BIHASH_TYPE. The BVT macro concatenates its argument with the
+value of BIHASH_TYPE and the fixed-string “_t”. So in the above example,
+BVT (clib_bihash) generates “clib_bihash_8_8_t”.
+
+If you’re sure you won’t decide to change the template / type name
+later, it’s perfectly OK to code “clib_bihash_8_8_t” and so forth.
+
+In fact, if you #include multiple template instances in a single source
+file, you **must** use fully-enumerated type names. The macros stand no
+chance of working.
+
+Initializing a bihash table
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Call the init function as shown. As a rough guide, pick a number of
+buckets which is approximately
+number_of_expected_records/BIHASH_KVP_PER_PAGE from the relevant
+template instance header-file. See previous discussion.
+
+The amount of memory selected should easily contain all of the records,
+with a generous allowance for hash collisions. Bihash memory is
+allocated separately from the main heap, and won’t cost anything except
+kernel PTE’s until touched, so it’s OK to be reasonably generous.
+
+For example:
+
+.. code:: c
+
+ my_main_t *mm = &my_main;
+ clib_bihash_8_8_t *h;
+
+ h = &mm->hash_table;
+
+ clib_bihash_init_8_8 (h, "test", (u32) number_of_buckets,
+ (uword) memory_size);
+
+Add or delete a key/value pair
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use BV(clib_bihash_add_del), or the explicit type variant:
+
+.. code:: c
+
+ clib_bihash_kv_8_8_t kv;
+ clib_bihash_8_8_t * h;
+ my_main_t *mm = &my_main;
+ clib_bihash_8_8_t *h;
+
+ h = &mm->hash_table;
+ kv.key = key_to_add_or_delete;
+ kv.value = value_to_add_or_delete;
+
+ clib_bihash_add_del_8_8 (h, &kv, is_add /* 1=add, 0=delete */);
+
+In the delete case, kv.value is irrelevant. To change the value
+associated with an existing (key,value) pair, simply re-add the [new]
+pair.
+
+Simple search
+~~~~~~~~~~~~~
+
+The simplest possible (key, value) search goes like so:
+
+.. code:: c
+
+ clib_bihash_kv_8_8_t search_kv, return_kv;
+ clib_bihash_8_8_t * h;
+ my_main_t *mm = &my_main;
+ clib_bihash_8_8_t *h;
+
+ h = &mm->hash_table;
+ search_kv.key = key_to_add_or_delete;
+
+ if (clib_bihash_search_8_8 (h, &search_kv, &return_kv) < 0)
+ key_not_found();
+ else
+ key_found();
+
+Note that it’s perfectly fine to collect the lookup result
+
+.. code:: c
+
+ if (clib_bihash_search_8_8 (h, &search_kv, &search_kv))
+ key_not_found();
+ etc.
+
+Bihash vector processing
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+When processing a vector of packets which need a certain lookup
+performed, it’s worth the trouble to compute the key hash, and prefetch
+the correct bucket ahead of time.
+
+Here’s a sketch of one way to write the required code:
+
+Dual-loop: \* 6 packets ahead, prefetch 2x vlib_buffer_t’s and 2x packet
+data required to form the record keys \* 4 packets ahead, form 2x record
+keys and call BV(clib_bihash_hash) or the explicit hash function to
+calculate the record hashes. Call 2x BV(clib_bihash_prefetch_bucket) to
+prefetch the buckets \* 2 packets ahead, call 2x
+BV(clib_bihash_prefetch_data) to prefetch 2x (key,value) data pages. \*
+In the processing section, call 2x
+BV(clib_bihash_search_inline_with_hash) to perform the search
+
+Programmer’s choice whether to stash the hash code somewhere in
+vnet_buffer(b) metadata, or to use local variables.
+
+Single-loop: \* Use simple search as shown above.
+
+Walking a bihash table
+~~~~~~~~~~~~~~~~~~~~~~
+
+A fairly common scenario to build “show” commands involves walking a
+bihash table. It’s simple enough:
+
+.. code:: c
+
+ my_main_t *mm = &my_main;
+ clib_bihash_8_8_t *h;
+ void callback_fn (clib_bihash_kv_8_8_t *, void *);
+
+ h = &mm->hash_table;
+
+ BV(clib_bihash_foreach_key_value_pair) (h, callback_fn, (void *) arg);
+
+To nobody’s great surprise: clib_bihash_foreach_key_value_pair iterates
+across the entire table, calling callback_fn with active entries.
+
+Bihash table iteration safety
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The iterator template “clib_bihash_foreach_key_value_pair” must be used
+with a certain amount of care. For one thing, the iterator template does
+*not* take the bihash hash table writer lock. If your use-case requires
+it, lock the table.
+
+For another, the iterator template is not safe under all conditions:
+
+- It’s **OK to delete** bihash table entries during a table-walk. The
+ iterator checks whether the current bucket has been freed after each
+ *callback_fn(…)* invocation.
+
+- It is **not OK to add** entries during a table-walk.
+
+The add-during-walk case involves a jackpot: while processing a
+key-value-pair in a particular bucket, add a certain number of entries.
+By luck, assume that one or more of the added entries causes the
+**current bucket** to split-and-rehash.
+
+Since we rehash KVP’s to different pages based on what amounts to a
+different hash function, either of these things can go wrong:
+
+- We may revisit previously-visited entries. Depending on how one coded
+ the use-case, we could end up in a recursive-add situation.
+
+- We may skip entries that have not been visited
+
+One could build an add-safe iterator, at a significant cost in
+performance: copy the entire bucket, and walk the copy.
+
+It’s hard to imagine a worthwhile add-during walk use-case in the first
+place; let alone one which couldn’t be implemented by walking the table
+without modifying it, then adding a set of records.
+
+Creating a new template instance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Creating a new template is easy. Use one of the existing templates as a
+model, and make the obvious changes. The hash and key_compare methods
+are performance-critical in multiple senses.
+
+If the key compare method is slow, every lookup will be slow. If the
+hash function is slow, same story. If the hash function has poor
+statistical properties, space efficiency will suffer. In the limit, a
+bad enough hash function will cause large portions of the table to
+revert to linear search.
+
+Use of the best available vector unit is well worth the trouble in the
+hash and key_compare functions.
diff --git a/docs/developer/corearchitecture/buffer_metadata.rst b/docs/developer/corearchitecture/buffer_metadata.rst
new file mode 100644
index 00000000000..545c31f3041
--- /dev/null
+++ b/docs/developer/corearchitecture/buffer_metadata.rst
@@ -0,0 +1,237 @@
+Buffer Metadata
+===============
+
+Each vlib_buffer_t (packet buffer) carries buffer metadata which
+describes the current packet-processing state. The underlying techniques
+have been used for decades, across multiple packet processing
+environments.
+
+We will examine vpp buffer metadata in some detail, but folks who need
+to manipulate and/or extend the scheme should expect to do a certain
+level of code inspection.
+
+Vlib (Vector library) primary buffer metadata
+---------------------------------------------
+
+The first 64 octets of each vlib_buffer_t carries the primary buffer
+metadata. See …/src/vlib/buffer.h for full details.
+
+Important fields:
+
+- i16 current_data: the signed offset in data[], pre_data[] that we are
+ currently processing. If negative current header points into the
+ pre-data (rewrite space) area.
+- u16 current_length: nBytes between current_data and the end of this
+ buffer.
+- u32 flags: Buffer flag bits. Heavily used, not many bits left
+
+ - src/vlib/buffer.h flag bits
+
+ - VLIB_BUFFER_IS_TRACED: buffer is traced
+ - VLIB_BUFFER_NEXT_PRESENT: buffer has multiple chunks
+ - VLIB_BUFFER_TOTAL_LENGTH_VALID:
+ total_length_not_including_first_buffer is valid (see below)
+
+ - src/vnet/buffer.h flag bits
+
+ - VNET_BUFFER_F_L4_CHECKSUM_COMPUTED: tcp/udp checksum has been
+ computed
+ - VNET_BUFFER_F_L4_CHECKSUM_CORRECT: tcp/udp checksum is correct
+ - VNET_BUFFER_F_VLAN_2_DEEP: two vlan tags present
+ - VNET_BUFFER_F_VLAN_1_DEEP: one vlan tag present
+ - VNET_BUFFER_F_SPAN_CLONE: packet has already been cloned (span
+ feature)
+ - VNET_BUFFER_F_LOOP_COUNTER_VALID: packet look-up loop count
+ valid
+ - VNET_BUFFER_F_LOCALLY_ORIGINATED: packet built by vpp
+ - VNET_BUFFER_F_IS_IP4: packet is ipv4, for checksum offload
+ - VNET_BUFFER_F_IS_IP6: packet is ipv6, for checksum offload
+ - VNET_BUFFER_F_OFFLOAD_IP_CKSUM: hardware ip checksum offload
+ requested
+ - VNET_BUFFER_F_OFFLOAD_TCP_CKSUM: hardware tcp checksum offload
+ requested
+ - VNET_BUFFER_F_OFFLOAD_UDP_CKSUM: hardware udp checksum offload
+ requested
+ - VNET_BUFFER_F_IS_NATED: natted packet, skip input checks
+ - VNET_BUFFER_F_L2_HDR_OFFSET_VALID: L2 header offset valid
+ - VNET_BUFFER_F_L3_HDR_OFFSET_VALID: L3 header offset valid
+ - VNET_BUFFER_F_L4_HDR_OFFSET_VALID: L4 header offset valid
+ - VNET_BUFFER_F_FLOW_REPORT: packet is an ipfix packet
+ - VNET_BUFFER_F_IS_DVR: packet to be reinjected into the l2
+ output path
+ - VNET_BUFFER_F_QOS_DATA_VALID: QoS data valid in
+ vnet_buffer_opaque2
+ - VNET_BUFFER_F_GSO: generic segmentation offload requested
+ - VNET_BUFFER_F_AVAIL1: available bit
+ - VNET_BUFFER_F_AVAIL2: available bit
+ - VNET_BUFFER_F_AVAIL3: available bit
+ - VNET_BUFFER_F_AVAIL4: available bit
+ - VNET_BUFFER_F_AVAIL5: available bit
+ - VNET_BUFFER_F_AVAIL6: available bit
+ - VNET_BUFFER_F_AVAIL7: available bit
+
+- u32 flow_id: generic flow identifier
+- u8 ref_count: buffer reference / clone count (e.g. for span
+ replication)
+- u8 buffer_pool_index: buffer pool index which owns this buffer
+- vlib_error_t (u16) error: error code for buffers enqueued to error
+ handler
+- u32 next_buffer: buffer index of next buffer in chain. Only valid if
+ VLIB_BUFFER_NEXT_PRESENT is set
+- union
+
+ - u32 current_config_index: current index on feature arc
+ - u32 punt_reason: reason code once packet punted. Mutually
+ exclusive with current_config_index
+
+- u32 opaque[10]: primary vnet-layer opaque data (see below)
+- END of first cache line / data initialized by the buffer allocator
+- u32 trace_index: buffer’s index in the packet trace subsystem
+- u32 total_length_not_including_first_buffer: see
+ VLIB_BUFFER_TOTAL_LENGTH_VALID above
+- u32 opaque2[14]: secondary vnet-layer opaque data (see below)
+- u8 pre_data[VLIB_BUFFER_PRE_DATA_SIZE]: rewrite space, often used to
+ prepend tunnel encapsulations
+- u8 data[0]: buffer data received from the wire. Ordinarily, hardware
+ devices use b->data[0] as the DMA target but there are exceptions. Do
+ not write code which blindly assumes that packet data starts in
+ b->data[0]. Use vlib_buffer_get_current(…).
+
+Vnet (network stack) primary buffer metadata
+--------------------------------------------
+
+Vnet primary buffer metadata occupies space reserved in the vlib opaque
+field shown above, and has the type name vnet_buffer_opaque_t.
+Ordinarily accessed using the vnet_buffer(b) macro. See
+../src/vnet/buffer.h for full details.
+
+Important fields:
+
+- u32 sw_if_index[2]: RX and TX interface handles. At the ip lookup
+ stage, vnet_buffer(b)->sw_if_index[VLIB_TX] is interpreted as a FIB
+ index.
+- i16 l2_hdr_offset: offset from b->data[0] of the packet L2 header.
+ Valid only if b->flags & VNET_BUFFER_F_L2_HDR_OFFSET_VALID is set
+- i16 l3_hdr_offset: offset from b->data[0] of the packet L3 header.
+ Valid only if b->flags & VNET_BUFFER_F_L3_HDR_OFFSET_VALID is set
+- i16 l4_hdr_offset: offset from b->data[0] of the packet L4 header.
+ Valid only if b->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID is set
+- u8 feature_arc_index: feature arc that the packet is currently
+ traversing
+- union
+
+ - ip
+
+ - u32 adj_index[2]: adjacency from dest IP lookup in [VLIB_TX],
+ adjacency from source ip lookup in [VLIB_RX], set to ~0 until
+ source lookup done
+ - union
+
+ - generic fields
+ - ICMP fields
+ - reassembly fields
+
+ - mpls fields
+ - l2 bridging fields, only valid in the L2 path
+ - l2tpv3 fields
+ - l2 classify fields
+ - vnet policer fields
+ - MAP fields
+ - MAP-T fields
+ - ip fragmentation fields
+ - COP (whitelist/blacklist filter) fields
+ - LISP fields
+ - TCP fields
+
+ - connection index
+ - sequence numbers
+ - header and data offsets
+ - data length
+ - flags
+
+ - SCTP fields
+ - NAT fields
+ - u32 unused[6]
+
+Vnet (network stack) secondary buffer metadata
+----------------------------------------------
+
+Vnet primary buffer metadata occupies space reserved in the vlib opaque2
+field shown above, and has the type name vnet_buffer_opaque2_t.
+Ordinarily accessed using the vnet_buffer2(b) macro. See
+../src/vnet/buffer.h for full details.
+
+Important fields:
+
+- qos fields
+
+ - u8 bits
+ - u8 source
+
+- u8 loop_counter: used to detect and report internal forwarding loops
+- group-based policy fields
+
+ - u8 flags
+ - u16 sclass: the packet’s source class
+
+- u16 gso_size: L4 payload size, persists all the way to
+ interface-output in case GSO is not enabled
+- u16 gso_l4_hdr_sz: size of the L4 protocol header
+- union
+
+ - packet trajectory tracer (largely deprecated)
+
+ - u16 \*trajectory_trace; only #if VLIB_BUFFER_TRACE_TRAJECTORY >
+ 0
+
+ - packet generator
+
+ - u64 pg_replay_timestamp: timestamp for replayed pcap trace
+ packets
+
+ - u32 unused[8]
+
+Buffer Metadata Extensions
+--------------------------
+
+Plugin developers may wish to extend either the primary or secondary
+vnet buffer opaque unions. Please perform a manual live variable
+analysis, otherwise nodes which use shared buffer metadata space may
+break things.
+
+It’s not OK to add plugin or proprietary metadata to the core vpp engine
+header files named above. Instead, proceed as follows. The example
+concerns the vnet primary buffer opaque union vlib_buffer_opaque_t. It’s
+a very simple variation to use the vnet secondary buffer opaque union
+vlib_buffer_opaque2_t.
+
+In a plugin header file:
+
+::
+
+ /* Add arbitrary buffer metadata */
+ #include <vnet/buffer.h>
+
+ typedef struct
+ {
+ u32 my_stuff[6];
+ } my_buffer_opaque_t;
+
+ STATIC_ASSERT (sizeof (my_buffer_opaque_t) <=
+ STRUCT_SIZE_OF (vnet_buffer_opaque_t, unused),
+ "Custom meta-data too large for vnet_buffer_opaque_t");
+
+ #define my_buffer_opaque(b) \
+ ((my_buffer_opaque_t *)((u8 *)((b)->opaque) + STRUCT_OFFSET_OF (vnet_buffer_opaque_t, unused)))
+
+To set data in the custom buffer opaque type given a vlib_buffer_t \*b:
+
+::
+
+ my_buffer_opaque (b)->my_stuff[2] = 123;
+
+To read data from the custom buffer opaque type:
+
+::
+
+ stuff0 = my_buffer_opaque (b)->my_stuff[2];
diff --git a/docs/developer/corearchitecture/buildsystem/buildrootmakefile.rst b/docs/developer/corearchitecture/buildsystem/buildrootmakefile.rst
new file mode 100644
index 00000000000..1eb4e6b5301
--- /dev/null
+++ b/docs/developer/corearchitecture/buildsystem/buildrootmakefile.rst
@@ -0,0 +1,353 @@
+Introduction to build-root/Makefile
+===================================
+
+The vpp build system consists of a top-level Makefile, a data-driven
+build-root/Makefile, and a set of makefile fragments. The various parts
+come together as the result of a set of well-thought-out conventions.
+
+This section describes build-root/Makefile in some detail.
+
+Repository Groups and Source Paths
+----------------------------------
+
+Current vpp workspaces comprise a single repository group. The file
+.../build-root/build-config.mk defines a key variable called
+SOURCE\_PATH. The SOURCE\_PATH variable names the set of repository
+groups. At the moment, there is only one repository group.
+
+Single pass build system, dependencies and components
+-----------------------------------------------------
+
+The vpp build system caters to components built with GNU autoconf /
+automake. Adding such components is a simple process. Dealing with
+components which use BSD-style raw Makefiles is a more difficult.
+Dealing with toolchain components such as gcc, glibc, and binutils can
+be considerably more complicated.
+
+The vpp build system is a **single-pass** build system. A partial order
+must exist for any set of components: the set of (a before b) tuples
+must resolve to an ordered list. If you create a circular dependency of
+the form; (a,b) (b,c) (c,a), gmake will try to build the target list,
+but there’s a 0.0% chance that the results will be pleasant. Cut-n-paste
+mistakes in .../build-data/packages/.mk can produce confusing failures.
+
+In a single-pass build system, it’s best to separate libraries and
+applications which instantiate them. For example, if vpp depends on
+libfoo.a, and myapp depends on both vpp and libfoo.a, it's best to place
+libfoo.a and myapp in separate components. The build system will build
+libfoo.a, vpp, and then (as a separate component) myapp. If you try to
+build libfoo.a and myapp from the same component, it won’t work.
+
+If you absolutely, positively insist on having myapp and libfoo.a in the
+same source tree, you can create a pseudo-component in a separate .mk
+file in the .../build-data/packages/ directory. Define the code
+phoneycomponent\_source = realcomponent, and provide manual
+configure/build/install targets.
+
+Separate components for myapp, libfoo.a, and vpp is the best and easiest
+solution. However, the “mumble\_source = realsource” degree of freedom
+exists to solve intractable circular dependencies, such as: to build
+gcc-bootstrap, followed by glibc, followed by “real” gcc/g++ [which
+depends on glibc too].
+
+.../build-root
+--------------
+
+The .../build-root directory contains the repository group specification
+build-config.mk, the main Makefile, and the system-wide set of
+autoconf/automake variable overrides in config.site. We'll describe
+these files in some detail. To be clear about expectations: the main
+Makefile and config.site file are subtle and complex. It's unlikely that
+you'll need or want to modify them. Poorly planned changes in either
+place typically cause bugs that are difficult to solve.
+
+.../build-root/build-config.mk
+------------------------------
+
+As described above, the build-config.mk file is straightforward: it sets
+the make variable SOURCE\_PATH to a list of repository group absolute
+paths.
+
+The SOURCE\_PATH variable If you choose to move a workspace, make sure
+to modify the paths defined by the SOURCE\_PATH variable. Those paths
+need to match changes you make in the workspace paths. For example, if
+you place the vpp directory in the workspace of a user named jsmith, you
+might change the SOURCE\_PATH to:
+
+SOURCE\_PATH = /home/jsmithuser/workspace/vpp
+
+The "out of the box" setting should work 99.5% of the time:
+
+::
+
+ SOURCE_PATH = $(CURDIR)/..
+
+.../vpp/build-root/Makefile
+---------------------------
+
+The main Makefile is complex in a number of dimensions. If you think you
+need to modify it, it's a good idea to do some research, or ask for
+advice before you change it.
+
+The main Makefile was organized and designed to provide the following
+characteristics: excellent performance, accurate dependency processing,
+cache enablement, timestamp optimizations, git integration,
+extensibility, builds with cross-compilation tool chains, and builds
+with embedded Linux distributions.
+
+If you really need to do so, you can build double-cross tools with it,
+with a minimum amount of fuss. For example, you could: compile gdb on
+x86\_64, to run on PowerPC, to debug the Xtensa instruction set.
+
+The PLATFORM variable
+---------------------
+
+The PLATFORM make/environment variable controls a number of important
+characteristics, primarily:
+
+- CPU architecture
+- The list of images to build.
+
+With respect to .../build-root/Makefile, the list of images to build is
+specified by the target. For example:
+
+::
+
+ make PLATFORM=vpp TAG=vpp_debug install-deb
+
+builds vpp debug Debian packages.
+
+The main Makefile interprets $PLATFORM by attempting to "-include" the
+file /build-data/platforms.mk:
+
+::
+
+ $(foreach d,$(FULL_SOURCE_PATH), \
+ $(eval -include $(d)/platforms.mk))
+
+By convention, we don't define **platforms** in the
+...//build-data/platforms.mk file.
+
+In the vpp case, we search for platform definition makefile fragments in
+.../vpp/build-data/platforms.mk, as follows:
+
+::
+
+ $(foreach d,$(SOURCE_PATH_BUILD_DATA_DIRS), \
+ $(eval -include $(d)/platforms/*.mk))
+
+With vpp, which uses the "vpp" platform as discussed above, we end up
+"-include"-ing .../vpp/build-data/platforms/vpp.mk.
+
+The platform-specific .mk fragment
+----------------------------------
+
+Here are the contents of .../build-data/platforms/vpp.mk:
+
+::
+
+ MACHINE=$(shell uname -m)
+
+ vpp_arch = native
+ ifeq ($(TARGET_PLATFORM),thunderx)
+ vpp_dpdk_target = arm64-thunderx-linuxapp-gcc
+ endif
+ vpp_native_tools = vppapigen
+
+ vpp_uses_dpdk = yes
+
+ # Uncomment to enable building unit tests
+ # vpp_enable_tests = yes
+
+ vpp_root_packages = vpp
+
+ # DPDK configuration parameters
+ # vpp_uses_dpdk_mlx4_pmd = yes
+ # vpp_uses_dpdk_mlx5_pmd = yes
+ # vpp_uses_external_dpdk = yes
+ # vpp_dpdk_inc_dir = /usr/include/dpdk
+ # vpp_dpdk_lib_dir = /usr/lib
+ # vpp_dpdk_shared_lib = yes
+
+ # Use '--without-libnuma' for non-numa aware architecture
+ # Use '--enable-dlmalloc' to use dlmalloc instead of mheap
+ vpp_configure_args_vpp = --enable-dlmalloc
+ sample-plugin_configure_args_vpp = --enable-dlmalloc
+
+ # load balancer plugin is not portable on 32 bit platform
+ ifeq ($(MACHINE),i686)
+ vpp_configure_args_vpp += --disable-lb-plugin
+ endif
+
+ vpp_debug_TAG_CFLAGS = -g -O0 -DCLIB_DEBUG \
+ -fstack-protector-all -fPIC -Werror
+ vpp_debug_TAG_CXXFLAGS = -g -O0 -DCLIB_DEBUG \
+ -fstack-protector-all -fPIC -Werror
+ vpp_debug_TAG_LDFLAGS = -g -O0 -DCLIB_DEBUG \
+ -fstack-protector-all -fPIC -Werror
+
+ vpp_TAG_CFLAGS = -g -O2 -D_FORTIFY_SOURCE=2 -fstack-protector -fPIC -Werror
+ vpp_TAG_CXXFLAGS = -g -O2 -D_FORTIFY_SOURCE=2 -fstack-protector -fPIC -Werror
+ vpp_TAG_LDFLAGS = -g -O2 -D_FORTIFY_SOURCE=2 -fstack-protector -fPIC -Werror -pie -Wl,-z,now
+
+ vpp_clang_TAG_CFLAGS = -g -O2 -D_FORTIFY_SOURCE=2 -fstack-protector -fPIC -Werror
+ vpp_clang_TAG_LDFLAGS = -g -O2 -D_FORTIFY_SOURCE=2 -fstack-protector -fPIC -Werror
+
+ vpp_gcov_TAG_CFLAGS = -g -O0 -DCLIB_DEBUG -fPIC -Werror -fprofile-arcs -ftest-coverage
+ vpp_gcov_TAG_LDFLAGS = -g -O0 -DCLIB_DEBUG -fPIC -Werror -coverage
+
+ vpp_coverity_TAG_CFLAGS = -g -O2 -fPIC -Werror -D__COVERITY__
+ vpp_coverity_TAG_LDFLAGS = -g -O2 -fPIC -Werror -D__COVERITY__
+
+Note the following variable settings:
+
+- The variable \_arch sets the CPU architecture used to build the
+ per-platform cross-compilation toolchain. With the exception of the
+ "native" architecture - used in our example - the vpp build system
+ produces cross-compiled binaries.
+
+- The variable \_native\_tools lists the required set of self-compiled
+ build tools.
+
+- The variable \_root\_packages lists the set of images to build when
+ specifying the target: make PLATFORM= TAG= [install-deb \|
+ install-rpm].
+
+The TAG variable
+----------------
+
+The TAG variable indirectly sets CFLAGS and LDFLAGS, as well as the
+build and install directory names in the .../vpp/build-root directory.
+See definitions above.
+
+Important targets build-root/Makefile
+-------------------------------------
+
+The main Makefile and the various makefile fragments implement the
+following user-visible targets:
+
++------------------+----------------------+--------------------------------------------------------------------------------------+
+| Target | ENV Variable Settings| Notes |
+| | | |
++==================+======================+======================================================================================+
+| foo | bar | mumble |
++------------------+----------------------+--------------------------------------------------------------------------------------+
+| bootstrap-tools | none | Builds the set of native tools needed by the vpp build system to |
+| | | build images. Example: vppapigen. In a full cross compilation case might include |
+| | | include "make", "git", "find", and "tar |
++------------------+----------------------+--------------------------------------------------------------------------------------+
+| install-tools | PLATFORM | Builds the tool chain for the indicated <platform>. Not used in vpp builds |
++------------------+----------------------+--------------------------------------------------------------------------------------+
+| distclean | none | Roto-rooters everything in sight: toolchains, images, and so forth. |
++------------------+----------------------+--------------------------------------------------------------------------------------+
+| install-deb | PLATFORM and TAG | Build Debian packages comprising components listed in <platform>_root_packages, |
+| | | using compile / link options defined by TAG. |
++------------------+----------------------+--------------------------------------------------------------------------------------+
+| install-rpm | PLATFORM and TAG | Build RPMs comprising components listed in <platform>_root_packages, |
+| | | using compile / link options defined by TAG. |
++------------------+----------------------+--------------------------------------------------------------------------------------+
+
+Additional build-root/Makefile environment variable settings
+------------------------------------------------------------
+
+These variable settings may be of use:
+
++----------------------+------------------------------------------------------------------------------------------------------------+
+| ENV Variable | Notes |
++======================+======================+=====================================================================================+
+| BUILD_DEBUG=vx | Directs Makefile et al. to make a good-faith effort to show what's going on in excruciating detail. |
+| | Use it as follows: "make ... BUILD_DEBUG=vx". Fairly effective in Makefile debug situations. |
++----------------------+------------------------------------------------------------------------------------------------------------+
+| V=1 | print detailed cc / ld command lines. Useful for discovering if -DFOO=11 is in the command line or not |
++----------------------+------------------------------------------------------------------------------------------------------------+
+| CC=mygcc | Override the configured C-compiler |
++----------------------+------------------------------------------------------------------------------------------------------------+
+
+.../build-root/config.site
+--------------------------
+
+The contents of .../build-root/config.site override individual autoconf /
+automake default variable settings. Here are a few sample settings related to
+building a full toolchain:
+
+::
+
+ # glibc needs these setting for cross compiling
+ libc_cv_forced_unwind=yes
+ libc_cv_c_cleanup=yes
+ libc_cv_ssp=no
+
+Determining the set of variables which need to be overridden, and the
+override values is a matter of trial and error. It should be
+unnecessary to modify this file for use with fd.io vpp.
+
+.../build-data/platforms.mk
+---------------------------
+
+Each repo group includes the platforms.mk file, which is included by
+the main Makefile. The vpp/build-data/platforms.mk file is not terribly
+complex. As of this writing, .../build-data/platforms.mk file accomplishes two
+tasks.
+
+First, it includes vpp/build-data/platforms/\*.mk:
+
+::
+
+ # Pick up per-platform makefile fragments
+ $(foreach d,$(SOURCE_PATH_BUILD_DATA_DIRS), \
+ $(eval -include $(d)/platforms/*.mk))
+
+This collects the set of platform definition makefile fragments, as discussed above.
+
+Second, platforms.mk implements the user-visible "install-deb" target.
+
+.../build-data/packages/\*.mk
+-----------------------------
+
+Each component needs a makefile fragment in order for the build system
+to recognize it. The per-component makefile fragments vary
+considerably in complexity. For a component built with GNU autoconf /
+automake which does not depend on other components, the make fragment
+can be empty. See .../build-data/packages/vpp.mk for an uncomplicated
+but fully realistic example.
+
+Here are some of the important variable settings in per-component makefile fragments:
+
++----------------------+------------------------------------------------------------------------------------------------------------+
+| Variable | Notes |
++======================+======================+=====================================================================================+
+| xxx_configure_depend | Lists the set of component build dependencies for the xxx component. In plain English: don't try to |
+| | configure this component until you've successfully built the indicated targets. Almost always, |
+| | xxx_configure_depend will list a set of "yyy-install" targets. Note the pattern: |
+| | "variable names contain underscores, make target names contain hyphens" |
++----------------------+------------------------------------------------------------------------------------------------------------+
+| xxx_configure_args | (optional) Lists any additional arguments to pass to the xxx component "configure" script. |
+| | The main Makefile %-configure rule adds the required settings for --libdir, --prefix, and |
+| | --host (when cross-compiling) |
++----------------------+------------------------------------------------------------------------------------------------------------+
+| xxx_CPPFLAGS | Adds -I stanzas to CPPFLAGS for components upon which xxx depends. |
+| | Almost invariably "xxx_CPPFLAGS = $(call installed_includes_fn, dep1 dep2 dep3)", where dep1, dep2, and |
+| | dep3 are listed in xxx_configure_depend. It is bad practice to set "-g -O3" here. Those settings |
+| | belong in a TAG. |
++----------------------+------------------------------------------------------------------------------------------------------------+
+| xxx_LDFLAGS | Adds -Wl,-rpath -Wl,depN stanzas to LDFLAGS for components upon which xxx depends. |
+| | Almost invariably "xxx_LDFLAGS = $(call installed_lib_fn, dep1 dep2 dep3)", where dep1, dep2, and |
+| | dep3 are listed in xxx_configure_depend. It is bad manners to set "-liberty-or-death" here. |
+| | Those settings belong in Makefile.am. |
++----------------------+------------------------------------------------------------------------------------------------------------+
+
+When dealing with "irritating" components built with raw Makefiles
+which only work when building in the source tree, we use a specific
+strategy in the xxx.mk file.
+
+The strategy is simple for those components: We copy the source tree
+into .../vpp/build-root/build-xxx. This works, but completely defeats
+dependency processing. This strategy is acceptable only for 3rd party
+software which won't need extensive (or preferably any) modifications.
+
+Take a look at .../vpp/build-data/packages/dpdk.mk. When invoked, the
+dpdk_configure variable copies source code into $(PACKAGE_BUILD_DIR),
+and performs the BSD equivalent of "autoreconf -i -f" to configure the
+build area. The rest of the file is similar: a bunch of hand-rolled
+glue code which manages to make the dpdk act like a good vpp build
+citizen even though it is not.
diff --git a/docs/developer/corearchitecture/buildsystem/cmakeandninja.rst b/docs/developer/corearchitecture/buildsystem/cmakeandninja.rst
new file mode 100644
index 00000000000..580d261bdac
--- /dev/null
+++ b/docs/developer/corearchitecture/buildsystem/cmakeandninja.rst
@@ -0,0 +1,186 @@
+Introduction to cmake and ninja
+===============================
+
+Cmake plus ninja is approximately equal to GNU autotools plus GNU
+make, respectively. Both cmake and GNU autotools support self and
+cross-compilation, checking for required components and versions.
+
+- For a decent-sized project - such as vpp - build performance is drastically better with (cmake, ninja).
+
+- The cmake input language looks like an actual language, rather than a shell scripting scheme on steroids.
+
+- Ninja doesn't pretend to support manually-generated input files. Think of it as a fast, dumb robot which eats mildly legible byte-code.
+
+See the `cmake website <http://cmake.org>`_, and the `ninja website
+<https://ninja-build.org>`_ for additional information.
+
+vpp cmake configuration files
+-----------------------------
+
+The top of the vpp project cmake hierarchy lives in .../src/CMakeLists.txt.
+This file defines the vpp project, and (recursively) includes two kinds
+of files: rule/function definitions, and target lists.
+
+- Rule/function definitions live in .../src/cmake/{\*.cmake}. Although the contents of these files is simple enough to read, it shouldn't be necessary to modify them very often
+
+- Build target lists come from CMakeLists.txt files found in subdirectories, which are named in the SUBDIRS list in .../src/CMakeLists.txt
+
+::
+
+ ##############################################################################
+ # subdirs - order matters
+ ##############################################################################
+ if("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux")
+ find_package(OpenSSL REQUIRED)
+ set(SUBDIRS
+ vppinfra svm vlib vlibmemory vlibapi vnet vpp vat vcl plugins
+ vpp-api tools/vppapigen tools/g2 tools/perftool)
+ elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
+ set(SUBDIRS vppinfra)
+ else()
+ message(FATAL_ERROR "Unsupported system: ${CMAKE_SYSTEM_NAME}")
+ endif()
+
+ foreach(DIR ${SUBDIRS})
+ add_subdirectory(${DIR})
+ endforeach()
+
+- The vpp cmake configuration hierarchy discovers the list of plugins to be built by searching for subdirectories in .../src/plugins which contain CMakeLists.txt files
+
+
+::
+
+ ##############################################################################
+ # find and add all plugin subdirs
+ ##############################################################################
+ FILE(GLOB files RELATIVE
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}/*/CMakeLists.txt
+ )
+ foreach (f ${files})
+ get_filename_component(dir ${f} DIRECTORY)
+ add_subdirectory(${dir})
+ endforeach()
+
+How to write a plugin CMakeLists.txt file
+-----------------------------------------
+
+It's really quite simple. Follow the pattern:
+
+::
+
+ add_vpp_plugin(mactime
+ SOURCES
+ mactime.c
+ node.c
+
+ API_FILES
+ mactime.api
+
+ INSTALL_HEADERS
+ mactime_all_api_h.h
+ mactime_msg_enum.h
+
+ API_TEST_SOURCES
+ mactime_test.c
+ )
+
+Adding a target elsewhere in the source tree
+--------------------------------------------
+
+Within reason, adding a subdirectory to the SUBDIRS list in
+.../src/CMakeLists.txt is perfectly OK. The indicated directory will
+need a CMakeLists.txt file.
+
+.. _building-g2:
+
+Here's how we build the g2 event data visualization tool:
+
+::
+
+ option(VPP_BUILD_G2 "Build g2 tool." OFF)
+ if(VPP_BUILD_G2)
+ find_package(GTK2 COMPONENTS gtk)
+ if(GTK2_FOUND)
+ include_directories(${GTK2_INCLUDE_DIRS})
+ add_vpp_executable(g2
+ SOURCES
+ clib.c
+ cpel.c
+ events.c
+ main.c
+ menu1.c
+ pointsel.c
+ props.c
+ g2version.c
+ view1.c
+
+ LINK_LIBRARIES vppinfra Threads::Threads m ${GTK2_LIBRARIES}
+ NO_INSTALL
+ )
+ endif()
+ endif()
+
+The g2 component is optional, and is not built by default. There are
+a couple of ways to tell cmake to include it in build.ninja [or in Makefile.]
+
+When invoking cmake manually [rarely done and not very easy], specify
+-DVPP_BUILD_G2=ON:
+
+::
+
+ $ cmake ... -DVPP_BUILD_G2=ON
+
+Take a good look at .../build-data/packages/vpp.mk to see where and
+how the top-level Makefile and .../build-root/Makefile set all of the
+cmake arguments. One strategy to enable an optional component is fairly
+obvious. Add -DVPP_BUILD_G2=ON to vpp_cmake_args.
+
+That would work, of course, but it's not a particularly elegant solution.
+
+Tinkering with build options: ccmake
+------------------------------------
+
+The easy way to set VPP_BUILD_G2 - or frankly **any** cmake
+parameter - is to install the "cmake-curses-gui" package and use
+it.
+
+- Do a straightforward vpp build using the top level Makefile, "make build" or "make build-release"
+- Ajourn to .../build-root/build-vpp-native/vpp or .../build-root/build-vpp_debug-native/vpp
+- Invoke "ccmake ." to reconfigure the project as desired
+
+Here's approximately what you'll see:
+
+::
+
+ CCACHE_FOUND /usr/bin/ccache
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX /scratch/vpp-gate/build-root/install-vpp-nati
+ DPDK_INCLUDE_DIR /scratch/vpp-gate/build-root/install-vpp-nati
+ DPDK_LIB /scratch/vpp-gate/build-root/install-vpp-nati
+ MBEDTLS_INCLUDE_DIR /usr/include
+ MBEDTLS_LIB1 /usr/lib/x86_64-linux-gnu/libmbedtls.so
+ MBEDTLS_LIB2 /usr/lib/x86_64-linux-gnu/libmbedx509.so
+ MBEDTLS_LIB3 /usr/lib/x86_64-linux-gnu/libmbedcrypto.so
+ MUSDK_INCLUDE_DIR MUSDK_INCLUDE_DIR-NOTFOUND
+ MUSDK_LIB MUSDK_LIB-NOTFOUND
+ PRE_DATA_SIZE 128
+ VPP_API_TEST_BUILTIN ON
+ VPP_BUILD_G2 OFF
+ VPP_BUILD_PERFTOOL OFF
+ VPP_BUILD_VCL_TESTS ON
+ VPP_BUILD_VPPINFRA_TESTS OFF
+
+ CCACHE_FOUND: Path to a program.
+ Press [enter] to edit option Press [d] to delete an entry CMake Version 3.10.2
+ Press [c] to configure
+ Press [h] for help Press [q] to quit without generating
+ Press [t] to toggle advanced mode (Currently Off)
+
+Use the cursor to point at the VPP_BUILD_G2 line. Press the return key
+to change OFF to ON. Press "c" to regenerate build.ninja, etc.
+
+At that point "make build" or "make build-release" will build g2. And so on.
+
+Note that toggling advanced mode ["t"] gives access to substantially
+all of the cmake option, discovered directories and paths.
diff --git a/docs/developer/corearchitecture/buildsystem/index.rst b/docs/developer/corearchitecture/buildsystem/index.rst
new file mode 100644
index 00000000000..908e91e1fc1
--- /dev/null
+++ b/docs/developer/corearchitecture/buildsystem/index.rst
@@ -0,0 +1,14 @@
+.. _buildsystem:
+
+Build System
+============
+
+This guide describes the vpp build system in detail. As of this writing,
+the build systems uses a mix of make / Makefiles, cmake, and ninja to
+achieve excellent build performance.
+
+.. toctree::
+
+ mainmakefile
+ cmakeandninja
+ buildrootmakefile
diff --git a/docs/developer/corearchitecture/buildsystem/mainmakefile.rst b/docs/developer/corearchitecture/buildsystem/mainmakefile.rst
new file mode 100644
index 00000000000..96b97496350
--- /dev/null
+++ b/docs/developer/corearchitecture/buildsystem/mainmakefile.rst
@@ -0,0 +1,2 @@
+Introduction to the top-level Makefile
+======================================
diff --git a/docs/developer/corearchitecture/featurearcs.rst b/docs/developer/corearchitecture/featurearcs.rst
new file mode 100644
index 00000000000..89c50e38dce
--- /dev/null
+++ b/docs/developer/corearchitecture/featurearcs.rst
@@ -0,0 +1,225 @@
+Feature Arcs
+============
+
+A significant number of vpp features are configurable on a per-interface
+or per-system basis. Rather than ask feature coders to manually
+construct the required graph arcs, we built a general mechanism to
+manage these mechanics.
+
+Specifically, feature arcs comprise ordered sets of graph nodes. Each
+feature node in an arc is independently controlled. Feature arc nodes
+are generally unaware of each other. Handing a packet to “the next
+feature node” is quite inexpensive.
+
+The feature arc implementation solves the problem of creating graph arcs
+used for steering.
+
+At the beginning of a feature arc, a bit of setup work is needed, but
+only if at least one feature is enabled on the arc.
+
+On a per-arc basis, individual feature definitions create a set of
+ordering dependencies. Feature infrastructure performs a topological
+sort of the ordering dependencies, to determine the actual feature
+order. Missing dependencies **will** lead to runtime disorder. See
+https://gerrit.fd.io/r/#/c/12753 for an example.
+
+If no partial order exists, vpp will refuse to run. Circular dependency
+loops of the form “a then b, b then c, c then a” are impossible to
+satisfy.
+
+Adding a feature to an existing feature arc
+-------------------------------------------
+
+To nobody’s great surprise, we set up feature arcs using the typical
+“macro -> constructor function -> list of declarations” pattern:
+
+.. code:: c
+
+ VNET_FEATURE_INIT (mactime, static) =
+ {
+ .arc_name = "device-input",
+ .node_name = "mactime",
+ .runs_before = VNET_FEATURES ("ethernet-input"),
+ };
+
+This creates a “mactime” feature on the “device-input” arc.
+
+Once per frame, dig up the vnet_feature_config_main_t corresponding to
+the “device-input” feature arc:
+
+.. code:: c
+
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_interface_main_t *im = &vnm->interface_main;
+ u8 arc = im->output_feature_arc_index;
+ vnet_feature_config_main_t *fcm;
+
+ fcm = vnet_feature_get_config_main (arc);
+
+Note that in this case, we’ve stored the required arc index - assigned
+by the feature infrastructure - in the vnet_interface_main_t. Where to
+put the arc index is a programmer’s decision when creating a feature
+arc.
+
+Per packet, set next0 to steer packets to the next node they should
+visit:
+
+.. code:: c
+
+ vnet_get_config_data (&fcm->config_main,
+ &b0->current_config_index /* value-result */,
+ &next0, 0 /* # bytes of config data */);
+
+Configuration data is per-feature arc, and is often unused. Note that
+it’s normal to reset next0 to divert packets elsewhere; often, to drop
+them for cause:
+
+.. code:: c
+
+ next0 = MACTIME_NEXT_DROP;
+ b0->error = node->errors[DROP_CAUSE];
+
+Creating a feature arc
+----------------------
+
+Once again, we create feature arcs using constructor macros:
+
+.. code:: c
+
+ VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
+ {
+ .arc_name = "ip4-unicast",
+ .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
+ .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
+ };
+
+In this case, we configure two arc start nodes to handle the
+“hardware-verified ip checksum or not” cases. During initialization, the
+feature infrastructure stores the arc index as shown.
+
+In the head-of-arc node, do the following to send packets along the
+feature arc:
+
+.. code:: c
+
+ ip_lookup_main_t *lm = &im->lookup_main;
+ arc = lm->ucast_feature_arc_index;
+
+Once per packet, initialize packet metadata to walk the feature arc:
+
+.. code:: c
+
+ vnet_feature_arc_start (arc, sw_if_index0, &next, b0);
+
+Enabling / Disabling features
+-----------------------------
+
+Simply call vnet_feature_enable_disable to enable or disable a specific
+feature:
+
+.. code:: c
+
+ vnet_feature_enable_disable ("device-input", /* arc name */
+ "mactime", /* feature name */
+ sw_if_index, /* Interface sw_if_index */
+ enable_disable, /* 1 => enable */
+ 0 /* (void *) feature_configuration */,
+ 0 /* feature_configuration_nbytes */);
+
+The feature_configuration opaque is seldom used.
+
+If you wish to make a feature a *de facto* system-level concept, pass
+sw_if_index=0 at all times. Sw_if_index 0 is always valid, and
+corresponds to the “local” interface.
+
+Related “show” commands
+-----------------------
+
+To display the entire set of features, use “show features [verbose]”.
+The verbose form displays arc indices, and feature indicies within the
+arcs
+
+::
+
+ $ vppctl show features verbose
+ Available feature paths
+ <snip>
+ [14] ip4-unicast:
+ [ 0]: nat64-out2in-handoff
+ [ 1]: nat64-out2in
+ [ 2]: nat44-ed-hairpin-dst
+ [ 3]: nat44-hairpin-dst
+ [ 4]: ip4-dhcp-client-detect
+ [ 5]: nat44-out2in-fast
+ [ 6]: nat44-in2out-fast
+ [ 7]: nat44-handoff-classify
+ [ 8]: nat44-out2in-worker-handoff
+ [ 9]: nat44-in2out-worker-handoff
+ [10]: nat44-ed-classify
+ [11]: nat44-ed-out2in
+ [12]: nat44-ed-in2out
+ [13]: nat44-det-classify
+ [14]: nat44-det-out2in
+ [15]: nat44-det-in2out
+ [16]: nat44-classify
+ [17]: nat44-out2in
+ [18]: nat44-in2out
+ [19]: ip4-qos-record
+ [20]: ip4-vxlan-gpe-bypass
+ [21]: ip4-reassembly-feature
+ [22]: ip4-not-enabled
+ [23]: ip4-source-and-port-range-check-rx
+ [24]: ip4-flow-classify
+ [25]: ip4-inacl
+ [26]: ip4-source-check-via-rx
+ [27]: ip4-source-check-via-any
+ [28]: ip4-policer-classify
+ [29]: ipsec-input-ip4
+ [30]: vpath-input-ip4
+ [31]: ip4-vxlan-bypass
+ [32]: ip4-lookup
+ <snip>
+
+Here, we learn that the ip4-unicast feature arc has index 14, and that
+e.g. ip4-inacl is the 25th feature in the generated partial order.
+
+To display the features currently active on a specific interface, use
+“show interface features”:
+
+::
+
+ $ vppctl show interface GigabitEthernet3/0/0 features
+ Feature paths configured on GigabitEthernet3/0/0...
+ <snip>
+ ip4-unicast:
+ nat44-out2in
+ <snip>
+
+Table of Feature Arcs
+---------------------
+
+Simply search for name-strings to track down the arc definition,
+location of the arc index, etc.
+
+::
+
+ | Arc Name |
+ |------------------|
+ | device-input |
+ | ethernet-output |
+ | interface-output |
+ | ip4-drop |
+ | ip4-local |
+ | ip4-multicast |
+ | ip4-output |
+ | ip4-punt |
+ | ip4-unicast |
+ | ip6-drop |
+ | ip6-local |
+ | ip6-multicast |
+ | ip6-output |
+ | ip6-punt |
+ | ip6-unicast |
+ | mpls-input |
+ | mpls-output |
+ | nsh-output |
diff --git a/docs/developer/corearchitecture/index.rst b/docs/developer/corearchitecture/index.rst
new file mode 100644
index 00000000000..ecd5a3cdb08
--- /dev/null
+++ b/docs/developer/corearchitecture/index.rst
@@ -0,0 +1,21 @@
+.. _corearchitecture:
+
+=================
+Core Architecture
+=================
+
+.. toctree::
+ :maxdepth: 1
+
+ softwarearchitecture
+ infrastructure
+ vlib
+ vnet
+ featurearcs
+ buffer_metadata
+ multiarch/index
+ bihash
+ buildsystem/index
+ mem
+ multi_thread
+
diff --git a/docs/developer/corearchitecture/infrastructure.rst b/docs/developer/corearchitecture/infrastructure.rst
new file mode 100644
index 00000000000..b4e1065f81e
--- /dev/null
+++ b/docs/developer/corearchitecture/infrastructure.rst
@@ -0,0 +1,612 @@
+VPPINFRA (Infrastructure)
+=========================
+
+The files associated with the VPP Infrastructure layer are located in
+the ``./src/vppinfra`` folder.
+
+VPPinfra is a collection of basic c-library services, quite sufficient
+to build standalone programs to run directly on bare metal. It also
+provides high-performance dynamic arrays, hashes, bitmaps,
+high-precision real-time clock support, fine-grained event-logging, and
+data structure serialization.
+
+One fair comment / fair warning about vppinfra: you can't always tell a
+macro from an inline function from an ordinary function simply by name.
+Macros are used to avoid function calls in the typical case, and to
+cause (intentional) side-effects.
+
+Vppinfra has been around for almost 20 years and tends not to change
+frequently. The VPP Infrastructure layer contains the following
+functions:
+
+Vectors
+-------
+
+Vppinfra vectors are ubiquitous dynamically resized arrays with by user
+defined "headers". Many vpppinfra data structures (e.g. hash, heap,
+pool) are vectors with various different headers.
+
+The memory layout looks like this:
+
+::
+
+ User header (optional, uword aligned)
+ Alignment padding (if needed)
+ Vector length in elements
+ User's pointer -> Vector element 0
+ Vector element 1
+ ...
+ Vector element N-1
+
+As shown above, the vector APIs deal with pointers to the 0th element of
+a vector. Null pointers are valid vectors of length zero.
+
+To avoid thrashing the memory allocator, one often resets the length of
+a vector to zero while retaining the memory allocation. Set the vector
+length field to zero via the vec_reset_length(v) macro. [Use the macro!
+It’s smart about NULL pointers.]
+
+Typically, the user header is not present. User headers allow for other
+data structures to be built atop vppinfra vectors. Users may specify the
+alignment for first data element of a vector via the [vec]()*_aligned
+macros.
+
+Vector elements can be any C type e.g. (int, double, struct bar). This
+is also true for data types built atop vectors (e.g. heap, pool, etc.).
+Many macros have \_a variants supporting alignment of vector elements
+and \_h variants supporting non-zero-length vector headers. The \_ha
+variants support both. Additionally cacheline alignment within a vector
+element structure can be specified using the
+``[CLIB_CACHE_LINE_ALIGN_MARK]()`` macro.
+
+Inconsistent usage of header and/or alignment related macro variants
+will cause delayed, confusing failures.
+
+Standard programming error: memorize a pointer to the ith element of a
+vector, and then expand the vector. Vectors expand by 3/2, so such code
+may appear to work for a period of time. Correct code almost always
+memorizes vector **indices** which are invariant across reallocations.
+
+In typical application images, one supplies a set of global functions
+designed to be called from gdb. Here are a few examples:
+
+- vl(v) - prints vec_len(v)
+- pe(p) - prints pool_elts(p)
+- pifi(p, index) - prints pool_is_free_index(p, index)
+- debug_hex_bytes (p, nbytes) - hex memory dump nbytes starting at p
+
+Use the “show gdb” debug CLI command to print the current set.
+
+Bitmaps
+-------
+
+Vppinfra bitmaps are dynamic, built using the vppinfra vector APIs.
+Quite handy for a variety jobs.
+
+Pools
+-----
+
+Vppinfra pools combine vectors and bitmaps to rapidly allocate and free
+fixed-size data structures with independent lifetimes. Pools are perfect
+for allocating per-session structures.
+
+Hashes
+------
+
+Vppinfra provides several hash flavors. Data plane problems involving
+packet classification / session lookup often use
+./src/vppinfra/bihash_template.[ch] bounded-index extensible hashes.
+These templates are instantiated multiple times, to efficiently service
+different fixed-key sizes.
+
+Bihashes are thread-safe. Read-locking is not required. A simple
+spin-lock ensures that only one thread writes an entry at a time.
+
+The original vppinfra hash implementation in ./src/vppinfra/hash.[ch]
+are simple to use, and are often used in control-plane code which needs
+exact-string-matching.
+
+In either case, one almost always looks up a key in a hash table to
+obtain an index in a related vector or pool. The APIs are simple enough,
+but one must take care when using the unmanaged arbitrary-sized key
+variant. Hash_set_mem (hash_table, key_pointer, value) memorizes
+key_pointer. It is usually a bad mistake to pass the address of a vector
+element as the second argument to hash_set_mem. It is perfectly fine to
+memorize constant string addresses in the text segment.
+
+Timekeeping
+-----------
+
+Vppinfra includes high-precision, low-cost timing services. The datatype
+clib_time_t and associated functions reside in ./src/vppinfra/time.[ch].
+Call clib_time_init (clib_time_t \*cp) to initialize the clib_time_t
+object.
+
+Clib_time_init(…) can use a variety of different ways to establish the
+hardware clock frequency. At the end of the day, vppinfra timekeeping
+takes the attitude that the operating system’s clock is the closest
+thing to a gold standard it has handy.
+
+When properly configured, NTP maintains kernel clock synchronization
+with a highly accurate off-premises reference clock. Notwithstanding
+network propagation delays, a synchronized NTP client will keep the
+kernel clock accurate to within 50ms or so.
+
+Why should one care? Simply put, oscillators used to generate CPU ticks
+aren’t super accurate. They work pretty well, but a 0.1% error wouldn’t
+be out of the question. That’s a minute and a half’s worth of error in 1
+day. The error changes constantly, due to temperature variation, and a
+host of other physical factors.
+
+It’s far too expensive to use system calls for timing, so we’re left
+with the problem of continuously adjusting our view of the CPU tick
+register’s clocks_per_second parameter.
+
+The clock rate adjustment algorithm measures the number of cpu ticks and
+the “gold standard” reference time across an interval of approximately
+16 seconds. We calculate clocks_per_second for the interval: use rdtsc
+(on x86_64) and a system call to get the latest cpu tick count and the
+kernel’s latest nanosecond timestamp. We subtract the previous interval
+end values, and use exponential smoothing to merge the new clock rate
+sample into the clocks_per_second parameter.
+
+As of this writing, we maintain the clock rate by way of the following
+first-order differential equation:
+
+.. code:: c
+
+ clocks_per_second(t) = clocks_per_second(t-1) * K + sample_cps(t)*(1-K)
+ where K = e**(-1.0/3.75);
+
+This yields a per observation “half-life” of 1 minute. Empirically, the
+clock rate converges within 5 minutes, and appears to maintain
+near-perfect agreement with the kernel clock in the face of ongoing NTP
+time adjustments.
+
+See ./src/vppinfra/time.c:clib_time_verify_frequency(…) to look at the
+rate adjustment algorithm. The code rejects frequency samples
+corresponding to the sort of adjustment which might occur if someone
+changes the gold standard kernel clock by several seconds.
+
+Monotonic timebase support
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Particularly during system initialization, the “gold standard” system
+reference clock can change by a large amount, in an instant. It’s not a
+best practice to yank the reference clock - in either direction - by
+hours or days. In fact, some poorly-constructed use-cases do so.
+
+To deal with this reality, clib_time_now(…) returns the number of
+seconds since vpp started, *guaranteed to be monotonically increasing,
+no matter what happens to the system reference clock*.
+
+This is first-order important, to avoid breaking every active timer in
+the system. The vpp host stack alone may account for tens of millions of
+active timers. It’s utterly impractical to track down and fix timers, so
+we must deal with the issue at the timebase level.
+
+Here’s how it works. Prior to adjusting the clock rate, we collect the
+kernel reference clock and the cpu clock:
+
+.. code:: c
+
+ /* Ask the kernel and the CPU what time it is... */
+ now_reference = unix_time_now ();
+ now_clock = clib_cpu_time_now ();
+
+Compute changes for both clocks since the last rate adjustment, roughly
+15 seconds ago:
+
+.. code:: c
+
+ /* Compute change in the reference clock */
+ delta_reference = now_reference - c->last_verify_reference_time;
+
+ /* And change in the CPU clock */
+ delta_clock_in_seconds = (f64) (now_clock - c->last_verify_cpu_time) *
+ c->seconds_per_clock;
+
+Delta_reference is key. Almost 100% of the time, delta_reference and
+delta_clock_in_seconds are identical modulo one system-call time.
+However, NTP or a privileged user can yank the system reference time -
+in either direction - by an hour, a day, or a decade.
+
+As described above, clib_time_now(…) must return monotonically
+increasing answers to the question “how long has it been since vpp
+started, in seconds.” To do that, the clock rate adjustment algorithm
+begins by recomputing the initial reference time:
+
+.. code:: c
+
+ c->init_reference_time += (delta_reference - delta_clock_in_seconds);
+
+It’s easy to convince yourself that if the reference clock changes by
+15.000000 seconds and the cpu clock tick time changes by 15.000000
+seconds, the initial reference time won’t change.
+
+If, on the other hand, delta_reference is -86400.0 and delta clock is
+15.0 - reference time jumped backwards by exactly one day in a 15-second
+rate update interval - we add -86415.0 to the initial reference time.
+
+Given the corrected initial reference time, we recompute the total
+number of cpu ticks which have occurred since the corrected initial
+reference time, at the current clock tick rate:
+
+.. code:: c
+
+ c->total_cpu_time = (now_reference - c->init_reference_time)
+ * c->clocks_per_second;
+
+Timebase precision
+~~~~~~~~~~~~~~~~~~
+
+Cognoscenti may notice that vlib/clib_time_now(…) return a 64-bit
+floating-point value; the number of seconds since vpp started.
+
+Please see `this Wikipedia
+article <https://en.wikipedia.org/wiki/Double-precision_floating-point_format>`__
+for more information. C double-precision floating point numbers (called
+f64 in the vpp code base) have a 53-bit effective mantissa, and can
+accurately represent 15 decimal digits’ worth of precision.
+
+There are 315,360,000.000001 seconds in ten years plus one microsecond.
+That string has exactly 15 decimal digits. The vpp time base retains 1us
+precision for roughly 30 years.
+
+vlib/clib_time_now do *not* provide precision in excess of 1e-6 seconds.
+If necessary, please use clib_cpu_time_now(…) for direct access to the
+CPU clock-cycle counter. Note that the number of CPU clock cycles per
+second varies significantly across CPU architectures.
+
+Timer Wheels
+------------
+
+Vppinfra includes configurable timer wheel support. See the source code
+in …/src/vppinfra/tw_timer_template.[ch], as well as a considerable
+number of template instances defined in …/src/vppinfra/tw_timer\_.[ch].
+
+Instantiation of tw_timer_template.h generates named structures to
+implement specific timer wheel geometries. Choices include: number of
+timer wheels (currently, 1 or 2), number of slots per ring (a power of
+two), and the number of timers per “object handle”.
+
+Internally, user object/timer handles are 32-bit integers, so if one
+selects 16 timers/object (4 bits), the resulting timer wheel handle is
+limited to 2**28 objects.
+
+Here are the specific settings required to generate a single 2048 slot
+wheel which supports 2 timers per object:
+
+.. code:: c
+
+ #define TW_TIMER_WHEELS 1
+ #define TW_SLOTS_PER_RING 2048
+ #define TW_RING_SHIFT 11
+ #define TW_RING_MASK (TW_SLOTS_PER_RING -1)
+ #define TW_TIMERS_PER_OBJECT 2
+ #define LOG2_TW_TIMERS_PER_OBJECT 1
+ #define TW_SUFFIX _2t_1w_2048sl
+ #define TW_FAST_WHEEL_BITMAP 0
+ #define TW_TIMER_ALLOW_DUPLICATE_STOP 0
+
+See tw_timer_2t_1w_2048sl.h for a complete example.
+
+tw_timer_template.h is not intended to be #included directly. Client
+codes can include multiple timer geometry header files, although extreme
+caution would required to use the TW and TWT macros in such a case.
+
+API usage examples
+~~~~~~~~~~~~~~~~~~
+
+The unit test code in …/src/vppinfra/test_tw_timer.c provides a concrete
+API usage example. It uses a synthetic clock to rapidly exercise the
+underlying tw_timer_expire_timers(…) template.
+
+There are not many API routines to call.
+
+Initialize a two-timer, single 2048-slot wheel w/ a 1-second timer granularity
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: c
+
+ tw_timer_wheel_init_2t_1w_2048sl (&tm->single_wheel,
+ expired_timer_single_callback,
+ 1.0 / * timer interval * / );
+
+Start a timer
+^^^^^^^^^^^^^
+
+.. code:: c
+
+ handle = tw_timer_start_2t_1w_2048sl (&tm->single_wheel, elt_index,
+ [0 | 1] / * timer id * / ,
+ expiration_time_in_u32_ticks);
+
+Stop a timer
+^^^^^^^^^^^^
+
+.. code:: c
+
+ tw_timer_stop_2t_1w_2048sl (&tm->single_wheel, handle);
+
+An expired timer callback
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: c
+
+ static void
+ expired_timer_single_callback (u32 * expired_timers)
+ {
+ int i;
+ u32 pool_index, timer_id;
+ tw_timer_test_elt_t *e;
+ tw_timer_test_main_t *tm = &tw_timer_test_main;
+
+ for (i = 0; i < vec_len (expired_timers);
+ {
+ pool_index = expired_timers[i] & 0x7FFFFFFF;
+ timer_id = expired_timers[i] >> 31;
+
+ ASSERT (timer_id == 1);
+
+ e = pool_elt_at_index (tm->test_elts, pool_index);
+
+ if (e->expected_to_expire != tm->single_wheel.current_tick)
+ {
+ fformat (stdout, "[%d] expired at %d not %d\n",
+ e - tm->test_elts, tm->single_wheel.current_tick,
+ e->expected_to_expire);
+ }
+ pool_put (tm->test_elts, e);
+ }
+ }
+
+We use wheel timers extensively in the vpp host stack. Each TCP session
+needs 5 timers, so supporting 10 million flows requires up to 50 million
+concurrent timers.
+
+Timers rarely expire, so it’s of utmost important that stopping and
+restarting a timer costs as few clock cycles as possible.
+
+Stopping a timer costs a doubly-linked list dequeue. Starting a timer
+involves modular arithmetic to determine the correct timer wheel and
+slot, and a list head enqueue.
+
+Expired timer processing generally involves bulk link-list retirement
+with user callback presentation. Some additional complexity at wheel
+wrap time, to relocate timers from slower-turning timer wheels into
+faster-turning wheels.
+
+Format
+------
+
+Vppinfra format is roughly equivalent to printf.
+
+Format has a few properties worth mentioning. Format’s first argument is
+a (u8 \*) vector to which it appends the result of the current format
+operation. Chaining calls is very easy:
+
+.. code:: c
+
+ u8 * result;
+
+ result = format (0, "junk = %d, ", junk);
+ result = format (result, "more junk = %d\n", more_junk);
+
+As previously noted, NULL pointers are perfectly proper 0-length
+vectors. Format returns a (u8 \*) vector, **not** a C-string. If you
+wish to print a (u8 \*) vector, use the “%v” format string. If you need
+a (u8 \*) vector which is also a proper C-string, either of these
+schemes may be used:
+
+.. code:: c
+
+ vec_add1 (result, 0)
+ or
+ result = format (result, "<whatever>%c", 0);
+
+Remember to vec_free() the result if appropriate. Be careful not to pass
+format an uninitialized (u8 \*).
+
+Format implements a particularly handy user-format scheme via the “%U”
+format specification. For example:
+
+.. code:: c
+
+ u8 * format_junk (u8 * s, va_list *va)
+ {
+ junk = va_arg (va, u32);
+ s = format (s, "%s", junk);
+ return s;
+ }
+
+ result = format (0, "junk = %U, format_junk, "This is some junk");
+
+format_junk() can invoke other user-format functions if desired. The
+programmer shoulders responsibility for argument type-checking. It is
+typical for user format functions to blow up spectacularly if the
+va_arg(va, type) macros don’t match the caller’s idea of reality.
+
+Unformat
+--------
+
+Vppinfra unformat is vaguely related to scanf, but considerably more
+general.
+
+A typical use case involves initializing an unformat_input_t from either
+a C-string or a (u8 \*) vector, then parsing via unformat() as follows:
+
+.. code:: c
+
+ unformat_input_t input;
+ u8 *s = "<some-C-string>";
+
+ unformat_init_string (&input, (char *) s, strlen((char *) s));
+ /* or */
+ unformat_init_vector (&input, <u8-vector>);
+
+Then loop parsing individual elements:
+
+.. code:: c
+
+ while (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (&input, "value1 %d", &value1))
+ ;/* unformat sets value1 */
+ else if (unformat (&input, "value2 %d", &value2)
+ ;/* unformat sets value2 */
+ else
+ return clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, input);
+ }
+
+As with format, unformat implements a user-unformat function capability
+via a “%U” user unformat function scheme. Generally, one can trivially
+transform “format (s,”foo %d”, foo) -> “unformat (input,”foo %d”,
+&foo)“.
+
+Unformat implements a couple of handy non-scanf-like format specifiers:
+
+.. code:: c
+
+ unformat (input, "enable %=", &enable, 1 /* defaults to 1 */);
+ unformat (input, "bitzero %|", &mask, (1<<0));
+ unformat (input, "bitone %|", &mask, (1<<1));
+ <etc>
+
+The phrase “enable %=” means “set the supplied variable to the default
+value” if unformat parses the “enable” keyword all by itself. If
+unformat parses “enable 123” set the supplied variable to 123.
+
+We could clean up a number of hand-rolled “verbose” + “verbose %d”
+argument parsing codes using “%=”.
+
+The phrase “bitzero %\|” means “set the specified bit in the supplied
+bitmask” if unformat parses “bitzero”. Although it looks like it could
+be fairly handy, it’s very lightly used in the code base.
+
+``%_`` toggles whether or not to skip input white space.
+
+For transition from skip to no-skip in middle of format string, skip
+input white space. For example, the following:
+
+.. code:: c
+
+ fmt = "%_%d.%d%_->%_%d.%d%_"
+ unformat (input, fmt, &one, &two, &three, &four);
+
+matches input “1.2 -> 3.4”. Without this, the space after -> does not
+get skipped.
+
+
+How to parse a single input line
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Debug CLI command functions MUST NOT accidentally consume input
+belonging to other debug CLI commands. Otherwise, it's impossible to
+script a set of debug CLI commands which "work fine" when issued one
+at a time.
+
+This bit of code is NOT correct:
+
+.. code:: c
+
+ /* Eats script input NOT beloging to it, and chokes! */
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, ...))
+ ;
+ else if (unformat (input, ...))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, input);
+ }
+ }
+
+When executed as part of a script, such a function will return “parse
+error: ‘’” every time, unless it happens to be the last command in the
+script.
+
+Instead, use “unformat_line_input” to consume the rest of a line’s worth
+of input - everything past the path specified in the VLIB_CLI_COMMAND
+declaration.
+
+For example, unformat_line_input with “my_command” set up as shown below
+and user input “my path is clear” will produce an unformat_input_t that
+contains “is clear”.
+
+.. code:: c
+
+ VLIB_CLI_COMMAND (...) = {
+ .path = "my path",
+ };
+
+Here’s a bit of code which shows the required mechanics, in full:
+
+.. code:: c
+
+ static clib_error_t *
+ my_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+ {
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 this, that;
+ clib_error_t *error = 0;
+
+ if (!unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ /*
+ * Here, UNFORMAT_END_OF_INPUT is at the end of the line we consumed,
+ * not at the end of the script...
+ */
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "this %u", &this))
+ ;
+ else if (unformat (line_input, "that %u", &that))
+ ;
+ else
+ {
+ error = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ <do something based on "this" and "that", etc>
+
+ done:
+ unformat_free (line_input);
+ return error;
+ }
+ VLIB_CLI_COMMAND (my_command, static) = {
+ .path = "my path",
+ .function = my_command_fn",
+ };
+
+Vppinfra errors and warnings
+----------------------------
+
+Many functions within the vpp dataplane have return-values of type
+clib_error_t \*. Clib_error_t’s are arbitrary strings with a bit of
+metadata [fatal, warning] and are easy to announce. Returning a NULL
+clib_error_t \* indicates “A-OK, no error.”
+
+Clib_warning(format-args) is a handy way to add debugging output; clib
+warnings prepend function:line info to unambiguously locate the message
+source. Clib_unix_warning() adds perror()-style Linux system-call
+information. In production images, clib_warnings result in syslog
+entries.
+
+Serialization
+-------------
+
+Vppinfra serialization support allows the programmer to easily serialize
+and unserialize complex data structures.
+
+The underlying primitive serialize/unserialize functions use network
+byte-order, so there are no structural issues serializing on a
+little-endian host and unserializing on a big-endian host.
diff --git a/docs/developer/corearchitecture/mem.rst b/docs/developer/corearchitecture/mem.rst
new file mode 120000
index 00000000000..0fc53eab68c
--- /dev/null
+++ b/docs/developer/corearchitecture/mem.rst
@@ -0,0 +1 @@
+../../../src/vpp/mem/mem.rst \ No newline at end of file
diff --git a/docs/developer/corearchitecture/multi_thread.rst b/docs/developer/corearchitecture/multi_thread.rst
new file mode 100644
index 00000000000..195a9b791fd
--- /dev/null
+++ b/docs/developer/corearchitecture/multi_thread.rst
@@ -0,0 +1,169 @@
+.. _vpp_multi_thread:
+
+Multi-threading in VPP
+======================
+
+Modes
+-----
+
+VPP can work in 2 different modes:
+
+- single-thread
+- multi-thread with worker threads
+
+Single-thread
+~~~~~~~~~~~~~
+
+In a single-thread mode there is one main thread which handles both
+packet processing and other management functions (Command-Line Interface
+(CLI), API, stats). This is the default setup. There is no special
+startup config needed.
+
+Multi-thread with Worker Threads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this mode, the main threads handles management functions(debug CLI,
+API, stats collection) and one or more worker threads handle packet
+processing from input to output of the packet.
+
+Each worker thread polls input queues on subset of interfaces.
+
+With RSS (Receive Side Scaling) enabled multiple threads can service one
+physical interface (RSS function on NIC distributes traffic between
+different queues which are serviced by different worker threads).
+
+Thread placement
+----------------
+
+Thread placement is defined in the startup config under the cpu { … }
+section.
+
+The VPP platform can place threads automatically or manually. Automatic
+placement works in the following way:
+
+- if “skip-cores X” is defined first X cores will not be used
+- if “main-core X” is defined, VPP main thread will be placed on core
+ X, otherwise 1st available one will be used
+- if “workers N” is defined vpp will allocate first N available cores
+ and it will run threads on them
+- if “corelist-workers A,B1-Bn,C1-Cn” is defined vpp will automatically
+ assign those CPU cores to worker threads
+
+User can see active placement of cores by using the VPP debug CLI
+command show threads:
+
+.. code-block:: console
+
+ vpd# show threads
+ ID Name Type LWP lcore Core Socket State
+ 0 vpe_main 59723 2 2 0 wait
+ 1 vpe_wk_0 workers 59755 4 4 0 running
+ 2 vpe_wk_1 workers 59756 5 5 0 running
+ 3 vpe_wk_2 workers 59757 6 0 1 running
+ 4 vpe_wk_3 workers 59758 7 1 1 running
+ 5 stats 59775
+ vpd#
+
+The sample output above shows the main thread running on core 2 (2nd
+core on the CPU socket 0), worker threads running on cores 4-7.
+
+Sample Configurations
+---------------------
+
+By default, at start-up VPP uses
+configuration values from: ``/etc/vpp/startup.conf``
+
+The following sections describe some of the additional changes that can be made to this file.
+This file is initially populated from the files located in the following directory ``/vpp/vpp/conf/``
+
+Manual Placement
+~~~~~~~~~~~~~~~~
+
+Manual placement places the main thread on core 1, workers on cores
+4,5,20,21.
+
+.. code-block:: console
+
+ cpu {
+ main-core 1
+ corelist-workers 4-5,20-21
+ }
+
+Auto placement
+--------------
+
+Auto placement is likely to place the main thread on core 1 and workers
+on cores 2,3,4.
+
+.. code-block:: console
+
+ cpu {
+ skip-cores 1
+ workers 3
+ }
+
+Buffer Memory Allocation
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The VPP platform is NUMA aware. It can allocate memory for buffers on
+different CPU sockets (NUMA nodes). The amount of memory allocated can
+be defined in the startup config for each CPU socket by using the
+socket-mem A[[,B],C] statement inside the dpdk { … } section.
+
+For example:
+
+.. code-block:: console
+
+ dpdk {
+ socket-mem 1024,1024
+ }
+
+The above configuration allocates 1GB of memory on NUMA#0 and 1GB on
+NUMA#1. Each worker thread uses buffers which are local to itself.
+
+Buffer memory is allocated from hugepages. VPP prefers 1G pages if they
+are available. If not 2MB pages will be used.
+
+VPP takes care of mounting/unmounting hugepages file-system
+automatically so there is no need to do that manually.
+
+’‘’NOTE’’’: If you are running latest VPP release, there is no need for
+specifying socket-mem manually. VPP will discover all NUMA nodes and it
+will allocate 512M on each by default. socket-mem is only needed if
+bigger number of mbufs is required (default is 16384 per socket and can
+be changed with num-mbufs startup config command).
+
+Interface Placement in Multi-thread Setup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+On startup, the VPP platform assigns interfaces (or interface, queue
+pairs if RSS is used) to different worker threads in round robin
+fashion.
+
+The following example shows debug CLI commands to show and change
+interface placement:
+
+.. code-block:: console
+
+ vpd# sh dpdk interface placement
+ Thread 1 (vpp_wk_0 at lcore 5):
+ TenGigabitEthernet2/0/0 queue 0
+ TenGigabitEthernet2/0/1 queue 0
+ Thread 2 (vpp_wk_1 at lcore 6):
+ TenGigabitEthernet2/0/0 queue 1
+ TenGigabitEthernet2/0/1 queue 1
+
+The following shows an example of moving TenGigabitEthernet2/0/1 queue 1
+processing to 1st worker thread:
+
+.. code-block:: console
+
+ vpd# set interface placement TenGigabitEthernet2/0/1 queue 1 thread 1
+
+ vpp# sh dpdk interface placement
+ Thread 1 (vpp_wk_0 at lcore 5):
+ TenGigabitEthernet2/0/0 queue 0
+ TenGigabitEthernet2/0/1 queue 0
+ TenGigabitEthernet2/0/1 queue 1
+ Thread 2 (vpp_wk_1 at lcore 6):
+ TenGigabitEthernet2/0/0 queue 1
diff --git a/docs/developer/corearchitecture/multiarch/arbfns.rst b/docs/developer/corearchitecture/multiarch/arbfns.rst
new file mode 100644
index 00000000000..d469bd8a140
--- /dev/null
+++ b/docs/developer/corearchitecture/multiarch/arbfns.rst
@@ -0,0 +1,87 @@
+Multi-Architecture Arbitrary Function Cookbook
+==============================================
+
+Optimizing arbitrary functions for multiple architectures is simple
+enough, and very similar to process used to produce multi-architecture
+graph node dispatch functions.
+
+As with multi-architecture graph nodes, we compile source files
+multiple times, generating multiple implementations of the original
+function, and a public selector function.
+
+Details
+-------
+
+Decorate function definitions with CLIB_MARCH_FN macros. For example:
+
+Change the original function prototype...
+
+::
+
+ u32 vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index,
+ u32 frame_flags)
+
+...by recasting the function name and return type as the first two
+arguments to the CLIB_MARCH_FN macro:
+
+::
+
+ CLIB_MARCH_FN (vlib_frame_alloc_to_node, u32, vlib_main_t * vm,
+ u32 to_node_index, u32 frame_flags)
+
+In the actual vpp image, several versions of vlib_frame_alloc_to_node
+will appear: vlib_frame_alloc_to_node_avx2,
+vlib_frame_alloc_to_node_avx512, and so forth.
+
+
+For each multi-architecture function, use the CLIB_MARCH_FN_SELECT
+macro to help generate the one-and-only multi-architecture selector
+function:
+
+::
+
+ #ifndef CLIB_MARCH_VARIANT
+ u32
+ vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index,
+ u32 frame_flags)
+ {
+ return CLIB_MARCH_FN_SELECT (vlib_frame_alloc_to_node)
+ (vm, to_node_index, frame_flags);
+ }
+ #endif /* CLIB_MARCH_VARIANT */
+
+Once bound, the multi-architecture selector function is about as
+expensive as an indirect function call; which is to say: not very
+expensive.
+
+Modify CMakeLists.txt
+---------------------
+
+If the component in question already lists "MULTIARCH_SOURCES", simply
+add the indicated .c file to the list. Otherwise, add as shown
+below. Note that the added file "new_multiarch_node.c" should appear in
+*both* SOURCES and MULTIARCH_SOURCES:
+
+::
+
+ add_vpp_plugin(myplugin
+ SOURCES
+ multiarch_code.c
+ ...
+
+ MULTIARCH_SOURCES
+ multiarch_code.c
+ ...
+ )
+
+A Word to the Wise
+------------------
+
+A file which liberally mixes functions worth compiling for multiple
+architectures and functions which are not will end up full of
+#ifndef CLIB_MARCH_VARIANT conditionals. This won't do a thing to make
+the code look any better.
+
+Depending on requirements, it may make sense to move functions to
+(new) files to reduce complexity and/or improve legibility of the
+resulting code.
diff --git a/docs/developer/corearchitecture/multiarch/index.rst b/docs/developer/corearchitecture/multiarch/index.rst
new file mode 100644
index 00000000000..824a8e68438
--- /dev/null
+++ b/docs/developer/corearchitecture/multiarch/index.rst
@@ -0,0 +1,12 @@
+.. _multiarch:
+
+Multi-architecture support
+==========================
+
+This reference guide describes how to use the vpp multi-architecture support scheme
+
+.. toctree::
+ :maxdepth: 1
+
+ nodefns
+ arbfns
diff --git a/docs/developer/corearchitecture/multiarch/nodefns.rst b/docs/developer/corearchitecture/multiarch/nodefns.rst
new file mode 100644
index 00000000000..9647e64f08c
--- /dev/null
+++ b/docs/developer/corearchitecture/multiarch/nodefns.rst
@@ -0,0 +1,138 @@
+Multi-Architecture Graph Node Cookbook
+======================================
+
+In the context of graph node dispatch functions, it's easy enough to
+use the vpp multi-architecture support setup. The point of the scheme
+is simple: for performance-critical nodes, generate multiple CPU
+hardware-dependent versions of the node dispatch functions, and pick
+the best one at runtime.
+
+The vpp scheme is simple enough to use, but details matter.
+
+100,000 foot view
+-----------------
+
+We compile entire graph node dispatch function implementation files
+multiple times. These compilations give rise to multiple versions of
+the graph node dispatch functions. Per-node constructor-functions
+interrogate CPU hardware, select the node dispatch function variant to
+use, and set the vlib_node_registration_t ".function" member to the
+address of the selected variant.
+
+Details
+-------
+
+Declare the node dispatch function as shown, using the VLIB\_NODE\_FN macro. The
+name of the node function **MUST** match the name of the graph node.
+
+::
+
+ VLIB_NODE_FN (ip4_sdp_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+ {
+ if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+ return ip46_sdp_inline (vm, node, frame, 1 /* is_ip4 */ ,
+ 1 /* is_trace */ );
+ else
+ return ip46_sdp_inline (vm, node, frame, 1 /* is_ip4 */ ,
+ 0 /* is_trace */ );
+ }
+
+We need to generate *precisely one copy* of the
+vlib_node_registration_t, error strings, and packet trace decode function.
+
+Simply bracket these items with "#ifndef CLIB_MARCH_VARIANT...#endif":
+
+::
+
+ #ifndef CLIB_MARCH_VARIANT
+ static u8 *
+ format_sdp_trace (u8 * s, va_list * args)
+ {
+ <snip>
+ }
+ #endif
+
+ ...
+
+ #ifndef CLIB_MARCH_VARIANT
+ static char *sdp_error_strings[] = {
+ #define _(sym,string) string,
+ foreach_sdp_error
+ #undef _
+ };
+ #endif
+
+ ...
+
+ #ifndef CLIB_MARCH_VARIANT
+ VLIB_REGISTER_NODE (ip4_sdp_node) =
+ {
+ // DO NOT set the .function structure member.
+ // The multiarch selection __attribute__((constructor)) function
+ // takes care of it at runtime
+ .name = "ip4-sdp",
+ .vector_size = sizeof (u32),
+ .format_trace = format_sdp_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(sdp_error_strings),
+ .error_strings = sdp_error_strings,
+
+ .n_next_nodes = SDP_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes =
+ {
+ [SDP_NEXT_DROP] = "ip4-drop",
+ },
+ };
+ #endif
+
+To belabor the point: *do not* set the ".function" member! That's the job of the multi-arch
+selection \_\_attribute\_\_((constructor)) function
+
+Always inline node dispatch functions
+-------------------------------------
+
+It's typical for a graph dispatch function to contain one or more
+calls to an inline function. See above. If your node dispatch function
+is structured that way, make *ABSOLUTELY CERTAIN* to use the
+"always_inline" macro:
+
+::
+
+ always_inline uword
+ ip46_sdp_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ int is_ip4, int is_trace)
+ { ... }
+
+Otherwise, the compiler is highly likely NOT to build multiple
+versions of the guts of your dispatch function.
+
+It's fairly easy to spot this mistake in "perf top." If you see, for
+example, a bunch of functions with names of the form
+"xxx_node_fn_avx2" in the profile, *BUT* your brand-new node function
+shows up with a name of the form "xxx_inline.isra.1", it's quite likely
+that the inline was declared "static inline" instead of "always_inline".
+
+Modify CMakeLists.txt
+---------------------
+
+If the component in question already lists "MULTIARCH_SOURCES", simply
+add the indicated .c file to the list. Otherwise, add as shown
+below. Note that the added file "new_multiarch_node.c" should appear in
+*both* SOURCES and MULTIARCH_SOURCES:
+
+::
+
+ add_vpp_plugin(myplugin
+ SOURCES
+ new_multiarch_node.c
+ ...
+
+ MULTIARCH_SOURCES
+ new_ multiarch_node.c
+ ...
+ )
diff --git a/docs/developer/corearchitecture/softwarearchitecture.rst b/docs/developer/corearchitecture/softwarearchitecture.rst
new file mode 100644
index 00000000000..7f8a0e04645
--- /dev/null
+++ b/docs/developer/corearchitecture/softwarearchitecture.rst
@@ -0,0 +1,47 @@
+Software Architecture
+=====================
+
+The fd.io vpp implementation is a third-generation vector packet
+processing implementation specifically related to US Patent 7,961,636,
+as well as earlier work. Note that the Apache-2 license specifically
+grants non-exclusive patent licenses; we mention this patent as a point
+of historical interest.
+
+For performance, the vpp dataplane consists of a directed graph of
+forwarding nodes which process multiple packets per invocation. This
+schema enables a variety of micro-processor optimizations: pipelining
+and prefetching to cover dependent read latency, inherent I-cache phase
+behavior, vector instructions. Aside from hardware input and hardware
+output nodes, the entire forwarding graph is portable code.
+
+Depending on the scenario at hand, we often spin up multiple worker
+threads which process ingress-hashes packets from multiple queues using
+identical forwarding graph replicas.
+
+VPP Layers - Implementation Taxonomy
+------------------------------------
+
+.. figure:: /_images/VPP_Layering.png
+ :alt: image
+
+ image
+
+- VPP Infra - the VPP infrastructure layer, which contains the core
+ library source code. This layer performs memory functions, works with
+ vectors and rings, performs key lookups in hash tables, and works
+ with timers for dispatching graph nodes.
+- VLIB - the vector processing library. The vlib layer also handles
+ various application management functions: buffer, memory and graph
+ node management, maintaining and exporting counters, thread
+ management, packet tracing. Vlib implements the debug CLI (command
+ line interface).
+- VNET - works with VPP's networking interface (layers 2, 3, and 4)
+ performs session and traffic management, and works with devices and
+ the data control plane.
+- Plugins - Contains an increasingly rich set of data-plane plugins, as
+ noted in the above diagram.
+- VPP - the container application linked against all of the above.
+
+It’s important to understand each of these layers in a certain amount of
+detail. Much of the implementation is best dealt with at the API level
+and otherwise left alone.
diff --git a/docs/developer/corearchitecture/vlib.rst b/docs/developer/corearchitecture/vlib.rst
new file mode 100644
index 00000000000..f542d33ebb8
--- /dev/null
+++ b/docs/developer/corearchitecture/vlib.rst
@@ -0,0 +1,888 @@
+VLIB (Vector Processing Library)
+================================
+
+The files associated with vlib are located in the ./src/{vlib, vlibapi,
+vlibmemory} folders. These libraries provide vector processing support
+including graph-node scheduling, reliable multicast support,
+ultra-lightweight cooperative multi-tasking threads, a CLI, plug in .DLL
+support, physical memory and Linux epoll support. Parts of this library
+embody US Patent 7,961,636.
+
+Init function discovery
+-----------------------
+
+vlib applications register for various [initialization] events by
+placing structures and \__attribute__((constructor)) functions into the
+image. At appropriate times, the vlib framework walks
+constructor-generated singly-linked structure lists, performs a
+topological sort based on specified constraints, and calls the indicated
+functions. Vlib applications create graph nodes, add CLI functions,
+start cooperative multi-tasking threads, etc. etc. using this mechanism.
+
+vlib applications invariably include a number of VLIB_INIT_FUNCTION
+(my_init_function) macros.
+
+Each init / configure / etc. function has the return type clib_error_t
+\*. Make sure that the function returns 0 if all is well, otherwise the
+framework will announce an error and exit.
+
+vlib applications must link against vppinfra, and often link against
+other libraries such as VNET. In the latter case, it may be necessary to
+explicitly reference symbol(s) otherwise large portions of the library
+may be AWOL at runtime.
+
+Init function construction and constraint specification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It’s easy to add an init function:
+
+.. code:: c
+
+ static clib_error_t *my_init_function (vlib_main_t *vm)
+ {
+ /* ... initialize things ... */
+
+ return 0; // or return clib_error_return (0, "BROKEN!");
+ }
+ VLIB_INIT_FUNCTION(my_init_function);
+
+As given, my_init_function will be executed “at some point,” but with no
+ordering guarantees.
+
+Specifying ordering constraints is easy:
+
+.. code:: c
+
+ VLIB_INIT_FUNCTION(my_init_function) =
+ {
+ .runs_before = VLIB_INITS("we_run_before_function_1",
+ "we_run_before_function_2"),
+ .runs_after = VLIB_INITS("we_run_after_function_1",
+ "we_run_after_function_2),
+ };
+
+It’s also easy to specify bulk ordering constraints of the form “a then
+b then c then d”:
+
+.. code:: c
+
+ VLIB_INIT_FUNCTION(my_init_function) =
+ {
+ .init_order = VLIB_INITS("a", "b", "c", "d"),
+ };
+
+It’s OK to specify all three sorts of ordering constraints for a single
+init function, although it’s hard to imagine why it would be necessary.
+
+Node Graph Initialization
+-------------------------
+
+vlib packet-processing applications invariably define a set of graph
+nodes to process packets.
+
+One constructs a vlib_node_registration_t, most often via the
+VLIB_REGISTER_NODE macro. At runtime, the framework processes the set of
+such registrations into a directed graph. It is easy enough to add nodes
+to the graph at runtime. The framework does not support removing nodes.
+
+vlib provides several types of vector-processing graph nodes, primarily
+to control framework dispatch behaviors. The type member of the
+vlib_node_registration_t functions as follows:
+
+- VLIB_NODE_TYPE_PRE_INPUT - run before all other node types
+- VLIB_NODE_TYPE_INPUT - run as often as possible, after pre_input
+ nodes
+- VLIB_NODE_TYPE_INTERNAL - only when explicitly made runnable by
+ adding pending frames for processing
+- VLIB_NODE_TYPE_PROCESS - only when explicitly made runnable.
+ “Process” nodes are actually cooperative multi-tasking threads. They
+ **must** explicitly suspend after a reasonably short period of time.
+
+For a precise understanding of the graph node dispatcher, please read
+./src/vlib/main.c:vlib_main_loop.
+
+Graph node dispatcher
+---------------------
+
+Vlib_main_loop() dispatches graph nodes. The basic vector processing
+algorithm is diabolically simple, but may not be obvious from even a
+long stare at the code. Here’s how it works: some input node, or set of
+input nodes, produce a vector of work to process. The graph node
+dispatcher pushes the work vector through the directed graph,
+subdividing it as needed, until the original work vector has been
+completely processed. At that point, the process recurs.
+
+This scheme yields a stable equilibrium in frame size, by construction.
+Here’s why: as the frame size increases, the per-frame-element
+processing time decreases. There are several related forces at work; the
+simplest to describe is the effect of vector processing on the CPU L1
+I-cache. The first frame element [packet] processed by a given node
+warms up the node dispatch function in the L1 I-cache. All subsequent
+frame elements profit. As we increase the number of frame elements, the
+cost per element goes down.
+
+Under light load, it is a crazy waste of CPU cycles to run the graph
+node dispatcher flat-out. So, the graph node dispatcher arranges to wait
+for work by sitting in a timed epoll wait if the prevailing frame size
+is low. The scheme has a certain amount of hysteresis to avoid
+constantly toggling back and forth between interrupt and polling mode.
+Although the graph dispatcher supports interrupt and polling modes, our
+current default device drivers do not.
+
+The graph node scheduler uses a hierarchical timer wheel to reschedule
+process nodes upon timer expiration.
+
+Graph dispatcher internals
+--------------------------
+
+This section may be safely skipped. It’s not necessary to understand
+graph dispatcher internals to create graph nodes.
+
+Vector Data Structure
+---------------------
+
+In vpp / vlib, we represent vectors as instances of the vlib_frame_t
+type:
+
+.. code:: c
+
+ typedef struct vlib_frame_t
+ {
+ /* Frame flags. */
+ u16 flags;
+
+ /* Number of scalar bytes in arguments. */
+ u8 scalar_size;
+
+ /* Number of bytes per vector argument. */
+ u8 vector_size;
+
+ /* Number of vector elements currently in frame. */
+ u16 n_vectors;
+
+ /* Scalar and vector arguments to next node. */
+ u8 arguments[0];
+ } vlib_frame_t;
+
+Note that one *could* construct all kinds of vectors - including vectors
+with some associated scalar data - using this structure. In the vpp
+application, vectors typically use a 4-byte vector element size, and
+zero bytes’ worth of associated per-frame scalar data.
+
+Frames are always allocated on CLIB_CACHE_LINE_BYTES boundaries. Frames
+have u32 indices which make use of the alignment property, so the
+maximum feasible main heap offset of a frame is CLIB_CACHE_LINE_BYTES \*
+0xFFFFFFFF: 64*4 = 256 Gbytes.
+
+Scheduling Vectors
+------------------
+
+As you can see, vectors are not directly associated with graph nodes. We
+represent that association in a couple of ways. The simplest is the
+vlib_pending_frame_t:
+
+.. code:: c
+
+ /* A frame pending dispatch by main loop. */
+ typedef struct
+ {
+ /* Node and runtime for this frame. */
+ u32 node_runtime_index;
+
+ /* Frame index (in the heap). */
+ u32 frame_index;
+
+ /* Start of next frames for this node. */
+ u32 next_frame_index;
+
+ /* Special value for next_frame_index when there is no next frame. */
+ #define VLIB_PENDING_FRAME_NO_NEXT_FRAME ((u32) ~0)
+ } vlib_pending_frame_t;
+
+Here is the code in …/src/vlib/main.c:vlib_main_or_worker_loop() which
+processes frames:
+
+.. code:: c
+
+ /*
+ * Input nodes may have added work to the pending vector.
+ * Process pending vector until there is nothing left.
+ * All pending vectors will be processed from input -> output.
+ */
+ for (i = 0; i < _vec_len (nm->pending_frames); i++)
+ cpu_time_now = dispatch_pending_node (vm, i, cpu_time_now);
+ /* Reset pending vector for next iteration. */
+
+The pending frame node_runtime_index associates the frame with the node
+which will process it.
+
+Complications
+-------------
+
+Fasten your seatbelt. Here’s where the story - and the data structures -
+become quite complicated…
+
+At 100,000 feet: vpp uses a directed graph, not a directed *acyclic*
+graph. It’s really quite normal for a packet to visit ip[46]-lookup
+multiple times. The worst-case: a graph node which enqueues packets to
+itself.
+
+To deal with this issue, the graph dispatcher must force allocation of a
+new frame if the current graph node’s dispatch function happens to
+enqueue a packet back to itself.
+
+There are no guarantees that a pending frame will be processed
+immediately, which means that more packets may be added to the
+underlying vlib_frame_t after it has been attached to a
+vlib_pending_frame_t. Care must be taken to allocate new frames and
+pending frames if a (pending_frame, frame) pair fills.
+
+Next frames, next frame ownership
+---------------------------------
+
+The vlib_next_frame_t is the last key graph dispatcher data structure:
+
+.. code:: c
+
+ typedef struct
+ {
+ /* Frame index. */
+ u32 frame_index;
+
+ /* Node runtime for this next. */
+ u32 node_runtime_index;
+
+ /* Next frame flags. */
+ u32 flags;
+
+ /* Reflects node frame-used flag for this next. */
+ #define VLIB_FRAME_NO_FREE_AFTER_DISPATCH \
+ VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH
+
+ /* This next frame owns enqueue to node
+ corresponding to node_runtime_index. */
+ #define VLIB_FRAME_OWNER (1 << 15)
+
+ /* Set when frame has been allocated for this next. */
+ #define VLIB_FRAME_IS_ALLOCATED VLIB_NODE_FLAG_IS_OUTPUT
+
+ /* Set when frame has been added to pending vector. */
+ #define VLIB_FRAME_PENDING VLIB_NODE_FLAG_IS_DROP
+
+ /* Set when frame is to be freed after dispatch. */
+ #define VLIB_FRAME_FREE_AFTER_DISPATCH VLIB_NODE_FLAG_IS_PUNT
+
+ /* Set when frame has traced packets. */
+ #define VLIB_FRAME_TRACE VLIB_NODE_FLAG_TRACE
+
+ /* Number of vectors enqueue to this next since last overflow. */
+ u32 vectors_since_last_overflow;
+ } vlib_next_frame_t;
+
+Graph node dispatch functions call vlib_get_next_frame (…) to set “(u32
+\*)to_next” to the right place in the vlib_frame_t corresponding to the
+ith arc (aka next0) from the current node to the indicated next node.
+
+After some scuffling around - two levels of macros - processing reaches
+vlib_get_next_frame_internal (…). Get-next-frame-internal digs up the
+vlib_next_frame_t corresponding to the desired graph arc.
+
+The next frame data structure amounts to a graph-arc-centric frame
+cache. Once a node finishes adding element to a frame, it will acquire a
+vlib_pending_frame_t and end up on the graph dispatcher’s run-queue. But
+there’s no guarantee that more vector elements won’t be added to the
+underlying frame from the same (source_node, next_index) arc or from a
+different (source_node, next_index) arc.
+
+Maintaining consistency of the arc-to-frame cache is necessary. The
+first step in maintaining consistency is to make sure that only one
+graph node at a time thinks it “owns” the target vlib_frame_t.
+
+Back to the graph node dispatch function. In the usual case, a certain
+number of packets will be added to the vlib_frame_t acquired by calling
+vlib_get_next_frame (…).
+
+Before a dispatch function returns, it’s required to call
+vlib_put_next_frame (…) for all of the graph arcs it actually used. This
+action adds a vlib_pending_frame_t to the graph dispatcher’s pending
+frame vector.
+
+Vlib_put_next_frame makes a note in the pending frame of the frame
+index, and also of the vlib_next_frame_t index.
+
+dispatch_pending_node actions
+-----------------------------
+
+The main graph dispatch loop calls dispatch pending node as shown above.
+
+Dispatch_pending_node recovers the pending frame, and the graph node
+runtime / dispatch function. Further, it recovers the next_frame
+currently associated with the vlib_frame_t, and detaches the
+vlib_frame_t from the next_frame.
+
+In …/src/vlib/main.c:dispatch_pending_node(…), note this stanza:
+
+.. code:: c
+
+ /* Force allocation of new frame while current frame is being
+ dispatched. */
+ restore_frame_index = ~0;
+ if (nf->frame_index == p->frame_index)
+ {
+ nf->frame_index = ~0;
+ nf->flags &= ~VLIB_FRAME_IS_ALLOCATED;
+ if (!(n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH))
+ restore_frame_index = p->frame_index;
+ }
+
+dispatch_pending_node is worth a hard stare due to the several
+second-order optimizations it implements. Almost as an afterthought, it
+calls dispatch_node which actually calls the graph node dispatch
+function.
+
+Process / thread model
+----------------------
+
+vlib provides an ultra-lightweight cooperative multi-tasking thread
+model. The graph node scheduler invokes these processes in much the same
+way as traditional vector-processing run-to-completion graph nodes;
+plus-or-minus a setjmp/longjmp pair required to switch stacks. Simply
+set the vlib_node_registration_t type field to vlib_NODE_TYPE_PROCESS.
+Yes, process is a misnomer. These are cooperative multi-tasking threads.
+
+As of this writing, the default stack size is 2<<15 = 32kb. Initialize
+the node registration’s process_log2_n_stack_bytes member as needed. The
+graph node dispatcher makes some effort to detect stack overrun, e.g. by
+mapping a no-access page below each thread stack.
+
+Process node dispatch functions are expected to be “while(1) { }” loops
+which suspend when not otherwise occupied, and which must not run for
+unreasonably long periods of time.
+
+“Unreasonably long” is an application-dependent concept. Over the years,
+we have constructed frame-size sensitive control-plane nodes which will
+use a much higher fraction of the available CPU bandwidth when the frame
+size is low. The classic example: modifying forwarding tables. So long
+as the table-builder leaves the forwarding tables in a valid state, one
+can suspend the table builder to avoid dropping packets as a result of
+control-plane activity.
+
+Process nodes can suspend for fixed amounts of time, or until another
+entity signals an event, or both. See the next section for a description
+of the vlib process event mechanism.
+
+When running in vlib process context, one must pay strict attention to
+loop invariant issues. If one walks a data structure and calls a
+function which may suspend, one had best know by construction that it
+cannot change. Often, it’s best to simply make a snapshot copy of a data
+structure, walk the copy at leisure, then free the copy.
+
+Process events
+--------------
+
+The vlib process event mechanism API is extremely lightweight and easy
+to use. Here is a typical example:
+
+.. code:: c
+
+ vlib_main_t *vm = &vlib_global_main;
+ uword event_type, * event_data = 0;
+
+ while (1)
+ {
+ vlib_process_wait_for_event_or_clock (vm, 5.0 /* seconds */);
+
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ switch (event_type) {
+ case EVENT1:
+ handle_event1s (event_data);
+ break;
+
+ case EVENT2:
+ handle_event2s (event_data);
+ break;
+
+ case ~0: /* 5-second idle/periodic */
+ handle_idle ();
+ break;
+
+ default: /* bug! */
+ ASSERT (0);
+ }
+
+ vec_reset_length(event_data);
+ }
+
+In this example, the VLIB process node waits for an event to occur, or
+for 5 seconds to elapse. The code demuxes on the event type, calling the
+appropriate handler function. Each call to vlib_process_get_events
+returns a vector of per-event-type data passed to successive
+vlib_process_signal_event calls; it is a serious error to process only
+event_data[0].
+
+Resetting the event_data vector-length to 0 [instead of calling
+vec_free] means that the event scheme doesn’t burn cycles continuously
+allocating and freeing the event data vector. This is a common vppinfra
+/ vlib coding pattern, well worth using when appropriate.
+
+Signaling an event is easy, for example:
+
+.. code:: c
+
+ vlib_process_signal_event (vm, process_node_index, EVENT1,
+ (uword)arbitrary_event1_data); /* and so forth */
+
+One can either know the process node index by construction - dig it out
+of the appropriate vlib_node_registration_t - or by finding the
+vlib_node_t with vlib_get_node_by_name(…).
+
+Buffers
+-------
+
+vlib buffering solves the usual set of packet-processing problems,
+albeit at high performance. Key in terms of performance: one ordinarily
+allocates / frees N buffers at a time rather than one at a time. Except
+when operating directly on a specific buffer, one deals with buffers by
+index, not by pointer.
+
+Packet-processing frames are u32[] arrays, not vlib_buffer_t[] arrays.
+
+Packets comprise one or more vlib buffers, chained together as required.
+Multiple particle sizes are supported; hardware input nodes simply ask
+for the required size(s). Coalescing support is available. For obvious
+reasons one is discouraged from writing one’s own wild and wacky buffer
+chain traversal code.
+
+vlib buffer headers are allocated immediately prior to the buffer data
+area. In typical packet processing this saves a dependent read wait:
+given a buffer’s address, one can prefetch the buffer header [metadata]
+at the same time as the first cache line of buffer data.
+
+Buffer header metadata (vlib_buffer_t) includes the usual rewrite
+expansion space, a current_data offset, RX and TX interface indices,
+packet trace information, and a opaque areas.
+
+The opaque data is intended to control packet processing in arbitrary
+subgraph-dependent ways. The programmer shoulders responsibility for
+data lifetime analysis, type-checking, etc.
+
+Buffers have reference-counts in support of e.g. multicast replication.
+
+Shared-memory message API
+-------------------------
+
+Local control-plane and application processes interact with the vpp
+dataplane via asynchronous message-passing in shared memory over
+unidirectional queues. The same application APIs are available via
+sockets.
+
+Capturing API traces and replaying them in a simulation environment
+requires a disciplined approach to the problem. This seems like a
+make-work task, but it is not. When something goes wrong in the
+control-plane after 300,000 or 3,000,000 operations, high-speed replay
+of the events leading up to the accident is a huge win.
+
+The shared-memory message API message allocator vl_api_msg_alloc uses a
+particularly cute trick. Since messages are processed in order, we try
+to allocate message buffering from a set of fixed-size, preallocated
+rings. Each ring item has a “busy” bit. Freeing one of the preallocated
+message buffers merely requires the message consumer to clear the busy
+bit. No locking required.
+
+Debug CLI
+---------
+
+Adding debug CLI commands to VLIB applications is very simple.
+
+Here is a complete example:
+
+.. code:: c
+
+ static clib_error_t *
+ show_ip_tuple_match (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+ {
+ vlib_cli_output (vm, "%U\n", format_ip_tuple_match_tables, &routing_main);
+ return 0;
+ }
+
+ static VLIB_CLI_COMMAND (show_ip_tuple_command) =
+ {
+ .path = "show ip tuple match",
+ .short_help = "Show ip 5-tuple match-and-broadcast tables",
+ .function = show_ip_tuple_match,
+ };
+
+This example implements the “show ip tuple match” debug cli command. In
+ordinary usage, the vlib cli is available via the “vppctl” application,
+which sends traffic to a named pipe. One can configure debug CLI telnet
+access on a configurable port.
+
+The cli implementation has an output redirection facility which makes it
+simple to deliver cli output via shared-memory API messaging,
+
+Particularly for debug or “show tech support” type commands, it would be
+wasteful to write vlib application code to pack binary data, write more
+code elsewhere to unpack the data and finally print the answer. If a
+certain cli command has the potential to hurt packet processing
+performance by running for too long, do the work incrementally in a
+process node. The client can wait.
+
+Macro expansion
+~~~~~~~~~~~~~~~
+
+The vpp debug CLI engine includes a recursive macro expander. This is
+quite useful for factoring out address and/or interface name specifics:
+
+::
+
+ define ip1 192.168.1.1/24
+ define ip2 192.168.2.1/24
+ define iface1 GigabitEthernet3/0/0
+ define iface2 loop1
+
+ set int ip address $iface1 $ip1
+ set int ip address $iface2 $(ip2)
+
+ undefine ip1
+ undefine ip2
+ undefine iface1
+ undefine iface2
+
+Each socket (or telnet) debug CLI session has its own macro tables. All
+debug CLI sessions which use CLI_INBAND binary API messages share a
+single table.
+
+The macro expander recognizes circular definitions:
+
+::
+
+ define foo \$(bar)
+ define bar \$(mumble)
+ define mumble \$(foo)
+
+At 8 levels of recursion, the macro expander throws up its hands and
+replies “CIRCULAR.”
+
+Macro-related debug CLI commands
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to the “define” and “undefine” debug CLI commands, use “show
+macro [noevaluate]” to dump the macro table. The “echo” debug CLI
+command will evaluate and print its argument:
+
+::
+
+ vpp# define foo This\ Is\ Foo
+ vpp# echo $foo
+ This Is Foo
+
+Handing off buffers between threads
+-----------------------------------
+
+Vlib includes an easy-to-use mechanism for handing off buffers between
+worker threads. A typical use-case: software ingress flow hashing. At a
+high level, one creates a per-worker-thread queue which sends packets to
+a specific graph node in the indicated worker thread. With the queue in
+hand, enqueue packets to the worker thread of your choice.
+
+Initialize a handoff queue
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Simple enough, call vlib_frame_queue_main_init:
+
+.. code:: c
+
+ main_ptr->frame_queue_index
+ = vlib_frame_queue_main_init (dest_node.index, frame_queue_size);
+
+Frame_queue_size means what it says: the number of frames which may be
+queued. Since frames contain 1…256 packets, frame_queue_size should be a
+reasonably small number (32…64). If the frame queue producer(s) are
+faster than the frame queue consumer(s), congestion will occur. Suggest
+letting the enqueue operator deal with queue congestion, as shown in the
+enqueue example below.
+
+Under the floorboards, vlib_frame_queue_main_init creates an input queue
+for each worker thread.
+
+Please do NOT create frame queues until it’s clear that they will be
+used. Although the main dispatch loop is reasonably smart about how
+often it polls the (entire set of) frame queues, polling unused frame
+queues is a waste of clock cycles.
+
+Hand off packets
+~~~~~~~~~~~~~~~~
+
+The actual handoff mechanics are simple, and integrate nicely with a
+typical graph-node dispatch function:
+
+.. code:: c
+
+ always_inline uword
+ do_handoff_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame,
+ int is_ip4, int is_trace)
+ {
+ u32 n_left_from, *from;
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
+ u16 thread_indices [VLIB_FRAME_SIZE];
+ u16 nexts[VLIB_FRAME_SIZE], *next;
+ u32 n_enq;
+ htest_main_t *hmp = &htest_main;
+ int i;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ vlib_get_buffers (vm, from, bufs, n_left_from);
+ next = nexts;
+ b = bufs;
+
+ /*
+ * Typical frame traversal loop, details vary with
+ * use case. Make sure to set thread_indices[i] with
+ * the desired destination thread index. You may
+ * or may not bother to set next[i].
+ */
+
+ for (i = 0; i < frame->n_vectors; i++)
+ {
+ <snip>
+ /* Pick a thread to handle this packet */
+ thread_indices[i] = f (packet_data_or_whatever);
+ <snip>
+
+ b += 1;
+ next += 1;
+ n_left_from -= 1;
+ }
+
+ /* Enqueue buffers to threads */
+ n_enq =
+ vlib_buffer_enqueue_to_thread (vm, node, hmp->frame_queue_index,
+ from, thread_indices, frame->n_vectors,
+ 1 /* drop on congestion */);
+ /* Typical counters,
+ if (n_enq < frame->n_vectors)
+ vlib_node_increment_counter (vm, node->node_index,
+ XXX_ERROR_CONGESTION_DROP,
+ frame->n_vectors - n_enq);
+ vlib_node_increment_counter (vm, node->node_index,
+ XXX_ERROR_HANDED_OFF, n_enq);
+ return frame->n_vectors;
+ }
+
+Notes about calling vlib_buffer_enqueue_to_thread(…):
+
+- If you pass “drop on congestion” non-zero, all packets in the inbound
+ frame will be consumed one way or the other. This is the recommended
+ setting.
+
+- In the drop-on-congestion case, please don’t try to “help” in the
+ enqueue node by freeing dropped packets, or by pushing them to
+ “error-drop.” Either of those actions would be a severe error.
+
+- It’s perfectly OK to enqueue packets to the current thread.
+
+Handoff Demo Plugin
+-------------------
+
+Check out the sample (plugin) example in …/src/examples/handoffdemo. If
+you want to build the handoff demo plugin:
+
+::
+
+ $ cd .../src/plugins
+ $ ln -s ../examples/handoffdemo
+
+This plugin provides a simple example of how to hand off packets between
+threads. We used it to debug packet-tracer handoff tracing support.
+
+Packet generator input script
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ packet-generator new {
+ name x
+ limit 5
+ size 128-128
+ interface local0
+ node handoffdemo-1
+ data {
+ incrementing 30
+ }
+ }
+
+Start vpp with 2 worker threads
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The demo plugin hands packets from worker 1 to worker 2.
+
+Enable tracing, and start the packet generator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ trace add pg-input 100
+ packet-generator enable
+
+Sample Run
+~~~~~~~~~~
+
+::
+
+ DBGvpp# ex /tmp/pg_input_script
+ DBGvpp# pa en
+ DBGvpp# sh err
+ Count Node Reason
+ 5 handoffdemo-1 packets handed off processed
+ 5 handoffdemo-2 completed packets
+ DBGvpp# show run
+ Thread 1 vpp_wk_0 (lcore 0)
+ Time 133.9, average vectors/node 5.00, last 128 main loops 0.00 per node 0.00
+ vector rates in 3.7331e-2, out 0.0000e0, drop 0.0000e0, punt 0.0000e0
+ Name State Calls Vectors Suspends Clocks Vectors/Call
+ handoffdemo-1 active 1 5 0 4.76e3 5.00
+ pg-input disabled 2 5 0 5.58e4 2.50
+ unix-epoll-input polling 22760 0 0 2.14e7 0.00
+ ---------------
+ Thread 2 vpp_wk_1 (lcore 2)
+ Time 133.9, average vectors/node 5.00, last 128 main loops 0.00 per node 0.00
+ vector rates in 0.0000e0, out 0.0000e0, drop 3.7331e-2, punt 0.0000e0
+ Name State Calls Vectors Suspends Clocks Vectors/Call
+ drop active 1 5 0 1.35e4 5.00
+ error-drop active 1 5 0 2.52e4 5.00
+ handoffdemo-2 active 1 5 0 2.56e4 5.00
+ unix-epoll-input polling 22406 0 0 2.18e7 0.00
+
+Enable the packet tracer and run it again…
+
+::
+
+ DBGvpp# trace add pg-input 100
+ DBGvpp# pa en
+ DBGvpp# sh trace
+ sh trace
+ ------------------- Start of thread 0 vpp_main -------------------
+ No packets in trace buffer
+ ------------------- Start of thread 1 vpp_wk_0 -------------------
+ Packet 1
+
+ 00:06:50:520688: pg-input
+ stream x, 128 bytes, 0 sw_if_index
+ current data 0, length 128, buffer-pool 0, ref-count 1, trace handle 0x1000000
+ 00000000: 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d0000
+ 00000020: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000040: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000060: 0000000000000000000000000000000000000000000000000000000000000000
+ 00:06:50:520762: handoffdemo-1
+ HANDOFFDEMO: current thread 1
+
+ Packet 2
+
+ 00:06:50:520688: pg-input
+ stream x, 128 bytes, 0 sw_if_index
+ current data 0, length 128, buffer-pool 0, ref-count 1, trace handle 0x1000001
+ 00000000: 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d0000
+ 00000020: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000040: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000060: 0000000000000000000000000000000000000000000000000000000000000000
+ 00:06:50:520762: handoffdemo-1
+ HANDOFFDEMO: current thread 1
+
+ Packet 3
+
+ 00:06:50:520688: pg-input
+ stream x, 128 bytes, 0 sw_if_index
+ current data 0, length 128, buffer-pool 0, ref-count 1, trace handle 0x1000002
+ 00000000: 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d0000
+ 00000020: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000040: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000060: 0000000000000000000000000000000000000000000000000000000000000000
+ 00:06:50:520762: handoffdemo-1
+ HANDOFFDEMO: current thread 1
+
+ Packet 4
+
+ 00:06:50:520688: pg-input
+ stream x, 128 bytes, 0 sw_if_index
+ current data 0, length 128, buffer-pool 0, ref-count 1, trace handle 0x1000003
+ 00000000: 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d0000
+ 00000020: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000040: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000060: 0000000000000000000000000000000000000000000000000000000000000000
+ 00:06:50:520762: handoffdemo-1
+ HANDOFFDEMO: current thread 1
+
+ Packet 5
+
+ 00:06:50:520688: pg-input
+ stream x, 128 bytes, 0 sw_if_index
+ current data 0, length 128, buffer-pool 0, ref-count 1, trace handle 0x1000004
+ 00000000: 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d0000
+ 00000020: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000040: 0000000000000000000000000000000000000000000000000000000000000000
+ 00000060: 0000000000000000000000000000000000000000000000000000000000000000
+ 00:06:50:520762: handoffdemo-1
+ HANDOFFDEMO: current thread 1
+
+ ------------------- Start of thread 2 vpp_wk_1 -------------------
+ Packet 1
+
+ 00:06:50:520796: handoff_trace
+ HANDED-OFF: from thread 1 trace index 0
+ 00:06:50:520796: handoffdemo-2
+ HANDOFFDEMO: current thread 2
+ 00:06:50:520867: error-drop
+ rx:local0
+ 00:06:50:520914: drop
+ handoffdemo-2: completed packets
+
+ Packet 2
+
+ 00:06:50:520796: handoff_trace
+ HANDED-OFF: from thread 1 trace index 1
+ 00:06:50:520796: handoffdemo-2
+ HANDOFFDEMO: current thread 2
+ 00:06:50:520867: error-drop
+ rx:local0
+ 00:06:50:520914: drop
+ handoffdemo-2: completed packets
+
+ Packet 3
+
+ 00:06:50:520796: handoff_trace
+ HANDED-OFF: from thread 1 trace index 2
+ 00:06:50:520796: handoffdemo-2
+ HANDOFFDEMO: current thread 2
+ 00:06:50:520867: error-drop
+ rx:local0
+ 00:06:50:520914: drop
+ handoffdemo-2: completed packets
+
+ Packet 4
+
+ 00:06:50:520796: handoff_trace
+ HANDED-OFF: from thread 1 trace index 3
+ 00:06:50:520796: handoffdemo-2
+ HANDOFFDEMO: current thread 2
+ 00:06:50:520867: error-drop
+ rx:local0
+ 00:06:50:520914: drop
+ handoffdemo-2: completed packets
+
+ Packet 5
+
+ 00:06:50:520796: handoff_trace
+ HANDED-OFF: from thread 1 trace index 4
+ 00:06:50:520796: handoffdemo-2
+ HANDOFFDEMO: current thread 2
+ 00:06:50:520867: error-drop
+ rx:local0
+ 00:06:50:520914: drop
+ handoffdemo-2: completed packets
+ DBGvpp#
diff --git a/docs/developer/corearchitecture/vnet.rst b/docs/developer/corearchitecture/vnet.rst
new file mode 100644
index 00000000000..812e2fb4f8a
--- /dev/null
+++ b/docs/developer/corearchitecture/vnet.rst
@@ -0,0 +1,807 @@
+VNET (VPP Network Stack)
+========================
+
+The files associated with the VPP network stack layer are located in the
+*./src/vnet* folder. The Network Stack Layer is basically an
+instantiation of the code in the other layers. This layer has a vnet
+library that provides vectorized layer-2 and 3 networking graph nodes, a
+packet generator, and a packet tracer.
+
+In terms of building a packet processing application, vnet provides a
+platform-independent subgraph to which one connects a couple of
+device-driver nodes.
+
+Typical RX connections include “ethernet-input” [full software
+classification, feeds ipv4-input, ipv6-input, arp-input etc.] and
+“ipv4-input-no-checksum” [if hardware can classify, perform ipv4 header
+checksum].
+
+Effective graph dispatch function coding
+----------------------------------------
+
+Over the 15 years, multiple coding styles have emerged: a
+single/dual/quad loop coding model (with variations) and a
+fully-pipelined coding model.
+
+Single/dual loops
+-----------------
+
+The single/dual/quad loop model variations conveniently solve problems
+where the number of items to process is not known in advance: typical
+hardware RX-ring processing. This coding style is also very effective
+when a given node will not need to cover a complex set of dependent
+reads.
+
+Here is an quad/single loop which can leverage up-to-avx512 SIMD vector
+units to convert buffer indices to buffer pointers:
+
+.. code:: c
+
+ static uword
+ simulated_ethernet_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t *
+ node, vlib_frame_t * frame)
+ {
+ u32 n_left_from, *from;
+ u32 next_index = 0;
+ u32 n_bytes;
+ u32 thread_index = vm->thread_index;
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_interface_main_t *im = &vnm->interface_main;
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
+ u16 nexts[VLIB_FRAME_SIZE], *next;
+
+ n_left_from = frame->n_vectors;
+ from = vlib_frame_vector_args (frame);
+
+ /*
+ * Convert up to VLIB_FRAME_SIZE indices in "from" to
+ * buffer pointers in bufs[]
+ */
+ vlib_get_buffers (vm, from, bufs, n_left_from);
+ b = bufs;
+ next = nexts;
+
+ /*
+ * While we have at least 4 vector elements (pkts) to process..
+ */
+ while (n_left_from >= 4)
+ {
+ /* Prefetch next quad-loop iteration. */
+ if (PREDICT_TRUE (n_left_from >= 8))
+ {
+ vlib_prefetch_buffer_header (b[4], STORE);
+ vlib_prefetch_buffer_header (b[5], STORE);
+ vlib_prefetch_buffer_header (b[6], STORE);
+ vlib_prefetch_buffer_header (b[7], STORE);
+ }
+
+ /*
+ * $$$ Process 4x packets right here...
+ * set next[0..3] to send the packets where they need to go
+ */
+
+ do_something_to (b[0]);
+ do_something_to (b[1]);
+ do_something_to (b[2]);
+ do_something_to (b[3]);
+
+ /* Process the next 0..4 packets */
+ b += 4;
+ next += 4;
+ n_left_from -= 4;
+ }
+ /*
+ * Clean up 0...3 remaining packets at the end of the incoming frame
+ */
+ while (n_left_from > 0)
+ {
+ /*
+ * $$$ Process one packet right here...
+ * set next[0..3] to send the packets where they need to go
+ */
+ do_something_to (b[0]);
+
+ /* Process the next packet */
+ b += 1;
+ next += 1;
+ n_left_from -= 1;
+ }
+
+ /*
+ * Send the packets along their respective next-node graph arcs
+ * Considerable locality of reference is expected, most if not all
+ * packets in the inbound vector will traverse the same next-node
+ * arc
+ */
+ vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
+
+ return frame->n_vectors;
+ }
+
+Given a packet processing task to implement, it pays to scout around
+looking for similar tasks, and think about using the same coding
+pattern. It is not uncommon to recode a given graph node dispatch
+function several times during performance optimization.
+
+Creating Packets from Scratch
+-----------------------------
+
+At times, it’s necessary to create packets from scratch and send them.
+Tasks like sending keepalives or actively opening connections come to
+mind. Its not difficult, but accurate buffer metadata setup is required.
+
+Allocating Buffers
+~~~~~~~~~~~~~~~~~~
+
+Use vlib_buffer_alloc, which allocates a set of buffer indices. For
+low-performance applications, it’s OK to allocate one buffer at a time.
+Note that vlib_buffer_alloc(…) does NOT initialize buffer metadata. See
+below.
+
+In high-performance cases, allocate a vector of buffer indices, and hand
+them out from the end of the vector; decrement \_vec_len(..) as buffer
+indices are allocated. See tcp_alloc_tx_buffers(…) and
+tcp_get_free_buffer_index(…) for an example.
+
+Buffer Initialization Example
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following example shows the **main points**, but is not to be
+blindly cut-’n-pasted.
+
+.. code:: c
+
+ u32 bi0;
+ vlib_buffer_t *b0;
+ ip4_header_t *ip;
+ udp_header_t *udp;
+
+ /* Allocate a buffer */
+ if (vlib_buffer_alloc (vm, &bi0, 1) != 1)
+ return -1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* At this point b0->current_data = 0, b0->current_length = 0 */
+
+ /*
+ * Copy data into the buffer. This example ASSUMES that data will fit
+ * in a single buffer, and is e.g. an ip4 packet.
+ */
+ if (have_packet_rewrite)
+ {
+ clib_memcpy (b0->data, data, vec_len (data));
+ b0->current_length = vec_len (data);
+ }
+ else
+ {
+ /* OR, build a udp-ip packet (for example) */
+ ip = vlib_buffer_get_current (b0);
+ udp = (udp_header_t *) (ip + 1);
+ data_dst = (u8 *) (udp + 1);
+
+ ip->ip_version_and_header_length = 0x45;
+ ip->ttl = 254;
+ ip->protocol = IP_PROTOCOL_UDP;
+ ip->length = clib_host_to_net_u16 (sizeof (*ip) + sizeof (*udp) +
+ vec_len(udp_data));
+ ip->src_address.as_u32 = src_address->as_u32;
+ ip->dst_address.as_u32 = dst_address->as_u32;
+ udp->src_port = clib_host_to_net_u16 (src_port);
+ udp->dst_port = clib_host_to_net_u16 (dst_port);
+ udp->length = clib_host_to_net_u16 (vec_len (udp_data));
+ clib_memcpy (data_dst, udp_data, vec_len(udp_data));
+
+ if (compute_udp_checksum)
+ {
+ /* RFC 7011 section 10.3.2. */
+ udp->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ip);
+ if (udp->checksum == 0)
+ udp->checksum = 0xffff;
+ }
+ b0->current_length = vec_len (sizeof (*ip) + sizeof (*udp) +
+ vec_len (udp_data));
+
+ }
+ b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ /* sw_if_index 0 is the "local" interface, which always exists */
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
+
+ /* Use the default FIB index for tx lookup. Set non-zero to use another fib */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = 0;
+
+If your use-case calls for large packet transmission, use
+vlib_buffer_chain_append_data_with_alloc(…) to create the requisite
+buffer chain.
+
+Enqueueing packets for lookup and transmission
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The simplest way to send a set of packets is to use
+vlib_get_frame_to_node(…) to allocate fresh frame(s) to ip4_lookup_node
+or ip6_lookup_node, add the constructed buffer indices, and dispatch the
+frame using vlib_put_frame_to_node(…).
+
+.. code:: c
+
+ vlib_frame_t *f;
+ f = vlib_get_frame_to_node (vm, ip4_lookup_node.index);
+ f->n_vectors = vec_len(buffer_indices_to_send);
+ to_next = vlib_frame_vector_args (f);
+
+ for (i = 0; i < vec_len (buffer_indices_to_send); i++)
+ to_next[i] = buffer_indices_to_send[i];
+
+ vlib_put_frame_to_node (vm, ip4_lookup_node_index, f);
+
+It is inefficient to allocate and schedule single packet frames. That’s
+typical in case you need to send one packet per second, but should
+**not** occur in a for-loop!
+
+Packet tracer
+-------------
+
+Vlib includes a frame element [packet] trace facility, with a simple
+debug CLI interface. The cli is straightforward: “trace add
+input-node-name count” to start capturing packet traces.
+
+To trace 100 packets on a typical x86_64 system running the dpdk plugin:
+“trace add dpdk-input 100”. When using the packet generator: “trace add
+pg-input 100”
+
+To display the packet trace: “show trace”
+
+Each graph node has the opportunity to capture its own trace data. It is
+almost always a good idea to do so. The trace capture APIs are simple.
+
+The packet capture APIs snapshoot binary data, to minimize processing at
+capture time. Each participating graph node initialization provides a
+vppinfra format-style user function to pretty-print data when required
+by the VLIB “show trace” command.
+
+Set the VLIB node registration “.format_trace” member to the name of the
+per-graph node format function.
+
+Here’s a simple example:
+
+.. code:: c
+
+ u8 * my_node_format_trace (u8 * s, va_list * args)
+ {
+ vlib_main_t * vm = va_arg (*args, vlib_main_t *);
+ vlib_node_t * node = va_arg (*args, vlib_node_t *);
+ my_node_trace_t * t = va_arg (*args, my_trace_t *);
+
+ s = format (s, "My trace data was: %d", t-><whatever>);
+
+ return s;
+ }
+
+The trace framework hands the per-node format function the data it
+captured as the packet whizzed by. The format function pretty-prints the
+data as desired.
+
+Graph Dispatcher Pcap Tracing
+-----------------------------
+
+The vpp graph dispatcher knows how to capture vectors of packets in pcap
+format as they’re dispatched. The pcap captures are as follows:
+
+::
+
+ VPP graph dispatch trace record description:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Major Version | Minor Version | NStrings | ProtoHint |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Buffer index (big endian) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ + VPP graph node name ... ... | NULL octet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Buffer Metadata ... ... | NULL octet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Buffer Opaque ... ... | NULL octet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Buffer Opaque 2 ... ... | NULL octet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | VPP ASCII packet trace (if NStrings > 4) | NULL octet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Packet data (up to 16K) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+Graph dispatch records comprise a version stamp, an indication of how
+many NULL-terminated strings will follow the record header and preceed
+packet data, and a protocol hint.
+
+The buffer index is an opaque 32-bit cookie which allows consumers of
+these data to easily filter/track single packets as they traverse the
+forwarding graph.
+
+Multiple records per packet are normal, and to be expected. Packets will
+appear multiple times as they traverse the vpp forwarding graph. In this
+way, vpp graph dispatch traces are significantly different from regular
+network packet captures from an end-station. This property complicates
+stateful packet analysis.
+
+Restricting stateful analysis to records from a single vpp graph node
+such as “ethernet-input” seems likely to improve the situation.
+
+As of this writing: major version = 1, minor version = 0. Nstrings
+SHOULD be 4 or 5. Consumers SHOULD be wary values less than 4 or greater
+than 5. They MAY attempt to display the claimed number of strings, or
+they MAY treat the condition as an error.
+
+Here is the current set of protocol hints:
+
+.. code:: c
+
+ typedef enum
+ {
+ VLIB_NODE_PROTO_HINT_NONE = 0,
+ VLIB_NODE_PROTO_HINT_ETHERNET,
+ VLIB_NODE_PROTO_HINT_IP4,
+ VLIB_NODE_PROTO_HINT_IP6,
+ VLIB_NODE_PROTO_HINT_TCP,
+ VLIB_NODE_PROTO_HINT_UDP,
+ VLIB_NODE_N_PROTO_HINTS,
+ } vlib_node_proto_hint_t;
+
+Example: VLIB_NODE_PROTO_HINT_IP6 means that the first octet of packet
+data SHOULD be 0x60, and should begin an ipv6 packet header.
+
+Downstream consumers of these data SHOULD pay attention to the protocol
+hint. They MUST tolerate inaccurate hints, which MAY occur from time to
+time.
+
+Dispatch Pcap Trace Debug CLI
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To start a dispatch trace capture of up to 10,000 trace records:
+
+::
+
+ pcap dispatch trace on max 10000 file dispatch.pcap
+
+To start a dispatch trace which will also include standard vpp packet
+tracing for packets which originate in dpdk-input:
+
+::
+
+ pcap dispatch trace on max 10000 file dispatch.pcap buffer-trace dpdk-input 1000
+
+To save the pcap trace, e.g. in /tmp/dispatch.pcap:
+
+::
+
+ pcap dispatch trace off
+
+Wireshark dissection of dispatch pcap traces
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It almost goes without saying that we built a companion wireshark
+dissector to display these traces. As of this writing, we have
+upstreamed the wireshark dissector.
+
+Since it will be a while before wireshark/master/latest makes it into
+all of the popular Linux distros, please see the “How to build a vpp
+dispatch trace aware Wireshark” page for build info.
+
+Here is a sample packet dissection, with some fields omitted for
+clarity. The point is that the wireshark dissector accurately displays
+**all** of the vpp buffer metadata, and the name of the graph node in
+question.
+
+::
+
+ Frame 1: 2216 bytes on wire (17728 bits), 2216 bytes captured (17728 bits)
+ Encapsulation type: USER 13 (58)
+ [Protocols in frame: vpp:vpp-metadata:vpp-opaque:vpp-opaque2:eth:ethertype:ip:tcp:data]
+ VPP Dispatch Trace
+ BufferIndex: 0x00036663
+ NodeName: ethernet-input
+ VPP Buffer Metadata
+ Metadata: flags:
+ Metadata: current_data: 0, current_length: 102
+ Metadata: current_config_index: 0, flow_id: 0, next_buffer: 0
+ Metadata: error: 0, n_add_refs: 0, buffer_pool_index: 0
+ Metadata: trace_index: 0, recycle_count: 0, len_not_first_buf: 0
+ Metadata: free_list_index: 0
+ Metadata:
+ VPP Buffer Opaque
+ Opaque: raw: 00000007 ffffffff 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
+ Opaque: sw_if_index[VLIB_RX]: 7, sw_if_index[VLIB_TX]: -1
+ Opaque: L2 offset 0, L3 offset 0, L4 offset 0, feature arc index 0
+ Opaque: ip.adj_index[VLIB_RX]: 0, ip.adj_index[VLIB_TX]: 0
+ Opaque: ip.flow_hash: 0x0, ip.save_protocol: 0x0, ip.fib_index: 0
+ Opaque: ip.save_rewrite_length: 0, ip.rpf_id: 0
+ Opaque: ip.icmp.type: 0 ip.icmp.code: 0, ip.icmp.data: 0x0
+ Opaque: ip.reass.next_index: 0, ip.reass.estimated_mtu: 0
+ Opaque: ip.reass.fragment_first: 0 ip.reass.fragment_last: 0
+ Opaque: ip.reass.range_first: 0 ip.reass.range_last: 0
+ Opaque: ip.reass.next_range_bi: 0x0, ip.reass.ip6_frag_hdr_offset: 0
+ Opaque: mpls.ttl: 0, mpls.exp: 0, mpls.first: 0, mpls.save_rewrite_length: 0, mpls.bier.n_bytes: 0
+ Opaque: l2.feature_bitmap: 00000000, l2.bd_index: 0, l2.l2_len: 0, l2.shg: 0, l2.l2fib_sn: 0, l2.bd_age: 0
+ Opaque: l2.feature_bitmap_input: none configured, L2.feature_bitmap_output: none configured
+ Opaque: l2t.next_index: 0, l2t.session_index: 0
+ Opaque: l2_classify.table_index: 0, l2_classify.opaque_index: 0, l2_classify.hash: 0x0
+ Opaque: policer.index: 0
+ Opaque: ipsec.flags: 0x0, ipsec.sad_index: 0
+ Opaque: map.mtu: 0
+ Opaque: map_t.v6.saddr: 0x0, map_t.v6.daddr: 0x0, map_t.v6.frag_offset: 0, map_t.v6.l4_offset: 0
+ Opaque: map_t.v6.l4_protocol: 0, map_t.checksum_offset: 0, map_t.mtu: 0
+ Opaque: ip_frag.mtu: 0, ip_frag.next_index: 0, ip_frag.flags: 0x0
+ Opaque: cop.current_config_index: 0
+ Opaque: lisp.overlay_afi: 0
+ Opaque: tcp.connection_index: 0, tcp.seq_number: 0, tcp.seq_end: 0, tcp.ack_number: 0, tcp.hdr_offset: 0, tcp.data_offset: 0
+ Opaque: tcp.data_len: 0, tcp.flags: 0x0
+ Opaque: sctp.connection_index: 0, sctp.sid: 0, sctp.ssn: 0, sctp.tsn: 0, sctp.hdr_offset: 0
+ Opaque: sctp.data_offset: 0, sctp.data_len: 0, sctp.subconn_idx: 0, sctp.flags: 0x0
+ Opaque: snat.flags: 0x0
+ Opaque:
+ VPP Buffer Opaque2
+ Opaque2: raw: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
+ Opaque2: qos.bits: 0, qos.source: 0
+ Opaque2: loop_counter: 0
+ Opaque2: gbp.flags: 0, gbp.src_epg: 0
+ Opaque2: pg_replay_timestamp: 0
+ Opaque2:
+ Ethernet II, Src: 06:d6:01:41:3b:92 (06:d6:01:41:3b:92), Dst: IntelCor_3d:f6 Transmission Control Protocol, Src Port: 22432, Dst Port: 54084, Seq: 1, Ack: 1, Len: 36
+ Source Port: 22432
+ Destination Port: 54084
+ TCP payload (36 bytes)
+ Data (36 bytes)
+
+ 0000 cf aa 8b f5 53 14 d4 c7 29 75 3e 56 63 93 9d 11 ....S...)u>Vc...
+ 0010 e5 f2 92 27 86 56 4c 21 ce c5 23 46 d7 eb ec 0d ...'.VL!..#F....
+ 0020 a8 98 36 5a ..6Z
+ Data: cfaa8bf55314d4c729753e5663939d11e5f2922786564c21…
+ [Length: 36]
+
+It’s a matter of a couple of mouse-clicks in Wireshark to filter the
+trace to a specific buffer index. With that specific kind of filtration,
+one can watch a packet walk through the forwarding graph; noting any/all
+metadata changes, header checksum changes, and so forth.
+
+This should be of significant value when developing new vpp graph nodes.
+If new code mispositions b->current_data, it will be completely obvious
+from looking at the dispatch trace in wireshark.
+
+pcap rx, tx, and drop tracing
+-----------------------------
+
+vpp also supports rx, tx, and drop packet capture in pcap format,
+through the “pcap trace” debug CLI command.
+
+This command is used to start or stop a packet capture, or show the
+status of packet capture. Each of “pcap trace rx”, “pcap trace tx”, and
+“pcap trace drop” is implemented. Supply one or more of “rx”, “tx”, and
+“drop” to enable multiple simultaneous capture types.
+
+These commands have the following optional parameters:
+
+- rx - trace received packets.
+
+- tx - trace transmitted packets.
+
+- drop - trace dropped packets.
+
+- max *nnnn*\ - file size, number of packet captures. Once packets
+ have been received, the trace buffer buffer is flushed to the
+ indicated file. Defaults to 1000. Can only be updated if packet
+ capture is off.
+
+- max-bytes-per-pkt *nnnn*\ - maximum number of bytes to trace on a
+ per-packet basis. Must be >32 and less than 9000. Default value:
+
+ 512.
+
+- filter - Use the pcap rx / tx / drop trace filter, which must be
+ configured. Use classify filter pcap… to configure the filter. The
+ filter will only be executed if the per-interface or any-interface
+ tests fail.
+
+- intfc *interface* \| *any*\ - Used to specify a given interface, or
+ use ‘any’ to run packet capture on all interfaces. ‘any’ is the
+ default if not provided. Settings from a previous packet capture are
+ preserved, so ‘any’ can be used to reset the interface setting.
+
+- file *filename*\ - Used to specify the output filename. The file
+ will be placed in the ‘/tmp’ directory. If *filename* already exists,
+ file will be overwritten. If no filename is provided, ‘/tmp/rx.pcap
+ or tx.pcap’ will be used, depending on capture direction. Can only be
+ updated when pcap capture is off.
+
+- status - Displays the current status and configured attributes
+ associated with a packet capture. If packet capture is in progress,
+ ‘status’ also will return the number of packets currently in the
+ buffer. Any additional attributes entered on command line with a
+ ‘status’ request will be ignored.
+
+- filter - Capture packets which match the current packet trace filter
+ set. See next section. Configure the capture filter first.
+
+packet trace capture filtering
+------------------------------
+
+The “classify filter pcap \| \| trace” debug CLI command constructs an
+arbitrary set of packet classifier tables for use with “pcap rx \| tx \|
+drop trace,” and with the vpp packet tracer on a per-interface or
+system-wide basis.
+
+Packets which match a rule in the classifier table chain will be traced.
+The tables are automatically ordered so that matches in the most
+specific table are tried first.
+
+It’s reasonably likely that folks will configure a single table with one
+or two matches. As a result, we configure 8 hash buckets and 128K of
+match rule space by default. One can override the defaults by specifying
+“buckets ” and “memory-size ” as desired.
+
+To build up complex filter chains, repeatedly issue the classify filter
+debug CLI command. Each command must specify the desired mask and match
+values. If a classifier table with a suitable mask already exists, the
+CLI command adds a match rule to the existing table. If not, the CLI
+command add a new table and the indicated mask rule
+
+Configure a simple pcap classify filter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ classify filter pcap mask l3 ip4 src match l3 ip4 src 192.168.1.11
+ pcap trace rx max 100 filter
+
+Configure a simple per-interface capture filter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ classify filter GigabitEthernet3/0/0 mask l3 ip4 src match l3 ip4 src 192.168.1.11"
+ pcap trace rx max 100 intfc GigabitEthernet3/0/0
+
+Note that per-interface capture filters are *always* applied.
+
+Clear per-interface capture filters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ classify filter GigabitEthernet3/0/0 del
+
+Configure another fairly simple pcap classify filter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ classify filter pcap mask l3 ip4 src dst match l3 ip4 src 192.168.1.10 dst 192.168.2.10
+ pcap trace tx max 100 filter
+
+Configure a vpp packet tracer filter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ classify filter trace mask l3 ip4 src dst match l3 ip4 src 192.168.1.10 dst 192.168.2.10
+ trace add dpdk-input 100 filter
+
+Clear all current classifier filters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ classify filter [pcap | <interface> | trace] del
+
+To inspect the classifier tables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ show classify table [verbose]
+
+The verbose form displays all of the match rules, with hit-counters.
+
+Terse description of the “mask ” syntax:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ l2 src dst proto tag1 tag2 ignore-tag1 ignore-tag2 cos1 cos2 dot1q dot1ad
+ l3 ip4 <ip4-mask> ip6 <ip6-mask>
+ <ip4-mask> version hdr_length src[/width] dst[/width]
+ tos length fragment_id ttl protocol checksum
+ <ip6-mask> version traffic-class flow-label src dst proto
+ payload_length hop_limit protocol
+ l4 tcp <tcp-mask> udp <udp_mask> src_port dst_port
+ <tcp-mask> src dst # ports
+ <udp-mask> src_port dst_port
+
+To construct **matches**, add the values to match after the indicated
+keywords in the mask syntax. For example: “… mask l3 ip4 src” -> “…
+match l3 ip4 src 192.168.1.11”
+
+VPP Packet Generator
+--------------------
+
+We use the VPP packet generator to inject packets into the forwarding
+graph. The packet generator can replay pcap traces, and generate packets
+out of whole cloth at respectably high performance.
+
+The VPP pg enables quite a variety of use-cases, ranging from functional
+testing of new data-plane nodes to regression testing to performance
+tuning.
+
+PG setup scripts
+----------------
+
+PG setup scripts describe traffic in detail, and leverage vpp debug CLI
+mechanisms. It’s reasonably unusual to construct a pg setup script which
+doesn’t include a certain amount of interface and FIB configuration.
+
+For example:
+
+::
+
+ loop create
+ set int ip address loop0 192.168.1.1/24
+ set int state loop0 up
+
+ packet-generator new {
+ name pg0
+ limit 100
+ rate 1e6
+ size 300-300
+ interface loop0
+ node ethernet-input
+ data { IP4: 1.2.3 -> 4.5.6
+ UDP: 192.168.1.10 - 192.168.1.254 -> 192.168.2.10
+ UDP: 1234 -> 2345
+ incrementing 286
+ }
+ }
+
+A packet generator stream definition includes two major sections: -
+Stream Parameter Setup - Packet Data
+
+Stream Parameter Setup
+~~~~~~~~~~~~~~~~~~~~~~
+
+Given the example above, let’s look at how to set up stream parameters:
+
+- **name pg0** - Name of the stream, in this case “pg0”
+
+- **limit 1000** - Number of packets to send when the stream is
+ enabled. “limit 0” means send packets continuously.
+
+- **maxframe <nnn>** - Maximum frame size. Handy for injecting multiple
+ frames no larger than <nnn>. Useful for checking dual / quad loop
+ codes
+
+- **rate 1e6** - Packet injection rate, in this case 1 MPPS. When not
+ specified, the packet generator injects packets as fast as possible
+
+- **size 300-300** - Packet size range, in this case send 300-byte
+ packets
+
+- **interface loop0** - Packets appear as if they were received on the
+ specified interface. This datum is used in multiple ways: to select
+ graph arc feature configuration, to select IP FIBs. Configure
+ features e.g. on loop0 to exercise those features.
+
+- **tx-interface <name>** - Packets will be transmitted on the
+ indicated interface. Typically required only when injecting packets
+ into post-IP-rewrite graph nodes.
+
+- **pcap <filename>** - Replay packets from the indicated pcap capture
+ file. “make test” makes extensive use of this feature: generate
+ packets using scapy, save them in a .pcap file, then inject them into
+ the vpp graph via a vpp pg “pcap <filename>” stream definition
+
+- **worker <nn>** - Generate packets for the stream using the indicated
+ vpp worker thread. The vpp pg generates and injects O(10 MPPS /
+ core). Use multiple stream definitions and worker threads to generate
+ and inject enough traffic to easily fill a 40 gbit pipe with small
+ packets.
+
+Data definition
+~~~~~~~~~~~~~~~
+
+Packet generator data definitions make use of a layered implementation
+strategy. Networking layers are specified in order, and the notation can
+seem a bit counter-intuitive. In the example above, the data definition
+stanza constructs a set of L2-L4 headers layers, and uses an
+incrementing fill pattern to round out the requested 300-byte packets.
+
+- **IP4: 1.2.3 -> 4.5.6** - Construct an L2 (MAC) header with the ip4
+ ethertype (0x800), src MAC address of 00:01:00:02:00:03 and dst MAC
+ address of 00:04:00:05:00:06. Mac addresses may be specified in
+ either *xxxx.xxxx.xxxx* format or *xx:xx:xx:xx:xx:xx* format.
+
+- **UDP: 192.168.1.10 - 192.168.1.254 -> 192.168.2.10** - Construct an
+ incrementing set of L3 (IPv4) headers for successive packets with
+ source addresses ranging from .10 to .254. All packets in the stream
+ have a constant dest address of 192.168.2.10. Set the protocol field
+ to 17, UDP.
+
+- **UDP: 1234 -> 2345** - Set the UDP source and destination ports to
+ 1234 and 2345, respectively
+
+- **incrementing 256** - Insert up to 256 incrementing data bytes.
+
+Obvious variations involve “s/IP4/IP6/” in the above, along with
+changing from IPv4 to IPv6 address notation.
+
+The vpp pg can set any / all IPv4 header fields, including tos, packet
+length, mf / df / fragment id and offset, ttl, protocol, checksum, and
+src/dst addresses. Take a look at ../src/vnet/ip/ip[46]_pg.c for
+details.
+
+If all else fails, specify the entire packet data in hex:
+
+- **hex 0xabcd…** - copy hex data verbatim into the packet
+
+When replaying pcap files (“**pcap <filename>**”), do not specify a data
+stanza.
+
+Diagnosing “packet-generator new” parse failures
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you want to inject packets into a brand-new graph node, remember to
+tell the packet generator debug CLI how to parse the packet data stanza.
+
+If the node expects L2 Ethernet MAC headers, specify “.unformat_buffer =
+unformat_ethernet_header”:
+
+.. code:: c
+
+ VLIB_REGISTER_NODE (ethernet_input_node) =
+ {
+ <snip>
+ .unformat_buffer = unformat_ethernet_header,
+ <snip>
+ };
+
+Beyond that, it may be necessary to set breakpoints in
+…/src/vnet/pg/cli.c. Debug image suggested.
+
+When debugging new nodes, it may be far simpler to directly inject
+ethernet frames - and add a corresponding vlib_buffer_advance in the new
+node - than to modify the packet generator.
+
+Debug CLI
+---------
+
+The descriptions above describe the “packet-generator new” debug CLI in
+detail.
+
+Additional debug CLI commands include:
+
+::
+
+ vpp# packet-generator enable [<stream-name>]
+
+which enables the named stream, or all streams.
+
+::
+
+ vpp# packet-generator disable [<stream-name>]
+
+disables the named stream, or all streams.
+
+::
+
+ vpp# packet-generator delete <stream-name>
+
+Deletes the named stream.
+
+::
+
+ vpp# packet-generator configure <stream-name> [limit <nnn>]
+ [rate <f64-pps>] [size <nn>-<nn>]
+
+Changes stream parameters without having to recreate the entire stream
+definition. Note that re-issuing a “packet-generator new” command will
+correctly recreate the named stream.
diff --git a/docs/developer/corefeatures/bfd_doc.rst b/docs/developer/corefeatures/bfd_doc.rst
new file mode 120000
index 00000000000..6e9fdd5e508
--- /dev/null
+++ b/docs/developer/corefeatures/bfd_doc.rst
@@ -0,0 +1 @@
+../../../src/vnet/bfd/bfd_doc.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/eventviewer.rst b/docs/developer/corefeatures/eventviewer.rst
new file mode 100644
index 00000000000..21d5fa95275
--- /dev/null
+++ b/docs/developer/corefeatures/eventviewer.rst
@@ -0,0 +1,286 @@
+.. _eventviewer:
+
+Event-logger
+============
+
+The vppinfra event logger provides very lightweight (sub-100ns)
+precisely time-stamped event-logging services. See
+./src/vppinfra/{elog.c, elog.h}
+
+Serialization support makes it easy to save and ultimately to combine a
+set of event logs. In a distributed system running NTP over a local LAN,
+we find that event logs collected from multiple system elements can be
+combined with a temporal uncertainty no worse than 50us.
+
+A typical event definition and logging call looks like this:
+
+.. code-block:: c
+
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "tx-msg: stream %d local seq %d attempt %d",
+ .format_args = "i4i4i4",
+ };
+ struct { u32 stream_id, local_sequence, retry_count; } * ed;
+ ed = ELOG_DATA (m->elog_main, e);
+ ed->stream_id = stream_id;
+ ed->local_sequence = local_sequence;
+ ed->retry_count = retry_count;
+
+The ELOG\_DATA macro returns a pointer to 20 bytes worth of arbitrary
+event data, to be formatted (offline, not at runtime) as described by
+format\_args. Aside from obvious integer formats, the CLIB event logger
+provides a couple of interesting additions. The "t4" format
+pretty-prints enumerated values:
+
+.. code-block:: c
+
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "get_or_create: %s",
+ .format_args = "t4",
+ .n_enum_strings = 2,
+ .enum_strings = { "old", "new", },
+ };
+
+The "t" format specifier indicates that the corresponding datum is an
+index in the event's set of enumerated strings, as shown in the previous
+event type definition.
+
+The “T” format specifier indicates that the corresponding datum is an
+index in the event log’s string heap. This allows the programmer to emit
+arbitrary formatted strings. One often combines this facility with a
+hash table to keep the event-log string heap from growing arbitrarily
+large.
+
+Noting the 20-octet limit per-log-entry data field, the event log
+formatter supports arbitrary combinations of these data types. As in:
+the ".format" field may contain one or more instances of the following:
+
+- i1 - 8-bit unsigned integer
+- i2 - 16-bit unsigned integer
+- i4 - 32-bit unsigned integer
+- i8 - 64-bit unsigned integer
+- f4 - float
+- f8 - double
+- s - NULL-terminated string - be careful
+- sN - N-byte character array
+- t1,2,4 - per-event enumeration ID
+- T4 - Event-log string table offset
+
+The vpp engine event log is thread-safe, and is shared by all threads.
+Take care not to serialize the computation. Although the event-logger is
+about as fast as practicable, it's not appropriate for per-packet use in
+hard-core data plane code. It's most appropriate for capturing rare
+events - link up-down events, specific control-plane events and so
+forth.
+
+The vpp engine has several debug CLI commands for manipulating its event
+log:
+
+.. code-block:: console
+
+ vpp# event-logger clear
+ vpp# event-logger save <filename> # for security, writes into /tmp/<filename>.
+ # <filename> must not contain '.' or '/' characters
+ vpp# show event-logger [all] [<nnn>] # display the event log
+ # by default, the last 250 entries
+
+The event log defaults to 128K entries. The command-line argument "...
+vlib { elog-events nnn } ..." configures the size of the event log.
+
+As described above, the vpp engine event log is thread-safe and shared.
+To avoid confusing non-appearance of events logged by worker threads,
+make sure to code vlib\_global\_main.elog\_main - instead of
+vm->elog\_main. The latter form is correct in the main thread, but
+will almost certainly produce bad results in worker threads.
+
+G2 graphical event viewer
+-------------------------
+
+The G2 graphical event viewer can display serialized vppinfra event logs
+directly, or via the c2cpel tool. G2 is a fine-grained event-log viewer. It's
+highly scalable, supporting O(1e7 events) and O(1e3 discrete display "tracks").
+G2 displays binary data generated by the vppinfra "elog.[ch]" logger component,
+and also supports the CPEL file format, as described in this section.
+
+Building G2
+~~~~~~~~~~~
+
+This link describes :ref:`how to build G2 <building-g2>`
+
+Setting the Display Preferences
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The file $<*HOMEDIR*>/.g2 contains display preferences, which can be overridden.
+Simply un-comment one of the stanzas shown below, or experiment as desired.
+
+.. code-block:: c
+
+ /*
+ * Property / parameter settings for G2
+ *
+ * Setting for a 1024x768 display:
+ * event_selector_lines=20
+ * drawbox_height=800
+ * drawbox_width=600
+ *
+ * new mac w/ no monitor:
+ * event_selector_lines=20
+ * drawbox_height=1200
+ * drawbox_width=700
+ *
+ * 1600x1200:
+ * drawbox_width=1200
+ * drawbox_height=1000
+ * event_selector_lines=25
+ *
+ * for making screenshots on a Macbook Pro
+ * drawbox_width=1200
+ * drawbox_height=600
+ * event_selector_lines=20
+ */
+
+Screen Taxonomy
+~~~~~~~~~~~~~~~
+
+Here is an annotated G2 viewer screenshot, corresponding to activity during BGP
+prefix download. This data was captured on a Cisco IOS-XR system:
+
+.. figure:: /_images/g21.jpg
+ :scale: 75%
+
+
+The viewer has two main scrollbars: the horizontal axis scrollbar shifts the main
+drawing area in time; the vertical axis changes the set of visible process traces.
+The zoomin / zoomout operators change the time scale.
+
+The event selector PolyCheckMenu changes the set of displayed events.
+Using these tools -- and some patience -- you can understand a given event log.
+
+Mouse Gestures
+~~~~~~~~~~~~~~~
+
+G2 has three fairly sophisticated mouse gesture interfaces, which are worth describing
+in detail. First, a left mouse click on a display event pops up a per-event detail box.
+
+.. figure:: /_images/g22.jpg
+ :scale: 75%
+
+A left mouse click on an event detail box closes it.
+To zoom to a region of the display, press and hold the left mouse button, then drag
+right or left until the zoom-fence pair appears:
+
+.. figure:: /_images/g23.jpg
+ :scale: 75%
+
+When the zoom operation completes, the display is as follows:
+
+.. figure:: /_images/g24.jpg
+
+A click on any of the figures will show them at full resolution, right-click will open figures in new tabs,
+
+Time Ruler
+~~~~~~~~~~
+
+To use a time ruler, press and hold the right mouse button; drag right or left
+until the ruler measures the region of interest. If the time axis scale is coarse,
+event boxes can have significant width in time, so use a "reference point" in
+each event box when using the time ruler.
+
+.. figure:: /_images/g25.jpg
+ :scale: 75%
+
+Event Selection
+~~~~~~~~~~~~~~~
+
+Changing the Event Selector setup controls the set of points displayed in an
+obvious way. Here, we suppress all events except "this thread is now running on the CPU":
+
+.. figure:: /_images/g26.jpg
+ :scale: 75%
+
+Same setup, with all events displayed:
+
+.. figure:: /_images/g27.jpg
+ :scale: 75%
+
+Note that event detail boxes previously shown, but suppressed due to deselection
+of the event code will reappear when one reselects the event code. In the example
+above, the "THREAD/THREADY pid:491720 tid:12" detail box appears in this fashion.
+
+Snapshot Ring
+~~~~~~~~~~~~~
+
+Three buttons in lower left-hand corner of the g2 main window control the snapshot
+ring. Snapshots are simply saved views: maneuver the viewer into an "interesting"
+configuration, then press the "Snap" button to add a snapshot to the ring.
+
+Click **Next** to restore the next available snapshot. The **Del** button deletes the current snapshot.
+
+See the hotkey section below for access to a quick and easy method to save and
+restore the snapshot ring. Eventually we may add a safe/portable/supported mechanism
+to save/restore the snapshot ring from CPEL and vppinfra event log files.
+
+Chasing Events
+~~~~~~~~~~~~~~
+
+Event chasing sorts the trace axis by occurrence of the last selected event. For
+example, if one selects an event which means "thread running on the CPU" the first
+N displayed traces will be the first M threads to run (N <= M; a thread may run
+more than once. This feature addresses analytic problems caused by the finite size of the drawing area.
+
+In standard (NoChaseEvent) mode, it looks like only BGP threads 5 and 9 are active:
+
+.. figure:: /_images/g28.jpg
+ :scale: 75%
+
+After pressing the ChaseEvent button, we see a different picture:
+
+.. figure:: /_images/g29.jpg
+ :scale: 75%
+
+Burying Boring Tracks
+~~~~~~~~~~~~~~~~~~~~~
+
+The sequence <ctrl><left-mouse-click> moves the track under the mouse to the end
+of the set of tracks, effectively burying it. The sequence <shift><left-mouse-click>
+moves the track under the mouse to the beginning of the set of tracks. The latter
+function probably isn't precisely right--I think we may eventually provide an "undo"
+stack to provide precise thread exhumation.
+
+Summary Mode
+~~~~~~~~~~~~
+
+Summary mode declutters the screen by rendering events as short vertical line
+segments instead of numbered boxes. Event detail display is unaffected. G2 starts
+in summary mode, zoomed out sufficiently for all events in the trace to be displayed.
+Given a large number of events, summary mode reduces initial screen-paint time to a
+tolerable value. Once you've zoomed in sufficiently, type "e" - enter event mode,
+to enable boxed numeric event display.
+
+Hotkeys
+~~~~~~~
+
+G2 supports the following hotkey actions, supposedly (circa 1996) Quake-like
+according to the feature's original author:
+
++----------------------+--------------------------------------------------------+
+| Key | Function |
++======================+========================================================+
+| w | Zoom-in |
++----------------------+--------------------------------------------------------+
+| s | Zoom-out |
++----------------------+--------------------------------------------------------+
+| a | Scroll Left |
++----------------------+--------------------------------------------------------+
+| d | Scroll Right |
++----------------------+--------------------------------------------------------+
+| e | Toggle between event and summary-event mode |
++----------------------+--------------------------------------------------------+
+| p | Put (write) snapshot ring to snapshots.g2 |
++----------------------+--------------------------------------------------------+
+| l | Load (read) snapshot ring from snapshots.g2 |
++----------------------+--------------------------------------------------------+
+| <ctrl>-q | quit |
++----------------------+--------------------------------------------------------+
diff --git a/docs/developer/corefeatures/fib/attachedexport.rst b/docs/developer/corefeatures/fib/attachedexport.rst
new file mode 100644
index 00000000000..3bf933de679
--- /dev/null
+++ b/docs/developer/corefeatures/fib/attachedexport.rst
@@ -0,0 +1,50 @@
+.. _attachedexport:
+
+Attached Export
+^^^^^^^^^^^^^^^^
+
+Extranets make prefixes in table A also reachable from table B. Table A is the export table,
+B the import. Consider this route in the export table;
+
+.. code-block:: console
+
+ # ip route add table 2 1.1.1.0/24 via 10.10.10.0 GigabitEthernet0/8/0
+
+there are two ways one might consider representing this route in the import VRF:
+
+#. ip route add table 3 1.1.1.0/24 via 10.10.10.0 GigabitEthernet0/8/0
+#. ip route add table 3 1.1.1.0/24 via lookup-in-table 2
+
+where option 2) is an example of a de-aggregate route where a second lookup is
+performed in table 2, the export VRF. Option 2) is clearly less efficient, since
+the cost of the second lookup is high. Option 1) is therefore preferred. However,
+connected and attached prefixes, and specifically the adj-fibs that they cover,
+require special attention. The control plane is aware of the connected and
+attached prefixes that are required to be exported, but it is unaware of the
+adj-fibs. It is therefore the responsibility of FIB to ensure that whenever an
+attached prefix is exported, so are the adj-fibs and local prefixes that it
+covers, and only the adj-fibs and locals, not any covered more specific
+(sourced e.g. by API). The imported FIB entries are sourced as *attached-export*
+this is a low priority source, so if those prefixes already exist in the import
+table, sourced by the API, then they will continue to forward with that information.
+
+.. figure:: /_images/fib20fig6.png
+
+Figure 6: Attached Export Class diagram.
+
+Figure 6 shows the data structures used to perform attached export.
+
+- *fib_import_t*. A representation of the need to import covered prefixes. An instance is associated with the FIB entry in the import VRF. The need to import prefixes is recognised when an attached route is added to a table that is different to the table of the interface to which it t is attached. The creation of a *fib_import_t* will trigger the creation of a *fib_export_t*.
+- *fib_export_t*. A representation of the need to export prefixes. An instance is associated with the attached entry in the export VRF. A *fib_export_t* can have many associated *fib_import_t* objects representing multiple VRFs into which the prefix is exported.
+
+.. figure:: /_images/fib20fig6.png
+
+Figure 7: Attached Export object diagram
+
+Figure 7 shows an object instance diagram for the export of a connected from table
+1 to two other tables. The /32 adj-fib and local prefix in the export VRF are
+exported into the import VRFs, where they are sourced as *attached-export* and
+inherit the forwarding information from the exported entry. The attached prefix
+in the import VRF also performs cover tracking with the connected prefix in the
+export VRF so that it can react to updates to that prefix that will require the
+removal the imported covered prefixes.
diff --git a/docs/developer/corefeatures/fib/barnacles.rst b/docs/developer/corefeatures/fib/barnacles.rst
new file mode 100644
index 00000000000..08e842ade28
--- /dev/null
+++ b/docs/developer/corefeatures/fib/barnacles.rst
@@ -0,0 +1,78 @@
+.. _barnacles:
+
+Barnacles
+---------
+
+Features that are stuck on the side of the FIB. Those that directly use
+the services that the FIB provides.
+
+In the section on FIB fundamentals it was mentioned that there is a
+separation between what to match and how to forward. In an IP FIB what
+to match is the packet's destination address against a table of IP
+prefixes, and how to forward is described by a list of paths (the
+**fib_path_list_t**).
+
+ACL Based Forwarding
+^^^^^^^^^^^^^^^^^^^^
+
+ACL Based Forwarding (ABF) is also know as policy based routing
+(PBR). In ABF what to match is described by an ACL.
+
+ABF uses two VPP services; ACL as a service, as provided by the ACL
+plugin and FIB path-lists. It just glues them together.
+
+An ABF policy is the combination of an ACL with the forwarding
+description of a FIB path-list. An ABF attachment is the association
+of [an ordered set of] ABF policies to an interface. The attachment is
+consulted on the ingress path of the IP DP (as an input
+feature). If the ACL matches then the associated forwarding is
+followed, if not, the packet continues along the DP. Simple.
+
+Layer 3 Cross Connect
+^^^^^^^^^^^^^^^^^^^^^
+
+An L3 cross-connect (L3XC) matches all packets
+that ingress the interface and then forwards using the supplied FIB
+path-list. Naturally it runs as an input feature in the IP
+path. Super simple.
+
+IP Punt
+^^^^^^^
+
+Matches all IP packets that VPP has punted. Why they are punted is not
+relevant. All IP punted packets are sent by VPP to the punt feature
+arc. This feature 'matches' all packets that it receives and forwards
+using the FIB path-list.
+
+
+Unicast Reverse Path Forwarding
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Unicast Reverse Path Forwarding (uRPF) is the process of ensuring that
+a packet has a conforming source address. It comes in two
+flavours:
+
+- loose: The source address must be reachable, i.e. FIB must have a
+ route that will forward to the source address. The default route
+ counts as long as it does not drop.
+- strict: The source address is reachable via the interface on which
+ the packet arrived, i.e. the FIB's route for the source address must
+ include the input interface as an output interface.
+
+The uRPF feature can run on either the input or output IP feature
+arc. In both cases it serves as an anti-spoofing check, though the
+semantics are slightly different. On the input arc it enforces that
+peers on that link are only using source addresses that they should -
+a network admin should employ at the access edge. On the output
+arc it enforces that a packet is sourced from a prefix that belongs to
+the network, i.e. that is has originated from within an SP's
+network, a network admin could use at its peering points.
+
+To perform a uRPF check, the DP performs an IP FIB lookup on the
+source address, this always results in a load-balance (LB) object. If
+the LB has only 1 bucket and that bucket stacks on a drop DPO, then
+both a loose and strict check will fail, otherwise a loose check
+will pass. Each LB object has an associated uRPF list object. This
+object holds the list of interfaces through which the prefix is
+reachable. To pass the strict check, the input/output interface must
+be in this list.
diff --git a/docs/developer/corefeatures/fib/controlplane.rst b/docs/developer/corefeatures/fib/controlplane.rst
new file mode 100644
index 00000000000..8b58d42f5b3
--- /dev/null
+++ b/docs/developer/corefeatures/fib/controlplane.rst
@@ -0,0 +1,23 @@
+.. _controlplane:
+
+The Control Plane
+-----------------
+
+The control plane follows a layered data representation. This document describes the
+model starting from the lowest layer. The description uses IPv4 addresses and
+protocols, but all concepts apply equally to the IPv6 equivalents. The diagrams
+all portray the CLI command to install the information in VPP and an
+[approximation of] a UML diagram [#f1]_ of the data structures used to represent that
+information.
+
+.. toctree::
+
+ neighbors
+ routes
+ attachedexport
+ graphwalks
+ marknsweep
+
+.. rubric:: Footnotes:
+
+.. [#f1] The arrow indicates a ‘has-a’ relationship. The object attached to the arrow head ‘has-a’ instance of the other. The numbers next to the arrows indicate the multiplicity, i.e. object A has n to m instances of object B. The difference between a UML association and aggregation is not conveyed in any diagrams. To UML aficionados, I apologize. Word is not the best drawing tool.
diff --git a/docs/developer/corefeatures/fib/dataplane.rst b/docs/developer/corefeatures/fib/dataplane.rst
new file mode 100644
index 00000000000..94e11d1428c
--- /dev/null
+++ b/docs/developer/corefeatures/fib/dataplane.rst
@@ -0,0 +1,100 @@
+.. _dataplane:
+
+The Data Plane
+---------------
+
+The data-plane data model is a directed, acyclic [#f16]_ graph of heterogeneous objects.
+A packet will forward walk the graph as it is switched. Each object describes
+the actions to perform on the packet. Each object type has an associated VLIB
+graph node. For a packet to forward walk the graph is therefore to move from one
+VLIB node to the next, with each performing the required actions. This is the
+heart of the VPP model.
+
+The data-plane graph is composed of generic data-path objects (DPOs). A parent
+DPO is identified by the tuple:{type,index,next_node}. The *next_node* parameter
+is the index of the VLIB node to which the packets should be sent next, this is
+present to maximise performance - it is important to ensure that the parent does
+not need to be read [#f17]_ whilst processing the child. Specialisations [#f18]_ of the DPO
+perform distinct actions. The most common DPOs and briefly what they represent are:
+
+- Load-balance: a choice in an ECMP set.
+- Adjacency: apply a rewrite and forward through an interface
+- MPLS-label: impose an MPLS label.
+- Lookup: perform another lookup in a different table.
+
+The data-plane graph is derived from the control-plane graph by the objects
+therein 'contributing' a DPO to the data-plane graph. Objects in the data-plane
+contain only the information needed to switch a packet, they are therefore
+simpler, and in memory terms smaller, with the aim to fit one DPO on a single
+cache-line. The derivation from the control plane means that the data-plane
+graph contains only object whose current state can forward packets. For example,
+the difference between a *fib_path_list_t* and a *load_balance_t* is that the former
+expresses the control-plane's desired state, the latter the data-plane available
+state. If some paths in the path-list are unresolved or down, then the
+load-balance will not include them in the forwarding choice.
+
+.. figure:: /_images/fib20fig8.png
+
+Figure 8: DPO contributions for a non-recursive route
+
+Figure 8 shows a simplified view of the control-plane graph indicating those
+objects that contribute DPOs. Also shown are the VLIB node graphs at which the DPO is used.
+
+Each *fib_entry_t* contributes it own *load_balance_t*, for three reasons;
+
+- The result of a lookup in a IPv[46] table is a single 32 bit unsigned integer. This is an index into a memory pool. Consequently the object type must be the same for each result. Some routes will need a load-balance and some will not, but to insert another object in the graph to represent this choice is a waste of cycles, so the load-balance object is always the result. If the route does not have ECMP, then the load-balance has only one choice.
+
+- In order to collect per-route counters, the lookup result must in some way uniquely identify the *fib_entry_t*. A shared load-balance (contributed by the path-list) would not allow this.
+- In the case the *fib_entry_t* has MPLS out labels, and hence a *fib_path_ext_t*, then the load-balance must be per-prefix, since the MPLS labels that are its parents are themselves per-fib_entry_t.
+
+.. figure:: /_images/fib20fig9.png
+
+Figure 9: DPO contribution for a recursive route.
+
+Figure 9 shows the load-balance objects contributed for a recursive route.
+
+.. figure:: /_images/fib20fig10.png
+
+Figure 10: DPO Contributions from labelled recursive routes.
+
+Figure 10 shows the derived data-plane graph for a labelled recursive route.
+There can be as many MPLS-label DPO instances as there are routes multiplied by
+the number of paths per-route. For this reason the mpls-label DPO should be as
+small as possible [#f19]_.
+
+The data-plane graph is constructed by 'stacking' one
+instance of a DPO on another to form the child-parent relationship. When this
+stacking occurs, the necessary VLIB graph arcs are automatically constructed
+from the respected DPO type's registered graph nodes.
+
+The diagrams above show that for any given route the full data-plane graph is
+known before any packet arrives. If that graph is composed of n objects, then the
+packet will visit n nodes and thus incur a forwarding cost of approximately n
+times the graph node cost. This could be reduced if the graph were *collapsed*
+into fewer DPOs and nodes. There are two ways we might consider doing
+this:
+
+- write custom DPOs/nodes for combined functions, e.g. pop MPLS label
+ and lookup in v4 table. This has the disadvantage that the number of
+ such nodes would be, well, combinatorial, and resolving a path via
+ a combined DPO would be more difficult as it would involve a
+ forward walk of the graph to determine what the combination
+ is. However, VPP power users might consider this option for a
+ limited set of their use cases where performance is truly king.
+- collapse multiple levels of load-balancing into one. For example,
+ if there were two levels of load-balancing each with two choices,
+ this could equally be represented by one level with 4 choices.
+
+In either case a disadvantage to collapsing the graph is that it
+removes the indirection objects that provide fast convergence (see
+section Fast Convergence). To collapse is then a trade-off between
+faster forwarding and fast convergence; VPP favours the latter.
+
+
+.. rubric:: Footnotes:
+
+.. [#f16] Directed implies it cannot be back-walked. It is acyclic even in the presence of a recursion loop.
+.. [#f17] Loaded into cache, and hence potentially incurring a d-cache miss.
+.. [#f18] The engaged reader is directed to vnet/vnet/dpo/*
+.. [#f19] i.e. we should not re-use the adjacency structure.
+
diff --git a/docs/developer/corefeatures/fib/debugging.rst b/docs/developer/corefeatures/fib/debugging.rst
new file mode 100644
index 00000000000..750ad65c420
--- /dev/null
+++ b/docs/developer/corefeatures/fib/debugging.rst
@@ -0,0 +1,106 @@
+.. _debugging:
+
+Debugging
+---------
+
+the anatomy of a route:
+
+.. code-block:: console
+
+ BGvpp# sh ip fib 1.1.1.3/32
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, recursive-resolution:4, default-route:1, ]
+ 1.1.1.0/24 fib:0 index:9 locks:2
+ CLI refs:1 src-flags:added,contributing,active,
+ path-list:[24] locks:4 flags:shared, uPRF-list:11 len:1 itfs:[1, ]
+ path:[26] pl-index:24 ip4 weight=1 pref=0 attached-nexthop: oper-flags:resolved,
+ 10.0.0.1 loop0
+ [@0]: arp-ipv4: via 10.0.0.1 loop0
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:11 buckets:1 uRPF:11 to:[0:0]]
+ [0] [@3]: arp-ipv4: via 10.0.0.1 loop0
+
+let's go line by line.
+
+.. code-block:: console
+
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, recursive-resolution:4, default-route:1, ]
+
+Each field in turn:
+
+- ipv4-VRF:0: the name of the table (as given by the user, or
+ automatically generated by VPP).
+- fib-index:0; in the VPP pool of FIB objects, this is index 0
+- flow hash:[src dst sport dport proto ]: When calculating the flow
+ hash to use for load-balancing, these are the fields in the packet
+ that are used. There is an API to change this per-table.
+- epoch:0; Used during mark-n-sweep.
+- flags:none; use the force, to find the per-table flags.
+- locks: per-source reference counting, a table can only be deleted
+ when all sources no longer reference it.
+
+next line:
+
+.. code-block:: console
+
+ 1.1.1.0/24 fib:0 index:9 locks:2
+
+this shows the route that matched the show request. note that it is not
+an exact match, it's an LPM. The route is in FIB index 0, its index
+(in the VPP pool of fib_entry_t objects) is nine and there are two
+references to the entry.
+You'll get the same output if you type "sh fib entry 9"
+
+next line:
+
+.. code-block:: console
+
+ CLI refs:1 src-flags:added,contributing,active,
+
+the 'CLI' has sourced this route (it was added via CLI). This source
+has been added (well duh) it is 'active', meaning it is the best
+source, and it is contributing a forwarding object. There are some
+scenarios where sources other than the active source contribute,
+namely interpose sources.
+
+next line:
+
+.. code-block:: console
+
+ path-list:[24] locks:4 flags:shared, uPRF-list:11 len:1 itfs:[1, ]
+
+This is path-list inex 24 (see "sh fib path-list 24" this will also
+show the children), it is 'shared',
+meaning that if other prefixes were to use the same set of paths,
+then they would also use this path-list object. It has uRPF list 11 of
+length 1 containing interface index 1 (which is loop0, see "sh int").
+
+next line:
+
+.. code-block:: console
+
+ path:[26] pl-index:24 ip4 weight=1 pref=0 attached-nexthop: oper-flags:resolved,
+ 10.0.0.1 loop0
+ [@0]: arp-ipv4: via 10.0.0.1 loop0
+
+This is path 26 (see "sh fib path 26"). It's a member of
+path-list 24. It's ip4 has a weight of 1 and a preference of 0. It's
+of type 'attached-nexthop' and currently resolved - woohoo.
+It is a path 'via 10.0.0.1 loop0'. It is contributing an incomplete adjacency.
+
+next line:
+
+.. code-block:: console
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:11 buckets:1 uRPF:11 to:[0:0]]
+ [0] [@3]: arp-ipv4: via 10.0.0.1 loop0
+
+This section describes how packets of type 'unicast-ip4' will be
+forwarded. It is the result of processing the path information from
+above.
+Here we see load-balance object 11, which has 1 bucket/choice. It is
+also linked to uRPF instance 11 (which it got from path-list 24).
+In bucket 0 there is the incomplete adjacency that was contributed by
+path 26.
+
diff --git a/docs/developer/corefeatures/fib/fastconvergence.rst b/docs/developer/corefeatures/fib/fastconvergence.rst
new file mode 100644
index 00000000000..e1c5d0cc095
--- /dev/null
+++ b/docs/developer/corefeatures/fib/fastconvergence.rst
@@ -0,0 +1,576 @@
+.. _fastconvergence:
+
+Fast Convergence
+------------------------------------
+
+This is an excellent description of the topic:
+
+'FIB <https://tools.ietf.org/html/draft-ietf-rtgwg-bgp-pic-12>'_
+
+but if you're interested in my take keep reading...
+
+First some definitions:
+
+- Convergence; When a FIB is forwarding all packets correctly based
+ on the network topology (i.e. doing what the routing control plane
+ has instructed it to do), then it is said to be 'converged'.
+ Not being in a converged state is [hopefully] a transient state,
+ when either the topology change (e.g. a link failure) has not been
+ observed or processed by the routing control plane, or that the FIB
+ is still processing routing updates. Convergence is the act of
+ getting to the converged state.
+- Fast: In the shortest time possible. There are no absolute limits
+ placed on how short this must be, although there is one number often
+ mentioned. Apparently the human ear can detect loss/delay/jitter in
+ VOIP of 50ms, therefore network failures should last no longer than
+ this, and some technologies (notably link-free alternate fast
+ reroute) are designed to converge in this time. However, it is
+ generally accepted that it is not possible to converge a FIB with
+ tens of millions of routes in this time scale, the industry
+ 'standard' is sub-second.
+
+Converging the FIB quickly is thus a matter of:
+
+- discovering something is down
+- updating as few objects as possible
+- to determine which objects to update as efficiently as possible
+- to update each object as quickly as possible
+
+we'll discuss each in turn.
+All output came from VPP version 21.01rc0. In what follows I use IPv4
+prefixes, addresses and IPv4 host length masks, however, exactly the
+same applies to IPv6.
+
+
+Failure Detection
+^^^^^^^^^^^^^^^^^
+
+The two common forms (we'll see others later on) of failure detection
+are:
+
+- link down
+- BFD
+
+The FIB needs to hook into these notifications to trigger
+convergence.
+
+Whenever an interface goes down, VPP issues a callback to all
+registered clients. The adjacency code is such a client. The adjacency
+is a leaf node in the FIB control-plane graph (containing fib_path_t,
+fib_entry_t etc). A back-walk from the adjacency will trigger a
+re-resolution of the paths.
+
+FIB is a client of BFD in order to receive BFD notifications. BFD
+comes in two flavours; single and multi hop. Single hop is to protect
+a specific peer on an interface, such peers are modelled by an
+adjacency. Multi hop is to protect a peer on an unspecified interface
+(i.e. a remote peer), this peer is represented by a host-prefix
+**fib_entry_t**. In both case FIB will add a delegate to the
+**ip_adjacency_t** or **fib_entry_t** that represents the association
+to the BFD session. If the BFD session signals up/down then a backwalk
+can be triggered from the object to trigger re-resolution and hence
+convergence.
+
+
+Few Updates
+^^^^^^^^^^^
+
+In order to talk about what 'a few' is we have to leave the realm of
+the FIB as an abstract graph based object DB and move into the
+concrete representation of forwarding in a large network. Large
+networks are built in layers, it's how you scale them. We'll take
+here a hypothetical service provider (SP) network, but the concepts
+apply equally to data center leaf-spines. This is a rudimentary
+description, but it should serve our purpose.
+
+An SP manages a BGP autonomous system (AS). The SP's goal is both to
+attract traffic into its network to serve its customers, but also to
+serve transit traffic passing through it, we'll consider the latter here.
+The SP's network is all devices in that AS, these
+devices are split into those at the edge (provider edge (PE) routers)
+which peer with routers in other SP networks,
+and those in the core (termed provider (P) routers). Both the PE and P
+routers run the IGP (usually OSPF or ISIS). Only the reachability of the devices
+in the AS are advertised in the IGP - thus the scale (i.e. the number
+of routes) in the IGP is 'small' - only the number of
+devices that the SP has (typically not more than a few 10k).
+PE routers run BGP; they have external BGP sessions to devices in
+other ASs and internal BGP sessions to devices in the same AS. BGP is
+used to advertise the routes to *all* networks on the internet - at
+the time of writing this number is approaching 900k IPv4 route, hopefully by
+the time you are reading this the number of IPv6 routes has caught up ...
+If we include the additional routes the SP carries to offering VPN service to its
+customers the number of BGP routes can grow to the tens of millions.
+
+BGP scale thus exceeds IGP scale by two orders of magnitude... pause for
+a moment and let that sink in...
+
+A comparison of BGP and an IGP is way way beyond the scope of this
+documentation (and frankly beyond me) so we'll note only the
+difference in the form of the routes they present to FIB. A routing
+protocol will produce routes that specify the prefixes that are
+reachable through its peers. A good IGP
+is link state based, it forms peerings to other devices over these
+links, hence its routes specify links/interfaces. In
+FIB nomenclature this means an IGP produces routes that are
+attached-nexthop, e.g.:
+
+.. code-block:: console
+
+ ip route add 1.1.1.1/32 via 10.0.0.1 GigEthernet0/0/0
+
+BGP on the other hand forms peerings only to neighbours, it does not
+know, nor care, what interface is used to reach the peer. In FIB
+nomenclature therefore BGP produces recursive routes, e.g.:
+
+.. code-block:: console
+
+ ip route 8.0.0.0/16 via 1.1.1.1
+
+where 1.1.1.1 is the BGP peer. It's no accident in this example that
+1.1.1.1/32 happens to be the route the IGP advertised... BGP installs
+routes for prefixes reachable via other BGP peers, and the IGP install
+the routes to those BGP peers.
+
+This has been a very long winded way of describing why the scale of
+recursive routes is therefore 2 orders of magnitude greater than
+non-recursive/attached-nexthop routes.
+
+If we step back for a moment and recall why we've crawled down this
+rabbit hole, we're trying to determine what 'a few' updates means,
+does it include all those recursive routes, probably not ... let's
+keep crawling.
+
+We started this chapter with an abstract description of convergence,
+let's now make that more real. In the event of a network failure an SP
+is interested in moving to an alternate forwarding path as quickly as
+possible. If there is no alternate path, and a converged FIB will drop
+the packet, then who cares how fast it converges. In other words the
+interesting convergence scenarios are the scenarios where the network has
+alternate paths.
+
+PIC Core
+^^^^^^^^
+
+First let's consider alternate paths in the IGP, e.g.;
+
+.. code-block:: console
+
+ ip route add 1.1.1.1/32 via 10.0.0.2 GigEthernet0/0/0
+ ip route add 1.1.1.1/32 via 10.0.1.2 GigEthernet0/0/1
+
+this gives us in the FIB:
+
+.. code-block:: console
+
+ DBGvpp# sh ip fib 1.1.1.1/32
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, default-route:1, ]
+ 1.1.1.1/32 fib:0 index:15 locks:2
+ API refs:1 src-flags:added,contributing,active,
+ path-list:[23] locks:2 flags:shared, uPRF-list:22 len:2 itfs:[1, 2, ]
+ path:[27] pl-index:23 ip4 weight=1 pref=0 attached-nexthop: oper-flags:resolved,
+ 10.0.0.2 GigEthernet0/0/0
+ [@0]: ipv4 via 10.0.0.2 GigEthernet0/0/0: mtu:9000 next:3 001111111111dead000000000800
+ path:[28] pl-index:23 ip4 weight=1 pref=0 attached-nexthop: oper-flags:resolved,
+ 10.0.1.2 GigEthernet0/0/1
+ [@0]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:17 buckets:2 uRPF:22 to:[0:0]]
+ [0] [@5]: ipv4 via 10.0.0.2 GigEthernet0/0/0: mtu:9000 next:3 001111111111dead000000000800
+ [1] [@5]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+
+There is ECMP across the two paths. Note that the instance/index of the
+load-balance present in the forwarding graph is 17.
+
+Let's add a BGP route via this peer;
+
+.. code-block:: console
+
+ ip route add 8.0.0.0/16 via 1.1.1.1
+
+in the FIB we see:
+
+
+.. code-block:: console
+
+ DBGvpp# sh ip fib 8.0.0.0/16
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, recursive-resolution:1, default-route:1, ]
+ 8.0.0.0/16 fib:0 index:18 locks:2
+ API refs:1 src-flags:added,contributing,active,
+ path-list:[24] locks:2 flags:shared, uPRF-list:21 len:2 itfs:[1, 2, ]
+ path:[29] pl-index:24 ip4 weight=1 pref=0 recursive: oper-flags:resolved,
+ via 1.1.1.1 in fib:0 via-fib:15 via-dpo:[dpo-load-balance:17]
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:20 buckets:1 uRPF:21 to:[0:0]]
+ [0] [@12]: dpo-load-balance: [proto:ip4 index:17 buckets:2 uRPF:22 to:[0:0]]
+ [0] [@5]: ipv4 via 10.0.0.2 GigEthernet0/0/0: mtu:9000 next:3 001111111111dead000000000800
+ [1] [@5]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+
+the load-balance object used by this route is index 20, but note that
+the next load-balance in the chain is index 17, i.e. it is exactly
+the same instance that appears in the forwarding chain for the IGP
+route. So in the forwarding plane the packet first encounters
+load-balance object 20 (which it will use in ip4-lookup) and then
+number 17 (in ip4-load-balance).
+
+What's the significance? Let's shut down one of those IGP paths:
+
+.. code-block:: console
+
+ DBGvpp# set in state GigEthernet0/0/0 down
+
+the resulting update to the IGP route is:
+
+.. code-block:: console
+
+ DBGvpp# sh ip fib 1.1.1.1/32
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, recursive-resolution:1, default-route:1, ]
+ 1.1.1.1/32 fib:0 index:15 locks:4
+ API refs:1 src-flags:added,contributing,active,
+ path-list:[23] locks:2 flags:shared, uPRF-list:25 len:2 itfs:[1, 2, ]
+ path:[27] pl-index:23 ip4 weight=1 pref=0 attached-nexthop:
+ 10.0.0.2 GigEthernet0/0/0
+ [@0]: arp-ipv4: via 10.0.0.2 GigEthernet0/0/0
+ path:[28] pl-index:23 ip4 weight=1 pref=0 attached-nexthop: oper-flags:resolved,
+ 10.0.1.2 GigEthernet0/0/1
+ [@0]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+
+ recursive-resolution refs:1 src-flags:added, cover:-1
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:17 buckets:1 uRPF:25 to:[0:0]]
+ [0] [@5]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+
+
+notice that the path via 10.0.0.2 is no longer flagged as resolved,
+and the forwarding chain does not contain this path as a
+choice. However, the key thing to note is the load-balance
+instance is still index 17, i.e. it has been modified not
+exchanged. In the FIB vernacular we say it has been 'in-place
+modified', a somewhat linguistically redundant expression, but one that serves
+to emphasise that it was changed whilst still be part of the graph, it
+was never at any point removed from the graph and re-added, and it was
+modified without worker barrier lock held.
+
+Still don't see the significance? In order to converge around the
+failure of the IGP link it was not necessary to update load-balance
+object number 20! It was not necessary to update the recursive
+route. i.e. convergence is achieved without updating any recursive
+routes, it is only necessary to update the affected IGP routes, this is
+the definition of 'a few'. We call this 'prefix independent
+convergence' (PIC) which should really be called 'recursive prefix
+independent convergence' but it isn't...
+
+How was the trick done? As with all problems in computer science, it
+was solved by a layer of misdirection, I mean indirection. The
+indirection is the load-balance that belongs to the IGP route. By
+keeping this object in the forwarding graph and updating it in place,
+we get PIC. The alternative design would be to collapse the two layers of
+load-balancing into one, which would improve forwarding performance
+but would come at the cost of prefix dependent convergence. No doubt
+there are situations where the VPP deployment would favour forwarding
+performance over convergence, you know the drill, contributions welcome.
+
+This failure scenario is known as PIC core, since it's one of the IGP's
+core links that has failed.
+
+iBGP PIC Edge
+^^^^^^^^^^^^^
+
+Next, let's consider alternate paths in BGP, e.g:
+
+.. code-block:: console
+
+ ip route add 8.0.0.0/16 via 1.1.1.1
+ ip route add 8.0.0.0/16 via 1.1.1.2
+
+the 8.0.0.0/16 prefix is reachable via two BGP next-hops (two PEs).
+
+Our FIB now also contains:
+
+.. code-block:: console
+
+ DBGvpp# sh ip fib 8.0.0.0/16
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, recursive-resolution:2, default-route:1, ]
+ 8.0.0.0/16 fib:0 index:18 locks:2
+ API refs:1 src-flags:added,contributing,active,
+ path-list:[15] locks:2 flags:shared, uPRF-list:11 len:2 itfs:[1, 2, ]
+ path:[17] pl-index:15 ip4 weight=1 pref=0 recursive: oper-flags:resolved,
+ via 1.1.1.1 in fib:0 via-fib:15 via-dpo:[dpo-load-balance:17]
+ path:[15] pl-index:15 ip4 weight=1 pref=0 recursive: oper-flags:resolved,
+ via 1.1.1.2 in fib:0 via-fib:10 via-dpo:[dpo-load-balance:12]
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:20 buckets:2 uRPF:11 to:[0:0]]
+ [0] [@12]: dpo-load-balance: [proto:ip4 index:17 buckets:1 uRPF:25 to:[0:0]]
+ [0] [@5]: ipv4 via 10.0.0.2 GigEthernet0/0/0: mtu:9000 next:3 001122334455dead000000000800
+ [1] [@5]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+ [1] [@12]: dpo-load-balance: [proto:ip4 index:12 buckets:1 uRPF:13 to:[0:0]]
+ [0] [@5]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+
+The first load-balance (LB) in the forwarding graph is index 20 (the astute
+reader will note this is the same index as in the previous
+section, I am adding paths to the same route, the load-balance is
+in-place modified again). Each choice in LB 20 is another LB
+contributed by the IGP route through which the route's paths recurse.
+
+So what's the equivalent in BGP to a link down in the IGP? An IGP link
+down means it loses its peering out of that link, so the equivalent in
+BGP is the loss of the peering and thus the loss of reachability to
+the peer. This is signaled by the IGP withdrawing the route to the
+peer. But "Wait wait wait", i hear you say ... "just because the IGP
+withdraws 1.1.1.1/32 doesn't mean I can't reach 1.1.1.1, perhaps there
+is a less specific route that gives reachability to 1.1.1.1". Indeed
+there may be. So a little more on BGP network design. I know it's like
+a bad detective novel where the author drip feeds you the plot... When
+describing iBGP peerings one 'always' describes the peer using one of
+its GigEthernet0/0/back addresses. Why? A GigEthernet0/0/back interface
+never goes down (unless you admin down it yourself), some muppet can't
+accidentally cut through the GigEthernet0/0/back cable whilst digging up the
+street. And what subnet mask length does a prefix have on a GigEthernet0/0/back
+interface? it's 'always' a /32. Why? because there's no cable to connect
+any other devices. This choice justifies there 'always' being a /32
+route for the BGP peer. But what prevents there not being a less
+specific - nothing.
+Now clearly if the BGP peer crashes then the /32 for its GigEthernet0/0/back is
+going to be removed from the IGP, but what will withdraw the less
+specific - nothing.
+
+So in order to make use of this trick of relying on the withdrawal of
+the /32 for the peer to signal that the peer is down and thus the
+signal to converge the FIB, we need to force FIB to recurse only via
+the /32 and not via a less specific. This is called a 'recursion
+constraint'. In this case the constraint is 'recurse via host'
+i.e. for ipv4 use a /32.
+So we need to update our route additions from before:
+
+.. code-block:: console
+
+ ip route add 8.0.0.0/16 via 1.1.1.1 resolve-via-host
+ ip route add 8.0.0.0/16 via 1.1.1.2 resolve-via-host
+
+checking the FIB output is left as an exercise to the reader. I hope
+you're doing these configs as you read. There's little change in the
+output, you'll see some extra flags on the paths.
+
+Now let's add the less specific, just for fun:
+
+
+.. code-block:: console
+
+ ip route add 1.1.1.0/28 via 10.0.0.2 GigEthernet0/0/0
+
+nothing changes in resolution of 8.0.0.0/16.
+
+Now withdraw the route to 1.1.1.2/32:
+
+.. code-block:: console
+
+ ip route del 1.1.1.2/32 via 10.0.0.2 GigEthernet0/0/0
+
+In the FIB we see:
+
+.. code-block:: console
+
+ DBGvpp# sh ip fib 8.0.0.0/32
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, recursive-resolution:2, default-route:1, ]
+ 8.0.0.0/16 fib:0 index:18 locks:2
+ API refs:1 src-flags:added,contributing,active,
+ path-list:[15] locks:2 flags:shared, uPRF-list:13 len:2 itfs:[1, 2, ]
+ path:[15] pl-index:15 ip4 weight=1 pref=0 recursive: oper-flags:resolved, cfg-flags:resolve-host,
+ via 1.1.1.1 in fib:0 via-fib:15 via-dpo:[dpo-load-balance:17]
+ path:[17] pl-index:15 ip4 weight=1 pref=0 recursive: cfg-flags:resolve-host,
+ via 1.1.1.2 in fib:0 via-fib:10 via-dpo:[dpo-drop:0]
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:20 buckets:1 uRPF:13 to:[0:0]]
+ [0] [@12]: dpo-load-balance: [proto:ip4 index:17 buckets:2 uRPF:27 to:[0:0]]
+ [0] [@5]: ipv4 via 10.0.0.2 GigEthernet0/0/0: mtu:9000 next:3 001122334455dead000000000800
+ [1] [@5]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+
+the path via 1.1.1.2 is unresolved, because the recursion constraints
+are preventing the the path resolving via 1.1.1.0/28. the LB index 20
+has been updated to remove the unresolved path.
+
+Job done? Not quite! Why not?
+
+Let's re-examine the goals of this chapter. We wanted to update 'a
+few' objects, which we have defined as not all the millions of
+recursive routes. Did we do that here? We sure did, when we
+modified LB index 20. So WTF?? Where's the indirection object that can
+be modified so that the LBs for the recursive routes are not
+modified - it's not there.... WTF?
+
+OK so the great detective has assembled all the suspects in the
+drawing room and only now does he drop the bomb; the FIB knows the
+scale, we talked above about what the scale **can** be, worst case
+scenario, but that's not necessarily what it is in this hypothetical
+(your) deployment. It knows how many recursive routes there are that
+depend on a /32, it can thus make its own determination of the
+definition of 'a few'. In other words, if there are only 'a few'
+recursive prefixes that depend on a /32 then it will update them
+synchronously (and we'll discuss what synchronously means a bit more later).
+
+So what does FIB consider to be 'a few'. Let's add more routes and
+find out.
+
+.. code-block:: console
+
+ DBGvpp# ip route add 8.1.0.0/16 via 1.1.1.2 resolve-via-host via 1.1.1.1 resolve-via-host
+ ...
+ DBGvpp# ip route add 8.63.0.0/16 via 1.1.1.2 resolve-via-host via 1.1.1.1 resolve-via-host
+
+and we see:
+
+.. code-block:: console
+
+ DBGvpp# sh ip fib 8.8.0.0
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, recursive-resolution:4, default-route:1, ]
+ 8.8.0.0/16 fib:0 index:77 locks:2
+ API refs:1 src-flags:added,contributing,active,
+ path-list:[15] locks:128 flags:shared,popular, uPRF-list:28 len:2 itfs:[1, 2, ]
+ path:[17] pl-index:15 ip4 weight=1 pref=0 recursive: oper-flags:resolved, cfg-flags:resolve-host,
+ via 1.1.1.1 in fib:0 via-fib:15 via-dpo:[dpo-load-balance:17]
+ path:[15] pl-index:15 ip4 weight=1 pref=0 recursive: oper-flags:resolved, cfg-flags:resolve-host,
+ via 1.1.1.2 in fib:0 via-fib:10 via-dpo:[dpo-load-balance:12]
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:79 buckets:2 uRPF:28 flags:[uses-map] to:[0:0]]
+ load-balance-map: index:0 buckets:2
+ index: 0 1
+ map: 0 1
+ [0] [@12]: dpo-load-balance: [proto:ip4 index:17 buckets:2 uRPF:27 to:[0:0]]
+ [0] [@5]: ipv4 via 10.0.0.2 GigEthernet0/0/0: mtu:9000 next:3 001122334455dead000000000800
+ [1] [@5]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+ [1] [@12]: dpo-load-balance: [proto:ip4 index:12 buckets:1 uRPF:18 to:[0:0]]
+ [0] [@3]: arp-ipv4: via 10.0.1.2 GigEthernet0/0/0
+
+
+Two elements to note here; the path-list has the 'popular' flag and
+there is a load-balance map in the forwarding path.
+
+'popular' in this case means that the path-list has passed the limit
+of 'a few' in the number of children it has.
+
+here are the children:
+
+.. code-block:: console
+
+ DBGvpp# sh fib path-list 15
+ path-list:[15] locks:128 flags:shared,popular, uPRF-list:28 len:2 itfs:[1, 2, ]
+ path:[17] pl-index:15 ip4 weight=1 pref=0 recursive: oper-flags:resolved, cfg-flags:resolve-host,
+ via 1.1.1.1 in fib:0 via-fib:15 via-dpo:[dpo-load-balance:17]
+ path:[15] pl-index:15 ip4 weight=1 pref=0 recursive: oper-flags:resolved, cfg-flags:resolve-host,
+ via 1.1.1.2 in fib:0 via-fib:10 via-dpo:[dpo-load-balance:12]
+ children:{entry:18}{entry:21}{entry:22}{entry:23}{entry:25}{entry:26}{entry:27}{entry:28}{entry:29}{entry:30}{entry:31}{entry:32}{entry:33}{entry:34}{entry:35}{entry:36}{entry:37}{entry:38}{entry:39}{entry:40}{entry:41}{entry:42}{entry:43}{entry:44}{entry:45}{entry:46}{entry:47}{entry:48}{entry:49}{entry:50}{entry:51}{entry:52}{entry:53}{entry:54}{entry:55}{entry:56}{entry:57}{entry:58}{entry:59}{entry:60}{entry:61}{entry:62}{entry:63}{entry:64}{entry:65}{entry:66}{entry:67}{entry:68}{entry:69}{entry:70}{entry:71}{entry:72}{entry:73}{entry:74}{entry:75}{entry:76}{entry:77}{entry:78}{entry:79}{entry:80}{entry:81}{entry:82}{entry:83}{entry:84}
+
+64 children makes it popular. The number is fixed (there is no API to
+change it). Its choice is an attempt to balance the performance cost
+of the indirection performance degradation versus the convergence
+gain.
+
+Popular path-lists contribute the load-balance map, this is the
+missing indirection object. Its indirection happens when choosing the
+bucket in the LB. The packet's flow-hash is taken 'mod number of
+buckets' to give the 'candidate bucket' then the map will take this
+'index' and convert it into the 'map'. You can see in the example above
+that no change occurs, i.e. if the flow-hash mod n chooses bucket 1
+then it gets bucket 1.
+
+Why is this useful? The path-list is shared (you can convince
+yourself of this if you look at each of the 8.x.0.0/16 routes we
+added) and all of these routes use the same load-balance map, therefore, to
+converge all the recursive routs, we need only change the map and
+we're good; we again get PIC.
+
+OK who's still awake... if you're thinking there's more to this story,
+you're right. Keep reading.
+
+This failure scenario is called iBGP PIC edge. It's 'edge' because it
+refers to the loss of an edge device, and iBGP because the device was
+a iBGP peer (we learn iBGP peers in the IGP). There is a similar eBGP
+PIC edge scenario, but this is left for an exercise to the reader (hint
+there are other recursion constraints - see the RFC).
+
+Which Objects
+^^^^^^^^^^^^^
+
+The next topic on our list of how to converge quickly was to
+effectively find the objects that need to be updated when a converge
+event happens. If you haven't realised by now that the FIB is an
+object graph, then can I politely suggest you go back and start from
+the beginning ...
+
+Finding the objects affected by a change is simply a matter of walking
+from the parent (the object affected) to its children. These
+dependencies are kept really for this reason.
+
+So is fast convergence just a matter of walking the graph? Yes and
+no. The question to ask yourself is this, "in the case of iBGP PIC edge,
+when the /32 is withdrawn, what is the list of objects that need to be
+updated and particularly what is the order they should be updated in
+order to obtain the best convergence time?" Think breadth v. depth first.
+
+... ponder for a while ...
+
+For iBGP PIC edge we said it's the path-list that provides the
+indirection through the load-balance map. Hence once all path-lists
+are updated we are converged, thereafter, at our leisure, we can
+update the child recursive prefixes. Is the breadth or depth first?
+
+It's breadth first.
+
+Breadth first walks are achieved by spawning an async walk of the
+branch of the graph that we don't want to traverse. Withdrawing the /32
+triggers a synchronous walk of the children of the /32 route, we want
+a synchronous walk because we want to converge ASAP. This synchronous
+walk will encounter path-lists in the /32 route's child dependent list.
+These path-lists (and their LB maps) will be updated. If a path-list is
+popular, then it will spawn a async walk of the path-list's child
+dependent routes, if not it will walk those routes. So the walk
+effectively proceeds breadth first across the path-lists, then returns
+to the start to do the affected routes.
+
+Now the story is complete. The murderer is revealed.
+
+Let's withdraw one of the IGP routes.
+
+.. code-block:: console
+
+ DBGvpp# ip route del 1.1.1.2/32 via 10.0.1.2 GigEthernet0/0/1
+
+ DBGvpp# sh ip fib 8.8.0.0
+ ipv4-VRF:0, fib_index:0, flow hash:[src dst sport dport proto ] epoch:0 flags:none locks:[adjacency:1, recursive-resolution:4, default-route:1, ]
+ 8.8.0.0/16 fib:0 index:77 locks:2
+ API refs:1 src-flags:added,contributing,active,
+ path-list:[15] locks:128 flags:shared,popular, uPRF-list:18 len:2 itfs:[1, 2, ]
+ path:[17] pl-index:15 ip4 weight=1 pref=0 recursive: oper-flags:resolved, cfg-flags:resolve-host,
+ via 1.1.1.1 in fib:0 via-fib:15 via-dpo:[dpo-load-balance:17]
+ path:[15] pl-index:15 ip4 weight=1 pref=0 recursive: cfg-flags:resolve-host,
+ via 1.1.1.2 in fib:0 via-fib:10 via-dpo:[dpo-drop:0]
+
+ forwarding: unicast-ip4-chain
+ [@0]: dpo-load-balance: [proto:ip4 index:79 buckets:1 uRPF:18 to:[0:0]]
+ [0] [@12]: dpo-load-balance: [proto:ip4 index:17 buckets:2 uRPF:27 to:[0:0]]
+ [0] [@5]: ipv4 via 10.0.0.2 GigEthernet0/0/0: mtu:9000 next:3 001122334455dead000000000800
+ [1] [@5]: ipv4 via 10.0.1.2 GigEthernet0/0/1: mtu:9000 next:4 001111111111dead000000010800
+
+the LB Map has gone, since the prefix now only has one path. You'll
+need to be a CLI ninja if you want to catch the output showing the LB
+map in its transient state of:
+
+.. code-block:: console
+
+ load-balance-map: index:0 buckets:2
+ index: 0 1
+ map: 0 0
+
+but it happens. Trust me. I've got tests and everything.
+
+On the final topic of how to converge quickly; 'make each update fast'
+there are no tricks.
+
+
+
diff --git a/docs/developer/corefeatures/fib/graphs.rst b/docs/developer/corefeatures/fib/graphs.rst
new file mode 100644
index 00000000000..aec0e4b0135
--- /dev/null
+++ b/docs/developer/corefeatures/fib/graphs.rst
@@ -0,0 +1,34 @@
+.. _graphs:
+
+Graphs
+^^^^^^
+
+The FIB is essentially a collection of related graphs. Terminology from graph theory
+is often used in the sections that follow. From Wikipedia:
+
+*... a graph is a representation of a set of objects where some pairs of objects are
+connected by links. The interconnected objects are represented by mathematical
+abstractions called vertices (also called nodes or points), and the links that
+connect some pairs of vertices are called edges (also called arcs or lines) ...
+edges may be directed or undirected.*
+
+In a directed graph the edges can only be traversed in one direction - from child to
+parent. The names are chosen to represent the many to one relationship. A child has
+one parent, but a parent many children. In undirected graphs the edge traversal
+can be in either direction, but in FIB the parent child nomenclature remains to
+represent the many to one relationship. Children of the same parent are termed
+siblings. When the traversal is from child to parent it is considered to be a
+forward traversal, or walk, and from parent to the many children a back walk.
+Forward walks are cheap since they start from the many and move toward the few.
+Back walks are expensive as the start from the few and visit the many.
+
+The many to one relationship between child and parent means that the lifetime of a
+parent object must extend to the lifetime of its children. If the control plane
+removes a parent object before its children, then the parent must remain, in an
+**incomplete** state, until the children are themselves removed. Likewise if a child
+is created before its parent, the parent is created in an *incomplete* state. These
+incomplete objects are needed to maintain the graph dependencies. Without them when
+the parent is added finding the affected children would require a search through many
+databases for those children. To extend the lifetime of parents all children thereof
+hold a **lock** on the parent. This is a simple reference count. Children then follow
+the add-or-lock/unlock semantics for finding a parent, as opposed to a malloc/free.
diff --git a/docs/developer/corefeatures/fib/graphwalks.rst b/docs/developer/corefeatures/fib/graphwalks.rst
new file mode 100644
index 00000000000..e740660a2ed
--- /dev/null
+++ b/docs/developer/corefeatures/fib/graphwalks.rst
@@ -0,0 +1,80 @@
+.. _graphwalks:
+
+Graph Walks
+^^^^^^^^^^^^
+
+All FIB object types are allocated from a VPP memory pool [#f13]_. The objects are thus
+susceptible to memory re-allocation, therefore the use of a bare "C" pointer to refer
+to a child or parent is not possible. Instead there is the concept of a *fib_node_ptr_t*
+which is a tuple of type,index. The type indicates what type of object it is
+(and hence which pool to use) and the index is the index in that pool. This allows
+for the safe retrieval of any object type.
+
+When a child resolves via a parent it does so knowing the type of that parent. The
+child to parent relationship is thus fully known to the child, and hence a forward
+walk of the graph (from child to parent) is trivial. However, a parent does not choose
+its children, it does not even choose the type. All object types that form part of the
+FIB control plane graph all inherit from a single base class; *fib_node_t*. A *fib_node_t*
+identifies the object's index and its associated virtual function table provides the
+parent a mechanism to visit that object during the walk. The reason for a back-walk
+is to inform all children that the state of the parent has changed in some way, and
+that the child may itself need to update.
+
+To support the many to one, child to parent, relationship a parent must maintain a
+list of its children. The requirements of this list are;
+
+- O(1) insertion and delete time. Several child-parent relationships are made/broken during route addition/deletion.
+- Ordering. High priority children are at the front, low priority at the back (see section Fast Convergence)
+- Insertion at arbitrary locations.
+
+To realise these requirements the child-list is a doubly linked-list, where each element
+contains a *fib_node_ptr_t*. The VPP pool memory model applies to the list elements, so
+they are also identified by an index. When a child is added to a list it is returned the
+index of the element. Using this index the element can be removed in constant time.
+The list supports 'push-front' and 'push-back' semantics for ordering. To walk the children
+of a parent is then to iterate this list.
+
+A back-walk of the graph is a depth first search where all children in all levels of the
+hierarchy are visited. Such walks can therefore encounter all object instances in the
+FIB control plane graph, numbering in the millions. A FIB control-plane graph is cyclic
+in the presence of a recursion loop, so the walk implementation has mechanisms to detect
+this and exit early.
+
+A back-walk can be either synchronous or asynchronous. A synchronous walk will visit the
+entire section of the graph before control is returned to the caller, an asynchronous
+walk will queue the walk to a background process, to run at a later time, and immediately
+return to the caller. To implement asynchronous walks a *fib_walk_t* object it added to
+the front of the parent's child list. As children are visited the *fib_walk_t* object
+advances through the list. Since it is inserted in the list, when the walk suspends
+and resumes, it can continue at the correct location. It is also safe with respect to
+the deletion of children from the list. New children are added to the head of the list,
+and so will not encounter the walk, but since they are new, they already have the up to
+date state of the parent.
+
+A VLIB process 'fib-walk' runs to perform the asynchronous walks. VLIB has no priority
+scheduling between respective processes, so the fib-walk process does work in small
+increments so it does not block the main route download process. Since the main download
+process effectively has priority numerous asynchronous back-walks can be started on the
+same parent instance before the fib-walk process can run. FIB is a 'final state' application.
+If a parent changes n times, it is not necessary for the children to also update n
+times, instead it is only necessary that this child updates to the latest, or final,
+state. Consequently when multiple walks on a parent (and hence potential updates to a
+child) are queued, these walks can be merged into a single walk. This
+is the main reason the walks are designed this way, to eliminate (as
+much as possible) redundant work and thus converge the system as fast
+as possible.
+
+Choosing between a synchronous and an asynchronous walk is therefore a trade-off between
+time it takes to propagate a change in the parent to all of its children, versus the
+time it takes to act on a single route update. For example, if a route update were to
+affect millions of child recursive routes, then the rate at which such updates could be
+processed would be dependent on the number of child recursive route which would not be
+good. At the time of writing FIB2.0 uses synchronous walk in all locations except when
+walking the children of a path-list, and it has more than 32 [#f15]_ children. This avoids the
+case mentioned above.
+
+.. rubric:: Footnotes:
+
+.. [#f13] Fast memory allocation is crucial to fast route update times.
+.. [#f14] VPP may be written in C and not C++ but inheritance is still possible.
+.. [#f15] The value is arbitrary and yet to be tuned.
diff --git a/docs/developer/corefeatures/fib/hacking.rst b/docs/developer/corefeatures/fib/hacking.rst
new file mode 100644
index 00000000000..f64d3deb860
--- /dev/null
+++ b/docs/developer/corefeatures/fib/hacking.rst
@@ -0,0 +1,68 @@
+.. _hacking:
+
+Get Hacking
+-----------
+
+The code's directory structure is trivial, FIB, mFIB, adj have their
+own directories.
+
+for the most part, for all the FIB object types mentioned in this
+documentation there is a corresponding .h and .c file. As with any VPP
+component/sub-system a 'public' header file is any file that can be
+included by another sub-system and/or plugin. These must be specified
+in the build-system, so go look there. Public header files are always
+a good entry point to start reading.
+
+FIB
+^^^
+
+There is no direct [VPP's binary] API access to FIB, but FIB does
+expose types that can be used on the API by FIB and by other
+subsystems (e.g. :ref:`barnacles`). These types are specified in
+fib.api and the encoding and decoding thereof in fib_api.[ch].
+
+Most operations on a FIB entry happen as a result of an operation on a
+FIB table; an entry does not exist in isolation. The APIs in
+fib_table.h are well doxygen documented you should be able to figure
+out what they do. Use this as a starting point to explore how entries
+are created and deleted and how the source priority scheme works.
+
+FIB sources are defined in fib_source.h. Each source behaviour has its
+own file fib_entry_src_*.c These define the virtual functions that
+determine how the source behaves when actions on the FIB occur. For
+example, what the entry must do when its covering prefix's forwarding
+is updated.
+
+When creating new paths/path-lists the main action required is to
+resolve them; see fib_path*_resolve, and once resolved to have them
+contribute a DPO for forwarding or for the uRPF list; see
+fib_*_contribute_forwarding and fib_*_contribute_urpf respectively.
+
+The data-structures that used for entry lookup are protocol
+specific, they are implemented in separate files; ip4_fib.[ch],
+ip6_fib.[ch] and mpls_fib.[ch].
+
+FIB extranet support is implemented in fib_attached_export.[ch].
+FIB tracking is implemented in fib_entry_track.[ch].
+FIB [back]walk is implemented in fib_walk.[ch].
+
+Adjacency
+^^^^^^^^^
+
+Not much to say here, each adjacency type has it own file; use the
+force, read the source.
+
+
+Testing
+^^^^^^^
+
+the majority of FIB coverage comes from the C Unit tests in
+fib_test.c. I strongly encourage you to add code here. It's a much
+easier development cycle to fire up GDB, run VPP and iterate with
+'test fib', than it is work in the python UT. You still need to write
+python UT, don't get me wrong, it's just easier to do the FIB dev
+using C UT.
+
+
+
+Enjoy!
diff --git a/docs/developer/corefeatures/fib/index.rst b/docs/developer/corefeatures/fib/index.rst
new file mode 100644
index 00000000000..37c548b3f59
--- /dev/null
+++ b/docs/developer/corefeatures/fib/index.rst
@@ -0,0 +1,21 @@
+.. _fib20:
+
+The FIB
+===========================================
+
+This describe the FIB (Forwarding information base) implementation :
+Hierarchical, Protocol, Independent
+
+.. toctree::
+
+ prerequisites
+ thedatamodel
+ tunnels
+ mplsfib
+ multicast
+ debugging
+ fastconvergence
+ scale
+ barnacles
+ hacking
+ missing
diff --git a/docs/developer/corefeatures/fib/marknsweep.rst b/docs/developer/corefeatures/fib/marknsweep.rst
new file mode 100644
index 00000000000..e9e38a33f3a
--- /dev/null
+++ b/docs/developer/corefeatures/fib/marknsweep.rst
@@ -0,0 +1,68 @@
+.. _marknsweep:
+
+Mark and Sweep
+--------------
+
+The mark and sweep procedures, in FIB and in other subsystems, are
+built for the purpose of recovering from a control plane crash.
+
+In routing if the control plane (CP) crashes, when it restarts, the network
+topology may have changed. This means that some of the routes that
+were programmed in the FIB may no longer be needed, and perhaps some
+new ones are. If the CP were simply to insert all the new routes it
+learned after it restarts, then FIB could be left with old routes that
+never get removed, this would be bigly bad.
+
+At a high level the requirement is to delete routes from the old set
+that are not present in the new set; 'delete the diff' as it might
+be colloquially known.
+
+How should the control plane determine the old set? It could
+conceivably read back the FIB from VPP. But this presents two
+problems, firstly, it could be a large set of routes, numbering in the
+millions, this is not an efficient mechanism and not one one wants to
+perform at a point when the router is trying to converge
+ASAP. Secondly it represents a 'source of truth' inversion. The
+routing plane is the source of truth, not forwarding. Routing should
+not receive its 'input' from the layers below. Thirdly, on a practical
+note, the reading of VPP data structures to glean this sort of
+accurate information, would only happen in this scenario, i.e. it's
+not well tested and therefore not particularly reliable (see point 2).
+
+Enter 'mark and sweep' or m-n-s (not to be confused with the retail
+giant) as it's affectionately known.
+
+The Mark and Sweep algorithm proceeds in three steps:
+
+- Step 1; the CP declares to VPP that it wants to begin the process
+ (i.e. it has just restarted). At this point VPP will iterate through
+ all the objects that the CP owns and 'mark' then as being
+ stale. This process effectively declares a new 'epoch', a barrier in
+ time that separates the old objects from the new.
+- Step 2; The CP downloads all of its new objects. If one of these new
+ CP objects matches (has the same key as) an existing object, then
+ the CP add is considered an update, and the object's stale state is
+ removed.
+- Step 3: The CP declares it has 'converged'; it has no more updates
+ to give (at this time). VPP will then again iterate through all the
+ CP's objects and remove those that do not belong to the new epoch,
+ i.e. those that are still marked stale.
+
+After step 3, the CP and VPP databases are in sync.
+
+The cost of the process was to download all the new routes again. This
+is a highly-tuned and well-tested scenario.
+
+In VPP we use the synonym 'replace' to describe the mark-n-sweep
+action in the API. We use this term because it refers to the goals of
+the algorithm at a high level - the CP wants to replace the old DB
+with a new one - but it does not specify the algorithm by which that
+is achieved. One could equally perform this task by constructing a
+brand new DB in VPP, and then swapping them when the CP
+converges. Other subsystems may employ that approach, but FIB does
+not. Updates are typically faster than adds, since the update is
+likely a no-op, whereas a separate add would require the memory
+allocator, which is the long pole in FIB additions. Additionally, it requires
+twice the memory for a moment in time, which could be prohibitive when
+the FIB is large.
+
diff --git a/docs/developer/corefeatures/fib/missing.rst b/docs/developer/corefeatures/fib/missing.rst
new file mode 100644
index 00000000000..0beccb17af1
--- /dev/null
+++ b/docs/developer/corefeatures/fib/missing.rst
@@ -0,0 +1,110 @@
+.. _missing:
+
+Missing Functionality
+---------------------
+
+A list of functionality that the FIB does not currently provide.
+
+
+PIC Edge Backup Paths
+^^^^^^^^^^^^^^^^^^^^^
+
+FIB supports the concept of path 'preference'. Only paths that have
+the best preference contribute to forwarding. Only once all the paths with
+the best preference go down do the paths with the next best preference
+contribute.
+
+In BGP PIC edge, BGP would install the primary paths and the backup
+paths. With expectation that backups are only used once all primaries
+fail; this is the same behaviour that FIB's preference sets provide.
+
+However, in order to get prefix independent convergence, one must be
+able to only modify the path-list's load-balance map (LBM) to choose the
+paths to use. Hence the paths must already be in the map, and
+conversely must be in the fib_entry's load-balance (LB). In other
+words, to use backup paths with PIC, the fib_entry's LB must include
+the backup paths, and the path-lists LBM must map from the backups to
+the primaries.
+
+This is change that is reasonably easy w.r.t. to knowing what to
+change, but hard to get right and hard to test.
+
+
+Loop Free Alternate Paths
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Contrary to the BGP approach for path backups, an IGP could install a
+loop free alternate (LFA) path to achieve fast re-route (FRR).
+
+Because of the way the LFA paths are calculated by the IGP an LFA backup
+path is always paired with a primary. VPP FIB does not support this
+primary-backup pair relationship.
+
+In intent of LFA FRR is/was to get below the magic 50ms mark. To do
+this the expectation is/was that one would need in the forwarding
+graph an object that represents a path's state. This object would be
+checked for each packet being sent. If the path is up, the graph (an
+adjacency since it's the IGP) for the primary path is taken, if it's
+down the graph for the backup is taken. When a path goes down only
+this indirection object needs to be updated to affect all
+routes. Naturally, the indirection would incur a performance cost, but
+we know that there are many performance-convergence trade-offs in a
+FIB design.
+
+Should VPP's FIB support this feature? It all depends on the
+50ms. LFA FRR comes from the era when routers ran on lower performance
+CPUs and interface down was an interrupt. VPP typically has plenty of
+gas but runs as a user space process. So, can it update all routes in
+under 50ms on a meaty CPU and can the OS deliver the interface down
+within the time requirements? I don't have the answers to either
+question.
+
+
+Extranets for Multicast
+^^^^^^^^^^^^^^^^^^^^^^^
+
+When a unicast prefix is present in two different tables, then it
+refers to a different set of devices. When the prefix is imported it
+refers to the same set of devices. If the set of paths to reach the
+prefix is different in the import and export table, it doesn't matter,
+since they both refer to the same devices, so either set can be
+used. Therefore, FIB's usual source preference rules can apply. The
+'import' source is lower priority.
+
+When a multicast prefix is present in two different tables, then it's
+two different flows referring to two different set of receivers. When
+the prefix is imported, then it refers to the same flow and two
+different sets of receivers. In other words, the receiver set in the
+import table needs to be the super set of receivers.
+
+There are two ways one might consider doing this; merging the
+path-lists or replicating the packet first into each table.
+
+
+Collapsing
+^^^^^^^^^^
+
+Read :ref:`fastconvergence`
+
+Collapsing the DPO graph for recursive routes doesn't have to be an
+all or nothing. Easy cases:
+
+
+- A recursive prefix with only one path and a path-list that is not
+ popular, could stack directly on the LB of the via entry.
+- A recursive prefix with only multiple paths and a path-list that is not
+ popular, could construct a new load balance using the choices
+ present in each bucket of its via entries. The choices in the new LB
+ though would need to reflect the relative weighting.
+
+
+The condition of an non-popular path-list means that the LB doesn't
+have an LB map and hence it needs to be updated for convergence to
+occur.
+
+The more difficult cases come when the recursive prefix has labels
+which need to be stack on the via entries' choices.
+
+You might also envision a global configuration that always collapses all
+chains, which could be used in deployments where convergence is not a
+priority.
diff --git a/docs/developer/corefeatures/fib/mplsfib.rst b/docs/developer/corefeatures/fib/mplsfib.rst
new file mode 100644
index 00000000000..50b17304850
--- /dev/null
+++ b/docs/developer/corefeatures/fib/mplsfib.rst
@@ -0,0 +1,220 @@
+.. _mplsfib:
+
+MPLS FIB
+--------
+
+Implementation
+^^^^^^^^^^^^^^^
+
+The MPLS FIB is implemented using exactly the same data structures as
+the IP FIB. The only difference is the implementation of the
+table. Whereas for IPv4 this is an mtrie and for IPv6 a hash table,
+for MPLS it is a flat array indexed by a 21 bit key (label & EOS
+bit). This implementation is chosen to favour packet forwarding speed.
+
+It can be the case in MPLS forwarding that packets received with the
+EOS bit set in the MPLS label need to be forwarded differently from
+those without. The most common example of this is if the path set
+contains a path that does not have an output label. In this case the
+non-EOS packets cannot take this path, because to do so would expose
+the neighbouring router to a label that it did not allocate.
+
+The design choice to make with an MPLS FIB table is therefore:
+- 20 bit key: label only. When the EOS and non-EOS actions differ the result is a 'EOS-choice' object.
+- 21 bit key: label and EOS-bit. The result is then the specific action based on EOS-bit.
+
+20 bit key
+ - Advantages:lower memory overhead, since there are few DB entries.
+ - Disadvantages: slower DP performance in the case the path-lists
+ differ, as more objects are encountered in the switch path
+
+21 bit key
+ - Advantages: faster DP performance
+ Disadvantages: increased memory footprint.
+
+Switching between schemes based on observed/measured action similarity
+is not considered on the grounds of complexity and flip-flopping.
+
+VPP mantra - favour performance over memory. We choose a 21 bit key.
+
+Basics
+^^^^^^
+
+MPLS is not enabled by default. There are two steps to get
+started. First, create the default MPLS FIB:
+
+.. code-block:: console
+
+ $ mpls table add 0
+
+With '0' being the magic number for the 'default' table (just like it
+is for IPv[46]). One can create other MPLS tables, but, unlike IP
+tables, one cannot 'bind' non-default MPLS tables to interfaces, in
+other words all MPLS packets received on an interface will always
+result in a lookup in the default table. One has to be more inventive
+to use the non-default tables...
+
+Secondly, for *each* interface on which you wish to *receive* MPLS
+packets, that interface must be MPLS 'enabled'
+
+.. code-block:: console
+
+ $ set interface mpls GigEthernet0/0/0 enable
+
+there is no equivalent enable for transmit, all that is required is to
+use an interface as an egress path.
+
+Entries in the MPLS FIB can be displayed with:
+
+.. code-block:: console
+
+ $ sh mpls fib [table X] [label]
+
+There is a tight coupling between IP and MPLS forwarding. MPLS
+forwarding equivalence classes (FECs) are often an IP prefix – that is
+to say that traffic matching a given IP prefix is routed into a MPLS
+label switch path (LSP). It is thus necessary to be able to associate
+a given prefix/route with an [out-going] MPLS label that will be
+imposed when the packet is forwarded. This is configured as:
+
+.. code-block:: console
+
+ $ ip route add 1.1.1.1/32 via 10.10.10.10 GigEthernet0/0/0 out-labels 33
+
+packets matching 1.1.1.1/32 will be forwarded out GigEthernet0/0/0 and have
+MPLS label 33 imposed. More than one out-going label can be
+specified. Out-going MPLS labels can be applied to recursive and
+non-recursive routes, e.g;
+
+.. code-block:: console
+
+ $ ip route add 2.2.2.0/24 via 1.1.1.1 out-labels 34
+
+packets matching 2.2.2.0/24 will thus have two MPLS labels imposed; 34
+and 33. This is the realisation of, e,g, an MPLS BGP VPNv4.
+
+To associate/allocate a local-label for a prefix, and thus have
+packets to that local-label forwarded equivalently to the prefix do;
+
+.. code-block:: console
+
+ $ mpls local-label 99 2.2.2.0/24
+
+In the API this action is called a ‘bind’.
+The router receiving the MPLS encapsulated packets needs to be
+programmed with actions associated which each label value – this is
+the role of the MPLS FIB. The MPLS FIB is a table, whose key is the
+MPLS label value and end-of-stack (EOS) bit, which stores the action
+to perform on packets with matching encapsulation. Currently supported
+actions are:
+
+#. Pop the label and perform an IPv[46] lookup in a specified table
+#. Pop the label and forward via a specified next-hop (this is penultimate-hop-pop, PHP)
+#. Swap the label and forward via a specified next-hop.
+
+These can be programmed respectively by:
+
+.. code-block:: console
+
+ $ mpls local-label 33 eos ip4-lookup-in-table X
+ $ mpls local-label 33 [eos] via 10.10.10.10 GigEthernet0/0/0
+ $ mpls local-label 33 [eos] via 10.10.10.10 GigEthernet0/0/0 out-labels 66
+
+the latter is an example of an MPLS cross connect. Any description of
+a next-hop, recursive, non-recursive, labelled, non-labelled, etc,
+that is valid for an IP prefix, is also valid for an MPLS
+local-label. Note the use of the 'eos' keyword which indicates the
+programming is for the case when the label is end-of-stack. The last
+two operations can apply to both eos and non-eos packets, but the pop
+and IP lookup only to an eos packet.
+
+
+MPLS VPN
+^^^^^^^^
+
+To configure an MPLS VPN for a PE the following example can be used.
+
+Step 1; Configure routes to the iBGP peers - note these route MUST
+have out-going labels;
+
+.. code-block:: console
+
+ $ ip route add 10.0.0.1/32 via 192.168.1.2 Eth0 out-labels 33
+ $ ip route add 10.0.0.2/32 via 192.168.2.2 Eth0 out-labels 34
+
+Step 2; Configure the customer 'VRF'
+
+.. code-block:: console
+
+ $ ip table add 2
+
+Step 3; add a route via the iBGP peer[s] with the MPLS label
+advertised by that peer
+
+.. code-block:: console
+
+ $ ip route add table 2 10.10.10.0/24 via 10.0.0.2 next-hop-table 0 out-label 122
+ $ ip route add table 2 10.10.10.0/24 via 10.0.0.1 next-hop-table 0 out-label 121
+
+Step 4; add a route via the eBGP peer
+
+.. code-block:: console
+
+ $ ip route add table 2 10.10.20.0/24 via 172.16.0.1 next-hop-table 2
+
+Step 5; depending on the label allocation scheme used, add routes to
+the MPLS FIB to accept incoming labelled packets:
+
+#. per-prefix label scheme - this command 'binds' the label to the same
+ forwarding as the IP route
+
+ .. code-block:: console
+
+ $ mpls local-label 99 10.10.20.0/24
+
+#. per-CE label scheme - this pops the incoming label and forwards via
+ the next-hop provided. Append config for 'out-labels' if so desired.
+
+ .. code-block:: console
+
+ $ mpls local-label 99 via 172.16.0.1 next-hop-table 2
+
+#. per-VRF label scheme
+
+ .. code-block:: console
+
+ $ mpls local-label 99 via ip4-lookup-in-table 2
+
+MPLS Tunnels
+^^^^^^^^^^^^
+
+MPLS tunnels are unidirectional and can impose a stack of labels. They
+are 'normal' interfaces and thus can be used, for example, as the
+target for IP routes and L2 cross-connects. To construct a tunnel:
+
+.. code-block:: console
+
+ $ mpls tunnel add via 10.10.10.10 GigEthernet0/0/0 out-labels 33 44 55
+
+and to then have that created tunnel to perform ECMP:
+
+.. code-block:: console
+
+ $ mpls tunnel add mpls-tunnel0 via 10.10.10.11 GigEthernet0/0/0 out-labels 66 77 88
+
+use
+
+.. code-block:: console
+
+ $ sh mpls tunnel [X]
+
+to see the monster you have created.
+
+An MPLS tunnel interface is an interface like any other and now ready
+for use with the usual set of interface commands, e.g.:
+
+.. code-block:: console
+
+ $ set interface state mpls-tunnel0 up
+ $ set interface ip address mpls-tunnel0 192.168.1.1/30
+ $ ip route 1.1.1.1/32 via mpls-tunnel0
diff --git a/docs/developer/corefeatures/fib/multicast.rst b/docs/developer/corefeatures/fib/multicast.rst
new file mode 100644
index 00000000000..37c5673dcde
--- /dev/null
+++ b/docs/developer/corefeatures/fib/multicast.rst
@@ -0,0 +1,106 @@
+.. _mfib:
+
+IP Multicast FIB
+----------------
+
+The two principal differences between multicast and unicast forwarding
+are:
+
+* there is no load-balancing among paths, there is only replication
+ across paths.
+* multicast forwarding has an explicit reverse path forwarding (RPF)
+ check. It will only forward a packet if it arrives from a peer for
+ which it has been explicitly configured to accept.
+
+The other factor that influences the design of the mFIB is that the
+match criteria (the prefix) is different. For multicast it is
+necessary to be able to match on source and destination/group
+addresses (termed an (S,G)) and only on a destination prefix (a (\*,
+G/m)). This prefix is much bigger than a unicast prefix, and since
+unicast scale is almost always greater than multicast scale, it is not
+a good idea to have a single definition of a prefix. Therefore,
+there is a fib_prefix_t (and hence a fib_entry_t) and an
+mfib_prefix_t (and hence a mfib_entry_t).
+
+The fib_path_t and fib_path_list_t are reused. A path can represent
+either a peer from which to accept packets or a peer to which to send
+packets. A path-extension is added to the fib_path_t/mfib_entry_t to
+describe the role the path plays. Logically the path-list is split
+into two sets; an accepting set and a forwarding set. The forwarding set
+contributes a replicate DPO for forwarding and the accepting set
+contributes a list of interfaces (an mfib_itf_t) for the RPF check.
+
+An IP multicast FIB (mFIB) is a data-structure that holds entries that
+represent a (S,G) or a (\*,G/m) multicast group. There is one IPv4 and
+one IPv6 mFIB per IP table, i.e. each time the user calls 'ip[6] table
+add X' an mFIB is created.
+
+Usage
+^^^^^
+
+To add an entry to the default mFIB for the group (1.1.1.1, 239.1.1.1)
+that will replicate packets to GigEthernet0/0/0 and GigEthernet0/0/1, do:
+
+.. code-block:: console
+
+ $ ip mroute add 1.1.1.1 239.1.1.1 via GigEthernet0/0/0 Forward
+ $ ip mroute add 1.1.1.1 239.1.1.1 via GigEthernet0/0/1 Forward
+
+the flag 'Forward' passed with the path specifies this path to be part of the replication set.
+To add a path from GigEthernet0/0/2 to the accepting (RPF) set do:
+
+.. code-block:: console
+
+ $ ip mroute add 1.1.1.1 239.1.1.1 via GigEthernet0/0/2 Accept
+
+A (\*,G) entry is added by not specifying a source address:
+
+.. code-block:: console
+
+ $ ip mroute add 232.2.2.2 via GigEthernet0/0/2 Forward
+
+A (\*,G/m) entry is added by not specifying a source address and giving
+the group address a mask:
+
+.. code-block:: console
+
+ $ ip mroute add 232.2.2.0/24 via GigEthernet0/0/2 Forward
+
+Entries are deleted when all paths have been removed and all entry flags (see below) are also removed.
+
+Advanced
+^^^^^^^^
+
+There are a set of flags associated only with an entry, see:
+
+.. code-block:: console
+
+ $ show mfib route flags
+
+only some of these are relevant over the API/CLI:
+
+#. Signal - packets that match this entry will generate an event that
+ is sent to the control plane (which can be retrieved via the signal
+ dump API)
+#. Connected - indicates that the control plane should be informed of
+ connected sources (also retrieved via the signal dump API)
+#. Accept-all-itf - the entry shall accept packets from all
+ interfaces, thus eliminating the RPF check
+#. Drop - Drop all packet matching this entry.
+
+flags on an entry can be changed with:
+
+.. code-block:: console
+
+ $ ip mroute <PREFIX> <FLAG>
+
+An alternative approach to the RPF check, that does check the
+accepting path set, is to give the entry and RPF-ID:
+
+.. code-block:: console
+
+ $ ip mroute <PREFIX> rpf-id X
+
+the RPF-ID is an attribute of a received packet's meta-data and is
+added to the packet when it ingresses on a given entity such as an
+MPLS-tunnel or a BIER table disposition entry.
diff --git a/docs/developer/corefeatures/fib/neighbors.rst b/docs/developer/corefeatures/fib/neighbors.rst
new file mode 100644
index 00000000000..13a3f079b4f
--- /dev/null
+++ b/docs/developer/corefeatures/fib/neighbors.rst
@@ -0,0 +1,88 @@
+.. _neighbors:
+
+Neighbours
+^^^^^^^^^^^
+
+.. figure:: /_images/ip-neighbor.png
+
+Figure 1: Neighbour data model
+
+Figure 1 shows the data model for IP neighbours. An IP neighbour contains the mapping
+between a peer, identified by an IPv4 or IPv6 address, and its MAC address on a given
+interface. An IP-table (VRF) is not part of the neighbour's
+data/identity. This is because the virtualization of a router into
+different tables (VRFs) is performed at the interface level, i.e. an
+IP-table is bound to a particular interface. A neighbour, which is
+attached to an interface, is thus implicitly in that table, and
+only in that table. It is also worth noting that IP neighbours
+contribute forwarding for the egress direction, whereas an IP-table
+is an ingress only function.
+
+The *ip_neighbor_t* represents the control-plane addition of the
+neighbour. The *ip_adjacency_t* contains the data derived from the *ip_neighbor_t* that is needed to
+forward packets to the peer. The additional data in the adjacency are the *rewrite*
+and the *link_type*. The *link_type* is a description of the protocol of the packets
+that will be forwarded with this adjacency; e.g. IPv4, IPv6 or MPLS. The *link_type*
+maps directly to the ether-type in an Ethernet header, or the protocol filed in a
+GRE header. The rewrite is a byte string representation of the header that will be
+prepended to the packet when it is sent to that peer. For Ethernet interfaces this
+is be the src,dst MAC and the ether-type. For LISP tunnels, the IP src,dst pair
+and the LISP header.
+
+The *ip_neighbor_t* for an IPv4 peer (learned e.g. over ARP) will
+install a *link_type=IPv4* when the entry is created and a
+link_type=MPLS on demand (i.e. when a route with output labels resolves via the peer).
+
+Adjacency
+---------
+
+There are three sub-types of adjacencies. Purists would argue that some
+of these sub-types are not really adjacencies but are instead other
+forms of DPOs, and it would be hard to argue against that, but
+historically (not just in VPP, but in the FIB implementations from
+which VPP draws on for some of its concepts), these have been modelled
+as adjacency types, the one thing they have in common is that they
+have an associated interface and are terminal. The [sub] sub-types are:
+
+* A Neighbour Adjacency (key={interface, next-hop, link-type}). A
+ representation of a peer on a link (as described above). A neighbour adjacency itself has
+ two sub-types; terminal and mid-chain. When one speak of 'an
+ adjacency' one is usually referring to a terminal neighbour
+ sub-type. A mid-chain adjacency represents a neighbor on a virtual
+ interface which relies on the FIB to perform further forwarding. This
+ adjacency is thus not terminal for the FIB object graph but instead
+ appears in the 'middle' (the term chain is a synonym for graph in
+ some contexts).
+ A neighbour adjacency can be in one of two states; complete and
+ incomplete. A complete adjacency knows the rewrite string that
+ should be used to reach the peer, an incomplete adjacency does
+ not. If the adjacency was added as a result of the addition of an
+ *ip_neighbor_t* then the adjacency will be complete (because the
+ *ip_neighbor_t* knows the peer's MAC address). An incomplete
+ adjacency is created on demand by the FIB when a route's path
+ requires to resolve through such an adjacency. It is thus created in
+ order to resolve the missing dependency, it will become complete
+ once the *ip_neighbor_t* is discovered.
+ In the forwarding path a complete adjacency will prepend the rewrite
+ string and transmit on the egress interface, an incomplete adjacency
+ will construct a ARP/ND request to resolve the peer's IP address.
+
+* A Glean Adjacency (key={interface}). This is a representation of the need to discover
+ a peer on the given interface. It is used when it is known that the
+ packet is destined to an undiscovered peer on that interface. The
+ difference between the glean adjacency and an
+ incomplete neighbour adjacency is that in the forwarding path the
+ glean adjacency will construct an ARP/ND request for the peer as
+ determined from the packet's destination address. The glean
+ adjacency is used to resolve connected prefixes on multi-access
+ interfaces.
+
+* A Multicast Adjacency (key={interface}). This represents the need to send an IP
+ multicast packet out of the adjacency's associated interface. Since
+ IP multicast constructs the destination MAC address from the IP
+ packet's destination/group address, the rewrite is always known and
+ hence the adjacency is always complete.
+
+
+All adjacency types can be shared between routes, hence each type is
+stored in a DB whose key is appropriate for the type.
diff --git a/docs/developer/corefeatures/fib/prefixes.rst b/docs/developer/corefeatures/fib/prefixes.rst
new file mode 100644
index 00000000000..5e0437ae3b3
--- /dev/null
+++ b/docs/developer/corefeatures/fib/prefixes.rst
@@ -0,0 +1,17 @@
+.. _prefixes:
+
+Prefixes
+^^^^^^^^
+
+Some nomenclature used to describe prefixes:
+
+* 1.1.1.1 This is an address since it has no associated mask
+* 1.1.1.0/24 This is a prefix.
+* 1.1.1.1/32 This is a host prefix (the mask length is the size of the address).
+
+Prefix A is more specific than B if its mask length is longer, and less specific if
+the mask is shorter. For example, 1.1.1.0/28 is more specific than 1.1.1.0/24. A
+less specific prefix that overlaps with a more specific is the **covering** prefix.
+For example, 1.1.1.0/24 is the covering prefix for 1.1.1.0/28 and 1.1.1.0/28 is termed
+the **covered** prefix. A covering prefix is therefore always less specific than its
+covered prefixes.
diff --git a/docs/developer/corefeatures/fib/prerequisites.rst b/docs/developer/corefeatures/fib/prerequisites.rst
new file mode 100644
index 00000000000..9d2b5ca21f4
--- /dev/null
+++ b/docs/developer/corefeatures/fib/prerequisites.rst
@@ -0,0 +1,12 @@
+.. _prerequisites:
+
+Prerequisites
+-------------
+
+This section describes some prerequisite topics and nomenclature that are
+foundational to understanding the FIB architecture.
+
+.. toctree::
+
+ graphs
+ prefixes
diff --git a/docs/developer/corefeatures/fib/routes.rst b/docs/developer/corefeatures/fib/routes.rst
new file mode 100644
index 00000000000..a43cbd112d5
--- /dev/null
+++ b/docs/developer/corefeatures/fib/routes.rst
@@ -0,0 +1,353 @@
+.. _routes:
+
+Routes
+^^^^^^
+
+Basics
+------
+
+The anatomy of a route is crucial to understand:
+
+.. code-block:: console
+
+ 1.1.1.0/24 via 10.0.0.1 eth0
+
+A route is composed of two parts; **what** to match against and **how** to forward
+the matched packets. In the above example we want to match packets
+whose destination IP address is in the 1.1.1.0/24 subnet and then we
+want to forward those packet to 10.0.0.1 on interface eth0. We
+therefore want to match the **prefix** 1.1.1.0/24 and forward on the
+**path** to 10.0.0.1, eth0.
+
+Matching on a prefix is the particular task of the IP FIB, matching on
+other packet attributes is done by other subsystems, e.g. matching on
+MPLS labels in the MPLS-FIB, or matching on a tuple in ACL based
+forwarding (ABF), 'matching' on all packets that arrive on an L3
+interface (l3XC). Although these subsystems match on different
+properties, they share the infrastructure on **how** to forward
+matched packets, that is they share the **paths**. The FIB paths (or
+really the path-list) thus provide services to clients, this service
+is to **contribute** forwarding, this, in terms that will be made
+clear in later sections, is to provide the DPO to use.
+
+The prime function of the FIB is to *resolve* the paths for a
+route. To resolve a route is to construct an object graph that fully
+describes how to forward matching packets. This means that the graph
+must terminate with an object (the leaf node) that describes how
+to send a packet on an interface [#f1]_, i.e what encap to add to the
+packet and what interface to send it to; this is the purpose of the IP
+adjacency object. In Figure 3 the route is resolved as the graph is
+complete from *fib_entry_t* to *ip_adjacency_t*.
+
+
+Thread Model
+^^^^^^^^^^^^
+
+The FIB is not thread safe. All actions on the FIB are expected to
+occur exclusively in the main thread. However, the data-structures
+that FIB updates to add routes are thread safe,
+w.r.t. addition/deletion and read, therefore routes can be added
+without holding the worker thread barrier lock.
+
+
+Tables
+------
+
+An IP FIB is a set of prefixes against which to match; it is
+sub-address family (SAFI) specific (i.e. there is one for ipv4 and ipv6, unicast
+and multicast). An IP Table is address family (AFI) specific (i.e. the
+'table' includes the unicast and multicast FIB).
+
+Each FIB is identified by the SAFI and instance number (the [pool]
+index), each table is identified by the AFI and ID. The table's ID is
+assigned by the user when the table is constructed. Table ID 0 is
+reserved for the global/default table.
+
+In most routing models a VRF is composed of an IPv4 and IPv6 table,
+however, VPP has no construct to model this association, it deals only
+with tables and FIBs.
+
+A unicast FIB is comprised of two route data-bases; forwarding and non-forwarding. The
+forwarding data-base contains routes against which a packet will perform a longest
+prefix match (LPM) in the data-plane. The non-forwarding DB contains all the routes
+with which VPP has been programmed. Some of these routes may be
+unresolved, preventing their insertion into the forwarding DB.
+(see section: Adjacency source FIB entries).
+
+Model
+-----
+
+The route data is decomposed into three parts; entry, path-list and paths;
+
+* The *fib_entry_t*, which contains the route's prefix, is the representation of that prefix's entry in the FIB table.
+* The *fib_path_t* is a description of where to send the packets destined to the route's prefix. There are several types of path, including:
+
+ * Attached next-hop: the path is described with an interface and a next-hop. The next-hop is in the same sub-net as the router's own address on that interface, hence the peer is considered to be *attached*
+
+ * Attached: the path is described only by an interface. An
+ attached path means that all addresses covered by the route's
+ prefix are on the same L2 segment to which that router's
+ interface is attached. This means it is possible to ARP for any
+ address covered by the route's prefix. If this is not the case
+ then another device in that L2 segment needs to run proxy
+ ARP. An attached path is really only appropriate for a point-to-point
+ (P2P) interface where ARP is not required, i.e. a GRE tunnel. On
+ a p2p interface, attached and attached-nexthop paths will
+ resolve via a special 'auto-adjacency'. This is an adjacency
+ whose next-hop is the all zeros address and describes the only
+ peer on the link.
+
+ * Recursive: The path is described only via the next-hop and table-id.
+
+ * De-aggregate: The path is described only via the special all
+ zeros address and a table-id. This implies a subsequent lookup
+ in the table should be performed.
+
+ * There are other path types, please consult the code.
+
+* The *fib_path_list_t* represents the list of paths from which to choose when forwarding. A path-list is a shared object, i.e. it is the parent to multiple fib_entry_t children. In order to share any object type it is necessary for a child to search for an existing object matching its requirements. For this there must be a database. The key to the path-list database is a combined description of all of the paths it contains [#f2]_. Searching the path-list database is required with each route addition, so it is populated only with path-lists for which sharing will bring convergence benefits (see Section: :ref:`fastconvergence`).
+
+.. figure:: /_images/fib20fig2.png
+
+Figure 2: Route data model class diagram
+
+Figure 2 shows an example of a route with two attached-next-hop paths. Each of these
+paths will *resolve* by finding the adjacency that matches the paths attributes, which
+are the same as the key for the adjacency database [#f3]_. The *forwarding information (FI)*
+is the set of adjacencies that are available for load-balancing the traffic in the
+data-plane. A path *contributes* an adjacency to the route's forwarding information, the
+path-list contributes the full forwarding information for IP packets.
+
+.. figure:: /_images/fib20fig3.png
+
+Figure 3: Route object diagram
+
+Figure 3 shows the object instances and their relationships created in order to resolve
+the routes also shown. The graph nature of these relationships is evident; children
+are displayed at the top of the diagram, their parents below them. Forward walks are
+thus from top to bottom, back walks bottom to top. The diagram shows the objects
+that are shared, the path-list and adjacency. Sharing objects is critical to fast
+convergence (see section :ref:`fastconvergence`).
+
+FIB sources
+"""""""""""
+There are various entities in the system that can add routes to the FIB tables.
+Each of these entities is termed a *source*. When the same prefix is added by different
+sources the FIB must arbitrate between them to determine which source will contribute
+the forwarding information. Since each source determines the forwarding information
+using different best path and loop prevention algorithms, it is not correct for the
+forwarding information of multiple sources to be combined. Instead the FIB must choose
+to use the forwarding information from only one source. This choice is based on a static
+priority assignment [#f4]_. The FIB must maintain the information each source has added
+so it can be restored should that source become the best source. VPP has two
+*control-plane* sources; the API and the CLI the API has the higher priority.
+Each *source* data is represented by a *fib_entry_src_t* object of which a
+*fib_entry_t* maintains a sorted vector.
+
+The following configuration:
+
+.. code-block:: console
+
+ $ set interface ip address GigabitEthernet0/8/0 192.168.1.1/24
+
+results in the addition of two FIB entries; 192.168.1.0/24 which is connected and
+attached, and 192.168.1.1/32 which is connected and local (a.k.a.
+receive or for-us). A prefix is *connected* when it is applied to a router's interface.
+Both prefixes are *interface* sourced. The interface source has a high priority, so
+the accidental or nefarious addition of identical prefixes does not prevent the
+router from correctly forwarding. Packets matching a connected prefix will
+generate an ARP request for the packets destination address, this process is known
+as a *glean*.
+
+An *attached* prefix also results in a glean, but the router does not have its own
+address in that sub-net. The following configuration will result in an attached
+route, which resolves via an attached path;
+
+.. code-block:: console
+
+ $ ip route add table X 10.10.10.0/24 via gre0
+
+as mentioned before, these are only appropriate for point-to-point
+links.
+
+If table X is not the table to which gre0 is bound,
+then this is the case of an attached export (see the section :ref:`attachedexport`).
+
+Adjacency source FIB entries
+""""""""""""""""""""""""""""
+
+Whenever an ARP entry is created it will source a *fib_entry_t*. In this case the
+route is of the form:
+
+.. code-block:: console
+
+ $ ip route add table X 10.0.0.1/32 via 10.0.0.1 GigabitEthernet0/8/0
+
+This is a host prefix with a path whose next-hop address is the same host. This route
+highlights the distinction between the route's prefix - a description of the traffic
+to match - and the path - a description of where to send the matched traffic.
+Table X is the same table to which the interface is bound. FIB entries that are
+sourced by adjacencies are termed *adj-fibs*. The priority of the adjacency source
+is lower than the API source, so the following configuration:
+
+.. code-block:: console
+
+ $ set interface address 192.168.1.1/24 GigabitEthernet0/8/0
+ $ ip arp 192.168.1.2 GigabitEthernet0/8/0 dead.dead.dead
+ $ ip route add 192.168.1.2 via 10.10.10.10 GigabitEthernet1/8/0
+
+will forward traffic for 192.168.1.2 via GigabitEthernet1/8/0. That is the route added by the control
+plane is favoured over the adjacency discovered by ARP. The control plane, with its
+associated authentication, is considered the authoritative source. To counter the
+nefarious addition of adj-fibs, through the nefarious injection of adjacencies, the
+FIB is also required to ensure that only adj-fibs whose less specific covering prefix
+is attached are installed in forwarding. This requires the use of *cover tracking*,
+where a route maintains a dependency relationship with the route that is its less
+specific cover. When this cover changes (i.e. there is a new covering route) or the
+forwarding information of the cover is updated, then the covered route is notified.
+Adj-fibs that fail this cover check are not installed in the fib_table_t's forwarding
+table, they are only present in the non-forwarding table.
+
+Overlapping sub-nets are not supported, so no adj-fib has multiple paths. The control
+plane is expected to remove a prefix configured for an interface before the interface
+changes VRF.
+
+Recursive Routes
+""""""""""""""""
+
+Figure 4 shows the data structures used to describe a recursive route. The
+representation is almost identical to attached next-hop paths. The difference
+being that the *fib_path_t* has a parent that is another *fib_entry_t*, termed the
+*via-entry*
+
+.. figure:: /_images/fib20fig4.png
+
+Figure 4: Recursive route class diagram.
+
+In order to forward traffic to 64.10.128.0/20 the FIB must first determine how to forward
+traffic to 1.1.1.1/32. This is recursive resolution. Recursive resolution, which is
+essentially a cache of the data-plane result, emulates a longest prefix match for the
+*via-address" 1.1.1.1 in the *via-table* table 0 [#f5]_.
+
+Recursive resolution (RR) will source a host-prefix entry in the via-table for the
+via-address. The RR source is a low priority source. In the unlikely [#f6]_ event that the
+RR source is the best source, then it must derive forwarding information from its
+covering prefix.
+
+There are two cases to consider:
+
+* The cover is connected [#f7]_. The via-address is then an attached host and the RR source can resolve directly via the adjacency with the key {via-address, interface-of-connected-cover}
+* The cover is not connected [#f8]_. The RR source can directly inherit the forwarding information from its cover.
+
+This dependency on the covering prefix means the RR source will track its cover The
+covering prefix will *change* when;
+
+* A more specific prefix is inserted. For this reason whenever an entry is inserted into a FIB table its cover must be found so that its covered dependents can be informed.
+* The existing cover is removed. The covered prefixes must form a new relationship with the next less specific.
+
+The cover will be *updated* when the route for the covering prefix is modified. The
+cover tracking mechanism will provide the RR sourced entry with a notification in the
+event of a change or update of the cover, and the source can take the necessary action.
+
+The RR sourced FIB entry becomes the parent of the *fib_path_t* and will contribute its
+forwarding information to that path, so that the child's FIB entry can construct its own
+forwarding information.
+
+Figure 5 shows the object instances created to represent the recursive route and
+its resolving route also shown.
+
+.. figure:: /_images/fib20fig5.png
+
+Figure 5: Recursive Routes object diagram
+
+If the source adding recursive routes does not itself perform recursive resolution [#f9]_
+then it is possible that the source may inadvertently programme a recursion loop.
+
+An example of a recursion loop is the following configuration:
+
+.. code-block:: console
+
+ $ ip route add 5.5.5.5/32 via 6.6.6.6
+ $ ip route add 6.6.6.6/32 via 7.7.7.7
+ $ ip route add 7.7.7.7/32 via 5.5.5.5
+
+This shows a loop over three levels, but any number is possible. FIB will detect
+recursion loops by forward walking the graph when a *fib_entry_t* forms a child-parent
+relationship with a *fib_path_list_t*. The walk checks to see if the same object instances
+are encountered. When a recursion loop is formed the control plane [#f10]_ graph becomes
+cyclic, thus allowing the child-parent dependencies to form. This is necessary so that
+when the loop breaks, the affected children and be updated.
+
+Output labels
+"""""""""""""
+
+A route may have associated output MPLS labels [#f11]_. These are labels that are expected
+to be imposed on a packet as it is forwarded. It is important to note that an MPLS
+label is per-route and per-path, therefore, even though routes share paths they do not
+necessarily have the same label for that path [#f12]_. A label is therefore uniquely associated
+to a *fib_entry_t* and associated with one of the *fib_path_t* to which it forwards.
+MPLS labels are modelled via the generic concept of a *path-extension*. A *fib_entry_t*
+therefore has a vector of zero to many *fib_path_ext_t* objects to represent the labels
+with which it is configured.
+
+
+Delegates
+^^^^^^^^^
+
+A common software development pattern, a delegate is a means to
+extend the functionality of one object through composition of
+another, these other objects are called delegates. Both
+**fib_entry_t** and **ip_adjacency_t** support extension via delegates.
+
+The FIB uses delegates to add functionality when those functions are
+required by only a few objects instances rather than all of them, to
+save on memory. For example, building/contributing a load-balance
+object used to forward non-EOS MPLS traffic is only required for a
+fib_entry_t that corresponds to a BGP peer and that peer is
+advertising labeled route - there are only a few of
+these. See **fib_entry_delegate.h** for a full list of delegate types.
+
+
+Tracking
+^^^^^^^^
+
+A prime service FIB provides for other sub-system is the ability to
+'track' the forwarding for a given next-hop. For example, a tunnel
+will want to know how to forward to its destination address. It can
+therefore request of the FIB to track this host-prefix and inform it
+when the forwarding for that prefix changes.
+
+FIB tracking sources a host-prefix entry in the FIB using the 'recusive
+resolution (RR)' source, it exactly the same way that a recursive path
+does. If the entry did not previously exist, then the RR source will
+inherit (and track) forwarding from its covering prefix, therefore all
+packets that match this entry are forwarded in the same way as if the
+entry did not exist. The tunnel that is tracking this FIB entry will
+become a child dependent. The benefit to creating the entry, is that
+it now exists in the FIB node graph, so all actions that happen on its
+parents, are propagated to the host-prefix entry and consequently to
+the tunnel.
+
+FIB provides a wrapper to the sourcing of the host-prefix using a
+delegate attached to the entry, and the entry is RR sourced only once.
+. The benefit of this approach is that each time a new client tracks
+the entry it doesn't RR source it. When an entry is sourced all its
+children are updated. Thus, new clients tracking an entry is
+O(n^2). With the tracker as indirection, the entry is sourced only once.
+
+
+.. rubric:: Footnotes:
+
+.. [#f1] Or terminate in an object that transitions the packet out of
+ the FIB domain, e.g. a drop.
+.. [#f2] Optimisations
+.. [#f3] Note it is valid for either interface to be bound to a different table than table 1
+.. [#f4] The engaged reader can see the full priority list in vnet/vnet/fib/fib_entry.h
+.. [#f5] Note it is only possible to add routes via an address (i.e. a/32 or /128) not via a shorter mask prefix. There is no use case for the latter
+.. [#f6] For iBGP the via-address is the loopback address of the peer PE, for eBGP it is the adj-fib for the CE
+.. [#f7] As is the case ofr eBGP
+.. [#f8] As is the case for iBGP
+.. [#f9] If that source is relying on FIB to perform recursive resolution, then there is no reason it should do so itself.
+.. [#f10] The derived data-plane graph MUST never be cyclic
+.. [#f11] Advertised, e.g. by LDP, SR or BGP
+.. [#f12] The only case where the labels will be the same is BGP VPNv4 label allocation per-VRF
diff --git a/docs/developer/corefeatures/fib/scale.rst b/docs/developer/corefeatures/fib/scale.rst
new file mode 100644
index 00000000000..2ec8c6a85ec
--- /dev/null
+++ b/docs/developer/corefeatures/fib/scale.rst
@@ -0,0 +1,247 @@
+.. _scale:
+
+Scale
+-----
+
+The only limiting factor on FIB scale is the amount of memory
+allocated to each heap the FIB uses, and there are 2:
+
+* The main heap
+* The stats heap
+
+
+Main Heap
+^^^^^^^^^
+
+The main heap is used to allocate all memory needed for the FIB
+data-structures. Each table, created by the user, i.e. with;
+
+.. code-block:: console
+
+ $ ip table add 1
+
+or the default table, comprises 2 *ip4_fib_t* objects.
+The 'non-forwarding' *ip4_fib_t* contains all the entries in the table
+and, the 'forwarding' contains the entries that are matched against in
+the data-plane. The difference between the two sets are the entries
+that should not be matched in the data-plane.
+Each *ip4_fib_t* comprises an mtrie (for fast lookup in the data-plane)
+and a hash table per-prefix length (for lookup in the control plane).
+
+To see the amount of memory consumed by the IPv4 tables use:
+
+.. code-block:: console
+
+ vpp# sh ip fib mem
+ ipv4-VRF:0 mtrie:335744 hash:4663
+ ipv4-VRF:1 mtrie:333056 hash:3499
+ totals: mtrie:668800 hash:8162 all:676962
+
+this output shows two 'empty' (i.e. no added routes) tables. Each
+mtrie uses about 150k of memory, so each table about 300k.
+
+
+Below the output having added 1M, 2M and 4M routes respectively:
+
+.. code-block:: console
+
+ vpp# sh ip fib mem
+ ipv4-VRF:0 mtrie:335744 hash:4695
+ totals: mtrie:335744 hash:4695 all:340439
+
+.. code-block:: console
+
+ vpp# sh ip fib mem
+ ipv4-VRF:0 mtrie:5414720 hash:41177579
+ totals: mtrie:5414720 hash:41177579 all:46592299
+
+.. code-block:: console
+
+ vpp# sh ip fib mem
+ ipv4-VRF:0 mtrie:22452608 hash:168544508
+ totals: mtrie:22452608 hash:168544508 all:190997116
+
+
+IPv6 also has the concept of forwarding and non-forwarding entries,
+however for IPv6 all the forwarding entries are stored in a single
+hash table (same goes for the non-forwarding). The key to the hash
+table includes the IPv6 table-id.
+
+To see the amount of memory consumed by the IPv4 tables use:
+
+.. code-block:: console
+
+ vpp# sh ip6 fib mem
+ IPv6 Non-Forwarding Hash Table:
+ Hash table ip6 FIB non-fwding table
+ 7 active elements 7 active buckets
+ 1 free lists
+ 0 linear search buckets
+ arena: base 7f2fe28bf000, next 803c0
+ used 525248 b (0 Mbytes) of 33554432 b (32 Mbytes)
+
+ IPv6 Forwarding Hash Table:
+ Hash table ip6 FIB fwding table
+ 7 active elements 7 active buckets
+ 1 free lists
+ 0 linear search buckets
+ arena: base 7f2fe48bf000, next 803c0
+ used 525248 b (0 Mbytes) of 33554432 b (32 Mbytes)
+
+as we scale to 128k IPv6 entries:
+
+.. code-block:: console
+
+ vpp# sh ip6 fib mem
+ IPv6 Non-Forwarding Hash Table:
+ Hash table ip6 FIB non-fwding table
+ 131079 active elements 32773 active buckets
+ 2 free lists
+ [len 1] 2 free elts
+ 0 linear search buckets
+ arena: base 7fed7a514000, next 4805c0
+ used 4720064 b (4 Mbytes) of 1073741824 b (1024 Mbytes)
+
+ IPv6 Forwarding Hash Table:
+ Hash table ip6 FIB fwding table
+ 131079 active elements 32773 active buckets
+ 2 free lists
+ [len 1] 2 free elts
+ 0 linear search buckets
+ arena: base 7fedba514000, next 4805c0
+ used 4720064 b (4 Mbytes) of 1073741824 b (1024 Mbytes)
+
+and 256k:
+
+.. code-block:: console
+
+ vpp# sh ip6 fib mem
+ IPv6 Non-Forwarding Hash Table:
+ Hash table ip6 FIB non-fwding table
+ 262151 active elements 65536 active buckets
+ 2 free lists
+ [len 1] 6 free elts
+ 0 linear search buckets
+ arena: base 7fed7a514000, next 880840
+ used 8915008 b (8 Mbytes) of 1073741824 b (1024 Mbytes)
+
+ IPv6 Forwarding Hash Table:
+ Hash table ip6 FIB fwding table
+ 262151 active elements 65536 active buckets
+ 2 free lists
+ [len 1] 6 free elts
+ 0 linear search buckets
+ arena: base 7fedba514000, next 880840
+ used 8915008 b (8 Mbytes) of 1073741824 b (1024 Mbytes)
+
+and 1M:
+
+.. code-block:: console
+
+ vpp# sh ip6 fib mem
+ IPv6 Non-Forwarding Hash Table:
+ Hash table ip6 FIB non-fwding table
+ 1048583 active elements 65536 active buckets
+ 4 free lists
+ [len 1] 65533 free elts
+ [len 2] 65531 free elts
+ [len 4] 9 free elts
+ 0 linear search buckets
+ arena: base 7fed7a514000, next 3882740
+ used 59254592 b (56 Mbytes) of 1073741824 b (1024 Mbytes)
+
+ IPv6 Forwarding Hash Table:
+ Hash table ip6 FIB fwding table
+ 1048583 active elements 65536 active buckets
+ 4 free lists
+ [len 1] 65533 free elts
+ [len 2] 65531 free elts
+ [len 4] 9 free elts
+ 0 linear search buckets
+ arena: base 7fedba514000, next 3882740
+ used 59254592 b (56 Mbytes) of 1073741824 b (1024 Mbytes)
+
+as can be seen from the output the IPv6 hash-table in this case was scaled
+to 1GB and 1million prefixes has used 56MB of it.
+
+The main heap is also used to allocate objects that represent the FIB
+entries in the control and data plane (see :ref:`controlplane` and
+:ref:`dataplane`) such as *fib_entry_t* and *load_balance_t*. These come
+from the main heap because they are not protocol specific
+(i.e. they are used to represent either IPv4, IPv6 or MPLS
+entries).
+
+With 1M prefixes allocated the memory usage is:
+
+.. code-block:: console
+
+ vpp# sh fib mem
+ FIB memory
+ Tables:
+ SAFI Number Bytes
+ IPv4 unicast 1 33619968
+ IPv6 unicast 2 118502784
+ MPLS 0 0
+ IPv4 multicast 1 1175
+ IPv6 multicast 1 525312
+ Nodes:
+ Name Size in-use /allocated totals
+ Entry 72 1048589/ 1048589 75498408/75498408
+ Entry Source 40 1048589/ 1048589 41943560/41943560
+ Entry Path-Extensions 76 0 / 0 0/0
+ multicast-Entry 192 6 / 6 1152/1152
+ Path-list 40 18 / 18 720/720
+ uRPF-list 16 14 / 14 224/224
+ Path 72 22 / 22 1584/1584
+ Node-list elements 20 1048602/ 1048602 20972040/20972040
+ Node-list heads 8 24 / 24 192/192
+
+and with 2M
+
+.. code-block:: console
+
+ vpp# sh fib mem
+ FIB memory
+ Tables:
+ SAFI Number Bytes
+ IPv4 unicast 1 33619968
+ IPv6 unicast 2 252743040
+ MPLS 0 0
+ IPv4 multicast 1 1175
+ IPv6 multicast 1 525312
+ Nodes:
+ Name Size in-use /allocated totals
+ Entry 72 2097165/ 2097165 150995880/150995880
+ Entry Source 40 2097165/ 2097165 83886600/83886600
+ Entry Path-Extensions 76 0 / 0 0/0
+ multicast-Entry 192 6 / 6 1152/1152
+ Path-list 40 18 / 19 720/760
+ uRPF-list 16 18 / 18 288/288
+ Path 72 22 / 23 1584/1656
+ Node-list elements 20 2097178/ 2097178 41943560/41943560
+ Node-list heads 8 24 / 24 192/192
+
+However, the situation is not a simple as that. All of the 1M prefixes
+added above were reachable via the same next-hop, so the path-list
+(and path) they use is shared. As prefixes are added that use
+different (sets of) next-hops, the number of path-lists and paths
+requires will increase.
+
+
+Stats Heap
+^^^^^^^^^^
+
+VPP collects statistics for each route. For each route VPP collects
+byte and packet counters for packets sent to the prefix (i.e. the
+route was matched in the data-plane) and packets sent via the prefix (i.e. the
+matching prefix is reachable through it - like a BGP peer). This
+requires 4 counters per route in the stats segment.
+
+Below shows the size of the stats segment with 1M, 2M and 4M routes.
+
+.. code-block:: console
+
+ total: 1023.99M, used: 127.89M, free: 896.10M, trimmable: 830.94M
+ total: 1023.99M, used: 234.14M, free: 789.85M, trimmable: 668.15M
+ total: 1023.99M, used: 456.83M, free: 567.17M, trimmable: 388.91M
+
diff --git a/docs/developer/corefeatures/fib/thedatamodel.rst b/docs/developer/corefeatures/fib/thedatamodel.rst
new file mode 100644
index 00000000000..cd3de179814
--- /dev/null
+++ b/docs/developer/corefeatures/fib/thedatamodel.rst
@@ -0,0 +1,15 @@
+.. _thedatamodel:
+
+The Data Model
+--------------
+
+The FIB data model comprises two parts; the control-plane (CP) and the data-plane
+(DP). The CP data model represents the data that is programmed into VPP by the
+upper layers. The DP model represents how VPP derives actions to be performed on
+packets as they are switched.
+
+.. toctree::
+
+ controlplane
+ dataplane
+
diff --git a/docs/developer/corefeatures/fib/tunnels.rst b/docs/developer/corefeatures/fib/tunnels.rst
new file mode 100644
index 00000000000..d948a5e2bda
--- /dev/null
+++ b/docs/developer/corefeatures/fib/tunnels.rst
@@ -0,0 +1,62 @@
+.. _tunnels:
+
+Tunnels
+-------
+
+Tunnels share a similar property to recursive routes in that after applying the
+tunnel encapsulation, a new packet must be forwarded, i.e. forwarding is
+recursive. However, as with recursive routes the tunnel's destination is known
+beforehand, so the second lookup can be avoided if the packet can follow the
+already constructed data-plane graph for the tunnel's destination. This process
+of joining to DP graphs together is termed *stacking*.
+
+.. figure:: /_images/fib20fig11.png
+
+Figure 11: Tunnel control plane object diagram
+
+Figure 11 shows the control plane object graph for a route via a tunnel. The two
+sub-graphs for the route via the tunnel and the route for the tunnel's
+destination are shown to the right and left respectively. The red line shows the
+relationship form by stacking the two sub-graphs. The adjacency on the tunnel
+interface is termed a 'mid-chain' since it is now present in the middle of the
+graph/chain rather than its usual terminal location.
+
+The mid-chain adjacency is contributed by the gre_tunnel_t , which also becomes
+part of the FIB control-plane graph. Consequently it will be visited by a
+back-walk when the forwarding information for the tunnel's destination changes.
+This will trigger it to restack the mid-chain adjacency on the new
+*load_balance_t* contributed by the parent *fib_entry_t*.
+
+If the back-walk indicates that there is no route to the tunnel's
+destination, or that the resolving route does not meet resolution
+constraints, then the tunnel can be marked as down, and fast
+convergence can be triggered in the same way as for physical interfaces (see section ...).
+
+
+Multi-Point Tunnels
+^^^^^^^^^^^^^^^^^^^
+
+Multi-point tunnels are an example of a non-broadcast multi-access
+interface. In simple terms this means there are many peers on the link
+but it is not possible to broadcast a single message to all of them at
+once, and hence the usual peer discovery mechanism (as employed,
+e.g. by ARP) is not available. Although an *ip_neighbor_t* is a
+representation of an IP peer on a link, it is not valid in this
+context as it maps the peer's identity to its MAC address. For a
+tunnel peer it is required to map the peer's overlay address (the
+attached address, the one in the same subnet as the device) with the
+peer's underlay address (probably on the other side of the
+internet). In the P2P case where there is only one peer on the link,
+the peer's underlay address is the same as the tunnel's destination
+address.
+The data structure that represents the mapping of the peer's overlay
+with underlay address is an entry in the Tunnel Endpoint Information
+Base (TEIB); the *tieb_entry_t*. TEIB entries are created by the
+control plane (e.g. NHRP (RFC2332)).
+
+Each mid-chain adjacency on a multi-point tunnel is stacked on the
+*fib_entry_t* object that resolves the peer's underlay address. The
+glean adjacency on the tunnel resolves via a drop, since broadcasts
+are not possible. A multicast adjacency on a multi-point tunnel is
+currently a work in progress.
+
diff --git a/docs/developer/corefeatures/index.rst b/docs/developer/corefeatures/index.rst
new file mode 100644
index 00000000000..adc086e3c98
--- /dev/null
+++ b/docs/developer/corefeatures/index.rst
@@ -0,0 +1,21 @@
+.. _corefeatures:
+
+=======================
+Core Features
+=======================
+
+.. toctree::
+ :maxdepth: 1
+
+ fib/index
+ sr/index
+ punt
+ ipsec
+ bfd_doc
+ ipfix_doc
+ span_doc
+ mtu
+ sylog_doc
+ eventviewer
+ stats
+ selinux_doc
diff --git a/docs/developer/corefeatures/ipfix_doc.rst b/docs/developer/corefeatures/ipfix_doc.rst
new file mode 120000
index 00000000000..a4242e19928
--- /dev/null
+++ b/docs/developer/corefeatures/ipfix_doc.rst
@@ -0,0 +1 @@
+../../../src/vnet/ipfix-export/ipfix_doc.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/ipsec.rst b/docs/developer/corefeatures/ipsec.rst
new file mode 120000
index 00000000000..46a12cdeacc
--- /dev/null
+++ b/docs/developer/corefeatures/ipsec.rst
@@ -0,0 +1 @@
+../../../src/vnet/ipsec/ipsec.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/mtu.rst b/docs/developer/corefeatures/mtu.rst
new file mode 120000
index 00000000000..08930ffa03a
--- /dev/null
+++ b/docs/developer/corefeatures/mtu.rst
@@ -0,0 +1 @@
+../../../src/vnet/mtu.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/punt.rst b/docs/developer/corefeatures/punt.rst
new file mode 120000
index 00000000000..0a56632f671
--- /dev/null
+++ b/docs/developer/corefeatures/punt.rst
@@ -0,0 +1 @@
+../../../src/vnet/ip/punt.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/selinux_doc.rst b/docs/developer/corefeatures/selinux_doc.rst
new file mode 120000
index 00000000000..f0b3f5cf3fc
--- /dev/null
+++ b/docs/developer/corefeatures/selinux_doc.rst
@@ -0,0 +1 @@
+../../../extras/selinux/selinux_doc.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/span_doc.rst b/docs/developer/corefeatures/span_doc.rst
new file mode 120000
index 00000000000..a8a1cb3f994
--- /dev/null
+++ b/docs/developer/corefeatures/span_doc.rst
@@ -0,0 +1 @@
+../../../src/vnet/span/span_doc.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/sr/index.rst b/docs/developer/corefeatures/sr/index.rst
new file mode 100644
index 00000000000..8c87e2c00a3
--- /dev/null
+++ b/docs/developer/corefeatures/sr/index.rst
@@ -0,0 +1,14 @@
+.. _corefeature_sr:
+
+===============
+Segment routing
+===============
+
+.. toctree::
+ :maxdepth: 1
+
+ sr_doc
+ sr_localsid
+ sr_mpls
+ sr_policy
+ sr_steering \ No newline at end of file
diff --git a/docs/developer/corefeatures/sr/sr_doc.rst b/docs/developer/corefeatures/sr/sr_doc.rst
new file mode 120000
index 00000000000..94bdfaa0de1
--- /dev/null
+++ b/docs/developer/corefeatures/sr/sr_doc.rst
@@ -0,0 +1 @@
+../../../../src/vnet/srv6/sr_doc.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/sr/sr_localsid.rst b/docs/developer/corefeatures/sr/sr_localsid.rst
new file mode 120000
index 00000000000..0492fc7532f
--- /dev/null
+++ b/docs/developer/corefeatures/sr/sr_localsid.rst
@@ -0,0 +1 @@
+../../../../src/vnet/srv6/sr_localsid.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/sr/sr_mpls.rst b/docs/developer/corefeatures/sr/sr_mpls.rst
new file mode 120000
index 00000000000..d2fe4025326
--- /dev/null
+++ b/docs/developer/corefeatures/sr/sr_mpls.rst
@@ -0,0 +1 @@
+../../../../src/vnet/srmpls/sr_doc.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/sr/sr_policy.rst b/docs/developer/corefeatures/sr/sr_policy.rst
new file mode 120000
index 00000000000..bbd87348c84
--- /dev/null
+++ b/docs/developer/corefeatures/sr/sr_policy.rst
@@ -0,0 +1 @@
+../../../../src/vnet/srv6/sr_policy.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/sr/sr_steering.rst b/docs/developer/corefeatures/sr/sr_steering.rst
new file mode 120000
index 00000000000..7edf737ab2b
--- /dev/null
+++ b/docs/developer/corefeatures/sr/sr_steering.rst
@@ -0,0 +1 @@
+../../../../src/vnet/srv6/sr_steering.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/stats.rst b/docs/developer/corefeatures/stats.rst
new file mode 120000
index 00000000000..ef1829611d9
--- /dev/null
+++ b/docs/developer/corefeatures/stats.rst
@@ -0,0 +1 @@
+../../../src/vpp/stats/stats.rst \ No newline at end of file
diff --git a/docs/developer/corefeatures/sylog_doc.rst b/docs/developer/corefeatures/sylog_doc.rst
new file mode 120000
index 00000000000..731267b3c25
--- /dev/null
+++ b/docs/developer/corefeatures/sylog_doc.rst
@@ -0,0 +1 @@
+../../../src/vnet/syslog/sylog_doc.rst \ No newline at end of file
diff --git a/docs/developer/devicedrivers/af_xdp.rst b/docs/developer/devicedrivers/af_xdp.rst
new file mode 120000
index 00000000000..ae41d1f5d72
--- /dev/null
+++ b/docs/developer/devicedrivers/af_xdp.rst
@@ -0,0 +1 @@
+../../../src/plugins/af_xdp/af_xdp_doc.rst \ No newline at end of file
diff --git a/docs/developer/devicedrivers/avf.rst b/docs/developer/devicedrivers/avf.rst
new file mode 120000
index 00000000000..06a84f56418
--- /dev/null
+++ b/docs/developer/devicedrivers/avf.rst
@@ -0,0 +1 @@
+../../../src/plugins/avf/README.rst \ No newline at end of file
diff --git a/docs/developer/devicedrivers/index.rst b/docs/developer/devicedrivers/index.rst
new file mode 100644
index 00000000000..e1194e7aa98
--- /dev/null
+++ b/docs/developer/devicedrivers/index.rst
@@ -0,0 +1,15 @@
+.. _devicedrivers:
+
+==============
+Device drivers
+==============
+
+
+
+.. toctree::
+ :maxdepth: 1
+
+ avf
+ rdma
+ vmxnet3
+ af_xdp
diff --git a/docs/developer/devicedrivers/rdma.rst b/docs/developer/devicedrivers/rdma.rst
new file mode 120000
index 00000000000..d16203c7c5a
--- /dev/null
+++ b/docs/developer/devicedrivers/rdma.rst
@@ -0,0 +1 @@
+../../../src/plugins/rdma/rdma_doc.rst \ No newline at end of file
diff --git a/docs/developer/devicedrivers/vmxnet3.rst b/docs/developer/devicedrivers/vmxnet3.rst
new file mode 120000
index 00000000000..361b899fcc0
--- /dev/null
+++ b/docs/developer/devicedrivers/vmxnet3.rst
@@ -0,0 +1 @@
+../../../src/plugins/vmxnet3/README.rst \ No newline at end of file
diff --git a/docs/developer/extras/index.rst b/docs/developer/extras/index.rst
new file mode 100644
index 00000000000..200caf56d3a
--- /dev/null
+++ b/docs/developer/extras/index.rst
@@ -0,0 +1,17 @@
+.. _vpp_extras:
+
+===============
+VPP extra tools
+===============
+
+.. toctree::
+ :maxdepth: 2
+
+ lcov
+ snap
+ strongswan
+ vpp_config
+ vpp_if_stats
+ vpp_stats_fs
+ vpptop
+ vcl_ldpreload
diff --git a/docs/developer/extras/lcov.rst b/docs/developer/extras/lcov.rst
new file mode 120000
index 00000000000..9f8de245ea6
--- /dev/null
+++ b/docs/developer/extras/lcov.rst
@@ -0,0 +1 @@
+../../../extras/lcov/README.rst \ No newline at end of file
diff --git a/docs/developer/extras/snap.rst b/docs/developer/extras/snap.rst
new file mode 120000
index 00000000000..773ec9be3eb
--- /dev/null
+++ b/docs/developer/extras/snap.rst
@@ -0,0 +1 @@
+../../../extras/snap/README.rst \ No newline at end of file
diff --git a/docs/developer/extras/strongswan.rst b/docs/developer/extras/strongswan.rst
new file mode 120000
index 00000000000..1fef5a0fdfb
--- /dev/null
+++ b/docs/developer/extras/strongswan.rst
@@ -0,0 +1 @@
+../../../extras/strongswan/README.rst \ No newline at end of file
diff --git a/docs/developer/extras/vcl_ldpreload.rst b/docs/developer/extras/vcl_ldpreload.rst
new file mode 120000
index 00000000000..3740db2ffd2
--- /dev/null
+++ b/docs/developer/extras/vcl_ldpreload.rst
@@ -0,0 +1 @@
+../../../extras/vcl-ldpreload/README.rst \ No newline at end of file
diff --git a/docs/developer/extras/vpp_config.rst b/docs/developer/extras/vpp_config.rst
new file mode 120000
index 00000000000..0db2e6f8ccc
--- /dev/null
+++ b/docs/developer/extras/vpp_config.rst
@@ -0,0 +1 @@
+../../../extras/vpp_config/README.rst \ No newline at end of file
diff --git a/docs/developer/extras/vpp_if_stats.rst b/docs/developer/extras/vpp_if_stats.rst
new file mode 120000
index 00000000000..857cdee8587
--- /dev/null
+++ b/docs/developer/extras/vpp_if_stats.rst
@@ -0,0 +1 @@
+../../../extras/vpp_if_stats/README.rst \ No newline at end of file
diff --git a/docs/developer/extras/vpp_stats_fs.rst b/docs/developer/extras/vpp_stats_fs.rst
new file mode 120000
index 00000000000..5e67c6a6818
--- /dev/null
+++ b/docs/developer/extras/vpp_stats_fs.rst
@@ -0,0 +1 @@
+../../../extras/vpp_stats_fs/README.rst \ No newline at end of file
diff --git a/docs/developer/extras/vpptop.rst b/docs/developer/extras/vpptop.rst
new file mode 120000
index 00000000000..cf7f78e0c39
--- /dev/null
+++ b/docs/developer/extras/vpptop.rst
@@ -0,0 +1 @@
+../../../extras/vpptop/README.rst \ No newline at end of file
diff --git a/docs/developer/plugindoc/add_plugin.rst b/docs/developer/plugindoc/add_plugin.rst
new file mode 100644
index 00000000000..cf771116095
--- /dev/null
+++ b/docs/developer/plugindoc/add_plugin.rst
@@ -0,0 +1,362 @@
+.. _add_plugin:
+
+Adding a plugin
+===============
+
+.. toctree::
+
+Strategic Choices
+_________________
+
+Plugins may implement lightly-used, experimental, or test
+functionality. In such cases, please disable the plugin by default:
+
+.. code-block:: console
+
+ /* *INDENT-OFF* */
+ VLIB_PLUGIN_REGISTER () =
+ {
+ .version = VPP_BUILD_VER,
+ .description = "Plugin Disabled by Default...",
+ .default_disabled = 1,
+ };
+ /* *INDENT-ON* */
+
+Please do not create processes, or other dynamic data structures
+unless the plugin is configured by API or debug CLI.
+
+Specifically, please don't initialize bihash tables from
+VLIB_INIT_FUNCTIONS, *especially* if the bihash template involved
+doesn't #define BIHASH_LAZY_INSTANTIATE 1.
+
+.. code-block:: console
+
+ static clib_error_t * sample_init (vlib_main_t * vm)
+ {
+ <snip>
+ /* DONT DO THIS! */
+ BV(clib_bihash_init (h, ...))
+ <snip>
+ }
+ VLIB_INIT_FUNCTION (sample_init);
+
+Instead, please add a feature_init function:
+
+.. code-block:: console
+
+ static void
+ feature_init (my_main_t * mm)
+ {
+ if (mm->feature_initialized == 0)
+ {
+ BV(clib_bihash_init)(mm->hash_table, ...)
+ /* Create Other Things, e.g a periodic process */
+ mm->feature_initialized = 1;
+ }
+ }
+
+And call it from debug CLI and API message handlers any time the feature
+is enabled.
+
+How to create a new plugin
+__________________________
+
+This section shows how a VPP developer can create a new plugin, and
+add it to VPP. We assume that we are starting from the VPP <top-of-workspace>.
+
+As an example, we will use the **make-plugin.sh** tool found in
+**./extras/emacs**. make-plugin.sh is a simple wrapper for a comprehensive
+plugin generator constructed from a set of emacs-lisp skeletons.
+
+Change directory to **./src/plugins**, and run the plugin generator:
+
+.. code-block:: console
+
+ $ cd ./src/plugins
+ $ ../../extras/emacs/make-plugin.sh
+ <snip>
+ Loading /scratch/vpp-docs/extras/emacs/tunnel-c-skel.el (source)...
+ Loading /scratch/vpp-docs/extras/emacs/tunnel-decap-skel.el (source)...
+ Loading /scratch/vpp-docs/extras/emacs/tunnel-encap-skel.el (source)...
+ Loading /scratch/vpp-docs/extras/emacs/tunnel-h-skel.el (source)...
+ Loading /scratch/vpp-docs/extras/emacs/elog-4-int-skel.el (source)...
+ Loading /scratch/vpp-docs/extras/emacs/elog-4-int-track-skel.el (source)...
+ Loading /scratch/vpp-docs/extras/emacs/elog-enum-skel.el (source)...
+ Loading /scratch/vpp-docs/extras/emacs/elog-one-datum-skel.el (source)...
+ Plugin name: myplugin
+ Dispatch type [dual or qs]: dual
+ (Shell command succeeded with no output)
+
+ OK...
+
+The plugin generator script asks two questions: the name of the
+plugin, and which of two dispatch types to use. Since the plugin name
+finds its way into quite a number of places - filenames, typedef
+names, graph arc names - it pays to think for a moment.
+
+The dispatch type refers to the coding pattern used to construct
+**node.c**, the *pro forma* data-plane node. The **dual** option
+constructs a dual-single loop pair with speculative enqueueing. This
+is the traditional coding pattern for load-store intensive graph
+nodes.
+
+The **qs** option generates a quad-single loop pair which uses
+vlib_get_buffers(...) and vlib_buffer_enqueue_to_next(...). These
+operators make excellent use of available SIMD vector unit
+operations. It's very simple to change a quad-single loop-pair to a
+dual-single loop pair if you decide to do so later.
+
+Generated Files
+---------------
+
+Here are the generated files. We'll go through them in a moment.
+
+.. code-block:: console
+
+ $ cd ./myplugin
+ $ ls
+ CMakeLists.txt myplugin.api myplugin.c myplugin.h
+ myplugin_periodic.c myplugin_test.c node.c setup.pg
+
+Due to recent build system improvements, you **don't** need to touch
+any other files to integrate your new plugin into the vpp build. Simply
+rebuild your workspace from scratch, and the new plugin will appear.
+
+Rebuild your workspace
+----------------------
+
+This is the straightforward way to reconfigure and rebuild your workspace:
+
+.. code-block:: console
+
+ $ cd <top-of-workspace>
+ $ make rebuild [or rebuild-release]
+
+Thanks to ccache, this operation doesn't take an annoying amount of time.
+
+Sanity check: run vpp
+---------------------
+
+As a quick sanity check, run vpp and make sure that
+"myplugin_plugin.so" and "myplugin_test_plugin.so" are loaded:
+
+.. code-block:: console
+
+ $ cd <top-of-workspace>
+ $ make run
+ <snip>
+ load_one_plugin:189: Loaded plugin: myplugin_plugin.so (myplugin description goes here)
+ <snip>
+ load_one_vat_plugin:67: Loaded plugin: myplugin_test_plugin.so
+ <snip>
+ DBGvpp#
+
+If this simple test fails, please seek assistance.
+
+Generated Files in Detail
+_________________________
+
+This section discusses the generated files in some detail. It's fine to
+skim this section, and return later for more detail.
+
+CMakeLists.txt
+--------------
+
+This is the build system recipe for building your plugin. Please fix
+the copyright notice:
+
+.. code-block:: console
+
+ # Copyright (c) <current-year> <your-organization>
+
+The rest of the build recipe is pretty simple:
+
+.. code-block:: CMake
+
+ add_vpp_plugin (myplugin
+ SOURCES
+ myplugin.c
+ node.c
+ myplugin_periodic.c
+ myplugin.h
+
+ MULTIARCH_SOURCES
+ node.c
+
+ API_FILES
+ myplugin.api
+
+ API_TEST_SOURCES
+ myplugin_test.c
+ )
+
+As you can see, the build recipe consists of several lists of
+files. **SOURCES** is a list of C source files. **API_FILES** is a
+list of the plugin's binary API definition files [one such file is
+usually plenty], and so forth.
+
+**MULTIARCH_SOURCES** lists data plane graph node dispatch function
+source files considered to be performance-critical. Specific functions
+in these files are compiled multiple times, so that they can leverage
+CPU-specific features. More on this in a moment.
+
+If you add source files, simply add them to the indicated list(s).
+
+myplugin.h
+----------
+
+This is the primary #include file for the new plugin. Among other
+things, it defines the plugin's *main_t* data structure. This is the
+right place to add problem-specific data structures. Please **resist
+the temptation** to create a set of static or [worse yet] global
+variables in your plugin. Refereeing name-collisions between plugins
+is not anyone's idea of a good time.
+
+myplugin.c
+----------
+
+For want of a better way to describe it, myplugin.c is the vpp plugin
+equivalent of "main.c". Its job is to hook the plugin into the vpp
+binary API message dispatcher, and to add its messages to vpp's global
+"message-name_crc" hash table. See "myplugin_init (...")"
+
+Vpp itself uses dlsym(...) to track down the vlib_plugin_registration_t
+generated by the VLIB_PLUGIN_REGISTER macro:
+
+.. code-block:: C
+
+ VLIB_PLUGIN_REGISTER () =
+ {
+ .version = VPP_BUILD_VER,
+ .description = "myplugin plugin description goes here",
+ };
+
+Vpp only loads .so files from the plugin directory which contain an
+instance of this data structure.
+
+You can enable or disable specific vpp plugins from the command
+line. By default, plugins are loaded. To change that behavior, set
+default_disabled in the macro VLIB_PLUGIN_REGISTER:
+
+.. code-block:: C
+
+ VLIB_PLUGIN_REGISTER () =
+ {
+ .version = VPP_BUILD_VER,
+ .default_disabled = 1
+ .description = "myplugin plugin description goes here",
+ };
+
+The boilerplate generator places the graph node dispatch function
+onto the "device-input" feature arc. This may or may not be useful.
+
+.. code-block:: C
+
+ VNET_FEATURE_INIT (myplugin, static) =
+ {
+ .arc_name = "device-input",
+ .node_name = "myplugin",
+ .runs_before = VNET_FEATURES ("ethernet-input"),
+ };
+
+As given by the plugin generator, myplugin.c contains the binary API
+message handler for a generic "please enable my feature on such and
+such an interface" binary API message. As you'll see, setting up the
+vpp message API tables is simple. Big fat warning: the scheme is
+intolerant of minor mistakes. Example: forgetting to add
+mainp->msg_id_base can lead to very confusing failures.
+
+If you stick to modifying the generated boilerplate with care -
+instead of trying to build code from first principles - you'll save
+yourself a bunch of time and aggravation
+
+myplugin_test.c
+---------------
+
+This file contains binary API message **generation** code, which is
+compiled into a separate .so file. The "vpp_api_test" program loads
+these plugins, yielding immediate access to your plugin APIs for
+external client binary API testing.
+
+vpp itself loads test plugins, and makes the code available via the
+"binary-api" debug CLI. This is a favorite way to unit-test binary
+APIs prior to integration testing.
+
+node.c
+------
+
+This is the generated graph node dispatch function. You'll need to
+rewrite it to solve the problem at hand. It will save considerable
+time and aggravation to retain the **structure** of the node dispatch
+function.
+
+Even for an expert, it's a waste of time to reinvent the *loop
+structure*, enqueue patterns, and so forth. Simply tear out and
+replace the specimen 1x, 2x, 4x packet processing code with code
+relevant to the problem you're trying to solve.
+
+myplugin.api
+------------
+
+This contains the API message definition. Here we only have defined
+a single one named ``myplugin_enable_disable`` and an implicit
+``myplugin_enable_disable_reply`` containing only a return value due
+to the ``autoreply`` keyword.
+
+The syntax reference for ``.api`` files can be found at VPP API Language
+
+Addressing the binary API with this message will run the handler defined
+in ``myplugin.c`` as ``vl_api_myplugin_enable_disable_t_handler``.
+It will receive a message pointer ``*mp`` which is the struct defined
+in ``myplugin.api`` and should return another message pointer ``*rmp``,
+of the reply type. That's what ``REPLY_MACRO`` does.
+
+To be noted, all API messages are in net-endian and vpp is host-endian,
+so you will need to use :
+
+* ``u32 value = ntohl(mp->value);``
+* ``rmp->value = htonl(value);``
+
+You can now use this API with :ref:`GoLang bindings <add_plugin_goapi>`
+
+myplugin_periodic.c
+-------------------
+
+This defines a VPP process, a routine that will run indefinitely and
+be woken up intermittently, here to process plugin events.
+
+To be noted, vlib_processes aren't thread-safe, and data structures
+should be locked when shared between workers.
+
+Plugin "Friends with Benefits"
+------------------------------
+
+In vpp VLIB_INIT_FUNCTION functions, It's reasonably common to see a
+specific init function invoke other init functions:
+
+.. code-block:: C
+
+ if ((error = vlib_call_init_function (vm, some_other_init_function))
+ return error;
+
+In the case where one plugin needs to call a init function in another
+plugin, use the vlib_call_plugin_init_function macro:
+
+.. code-block:: C
+
+ if ((error = vlib_call_plugin_init_function (vm, "otherpluginname", some_init_function))
+ return error;
+
+This allows sequencing between plugin init functions.
+
+If you wish to obtain a pointer to a symbol in another plugin, use the
+vlib_plugin_get_symbol(...) API:
+
+.. code-block:: C
+
+ void *p = vlib_get_plugin_symbol ("plugin_name", "symbol");
+
+More Examples
+-------------
+
+For more information you can read many example plugins in the directory "./src/plugins".
diff --git a/docs/developer/plugindoc/handoffdemo.rst b/docs/developer/plugindoc/handoffdemo.rst
new file mode 120000
index 00000000000..d9eaebf3c02
--- /dev/null
+++ b/docs/developer/plugindoc/handoffdemo.rst
@@ -0,0 +1 @@
+../../../src/examples/handoffdemo/handoffdemo.rst \ No newline at end of file
diff --git a/docs/developer/plugindoc/index.rst b/docs/developer/plugindoc/index.rst
new file mode 100644
index 00000000000..2fcddb4cc5b
--- /dev/null
+++ b/docs/developer/plugindoc/index.rst
@@ -0,0 +1,13 @@
+.. _add_new_plugin:
+
+==============================
+Adding a new plugin or feature
+==============================
+
+
+.. toctree::
+ :maxdepth: 2
+
+ add_plugin
+ sample_plugin_doc
+ handoffdemo
diff --git a/docs/developer/plugindoc/sample_plugin_doc.rst b/docs/developer/plugindoc/sample_plugin_doc.rst
new file mode 120000
index 00000000000..5f4c67accfa
--- /dev/null
+++ b/docs/developer/plugindoc/sample_plugin_doc.rst
@@ -0,0 +1 @@
+../../../src/examples/sample-plugin/sample_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/acl_hash_lookup.rst b/docs/developer/plugins/acl_hash_lookup.rst
new file mode 120000
index 00000000000..934be2816ba
--- /dev/null
+++ b/docs/developer/plugins/acl_hash_lookup.rst
@@ -0,0 +1 @@
+../../../src/plugins/acl/acl_hash_lookup_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/acl_lookup_context.rst b/docs/developer/plugins/acl_lookup_context.rst
new file mode 120000
index 00000000000..e9e15ed3684
--- /dev/null
+++ b/docs/developer/plugins/acl_lookup_context.rst
@@ -0,0 +1 @@
+../../../src/plugins/acl/acl_lookup_context.rst \ No newline at end of file
diff --git a/docs/developer/plugins/acl_multicore.rst b/docs/developer/plugins/acl_multicore.rst
new file mode 120000
index 00000000000..564b6a10876
--- /dev/null
+++ b/docs/developer/plugins/acl_multicore.rst
@@ -0,0 +1 @@
+../../../src/plugins/acl/acl_multicore_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/bufmon_doc.rst b/docs/developer/plugins/bufmon_doc.rst
new file mode 120000
index 00000000000..3578898d323
--- /dev/null
+++ b/docs/developer/plugins/bufmon_doc.rst
@@ -0,0 +1 @@
+../../../src/plugins/bufmon/bufmon_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/cnat.rst b/docs/developer/plugins/cnat.rst
new file mode 120000
index 00000000000..c261d501228
--- /dev/null
+++ b/docs/developer/plugins/cnat.rst
@@ -0,0 +1 @@
+../../../src/plugins/cnat/cnat.rst \ No newline at end of file
diff --git a/docs/developer/plugins/dhcp6_pd.rst b/docs/developer/plugins/dhcp6_pd.rst
new file mode 120000
index 00000000000..80bd946c952
--- /dev/null
+++ b/docs/developer/plugins/dhcp6_pd.rst
@@ -0,0 +1 @@
+../../../src/plugins/dhcp/dhcp6_pd_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/flowprobe.rst b/docs/developer/plugins/flowprobe.rst
new file mode 120000
index 00000000000..7e2fbd2f079
--- /dev/null
+++ b/docs/developer/plugins/flowprobe.rst
@@ -0,0 +1 @@
+../../../src/plugins/flowprobe/flowprobe_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/index.rst b/docs/developer/plugins/index.rst
new file mode 100644
index 00000000000..9aedc89bdca
--- /dev/null
+++ b/docs/developer/plugins/index.rst
@@ -0,0 +1,41 @@
+.. _vpp_plugins:
+
+=======
+Plugins
+=======
+
+vlib implements a straightforward plug-in DLL mechanism. VLIB client
+applications specify a directory to search for plug-in .DLLs, and a name
+filter to apply (if desired). VLIB needs to load plug-ins very early.
+
+Once loaded, the plug-in DLL mechanism uses dlsym to find and verify a
+vlib\_plugin\_registration data structure in the newly-loaded plug-in.
+
+For more on plugins please refer to :ref:`add_plugin`.
+
+
+.. toctree::
+ :maxdepth: 2
+
+ quic
+ cnat
+ lcp
+ srv6/index
+ marvell
+ lldp
+ nat64
+ nat44_ei_ha
+ pnat
+ lb
+ lacp
+ flowprobe
+ map_lw4o6
+ mdata
+ dhcp6_pd
+ ioam
+ wireguard
+ srtp
+ acl_multicore
+ acl_hash_lookup
+ acl_lookup_context
+ bufmon_doc
diff --git a/docs/developer/plugins/ioam.rst b/docs/developer/plugins/ioam.rst
new file mode 120000
index 00000000000..7a8faf6fb7a
--- /dev/null
+++ b/docs/developer/plugins/ioam.rst
@@ -0,0 +1 @@
+../../../src/plugins/ioam/ioam_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/lacp.rst b/docs/developer/plugins/lacp.rst
new file mode 120000
index 00000000000..4d410243a8b
--- /dev/null
+++ b/docs/developer/plugins/lacp.rst
@@ -0,0 +1 @@
+../../../src/plugins/lacp/lacp_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/lb.rst b/docs/developer/plugins/lb.rst
new file mode 120000
index 00000000000..de3183a8b44
--- /dev/null
+++ b/docs/developer/plugins/lb.rst
@@ -0,0 +1 @@
+../../../src/plugins/lb/lb_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/lcp.rst b/docs/developer/plugins/lcp.rst
new file mode 120000
index 00000000000..2fbbe1f4db6
--- /dev/null
+++ b/docs/developer/plugins/lcp.rst
@@ -0,0 +1 @@
+../../../src/plugins/linux-cp/lcp.rst \ No newline at end of file
diff --git a/docs/developer/plugins/lldp.rst b/docs/developer/plugins/lldp.rst
new file mode 120000
index 00000000000..b913eca926c
--- /dev/null
+++ b/docs/developer/plugins/lldp.rst
@@ -0,0 +1 @@
+../../../src/plugins/lldp/lldp_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/map_lw4o6.rst b/docs/developer/plugins/map_lw4o6.rst
new file mode 120000
index 00000000000..427f189082e
--- /dev/null
+++ b/docs/developer/plugins/map_lw4o6.rst
@@ -0,0 +1 @@
+../../../src/plugins/map/map_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/marvell.rst b/docs/developer/plugins/marvell.rst
new file mode 120000
index 00000000000..28f0cd0f664
--- /dev/null
+++ b/docs/developer/plugins/marvell.rst
@@ -0,0 +1 @@
+../../../src/plugins/marvell/README.rst \ No newline at end of file
diff --git a/docs/developer/plugins/mdata.rst b/docs/developer/plugins/mdata.rst
new file mode 120000
index 00000000000..87e5c929bc4
--- /dev/null
+++ b/docs/developer/plugins/mdata.rst
@@ -0,0 +1 @@
+../../../src/plugins/mdata/mdata_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/nat44_ei_ha.rst b/docs/developer/plugins/nat44_ei_ha.rst
new file mode 120000
index 00000000000..a8a00f40218
--- /dev/null
+++ b/docs/developer/plugins/nat44_ei_ha.rst
@@ -0,0 +1 @@
+../../../src/plugins/nat/nat44-ei/nat44_ei_ha_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/nat64.rst b/docs/developer/plugins/nat64.rst
new file mode 120000
index 00000000000..22c6bbea80d
--- /dev/null
+++ b/docs/developer/plugins/nat64.rst
@@ -0,0 +1 @@
+../../../src/plugins/nat/nat64/nat64_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/pnat.rst b/docs/developer/plugins/pnat.rst
new file mode 120000
index 00000000000..4a8da6ef31a
--- /dev/null
+++ b/docs/developer/plugins/pnat.rst
@@ -0,0 +1 @@
+../../../src/plugins/nat/pnat/pnat.rst \ No newline at end of file
diff --git a/docs/developer/plugins/quic.rst b/docs/developer/plugins/quic.rst
new file mode 120000
index 00000000000..ffe53429571
--- /dev/null
+++ b/docs/developer/plugins/quic.rst
@@ -0,0 +1 @@
+../../../src/plugins/quic/quic_plugin.rst \ No newline at end of file
diff --git a/docs/developer/plugins/srtp.rst b/docs/developer/plugins/srtp.rst
new file mode 120000
index 00000000000..51013cce036
--- /dev/null
+++ b/docs/developer/plugins/srtp.rst
@@ -0,0 +1 @@
+../../../src/plugins/srtp/srtp_plugin.rst \ No newline at end of file
diff --git a/docs/developer/plugins/srv6/ad_flow_plugin_doc.rst b/docs/developer/plugins/srv6/ad_flow_plugin_doc.rst
new file mode 120000
index 00000000000..f944884a992
--- /dev/null
+++ b/docs/developer/plugins/srv6/ad_flow_plugin_doc.rst
@@ -0,0 +1 @@
+../../../../src/plugins/srv6-ad-flow/ad_flow_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/srv6/ad_plugin_doc.rst b/docs/developer/plugins/srv6/ad_plugin_doc.rst
new file mode 120000
index 00000000000..46ec8a8899e
--- /dev/null
+++ b/docs/developer/plugins/srv6/ad_plugin_doc.rst
@@ -0,0 +1 @@
+../../../../src/plugins/srv6-ad/ad_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/srv6/am_plugin_doc.rst b/docs/developer/plugins/srv6/am_plugin_doc.rst
new file mode 120000
index 00000000000..842426adc54
--- /dev/null
+++ b/docs/developer/plugins/srv6/am_plugin_doc.rst
@@ -0,0 +1 @@
+../../../../src/plugins/srv6-am/am_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/srv6/as_plugin_doc.rst b/docs/developer/plugins/srv6/as_plugin_doc.rst
new file mode 120000
index 00000000000..90dd28e6606
--- /dev/null
+++ b/docs/developer/plugins/srv6/as_plugin_doc.rst
@@ -0,0 +1 @@
+../../../../src/plugins/srv6-as/as_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/srv6/index.rst b/docs/developer/plugins/srv6/index.rst
new file mode 100644
index 00000000000..eb8f3e815a2
--- /dev/null
+++ b/docs/developer/plugins/srv6/index.rst
@@ -0,0 +1,16 @@
+.. _dplugins_srv6:
+
+============
+SRv6 Plugins
+============
+
+.. toctree::
+ :maxdepth: 2
+
+ ad_flow_plugin_doc
+ ad_plugin_doc
+ am_plugin_doc
+ as_plugin_doc
+ mobile_plugin_doc
+ runner_doc
+ srv6_sample_localsid_doc
diff --git a/docs/developer/plugins/srv6/mobile_plugin_doc.rst b/docs/developer/plugins/srv6/mobile_plugin_doc.rst
new file mode 120000
index 00000000000..546f5f2cc14
--- /dev/null
+++ b/docs/developer/plugins/srv6/mobile_plugin_doc.rst
@@ -0,0 +1 @@
+../../../../src/plugins/srv6-mobile/mobile_plugin_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/srv6/runner_doc.rst b/docs/developer/plugins/srv6/runner_doc.rst
new file mode 120000
index 00000000000..16217038309
--- /dev/null
+++ b/docs/developer/plugins/srv6/runner_doc.rst
@@ -0,0 +1 @@
+../../../../src/plugins/srv6-mobile/extra/runner_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/srv6/srv6_sample_localsid_doc.rst b/docs/developer/plugins/srv6/srv6_sample_localsid_doc.rst
new file mode 120000
index 00000000000..db924565db4
--- /dev/null
+++ b/docs/developer/plugins/srv6/srv6_sample_localsid_doc.rst
@@ -0,0 +1 @@
+../../../../src/examples/srv6-sample-localsid/srv6_sample_localsid_doc.rst \ No newline at end of file
diff --git a/docs/developer/plugins/wireguard.rst b/docs/developer/plugins/wireguard.rst
new file mode 120000
index 00000000000..3cba6b438c3
--- /dev/null
+++ b/docs/developer/plugins/wireguard.rst
@@ -0,0 +1 @@
+../../../src/plugins/wireguard/README.rst \ No newline at end of file
diff --git a/docs/developer/tests/overview.rst b/docs/developer/tests/overview.rst
new file mode 100644
index 00000000000..cc9836e3992
--- /dev/null
+++ b/docs/developer/tests/overview.rst
@@ -0,0 +1,450 @@
+.. _unittest: https://docs.python.org/2/library/unittest.html
+.. _TestCase: https://docs.python.org/2/library/unittest.html#unittest.TestCase
+.. _AssertionError: https://docs.python.org/2/library/exceptions.html#exceptions.AssertionError
+.. _SkipTest: https://docs.python.org/2/library/unittest.html#unittest.SkipTest
+.. _virtualenv: http://docs.python-guide.org/en/latest/dev/virtualenvs/
+.. _scapy: http://www.secdev.org/projects/scapy/
+.. _logging: https://docs.python.org/2/library/logging.html
+.. _process: https://docs.python.org/2/library/multiprocessing.html#the-process-class
+.. _pipes: https://docs.python.org/2/library/multiprocessing.html#multiprocessing.Pipe
+.. _managed: https://docs.python.org/2/library/multiprocessing.html#managers
+
+.. |vtf| replace:: VPP Test Framework
+
+|vtf|
+=====
+
+.. contents::
+ :local:
+ :depth: 1
+
+Overview
+########
+
+The goal of the |vtf| is to ease writing, running and debugging
+unit tests for the VPP. For this, python was chosen as a high level language
+allowing rapid development with scapy_ providing the necessary tool for creating
+and dissecting packets.
+
+Anatomy of a test case
+######################
+
+Python's unittest_ is used as the base framework upon which the VPP test
+framework is built. A test suite in the |vtf| consists of multiple classes
+derived from `VppTestCase`, which is itself derived from TestCase_.
+The test class defines one or more test functions, which act as test cases.
+
+Function flow when running a test case is:
+
+1. `setUpClass <VppTestCase.setUpClass>`:
+ This function is called once for each test class, allowing a one-time test
+ setup to be executed. If this functions throws an exception,
+ none of the test functions are executed.
+2. `setUp <VppTestCase.setUp>`:
+ The setUp function runs before each of the test functions. If this function
+ throws an exception other than AssertionError_ or SkipTest_, then this is
+ considered an error, not a test failure.
+3. *test_<name>*:
+ This is the guts of the test case. It should execute the test scenario
+ and use the various assert functions from the unittest framework to check
+ necessary. Multiple test_<name> methods can exist in a test case.
+4. `tearDown <VppTestCase.tearDown>`:
+ The tearDown function is called after each test function with the purpose
+ of doing partial cleanup.
+5. `tearDownClass <VppTestCase.tearDownClass>`:
+ Method called once after running all of the test functions to perform
+ the final cleanup.
+
+Logging
+#######
+
+Each test case has a logger automatically created for it, stored in
+'logger' property, based on logging_. Use the logger's standard methods
+debug(), info(), error(), ... to emit log messages to the logger.
+
+All the log messages go always into a log file in temporary directory
+(see below).
+
+To control the messages printed to console, specify the V= parameter.
+
+.. code-block:: shell
+
+ make test # minimum verbosity
+ make test V=1 # moderate verbosity
+ make test V=2 # maximum verbosity
+
+Parallel test execution
+#######################
+
+|vtf| test suites can be run in parallel. Each test suite is executed
+in a separate process spawned by Python multiprocessing process_.
+
+The results from child test suites are sent to parent through pipes_, which are
+aggregated and summarized at the end of the run.
+
+Stdout, stderr and logs logged in child processes are redirected to individual
+parent managed_ queues. The data from these queues are then emitted to stdout
+of the parent process in the order the test suites have finished. In case there
+are no finished test suites (such as at the beginning of the run), the data
+from last started test suite are emitted in real time.
+
+To enable parallel test run, specify the number of parallel processes:
+
+.. code-block:: shell
+
+ make test TEST_JOBS=n # at most n processes will be spawned
+ make test TEST_JOBS=auto # chosen based on the number of cores
+ # and the size of shared memory
+
+Test temporary directory and VPP life cycle
+###########################################
+
+Test separation is achieved by separating the test files and vpp instances.
+Each test creates a temporary directory and it's name is used to create
+a shared memory prefix which is used to run a VPP instance.
+The temporary directory name contains the testcase class name for easy
+reference, so for testcase named 'TestVxlan' the directory could be named
+e.g. vpp-unittest-TestVxlan-UNUP3j.
+This way, there is no conflict between any other VPP instances running
+on the box and the test VPP. Any temporary files created by the test case
+are stored in this temporary test directory.
+
+The test temporary directory holds the following interesting files:
+
+* log.txt - this contains the logger output on max verbosity
+* pg*_in.pcap - last injected packet stream into VPP, named after the interface,
+ so for pg0, the file will be named pg0_in.pcap
+* pg*_out.pcap - last capture file created by VPP for interface, similarly,
+ named after the interface, so for e.g. pg1, the file will be named
+ pg1_out.pcap
+* history files - whenever the capture is restarted or a new stream is added,
+ the existing files are rotated and renamed, so all the pcap files
+ are always saved for later debugging if needed
+* core - if vpp dumps a core, it'll be stored in the temporary directory
+* vpp_stdout.txt - file containing output which vpp printed to stdout
+* vpp_stderr.txt - file containing output which vpp printed to stderr
+
+*NOTE*: existing temporary directories named vpp-unittest-* are automatically
+removed when invoking 'make test*' or 'make retest*' to keep the temporary
+directory clean.
+
+Virtual environment
+###################
+
+Virtualenv_ is a python module which provides a means to create an environment
+containing the dependencies required by the |vtf|, allowing a separation
+from any existing system-wide packages. |vtf|'s Makefile automatically
+creates a virtualenv_ inside build-root and installs the required packages
+in that environment. The environment is entered whenever executing a test
+via one of the make test targets.
+
+Naming conventions
+##################
+
+Most unit tests do some kind of packet manipulation - sending and receiving
+packets between VPP and virtual hosts connected to the VPP. Referring
+to the sides, addresses, etc. is always done as if looking from the VPP side,
+thus:
+
+* *local_* prefix is used for the VPP side.
+ So e.g. `local_ip4 <VppInterface.local_ip4>` address is the IPv4 address
+ assigned to the VPP interface.
+* *remote_* prefix is used for the virtual host side.
+ So e.g. `remote_mac <VppInterface.remote_mac>` address is the MAC address
+ assigned to the virtual host connected to the VPP.
+
+Automatically generated addresses
+#################################
+
+To send packets, one needs to typically provide some addresses, otherwise
+the packets will be dropped. The interface objects in |vtf| automatically
+provide addresses based on (typically) their indexes, which ensures
+there are no conflicts and eases debugging by making the addressing scheme
+consistent.
+
+The developer of a test case typically doesn't need to work with the actual
+numbers, rather using the properties of the objects. The addresses typically
+come in two flavors: '<address>' and '<address>n' - note the 'n' suffix.
+The former address is a Python string, while the latter is translated using
+socket.inet_pton to raw format in network byte order - this format is suitable
+for passing as an argument to VPP APIs.
+
+e.g. for the IPv4 address assigned to the VPP interface:
+
+* local_ip4 - Local IPv4 address on VPP interface (string)
+* local_ip4n - Local IPv4 address - raw, suitable as API parameter.
+
+These addresses need to be configured in VPP to be usable using e.g.
+`VppInterface.config_ip4` API. Please see the documentation to
+`VppInterface` for more details.
+
+By default, there is one remote address of each kind created for L3:
+remote_ip4 and remote_ip6. If the test needs more addresses, because it's
+simulating more remote hosts, they can be generated using
+`generate_remote_hosts` API and the entries for them inserted into the ARP
+table using `configure_ipv4_neighbors` API.
+
+Packet flow in the |vtf|
+########################
+
+Test framework -> VPP
+~~~~~~~~~~~~~~~~~~~~~
+
+|vtf| doesn't send any packets to VPP directly. Traffic is instead injected
+using packet-generator interfaces, represented by the `VppPGInterface` class.
+Packets are written into a temporary .pcap file, which is then read by the VPP
+and the packets are injected into the VPP world.
+
+To add a list of packets to an interface, call the `VppPGInterface.add_stream`
+method on that interface. Once everything is prepared, call `pg_start` method to
+start the packet generator on the VPP side.
+
+VPP -> test framework
+~~~~~~~~~~~~~~~~~~~~~
+
+Similarly, VPP doesn't send any packets to |vtf| directly. Instead, packet
+capture feature is used to capture and write traffic to a temporary .pcap file,
+which is then read and analyzed by the |vtf|.
+
+The following APIs are available to the test case for reading pcap files.
+
+* `VppPGInterface.get_capture`: this API is suitable for bulk & batch
+ style of test, where a list of packets is prepared & sent, then the
+ received packets are read and verified. The API needs the number of
+ packets which are expected to be captured (ignoring filtered
+ packets - see below) to know when the pcap file is completely
+ written by the VPP. If using packet infos for verifying packets,
+ then the counts of the packet infos can be automatically used by
+ `VppPGInterface.get_capture` to get the proper count (in this case
+ the default value None can be supplied as expected_count or omitted
+ altogether).
+* `VppPGInterface.wait_for_packet`: this API is suitable for
+ interactive style of test, e.g. when doing session management,
+ three-way handshakes, etc. This API waits for and returns a single
+ packet, keeping the capture file in place and remembering
+ context. Repeated invocations return following packets (or raise
+ Exception if timeout is reached) from the same capture file (=
+ packets arriving on the same interface).
+
+*NOTE*: it is not recommended to mix these APIs unless you understand
+how they work internally. None of these APIs rotate the pcap capture
+file, so calling e.g. `VppPGInterface.get_capture` after
+`VppPGInterface.wait_for_packet` will return already read packets. It
+is safe to switch from one API to another after calling
+`VppPGInterface.enable_capture` as that API rotates the capture file.
+
+Automatic filtering of packets:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Both APIs (`VppPGInterface.get_capture` and
+`VppPGInterface.wait_for_packet`) by default filter the packet
+capture, removing known uninteresting packets from it - these are IPv6
+Router Advertisements and IPv6 Router Alerts. These packets are
+unsolicited and from the point of |vtf| are random. If a test wants
+to receive these packets, it should specify either None or a custom
+filtering function as the value to the 'filter_out_fn' argument.
+
+Common API flow for sending/receiving packets:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We will describe a simple scenario, where packets are sent from pg0 to pg1
+interface, assuming that the interfaces were created using
+`create_pg_interfaces` API.
+
+1. Create a list of packets for pg0::
+
+ packet_count = 10
+ packets = create_packets(src=self.pg0, dst=self.pg1,
+ count=packet_count)
+
+2. Add that list of packets to the source interface::
+
+ self.pg0.add_stream(packets)
+
+3. Enable capture on the destination interface::
+
+ self.pg1.enable_capture()
+
+4. Start the packet generator::
+
+ self.pg_start()
+
+5. Wait for capture file to appear and read it::
+
+ capture = self.pg1.get_capture(expected_count=packet_count)
+
+6. Verify packets match sent packets::
+
+ self.verify_capture(send=packets, captured=capture)
+
+Test framework objects
+######################
+
+The following objects provide VPP abstraction and provide a means to do
+common tasks easily in the test cases.
+
+* `VppInterface`: abstract class representing generic VPP interface
+ and contains some common functionality, which is then used by derived classes
+* `VppPGInterface`: class representing VPP packet-generator interface.
+ The interface is created/destroyed when the object is created/destroyed.
+* `VppSubInterface`: VPP sub-interface abstract class, containing common
+ functionality for e.g. `VppDot1QSubint` and `VppDot1ADSubint` classes
+
+How VPP APIs/CLIs are called
+############################
+
+Vpp provides python bindings in a python module called vpp-papi, which the test
+framework installs in the virtual environment. A shim layer represented by
+the `VppPapiProvider` class is built on top of the vpp-papi, serving these
+purposes:
+
+1. Automatic return value checks:
+ After each API is called, the return value is checked against the expected
+ return value (by default 0, but can be overridden) and an exception
+ is raised if the check fails.
+2. Automatic call of hooks:
+
+ a. `before_cli <Hook.before_cli>` and `before_api <Hook.before_api>` hooks
+ are used for debug logging and stepping through the test
+ b. `after_cli <Hook.after_cli>` and `after_api <Hook.after_api>` hooks
+ are used for monitoring the vpp process for crashes
+3. Simplification of API calls:
+ Many of the VPP APIs take a lot of parameters and by providing sane defaults
+ for these, the API is much easier to use in the common case and the code is
+ more readable. E.g. ip_add_del_route API takes ~25 parameters, of which
+ in the common case, only 3 are needed.
+
+Utility methods
+###############
+
+Some interesting utility methods are:
+
+* `ppp`: 'Pretty Print Packet' - returns a string containing the same output
+ as Scapy's packet.show() would print
+* `ppc`: 'Pretty Print Capture' - returns a string containing printout of
+ a capture (with configurable limit on the number of packets printed from it)
+ using `ppp`
+
+*NOTE*: Do not use Scapy's packet.show() in the tests, because it prints
+the output to stdout. All output should go to the logger associated with
+the test case.
+
+Example: how to add a new test
+##############################
+
+In this example, we will describe how to add a new test case which tests
+basic IPv4 forwarding.
+
+1. Add a new file called test_ip4_fwd.py in the test directory, starting
+ with a few imports::
+
+ from framework import VppTestCase
+ from scapy.layers.l2 import Ether
+ from scapy.packet import Raw
+ from scapy.layers.inet import IP, UDP
+ from random import randint
+
+2. Create a class inherited from the VppTestCase::
+
+ class IP4FwdTestCase(VppTestCase):
+ """ IPv4 simple forwarding test case """
+
+3. Add a setUpClass function containing the setup needed for our test to run::
+
+ @classmethod
+ def setUpClass(self):
+ super(IP4FwdTestCase, self).setUpClass()
+ self.create_pg_interfaces(range(2)) # create pg0 and pg1
+ for i in self.pg_interfaces:
+ i.admin_up() # put the interface up
+ i.config_ip4() # configure IPv4 address on the interface
+ i.resolve_arp() # resolve ARP, so that we know VPP MAC
+
+4. Create a helper method to create the packets to send::
+
+ def create_stream(self, src_if, dst_if, count):
+ packets = []
+ for i in range(count):
+ # create packet info stored in the test case instance
+ info = self.create_packet_info(src_if, dst_if)
+ # convert the info into packet payload
+ payload = self.info_to_payload(info)
+ # create the packet itself
+ p = (Ether(dst=src_if.local_mac, src=src_if.remote_mac) /
+ IP(src=src_if.remote_ip4, dst=dst_if.remote_ip4) /
+ UDP(sport=randint(1000, 2000), dport=5678) /
+ Raw(payload))
+ # store a copy of the packet in the packet info
+ info.data = p.copy()
+ # append the packet to the list
+ packets.append(p)
+
+ # return the created packet list
+ return packets
+
+5. Create a helper method to verify the capture::
+
+ def verify_capture(self, src_if, dst_if, capture):
+ packet_info = None
+ for packet in capture:
+ try:
+ ip = packet[IP]
+ udp = packet[UDP]
+ # convert the payload to packet info object
+ payload_info = self.payload_to_info(packet[Raw])
+ # make sure the indexes match
+ self.assert_equal(payload_info.src, src_if.sw_if_index,
+ "source sw_if_index")
+ self.assert_equal(payload_info.dst, dst_if.sw_if_index,
+ "destination sw_if_index")
+ packet_info = self.get_next_packet_info_for_interface2(
+ src_if.sw_if_index,
+ dst_if.sw_if_index,
+ packet_info)
+ # make sure we didn't run out of saved packets
+ self.assertIsNotNone(packet_info)
+ self.assert_equal(payload_info.index, packet_info.index,
+ "packet info index")
+ saved_packet = packet_info.data # fetch the saved packet
+ # assert the values match
+ self.assert_equal(ip.src, saved_packet[IP].src,
+ "IP source address")
+ # ... more assertions here
+ self.assert_equal(udp.sport, saved_packet[UDP].sport,
+ "UDP source port")
+ except:
+ self.logger.error(ppp("Unexpected or invalid packet:",
+ packet))
+ raise
+ remaining_packet = self.get_next_packet_info_for_interface2(
+ src_if.sw_if_index,
+ dst_if.sw_if_index,
+ packet_info)
+ self.assertIsNone(remaining_packet,
+ "Interface %s: Packet expected from interface "
+ "%s didn't arrive" % (dst_if.name, src_if.name))
+
+6. Add the test code to test_basic function::
+
+ def test_basic(self):
+ count = 10
+ # create the packet stream
+ packets = self.create_stream(self.pg0, self.pg1, count)
+ # add the stream to the source interface
+ self.pg0.add_stream(packets)
+ # enable capture on both interfaces
+ self.pg0.enable_capture()
+ self.pg1.enable_capture()
+ # start the packet generator
+ self.pg_start()
+ # get capture - the proper count of packets was saved by
+ # create_packet_info() based on dst_if parameter
+ capture = self.pg1.get_capture()
+ # assert nothing captured on pg0 (always do this last, so that
+ # some time has already passed since pg_start())
+ self.pg0.assert_nothing_captured()
+ # verify capture
+ self.verify_capture(self.pg0, self.pg1, capture)
+
+7. Run the test by issuing 'make test' or, to run only this specific
+ test, issue 'make test TEST=test_ip4_fwd'.