diff options
Diffstat (limited to 'src')
31 files changed, 2848 insertions, 2272 deletions
diff --git a/src/tools/vppapigen/VPPAPI.md b/src/tools/vppapigen/VPPAPI.md deleted file mode 100644 index df211d866a0..00000000000 --- a/src/tools/vppapigen/VPPAPI.md +++ /dev/null @@ -1,346 +0,0 @@ -# VPP API Language {#api_lang_doc} - -The VPP binary API is a message passing API. -The VPP API language is used to define a RPC interface between VPP and its -control plane. The API messages supports shared memory transport and -Unix domain sockets (SOCK_STREAM). - -The wire format is essentially that of a network formatted (big-endian) packed C struct. - -The VPP API compiler is located in *src/tools/vppapigen* and can currently -compile to JSON or C (used by the VPP binary itself). - -## Language definition - -### Defining a messages - -There are 3 types of message exchanges: - -* Request/Reply -The client sends a request message and the server replies with a -single reply message. The convention is that the reply message is -named as method_name + \_reply. - -* Dump/Detail -The client sends a "bulk" request message to the server, and the -server replies with a set of detail messages. These messages may be of -different type. A dump/detail call must be enclosed in a control ping -block (Otherwise the client will not know the end of the bulk -transmission). The method name must end with method + "\_dump", the -reply message should be named method + "\_details". The exception here -is for the methods that return multiple message types -(e.g. sw_interface_dump). The Dump/Detail methods are typically used -for acquiring bulk information, like the complete FIB table. - -* Events -The client can register for getting asynchronous notifications from -the server. This is useful for getting interface state changes, and so -on. The method name for requesting notifications is conventionally -prefixed with "want_". E.g. "want_interface_events". Which -notification types results from an event registration is defined in -the service definition. - -A message from a client must include the 'client_index', an opaque -cookie identifying the sender, and a 'context' field to let the client -match request with reply. - -An example of a message definition. The client sends the show_version request, -the server replies with the show_version_reply. - -The *client_index* and *context* fields are required in all requests. -The *context* is returned by the server and is used by the client to -match up request and reply messages. - -``` -define show_version -{ - u32 client_index; - u32 context; -}; -define show_version_reply -{ - u32 context; - i32 retval; - string program [32]; - string version [32]; - string build_date [32]; - /* The final field can be a variable length argument */ - string build_directory []; -}; - -``` - -The flags are not used by the clients, but have special meaning -for some of the tracing and debugging of the API. -The *autoreply* flag is a shorthand for a reply message with just a -*retval* field. - -``` - define : DEFINE ID '{' block_statements_opt '}' ';' - define : flist DEFINE ID '{' block_statements_opt '}' ';' - flist : flag - | flist flag - flag : MANUAL_PRINT - | MANUAL_ENDIAN - | DONT_TRACE - | AUTOREPLY - - block_statements_opt : block_statements - block_statements : block_statement - | block_statements block_statement - block_statement : declaration - | option - declaration : type_specifier ID ';' - | type_specifier ID '[' ID '=' assignee ']' ';' - declaration : type_specifier ID '[' NUM ']' ';' - | type_specifier ID '[' ID ']' ';' - type_specifier : U8 - | U16 - | U32 - | U64 - | I8 - | I16 - | I32 - | I64 - | F64 - | BOOL - | STRING - type_specifier : ID -``` - - -### Options -The *option* word is used to specify meta information. -The only current use is to specify a semantic version of the .api file itself. - -Example: -``` -option version = "1.0.0"; -``` - -``` - - option : OPTION ID '=' assignee ';' - assignee : NUM - | TRUE - | FALSE - | STRING_LITERAL -``` - -### Defining new types - -New user defined types are defined just like messages. -A typedef has two forms. It can either define an alias for a -different type (or array). - -Example: - -``` -typedef u8 ip4_address[4]; -typedef u8 ip6_address[16]; -``` - -Where the above defines two new types *vl_api_ip4_address_t* and -*vl_api_ip6_address_t*. These are aliases for the underlying -u8 array. - -In the other form, it is used to specify an abstract data type. - -``` -enum address_family { - ADDRESS_IP4 = 0, - ADDRESS_IP6, -}; - -union address_union { - vl_api_ip4_address_t ip4; - vl_api_ip6_address_t ip6; -}; - -typedef address { - vl_api_address_family_t af; - vl_api_address_union_t un; -}; -``` - -Where the new type *vl_api_address_t* - -``` - typedef : TYPEDEF ID '{' block_statements_opt '}' ';' - typedef : TYPEDEF declaration -``` - - -### Importing Definitions -You can use definitions from other .api files by importing them. -To import another .api's definitions, you add an import statement -to the top of your file: - -import "vnet/ip/ip_types.api"; - -By default you can only use definitions from directly imported .api files. - -The API compiler searches for imported files in a set of directories -specified on the API compiler command line using the --includedir flag. -``` -import : IMPORT STRING_LITERAL ';' -``` - -### Comments - -The API language uses C style comments. -``` -/* */ -// -``` - -### Enumerations -Enums are similar to enums in C. - -Every enum definition must contain a constant that maps to zero -as its first element. This is because: - -There must be a zero value, so that we can use 0 as a numeric default value. -The zero value needs to be the first element. - -As in C, enums can be used as flags or just as numbers. -The on-wire, and in memory representation size of an enum can be specified. -Not all language bindings will support that. The default size is 4 (u32). - -Example -``` -enum ip_neighbor_flags -{ - IP_API_NEIGHBOR_FLAG_NONE = 0, - IP_API_NEIGHBOR_FLAG_STATIC = 0x1, - IP_API_NEIGHBOR_FLAG_NO_FIB_ENTRY = 0x2, -}; -``` - -Which generates the vl_api_ip_neighbor_flags_t in the C binding. -In Python that is represented as an IntFlag object -VppEnum.vl_api_ip_neighbor_flags_t. - -``` - enum : ENUM ID '{' enum_statements '}' ';' - enum : ENUM ID ':' enum_size '{' enum_statements '}' ';' - enum_size : U8 - | U16 - | U32 - enum_statements : enum_statement - | enum_statements enum_statement - enum_statement : ID '=' NUM ',' - | ID ',' -``` - -### Services -The service statement defines the relationship between messages. -For request/response and dump/details messages it ties the -request with the reply. For events, it specifies which events -that can be received for a given want_* call. - -Example: -``` -service { - rpc want_interface_events returns want_interface_events_reply - events sw_interface_event; -}; - -``` - -Which states that the request want_interface_events returns a -want_interface_events_reply and if enabled the client will -receive sw_interface_event messages whenever interface states changes. - -``` - service : SERVICE '{' service_statements '}' ';' - service_statements : service_statement - | service_statements service_statement - service_statement : RPC ID RETURNS NULL ';' - | RPC ID RETURNS ID ';' - | RPC ID RETURNS STREAM ID ';' - | RPC ID RETURNS ID EVENTS event_list ';' - event_list : events - | event_list events - events : ID - | ID ',' -``` - - -## Types -### Scalar Value Types - -.api type|size|C type|Python type ----------|----|------|----------- -i8 | 1|i8 |int -u8 | 1|u8 |int -i16 | 2|i16 |int -u16 | 2|u16 |int -i32 | 4|i32 |int -u32 | 4|u32 |int -i64 | 8|i64 |int -u64 | 8|u64 |int -f64 | 8|f64 |float -bool | 1|bool |boolean -string |variable|vl_api_string_t|str - -### User Defined Types -#### vnet/ip/ip_types.api - -.api type|size|C type|Python type ----------|----|------|----------- -vl_api_address_t|20|vl_api_address_t|`<class 'ipaddress.IPv4Address'> or <class 'ipaddress.IPv6Address'>` -vl_api_ip4_address_t|4|vl_api_ip4_address_t|`<class 'ipaddress.IPv4Address'>` -vl_api_ip6_address_t|16|vl_api_ip6_address_t|`<class 'ipaddress.IPv6Address'>` -vl_api_prefix_t|21|vl_api_prefix_t|`<class 'ipaddress.IPv4Network'> or <class 'ipaddress.IPv6Network'>` -vl_api_ip4_prefix_t|5|vl_api_ip4_prefix_t|`<class 'ipaddress.IPv4Network'>` -vl_api_ip6_prefix_t|17|vl_api_ip6_prefix_t|`<class 'ipaddress.IPv6Network'>` -vl_api_ip4_address_with_prefix_t|5|vl_api_ip4_address_with_prefix_t|`<class 'ipaddress.IPv4Interface'>` -vl_api_ip6_address_with_prefix_t|17|vl_api_ip6_address_with_prefix_t|`<class 'ipaddress.IPv6Interface'>` - -#### vnet/ethernet/ethernet_types.api -.api type|size|C type|Python type ----------|----|------|----------- -vl_api_mac_address_t|6|vl_api_mac_address_t|`class 'vpp_papi.MACAddress'>` - -#### vnet/interface_types.api -.api type|size|C type|Python type ----------|----|------|----------- -vl_api_interface_index_t|4|vl_api_interface_index_t|int - -### New explicit types - -#### String versus bytes -A byte string with a maximum length of 64: -``` -u8 name[64]; -``` -Before the "string" type was added, text string were defined like this. -The implications of that was the user would have to know if the field -represented a \0 ended C-string or a fixed length byte string. -The wire format of the 'string' type is a u32 length - -An IPv4 or IPv6 address was previously defined like: -``` -u8 is_ip6; -u8 address[16]; -``` - -Which made it hard for language bindings to represent the -address as anything but a byte string. -The new explicit address types are shown above. - -## Language generators - -The VPP API compiler currently has two output modules. One generating JSON -and one generating C header files that are directly used by the VPP -infrastructure and plugins. - -The C/C++, Python, Go Lua, and Java language bindings are generated based -on the JSON files. - -### Future considerations -- [ ] Generate C/C++ (vapi) client code directly from vppapigen -- [ ] Embed JSON definitions into the API server, so dynamic languages - can download them directly without going via the filesystem and JSON - files. diff --git a/src/tools/vppapigen/VPPAPI.rst b/src/tools/vppapigen/VPPAPI.rst new file mode 100644 index 00000000000..5b172a8c758 --- /dev/null +++ b/src/tools/vppapigen/VPPAPI.rst @@ -0,0 +1,404 @@ +VPP API Language +================ + +The VPP binary API is a message passing API. The VPP API language is +used to define a RPC interface between VPP and its control plane. The +API messages supports shared memory transport and Unix domain sockets +(SOCK_STREAM). + +The wire format is essentially that of a network formatted (big-endian) +packed C struct. + +The VPP API compiler is located in *src/tools/vppapigen* and can +currently compile to JSON or C (used by the VPP binary itself). + +Language definition +------------------- + +Defining a messages +~~~~~~~~~~~~~~~~~~~ + +There are 3 types of message exchanges: + +- Request/Reply The client sends a request message and the server + replies with a single reply message. The convention is that the reply + message is named as method_name + \_reply. + +- Dump/Detail The client sends a “bulk” request message to the server, + and the server replies with a set of detail messages. These messages + may be of different type. A dump/detail call must be enclosed in a + control ping block (Otherwise the client will not know the end of the + bulk transmission). The method name must end with method + “\_dump”, + the reply message should be named method + “\_details”. The exception + here is for the methods that return multiple message types + (e.g. sw_interface_dump). The Dump/Detail methods are typically used + for acquiring bulk information, like the complete FIB table. + +- Events The client can register for getting asynchronous notifications + from the server. This is useful for getting interface state changes, + and so on. The method name for requesting notifications is + conventionally prefixed with “want\_”. E.g. “want_interface_events”. + Which notification types results from an event registration is + defined in the service definition. + +A message from a client must include the ‘client_index’, an opaque +cookie identifying the sender, and a ‘context’ field to let the client +match request with reply. + +An example of a message definition. The client sends the show_version +request, the server replies with the show_version_reply. + +The *client_index* and *context* fields are required in all requests. +The *context* is returned by the server and is used by the client to +match up request and reply messages. + +.. code-block:: c + + define show_version + { + u32 client_index; + u32 context; + }; + define show_version_reply + { + u32 context; + i32 retval; + string program [32]; + string version [32]; + string build_date [32]; + /* The final field can be a variable length argument */ + string build_directory []; + }; + +The flags are not used by the clients, but have special meaning for some +of the tracing and debugging of the API. The *autoreply* flag is a +shorthand for a reply message with just a *retval* field. + +.. code-block:: c + + define : DEFINE ID '{' block_statements_opt '}' ';' + define : flist DEFINE ID '{' block_statements_opt '}' ';' + flist : flag + | flist flag + flag : MANUAL_PRINT + | MANUAL_ENDIAN + | DONT_TRACE + | AUTOREPLY + + block_statements_opt : block_statements + block_statements : block_statement + | block_statements block_statement + block_statement : declaration + | option + declaration : type_specifier ID ';' + | type_specifier ID '[' ID '=' assignee ']' ';' + declaration : type_specifier ID '[' NUM ']' ';' + | type_specifier ID '[' ID ']' ';' + type_specifier : U8 + | U16 + | U32 + | U64 + | I8 + | I16 + | I32 + | I64 + | F64 + | BOOL + | STRING + type_specifier : ID + +Options +~~~~~~~ + +The *option* word is used to specify meta information. The only current +use is to specify a semantic version of the .api file itself. + +Example: + +.. code-block:: c + + option version = "1.0.0"; + +.. code-block:: c + + + option : OPTION ID '=' assignee ';' + assignee : NUM + | TRUE + | FALSE + | STRING_LITERAL + +Defining new types +~~~~~~~~~~~~~~~~~~ + +New user defined types are defined just like messages. A typedef has two +forms. It can either define an alias for a different type (or array). + +Example: + +.. code-block:: c + + typedef u8 ip4_address[4]; + typedef u8 ip6_address[16]; + +Where the above defines two new types *vl_api_ip4_address_t* and +*vl_api_ip6_address_t*. These are aliases for the underlying u8 array. + +In the other form, it is used to specify an abstract data type. + +.. code-block:: c + + enum address_family { + ADDRESS_IP4 = 0, + ADDRESS_IP6, + }; + + union address_union { + vl_api_ip4_address_t ip4; + vl_api_ip6_address_t ip6; + }; + + typedef address { + vl_api_address_family_t af; + vl_api_address_union_t un; + }; + +Where the new type *vl_api_address_t* + +.. code-block:: c + + typedef : TYPEDEF ID '{' block_statements_opt '}' ';' + typedef : TYPEDEF declaration + +Importing Definitions +~~~~~~~~~~~~~~~~~~~~~ + +You can use definitions from other .api files by importing them. To +import another .api’s definitions, you add an import statement to the +top of your file: + +import “vnet/ip/ip_types.api”; + +By default you can only use definitions from directly imported .api +files. + +The API compiler searches for imported files in a set of directories +specified on the API compiler command line using the –includedir flag. + +.. code-block:: c + + import : IMPORT STRING_LITERAL ';' + +Comments +~~~~~~~~ + +The API language uses C style comments. + +.. code-block:: c + + /* */ + // + +Enumerations +~~~~~~~~~~~~ + +Enums are similar to enums in C. + +Every enum definition must contain a constant that maps to zero as its +first element. This is because: + +There must be a zero value, so that we can use 0 as a numeric default +value. The zero value needs to be the first element. + +As in C, enums can be used as flags or just as numbers. The on-wire, and +in memory representation size of an enum can be specified. Not all +language bindings will support that. The default size is 4 (u32). + +Example + +.. code-block:: c + + enum ip_neighbor_flags + { + IP_API_NEIGHBOR_FLAG_NONE = 0, + IP_API_NEIGHBOR_FLAG_STATIC = 0x1, + IP_API_NEIGHBOR_FLAG_NO_FIB_ENTRY = 0x2, + }; + +Which generates the vl_api_ip_neighbor_flags_t in the C binding. In +Python that is represented as an IntFlag object +VppEnum.vl_api_ip_neighbor_flags_t. + +.. code-block:: c + + enum : ENUM ID '{' enum_statements '}' ';' + enum : ENUM ID ':' enum_size '{' enum_statements '}' ';' + enum_size : U8 + | U16 + | U32 + enum_statements : enum_statement + | enum_statements enum_statement + enum_statement : ID '=' NUM ',' + | ID ',' + +Services +~~~~~~~~ + +The service statement defines the relationship between messages. For +request/response and dump/details messages it ties the request with the +reply. For events, it specifies which events that can be received for a +given ``want_*`` call. + +Example: + +.. code-block:: c + + service { + rpc want_interface_events returns want_interface_events_reply + events sw_interface_event; + }; + +Which states that the request want_interface_events returns a +want_interface_events_reply and if enabled the client will receive +sw_interface_event messages whenever interface states changes. + +.. code-block:: c + + service : SERVICE '{' service_statements '}' ';' + service_statements : service_statement + | service_statements service_statement + service_statement : RPC ID RETURNS NULL ';' + | RPC ID RETURNS ID ';' + | RPC ID RETURNS STREAM ID ';' + | RPC ID RETURNS ID EVENTS event_list ';' + event_list : events + | event_list events + events : ID + | ID ',' + +Types +----- + +Scalar Value Types +~~~~~~~~~~~~~~~~~~ + +========= ======== =============== =========== +.api type size C type Python type +========= ======== =============== =========== +i8 1 i8 int +u8 1 u8 int +i16 2 i16 int +u16 2 u16 int +i32 4 i32 int +u32 4 u32 int +i64 8 i64 int +u64 8 u64 int +f64 8 f64 float +bool 1 bool boolean +string variable vl_api_string_t str +========= ======== =============== =========== + +User Defined Types +~~~~~~~~~~~~~~~~~~ + +vnet/ip/ip_types.api +^^^^^^^^^^^^^^^^^^^^ + ++--------------------+--------+-------------+-------------------------+ +| .api type | size | C type | Python type | ++====================+========+=============+=========================+ +| vl_api_address_t | 20 | vl_ap | ` | +| | | i_address_t | `<class 'ipaddress.IPv4 | +| | | | Address'> or <class 'ip | +| | | | address.IPv6Address'>`` | ++--------------------+--------+-------------+-------------------------+ +| vl | 4 | vl_api_ip | ``<class 'ip | +| _api_ip4_address_t | | 4_address_t | address.IPv4Address'>`` | ++--------------------+--------+-------------+-------------------------+ +| vl | 16 | vl_api_ip | ``<class 'ip | +| _api_ip6_address_t | | 6_address_t | address.IPv6Address'>`` | ++--------------------+--------+-------------+-------------------------+ +| vl_api_prefix_t | 21 | vl_a | ` | +| | | pi_prefix_t | `<class 'ipaddress.IPv4 | +| | | | Network'> or <class 'ip | +| | | | address.IPv6Network'>`` | ++--------------------+--------+-------------+-------------------------+ +| v | 5 | vl_api_i | ``<class 'ip | +| l_api_ip4_prefix_t | | p4_prefix_t | address.IPv4Network'>`` | ++--------------------+--------+-------------+-------------------------+ +| v | 17 | vl_api_i | ``<class 'ip | +| l_api_ip6_prefix_t | | p6_prefix_t | address.IPv6Network'>`` | ++--------------------+--------+-------------+-------------------------+ +| vl_api_ip4_add | 5 | vl_api_ip4 | ``<class 'ipad | +| ress_with_prefix_t | | _address_wi | dress.IPv4Interface'>`` | +| | | th_prefix_t | | ++--------------------+--------+-------------+-------------------------+ +| vl_api_ip6_add | 17 | vl_api_ip6 | ``<class 'ipad | +| ress_with_prefix_t | | _address_wi | dress.IPv6Interface'>`` | +| | | th_prefix_t | | ++--------------------+--------+-------------+-------------------------+ + +vnet/ethernet/ethernet_types.api +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++---------------------+------+---------------------+-------------------+ +| .api type | size | C type | Python type | ++=====================+======+=====================+===================+ +| ``vl_ | 6 | ``vl_ | ``class 'vpp_pa | +| api_mac_address_t`` | | api_mac_address_t`` | pi.MACAddress'>`` | ++---------------------+------+---------------------+-------------------+ + +vnet/interface_types.api +^^^^^^^^^^^^^^^^^^^^^^^^ + +======================== ==== ======================== =========== +.api type size C type Python type +======================== ==== ======================== =========== +vl_api_interface_index_t 4 vl_api_interface_index_t int +======================== ==== ======================== =========== + +New explicit types +~~~~~~~~~~~~~~~~~~ + +String versus bytes +^^^^^^^^^^^^^^^^^^^ + +A byte string with a maximum length of 64: + +.. code-block:: c + + u8 name[64]; + +Before the “string” type was added, text string were defined like this. +The implications of that was the user would have to know if the field +represented a \\0 ended C-string or a fixed length byte string. The wire +format of the ‘string’ type is a u32 length + +An IPv4 or IPv6 address was previously defined like: + +.. code-block:: c + + u8 is_ip6; + u8 address[16]; + +Which made it hard for language bindings to represent the address as +anything but a byte string. The new explicit address types are shown +above. + +Language generators +------------------- + +The VPP API compiler currently has two output modules. One generating +JSON and one generating C header files that are directly used by the VPP +infrastructure and plugins. + +The C/C++, Python, Go Lua, and Java language bindings are generated +based on the JSON files. + +Future considerations +~~~~~~~~~~~~~~~~~~~~~ + +- Generate C/C++ (vapi) client code directly from vppapigen +- Embed JSON definitions into the API server, so dynamic languages + can download them directly without going via the filesystem and JSON + files. diff --git a/src/vlibapi/api_doc.md b/src/vlibapi/api_doc.md deleted file mode 100644 index 2e7ae09a722..00000000000 --- a/src/vlibapi/api_doc.md +++ /dev/null @@ -1,352 +0,0 @@ -# Binary API support {#api_doc} - -VPP provides a binary API scheme to allow a wide variety of client codes to -program data-plane tables. As of this writing, there are hundreds of binary -APIs. - -Messages are defined in `*.api` files. Today, there are about 50 api files, -with more arriving as folks add programmable features. The API file compiler -sources reside in @ref src/tools/vppapigen. - -From @ref src/vnet/interface.api, here's a typical request/response message -definition: - -```{.c} - autoreply define sw_interface_set_flags - { - u32 client_index; - u32 context; - u32 sw_if_index; - /* 1 = up, 0 = down */ - u8 admin_up_down; - }; -``` - -To a first approximation, the API compiler renders this definition into -`build-root/.../vpp/include/vnet/interface.api.h` as follows: - -```{.c} - /****** Message ID / handler enum ******/ - #ifdef vl_msg_id - vl_msg_id(VL_API_SW_INTERFACE_SET_FLAGS, vl_api_sw_interface_set_flags_t_handler) - vl_msg_id(VL_API_SW_INTERFACE_SET_FLAGS_REPLY, vl_api_sw_interface_set_flags_reply_t_handler) - #endif - - /****** Message names ******/ - #ifdef vl_msg_name - vl_msg_name(vl_api_sw_interface_set_flags_t, 1) - vl_msg_name(vl_api_sw_interface_set_flags_reply_t, 1) - #endif - - /****** Message name, crc list ******/ - #ifdef vl_msg_name_crc_list - #define foreach_vl_msg_name_crc_interface \ - _(VL_API_SW_INTERFACE_SET_FLAGS, sw_interface_set_flags, f890584a) \ - _(VL_API_SW_INTERFACE_SET_FLAGS_REPLY, sw_interface_set_flags_reply, dfbf3afa) \ - #endif - - /****** Typedefs *****/ - #ifdef vl_typedefs - typedef VL_API_PACKED(struct _vl_api_sw_interface_set_flags { - u16 _vl_msg_id; - u32 client_index; - u32 context; - u32 sw_if_index; - u8 admin_up_down; - }) vl_api_sw_interface_set_flags_t; - - typedef VL_API_PACKED(struct _vl_api_sw_interface_set_flags_reply { - u16 _vl_msg_id; - u32 context; - i32 retval; - }) vl_api_sw_interface_set_flags_reply_t; - - ... - #endif /* vl_typedefs */ -``` - -To change the admin state of an interface, a binary api client sends a -@ref vl_api_sw_interface_set_flags_t to VPP, which will respond with a -@ref vl_api_sw_interface_set_flags_reply_t message. - -Multiple layers of software, transport types, and shared libraries -implement a variety of features: - -* API message allocation, tracing, pretty-printing, and replay. -* Message transport via global shared memory, pairwise/private shared - memory, and sockets. -* Barrier synchronization of worker threads across thread-unsafe - message handlers. - -Correctly-coded message handlers know nothing about the transport used to -deliver messages to/from VPP. It's reasonably straighforward to use multiple -API message transport types simultaneously. - -For historical reasons, binary api messages are (putatively) sent in network -byte order. As of this writing, we're seriously considering whether that -choice makes sense. - - -## Message Allocation - -Since binary API messages are always processed in order, we allocate messages -using a ring allocator whenever possible. This scheme is extremely fast when -compared with a traditional memory allocator, and doesn't cause heap -fragmentation. See -@ref src/vlibmemory/memory_shared.c @ref vl_msg_api_alloc_internal(). - -Regardless of transport, binary api messages always follow a @ref msgbuf_t -header: - -```{.c} - typedef struct msgbuf_ - { - unix_shared_memory_queue_t *q; - u32 data_len; - u32 gc_mark_timestamp; - u8 data[0]; - } msgbuf_t; -``` - -This structure makes it easy to trace messages without having to -decode them - simply save data_len bytes - and allows -@ref vl_msg_api_free() to rapidly dispose of message buffers: - -```{.c} - void - vl_msg_api_free (void *a) - { - msgbuf_t *rv; - api_main_t *am = &api_main; - - rv = (msgbuf_t *) (((u8 *) a) - offsetof (msgbuf_t, data)); - - /* - * Here's the beauty of the scheme. Only one proc/thread has - * control of a given message buffer. To free a buffer, we just - * clear the queue field, and leave. No locks, no hits, no errors... - */ - if (rv->q) - { - rv->q = 0; - rv->gc_mark_timestamp = 0; - return; - } - <snip> - } -``` - -## Message Tracing and Replay - -It's extremely important that VPP can capture and replay sizeable binary API -traces. System-level issues involving hundreds of thousands of API -transactions can be re-run in a second or less. Partial replay allows one to -binary-search for the point where the wheels fall off. One can add scaffolding -to the data plane, to trigger when complex conditions obtain. - -With binary API trace, print, and replay, system-level bug reports of the form -"after 300,000 API transactions, the VPP data-plane stopped forwarding -traffic, FIX IT!" can be solved offline. - -More often than not, one discovers that a control-plane client -misprograms the data plane after a long time or under complex -circumstances. Without direct evidence, "it's a data-plane problem!" - -See @ref src/vlibmemory/memory_vlib.c @ref vl_msg_api_process_file(), -and @ref src/vlibapi/api_shared.c. See also the debug CLI command "api trace" - -## Client connection details - -Establishing a binary API connection to VPP from a C-language client -is easy: - -```{.c} - int - connect_to_vpe (char *client_name, int client_message_queue_length) - { - vat_main_t *vam = &vat_main; - api_main_t *am = &api_main; - - if (vl_client_connect_to_vlib ("/vpe-api", client_name, - client_message_queue_length) < 0) - return -1; - - /* Memorize vpp's binary API message input queue address */ - vam->vl_input_queue = am->shmem_hdr->vl_input_queue; - /* And our client index */ - vam->my_client_index = am->my_client_index; - return 0; - } -``` - -32 is a typical value for client_message_queue_length. VPP cannot -block when it needs to send an API message to a binary API client, and -the VPP-side binary API message handlers are very fast. When sending -asynchronous messages, make sure to scrape the binary API rx ring with -some enthusiasm. - -### binary API message RX pthread - -Calling @ref vl_client_connect_to_vlib spins up a binary API message RX -pthread: - -```{.c} - static void * - rx_thread_fn (void *arg) - { - unix_shared_memory_queue_t *q; - memory_client_main_t *mm = &memory_client_main; - api_main_t *am = &api_main; - - q = am->vl_input_queue; - - /* So we can make the rx thread terminate cleanly */ - if (setjmp (mm->rx_thread_jmpbuf) == 0) - { - mm->rx_thread_jmpbuf_valid = 1; - while (1) - { - vl_msg_api_queue_handler (q); - } - } - pthread_exit (0); - } -``` - -To handle the binary API message queue yourself, use -@ref vl_client_connect_to_vlib_no_rx_pthread. - -In turn, vl_msg_api_queue_handler(...) uses mutex/condvar signalling -to wake up, process VPP -> client traffic, then sleep. VPP supplies a -condvar broadcast when the VPP -> client API message queue transitions -from empty to nonempty. - -VPP checks its own binary API input queue at a very high rate. VPP -invokes message handlers in "process" context [aka cooperative -multitasking thread context] at a variable rate, depending on -data-plane packet processing requirements. - -## Client disconnection details - -To disconnect from VPP, call @ref vl_client_disconnect_from_vlib. -Please arrange to call this function if the client application -terminates abnormally. VPP makes every effort to hold a decent funeral -for dead clients, but VPP can't guarantee to free leaked memory in the -shared binary API segment. - -## Sending binary API messages to VPP - -The point of the exercise is to send binary API messages to VPP, and -to receive replies from VPP. Many VPP binary APIs comprise a client -request message, and a simple status reply. For example, to -set the admin status of an interface, one codes: - -```{.c} - vl_api_sw_interface_set_flags_t *mp; - - mp = vl_msg_api_alloc (sizeof (*mp)); - memset (mp, 0, sizeof (*mp)); - mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_SW_INTERFACE_SET_FLAGS); - mp->client_index = api_main.my_client_index; - mp->sw_if_index = clib_host_to_net_u32 (<interface-sw-if-index>); - vl_msg_api_send (api_main.shmem_hdr->vl_input_queue, (u8 *)mp); -``` - -Key points: - -* Use @ref vl_msg_api_alloc to allocate message buffers - -* Allocated message buffers are not initialized, and must be presumed - to contain trash. - -* Don't forget to set the _vl_msg_id field! - -* As of this writing, binary API message IDs and data are sent in - network byte order - -* The client-library global data structure @ref api_main keeps track - of sufficient pointers and handles used to communicate with VPP - -## Receiving binary API messages from VPP - -Unless you've made other arrangements (see @ref -vl_client_connect_to_vlib_no_rx_pthread), *messages are received on a -separate rx pthread*. Synchronization with the client application main -thread is the responsibility of the application! - -Set up message handlers about as follows: - -```{.c} - #define vl_typedefs /* define message structures */ - #include <vpp/api/vpe_all_api_h.h> - #undef vl_typedefs - - /* declare message handlers for each api */ - - #define vl_endianfun /* define message structures */ - #include <vpp/api/vpe_all_api_h.h> - #undef vl_endianfun - - /* instantiate all the print functions we know about */ - #define vl_print(handle, ...) - #define vl_printfun - #include <vpp/api/vpe_all_api_h.h> - #undef vl_printfun - - /* Define a list of all message that the client handles */ - #define foreach_vpe_api_reply_msg \ - _(SW_INTERFACE_SET_FLAGS_REPLY, sw_interface_set_flags_reply) - - static clib_error_t * - my_api_hookup (vlib_main_t * vm) - { - api_main_t *am = &api_main; - - #define _(N,n) \ - vl_msg_api_set_handlers(VL_API_##N, #n, \ - vl_api_##n##_t_handler, \ - vl_noop_handler, \ - vl_api_##n##_t_endian, \ - vl_api_##n##_t_print, \ - sizeof(vl_api_##n##_t), 1); - foreach_vpe_api_msg; - #undef _ - - return 0; - } -``` - -The key API used to establish message handlers is @ref -vl_msg_api_set_handlers , which sets values in multiple parallel -vectors in the @ref api_main_t structure. As of this writing: not all -vector element values can be set through the API. You'll see sporadic -API message registrations followed by minor adjustments of this form: - -```{.c} - /* - * Thread-safe API messages - */ - am->is_mp_safe[VL_API_IP_ADD_DEL_ROUTE] = 1; - am->is_mp_safe[VL_API_GET_NODE_GRAPH] = 1; -``` - - - - - - - - - - - - - - - - - - - - - diff --git a/src/vlibapi/api_doc.rst b/src/vlibapi/api_doc.rst new file mode 100644 index 00000000000..7d2b80a2e06 --- /dev/null +++ b/src/vlibapi/api_doc.rst @@ -0,0 +1,342 @@ +.. _api_doc: + +Writing API handlers +==================== + +VPP provides a binary API scheme to allow a wide variety of client codes +to program data-plane tables. As of this writing, there are hundreds of +binary APIs. + +Messages are defined in ``*.api`` files. Today, there are about 50 api +files, with more arriving as folks add programmable features. The API +file compiler sources reside in @ref src/tools/vppapigen. + +From @ref src/vnet/interface.api, here’s a typical request/response +message definition: + +.. code:: c + + autoreply define sw_interface_set_flags + { + u32 client_index; + u32 context; + u32 sw_if_index; + /* 1 = up, 0 = down */ + u8 admin_up_down; + }; + +To a first approximation, the API compiler renders this definition into +``build-root/.../vpp/include/vnet/interface.api.h`` as follows: + +.. code:: c + + /****** Message ID / handler enum ******/ + #ifdef vl_msg_id + vl_msg_id(VL_API_SW_INTERFACE_SET_FLAGS, vl_api_sw_interface_set_flags_t_handler) + vl_msg_id(VL_API_SW_INTERFACE_SET_FLAGS_REPLY, vl_api_sw_interface_set_flags_reply_t_handler) + #endif + + /****** Message names ******/ + #ifdef vl_msg_name + vl_msg_name(vl_api_sw_interface_set_flags_t, 1) + vl_msg_name(vl_api_sw_interface_set_flags_reply_t, 1) + #endif + + /****** Message name, crc list ******/ + #ifdef vl_msg_name_crc_list + #define foreach_vl_msg_name_crc_interface \ + _(VL_API_SW_INTERFACE_SET_FLAGS, sw_interface_set_flags, f890584a) \ + _(VL_API_SW_INTERFACE_SET_FLAGS_REPLY, sw_interface_set_flags_reply, dfbf3afa) \ + #endif + + /****** Typedefs *****/ + #ifdef vl_typedefs + typedef VL_API_PACKED(struct _vl_api_sw_interface_set_flags { + u16 _vl_msg_id; + u32 client_index; + u32 context; + u32 sw_if_index; + u8 admin_up_down; + }) vl_api_sw_interface_set_flags_t; + + typedef VL_API_PACKED(struct _vl_api_sw_interface_set_flags_reply { + u16 _vl_msg_id; + u32 context; + i32 retval; + }) vl_api_sw_interface_set_flags_reply_t; + + ... + #endif /* vl_typedefs */ + +To change the admin state of an interface, a binary api client sends a +@ref vl_api_sw_interface_set_flags_t to VPP, which will respond with a +@ref vl_api_sw_interface_set_flags_reply_t message. + +Multiple layers of software, transport types, and shared libraries +implement a variety of features: + +- API message allocation, tracing, pretty-printing, and replay. +- Message transport via global shared memory, pairwise/private shared + memory, and sockets. +- Barrier synchronization of worker threads across thread-unsafe + message handlers. + +Correctly-coded message handlers know nothing about the transport used +to deliver messages to/from VPP. It’s reasonably straightforward to use +multiple API message transport types simultaneously. + +For historical reasons, binary api messages are (putatively) sent in +network byte order. As of this writing, we’re seriously considering +whether that choice makes sense. + +Message Allocation +------------------ + +Since binary API messages are always processed in order, we allocate +messages using a ring allocator whenever possible. This scheme is +extremely fast when compared with a traditional memory allocator, and +doesn’t cause heap fragmentation. See @ref +src/vlibmemory/memory_shared.c @ref vl_msg_api_alloc_internal(). + +Regardless of transport, binary api messages always follow a @ref +msgbuf_t header: + +.. code:: c + + typedef struct msgbuf_ + { + unix_shared_memory_queue_t *q; + u32 data_len; + u32 gc_mark_timestamp; + u8 data[0]; + } msgbuf_t; + +This structure makes it easy to trace messages without having to decode +them - simply save data_len bytes - and allows @ref vl_msg_api_free() to +rapidly dispose of message buffers: + +.. code:: c + + void + vl_msg_api_free (void *a) + { + msgbuf_t *rv; + api_main_t *am = &api_main; + + rv = (msgbuf_t *) (((u8 *) a) - offsetof (msgbuf_t, data)); + + /* + * Here's the beauty of the scheme. Only one proc/thread has + * control of a given message buffer. To free a buffer, we just + * clear the queue field, and leave. No locks, no hits, no errors... + */ + if (rv->q) + { + rv->q = 0; + rv->gc_mark_timestamp = 0; + return; + } + <snip> + } + +Message Tracing and Replay +-------------------------- + +It’s extremely important that VPP can capture and replay sizeable binary +API traces. System-level issues involving hundreds of thousands of API +transactions can be re-run in a second or less. Partial replay allows +one to binary-search for the point where the wheels fall off. One can +add scaffolding to the data plane, to trigger when complex conditions +obtain. + +With binary API trace, print, and replay, system-level bug reports of +the form “after 300,000 API transactions, the VPP data-plane stopped +forwarding traffic, FIX IT!” can be solved offline. + +More often than not, one discovers that a control-plane client +misprograms the data plane after a long time or under complex +circumstances. Without direct evidence, “it’s a data-plane problem!” + +See @ref src/vlibmemory/memory_vlib.c @ref vl_msg_api_process_file(), +and @ref src/vlibapi/api_shared.c. See also the debug CLI command “api +trace” + +Client connection details +------------------------- + +Establishing a binary API connection to VPP from a C-language client is +easy: + +.. code:: c + + int + connect_to_vpe (char *client_name, int client_message_queue_length) + { + vat_main_t *vam = &vat_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", client_name, + client_message_queue_length) < 0) + return -1; + + /* Memorize vpp's binary API message input queue address */ + vam->vl_input_queue = am->shmem_hdr->vl_input_queue; + /* And our client index */ + vam->my_client_index = am->my_client_index; + return 0; + } + +32 is a typical value for client_message_queue_length. VPP cannot block +when it needs to send an API message to a binary API client, and the +VPP-side binary API message handlers are very fast. When sending +asynchronous messages, make sure to scrape the binary API rx ring with +some enthusiasm. + +binary API message RX pthread +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Calling @ref vl_client_connect_to_vlib spins up a binary API message RX +pthread: + +.. code:: c + + static void * + rx_thread_fn (void *arg) + { + unix_shared_memory_queue_t *q; + memory_client_main_t *mm = &memory_client_main; + api_main_t *am = &api_main; + + q = am->vl_input_queue; + + /* So we can make the rx thread terminate cleanly */ + if (setjmp (mm->rx_thread_jmpbuf) == 0) + { + mm->rx_thread_jmpbuf_valid = 1; + while (1) + { + vl_msg_api_queue_handler (q); + } + } + pthread_exit (0); + } + +To handle the binary API message queue yourself, use @ref +vl_client_connect_to_vlib_no_rx_pthread. + +In turn, vl_msg_api_queue_handler(…) uses mutex/condvar signalling to +wake up, process VPP -> client traffic, then sleep. VPP supplies a +condvar broadcast when the VPP -> client API message queue transitions +from empty to nonempty. + +VPP checks its own binary API input queue at a very high rate. VPP +invokes message handlers in “process” context [aka cooperative +multitasking thread context] at a variable rate, depending on data-plane +packet processing requirements. + +Client disconnection details +---------------------------- + +To disconnect from VPP, call @ref vl_client_disconnect_from_vlib. Please +arrange to call this function if the client application terminates +abnormally. VPP makes every effort to hold a decent funeral for dead +clients, but VPP can’t guarantee to free leaked memory in the shared +binary API segment. + +Sending binary API messages to VPP +---------------------------------- + +The point of the exercise is to send binary API messages to VPP, and to +receive replies from VPP. Many VPP binary APIs comprise a client request +message, and a simple status reply. For example, to set the admin status +of an interface, one codes: + +.. code:: c + + vl_api_sw_interface_set_flags_t *mp; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_SW_INTERFACE_SET_FLAGS); + mp->client_index = api_main.my_client_index; + mp->sw_if_index = clib_host_to_net_u32 (<interface-sw-if-index>); + vl_msg_api_send (api_main.shmem_hdr->vl_input_queue, (u8 *)mp); + +Key points: + +- Use @ref vl_msg_api_alloc to allocate message buffers + +- Allocated message buffers are not initialized, and must be presumed + to contain trash. + +- Don’t forget to set the \_vl_msg_id field! + +- As of this writing, binary API message IDs and data are sent in + network byte order + +- The client-library global data structure @ref api_main keeps track of + sufficient pointers and handles used to communicate with VPP + +Receiving binary API messages from VPP +-------------------------------------- + +Unless you’ve made other arrangements (see @ref +vl_client_connect_to_vlib_no_rx_pthread), *messages are received on a +separate rx pthread*. Synchronization with the client application main +thread is the responsibility of the application! + +Set up message handlers about as follows: + +.. code:: c + + #define vl_typedefs /* define message structures */ + #include <vpp/api/vpe_all_api_h.h> + #undef vl_typedefs + + /* declare message handlers for each api */ + + #define vl_endianfun /* define message structures */ + #include <vpp/api/vpe_all_api_h.h> + #undef vl_endianfun + + /* instantiate all the print functions we know about */ + #define vl_print(handle, ...) + #define vl_printfun + #include <vpp/api/vpe_all_api_h.h> + #undef vl_printfun + + /* Define a list of all message that the client handles */ + #define foreach_vpe_api_reply_msg \ + _(SW_INTERFACE_SET_FLAGS_REPLY, sw_interface_set_flags_reply) + + static clib_error_t * + my_api_hookup (vlib_main_t * vm) + { + api_main_t *am = &api_main; + + #define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_msg; + #undef _ + + return 0; + } + +The key API used to establish message handlers is @ref +vl_msg_api_set_handlers , which sets values in multiple parallel vectors +in the @ref api_main_t structure. As of this writing: not all vector +element values can be set through the API. You’ll see sporadic API +message registrations followed by minor adjustments of this form: + +.. code:: c + + /* + * Thread-safe API messages + */ + am->is_mp_safe[VL_API_IP_ADD_DEL_ROUTE] = 1; + am->is_mp_safe[VL_API_GET_NODE_GRAPH] = 1; diff --git a/src/vnet/MTU.md b/src/vnet/MTU.md deleted file mode 100644 index a0a8ba87490..00000000000 --- a/src/vnet/MTU.md +++ /dev/null @@ -1,72 +0,0 @@ -# MTU Introduction {#mtu_doc} -Maximum Transmission Unit is a term used to describe the maximum sized "thingy" that can be sent out an interface. It can refer to the maximum frame size that a NIC can send. On Ethernet that would include the Ethernet header but typically not the IGF. It can refer to the maximum packet size, that is, on Ethernet an MTU of 1500, would allow an IPv4 packet of 1500 bytes, that would result in an Ethernet frame of 1518 bytes. - -# MTU in VPP -VPP allows setting of the physical payload MTU. I.e. not including L2 overhead. Setting the hardware MTU will program the NIC. -This MTU will be inherited by all software interfaces. - -VPP also allows setting of the payload MTU for software interfaces. Independently of the MTU set on the hardware. If the software payload MTU is set higher than the capability of the NIC, the packet will be dropped. - -In addition VPP supports setting the MTU of individual network layer protocols. IPv4, IPv6 or MPLS. For example an IPv4 MTU of 1500 (includes the IPv4 header) will fit in a hardware payload MTU of 1500. - -_Note we might consider changing the hardware payload MTU to hardware MTU_. That is, the MTU includes all L2 framing. Then the payload MTU can be calculated based on the interface's configuration. E.g. 802.1q tags etc. - -There are currently no checks or warnings if e.g. the user configures a per-protocol MTU larger than the underlying payload MTU. If that happens packets will be fragmented or dropped. - -## Data structures -The hardware payload MTU is stored in the max_packet_bytes variable in the vnet_hw_interface_t structure. - -The software MTU (previously max_l3_packet_bytes) is in vnet_sw_interface_t->in mtu[VNET_N_MTU]. - -# API - -## Set physical MTU - -This API message is used to set the physical MTU. It is currently limited to Ethernet interfaces. Note, this programs the NIC. - -``` -autoreply define hw_interface_set_mtu -{ - u32 client_index; - u32 context; - u32 sw_if_index; - u16 mtu; -}; -``` - -## Set the L2 payload MTU (not including the L2 header) and per-protocol MTUs - -This API message sets the L3 payload MTU. E.g. on Ethernet it is the maximum size of the Ethernet payload. If a value is left as 0, then the default is picked from VNET_MTU_L3. - -``` -autoreply define sw_interface_set_mtu -{ - u32 client_index; - u32 context; - u32 sw_if_index; - /* $$$$ Replace with enum */ - u32 mtu[4]; /* 0 - L3, 1 - IP4, 2 - IP6, 3 - MPLS */ -}; - -``` - -## Get interface MTU - -The various MTUs on an interface can be queried with the sw_interface_dump/sw_interface_details calls. - -``` -define sw_interface_details -{ - /* MTU */ - u16 link_mtu; - - /* Per protocol MTUs */ - u32 mtu[4]; /* 0 - L3, 1 - IP4, 2 - IP6, 3 - MPLS */ -}; -``` - -# CLI - -``` -set interface mtu [packet|ip4|ip6|mpls] <value> <interface> -``` diff --git a/src/vnet/bfd/bfd_doc.md b/src/vnet/bfd/bfd_doc.md deleted file mode 100644 index 7d7606e4dd1..00000000000 --- a/src/vnet/bfd/bfd_doc.md +++ /dev/null @@ -1,374 +0,0 @@ -# BFD module {#bfd_doc} - -## Overview - -Bidirectional Forwarding Detection in VPP currently supports single-hop UDP -transport based on RFC 5880 and RFC 5881. - -## Usage - -### General usage - -BFD sessions are created using APIs only. The following CLIs are implemented, -which call the APIs to manipulate the BFD: - -#### Show commands: - -> show bfd [keys|sessions|echo-source] - -Show the existing keys, sessions or echo-source. - -#### Key manipulation - -##### Create a new key or modify an existing key - -> bfd key set conf-key-id <id> type <keyed-sha1|meticulous-keyed-sha1> secret <secret> - -Parameters: - -* conf-key-id - local configuration key ID, used to uniquely identify this key -* type - type of the key -* secret - shared secret (hex data) - -Example: - -> bfd key set conf-key-id 2368880803 type meticulous-keyed-sha1 secret 69d685b0d990cdba46872706dc - -Notes: - -* in-use key cannot be modified - -##### Delete an existing key - -> bfd key del conf-key-id <id> - -Parameters: - -* conf-key-id - local configuration key ID, used to uniquely identify this key - -Example: - -> bfd key del conf-key-id 2368880803 - -Notes: - -* in-use key cannot be deleted - -##### Create a new (plain or authenticated) BFD session - -> bfd udp session add interface <interface> local-addr <address> peer-addr <address> desired-min-tx <interval> required-min-rx <interval> detect-mult <multiplier> [ conf-key-id <ID> bfd-key-id <ID> ] - -Parameters: - -* interface - interface to which this session is tied to -* local-addr - local address (ipv4 or ipv6) -* peer-addr - peer address (ipv4 or ipv6, must match local-addr family) -* desired-min-tx - desired minimum tx interval (microseconds) -* required-min-rx - required minimum rx interval (microseconds) -* detect-mult - detect multiplier (must be non-zero) -* conf-key-id - local configuration key ID -* bfd-key-id - BFD key ID, as carried in BFD control frames - -Example: - -> bfd udp session add interface pg0 local-addr fd01:1::1 peer-addr fd01:1::2 desired-min-tx 100000 required-min-rx 100000 detect-mult 3 conf-key-id 1029559112 bfd-key-id 13 - -Notes: - -* if conf-key-id and bfd-key-id are not specified, session is non-authenticated -* desired-min-tx controls desired transmission rate of both control frames and echo packets - -##### Modify BFD session - -> bfd udp session mod interface <interface> local-addr <address> peer-addr <address> desired-min-tx <interval> required-min-rx <interval> detect-mult <multiplier> - -Parameters: - -* interface - interface to which this session is tied to -* local-addr - local address (ipv4 or ipv6) -* peer-addr - peer address (ipv4 or ipv6, must match local-addr family) -* desired-min-tx - desired minimum tx interval (microseconds) -* required-min-rx - required minimum rx interval (microseconds) -* detect-mult - detect multiplier (must be non-zero) - -Example: - -> bfd udp session mod interface pg0 local-addr 172.16.1.1 peer-addr 172.16.1.2 desired-min-tx 300000 required-min-rx 200000 detect-mult 12 - -Notes: - -* desired-min-tx controls desired transmission rate of both control frames and echo packets - -##### Delete an existing BFD session - -> bfd udp session del interface <interface> local-addr <address> peer-addr<address> - -Parameters: - -* interface - interface to which this session is tied to -* local-addr - local address (ipv4 or ipv6) -* peer-addr - peer address (ipv4 or ipv6, must match local-addr family) - -Example: - -> bfd udp session del interface pg0 local-addr 172.16.1.1 peer-addr 172.16.1.2 - -##### Set session admin-up or admin-down - -> bfd udp session set-flags interface <interface> local-addr <address> peer-addr <address> admin <up|down> - -Parameters: - -* interface - interface to which this session is tied to -* local-addr - local address (ipv4 or ipv6) -* peer-addr - peer address (ipv4 or ipv6, must match local-addr family) -* admin - up/down based on desired action - -Example: - -> bfd udp session set-flags admin down interface pg0 local-addr 172.16.1.1 peer-addr 172.16.1.2 - -##### Activate/change authentication for existing session - -> bfd udp session auth activate interface <interface> local-addr <address> peer-addr <address> conf-key-id <ID> bfd-key-id <ID> [ delayed <yes|no> ] - -Parameters: - -* interface - interface to which this session is tied to -* local-addr - local address (ipv4 or ipv6) -* peer-addr - peer address (ipv4 or ipv6, must match local-addr family) -* conf-key-id - local configuration key ID -* bfd-key-id - BFD key ID, as carried in BFD control frames -* delayed - is yes then this action is delayed until the peer performs the same action - -Example: - -> bfd udp session auth activate interface pg0 local-addr 172.16.1.1 peer-addr 172.16.1.2 conf-key-id 540928695 bfd-key-id 239 delayed yes - -Notes: - -* see [Delayed option] for more information - -##### Deactivate authentication for existing session - -> bfd udp session auth deactivate interface <interface> local-addr <address> peer-addr <address> [ delayed <yes|no> ] - -Parameters: - -* interface - interface to which this session is tied to -* local-addr - local address (ipv4 or ipv6) -* peer-addr - peer address (ipv4 or ipv6, must match local-addr family) -* delayed - is yes then this action is delayed until the peer performs the same action - -Example: - -> bfd udp session auth deactivate interface pg0 local-addr 172.16.1.1 peer-addr 172.16.1.2 - -Notes: - -* see [Delayed option] for more information - -##### Set echo-source interface - -> bfd udp echo-source set interface <interface> - -Parameters: - -* interface - interface used for getting source address for echo packets - -Example: - -> bfd udp echo-source set interface loop0 - -##### Delete echo-source interface - -> bfd udp echo-source del - -Example: - -> bfd udp echo-source del - -### Authentication - -BFD sessions should be authenticated for security purposes. SHA1 and meticulous -SHA1 authentication is supported by VPP. First, authentication keys are -configured in VPP and afterwards they can be used by sessions. - -There are two key IDs in the scope of BFD session: - -* configuration key ID is the internal unique key ID inside VPP and is never - communicated to any peer, it serves only the purpose of identifying the key -* BFD key ID is the key ID carried in BFD control frames and is used for - verifying authentication - -#### Turning auth on/off - -Authentication can be turned on or off at any time. Care must be taken however, -to either synchronize the authentication manipulation with peer's actions -to avoid the session going down. - -##### Delayed option - -Delayed option is useful for synchronizing authentication changes with a peer. -If it's specified, then authentication change is not performed immediately. -In this case, VPP continues to transmit packets using the old authentication -method (unauthenticated or using old sha1 key). If a packet is received, which -does not pass the current authentication, then VPP tries to authenticate it -using the new method (which might be none, if deactivating authentication) -and if it passes, then the new authentication method is put in use. - -The recommended procedure for enabling/changing/disabling session -authentication is: - -1. perform authentication change on vpp's side with delayed option set to yes -2. perform authentication change on peer's side (without delayed option) - -Notes: - -* if both peers use delayed option at the same time, the change will never - be carried out, since none of the peers will see any packet with the new - authentication which could trigger the change -* remote peer does not need to support or even be aware of this mechanism - for it to work properly - - -### Echo function - -Echo function is used by VPP whenever a peer declares the willingness -to support it, echo-source is set and it contains a usable subnet (see below). -When echo function is switched on, the required min rx interval advertised -to peer is set to 1 second (or the configured value, if its higher). - -#### Echo source address - -Because echo packets are only looped back (and not processed in any way) -by a peer, it's necessary to set the source address in a way which avoids -packet drop due to spoofing protection by VPP. Per RFC, the source address -should not be in the subnet set on the interface over which the echo packets -are sent. Also, it must not be any VPP-local address, otherwise the packet -gets dropped on receipt by VPP. The solution is to create a loopback interface -with a (private) IPv4/IPv6 subnet assigned as echo-source. The BFD then picks -an unused address from the subnet by flipping the last bit and uses that as -source address in the echo packets, thus meeting RFC recommendation while -avoiding spoofing protection. - -Example: if 10.10.10.3/31 is the subnet, then 10.10.10.2 will be used as - source address in (IPv4) echo packets - -### Demand mode - -Demand mode is respected by VPP, but not used locally. The only scenario when -demand mode could make sense currently is when echo is active. Because echo -packets are inherently insecure against an adversary looping them back a poll -sequence would be required for slow periodic connectivity verification anyway. -It's more efficient to just ask the remote peer to send slow periodic control -frames without VPP initiating periodic poll sequences. - -### Admin-down - -Session may be put admin-down at any time. This immediately causes the state -to be changed to AdminDown and remain so unless the session is put admin-up. - -## BFD implementation notes - -Because BFD can work over different transport layers, the BFD code is separated -into core BFD functionality - main module implemented in bfd_main.c -and transport-specific code implemented in bfd_udp.c. - -### Main module - -Main module is responsible for handling all the BFD functionality defined -in RFC 5880. - -#### Internal API - -Internal APIs defined in bfd_main.h are called from transport-specific code -to create/modify/delete - -#### Packet receipt - -When a packet is received by the transport layer, it is forwarded to main -module (to main thread) via an RPC call. At this point, the authentication has -been verified, so the packet is consumed, session parameters are updated -accordingly and state change (if applicable). Based on these, the timeouts -are adjusted if required and an event is sent to the process node to wake up -and recalculate sleep time. - -#### Packet transmit - -Main module allocates a vlib_buffer_t, creates the required BFD frame (control -or echo in it), then calls the transport layer to add the transport layer. -Then a frame containing the buffer to the aprropriate node is created -and enqueued. - -#### Process node - -Main module implements one process node which is a simple loop. The process -node gets next timeout from the timer wheel, sleeps until the timeout expires -and then calls a timeout routine which drives the state machine for each -session which timed out. The sleep is interrupted externally via vlib event, -when a session is added or modified in a way which might require timer wheel -manipulation. In this case the caller inserts the necessary timeout to timer -wheel and then signals the process node to wake up early, handle possible -timeouts and recalculate the sleep time again. - -#### State machine - -Default state of BFD session when created is Down, per RFC 5880. State changes -to Init, Up or Down based on events like received state from peer and timeouts. -The session state can be set AdminDown using a binary API, which prevents it -from going to any other state, until this limitation is removed. This state -is advertised to peers in slow periodic control frames. - -For each session, the following timeouts are maintained: - -1. tx timeout - used for sending out control frames -2. rx timeout - used for detecting session timeout -3. echo tx timeout - used for sending out echo frames -3. echo rx timeout - used for detecting session timeout based on echo - -These timeouts are maintained in cpu clocks and recalculated when appropriate -(e.g. rx timeout is bumped when a packet is received, keeping the session -alive). Only the earliest timeout is inserted into the timer wheel at a time -and timer wheel events are never deleted, rather spurious events are ignored. -This allows efficient operation, like not inserting events into timing wheel -for each packet received or ignoring left-over events in case a bfd session -gets removed and a new one is recreated with the same session index. - -#### Authentication keys management - -Authentication keys are managed internally in a pool, with each key tracking -it's use count. The removal/modification is only allowed if the key is not in -use. - -### UDP module - -UDP module is responsible for: - -1. public APIs/CLIs to configure BFD over UDP. -2. support code called by main module to encapsulate/decapsulate BFD packets - -This module implements two graph nodes - for consuming ipv4 and ipv6 packets -target at BFD ports 3874 and 3875. - -#### Packet receipt - -BFD packet receipt receipt starts in the bfd udp graph nodes. Since the code -needs to verify IP/UDP header data, it relies on ip4-local (and ip6-local) -nodes to store pointers to the appropriate headers. First, your discriminator -is extracted from BFD packet and used to lookup the existing session. In case -it's zero, the pair of IP addresses and sw_if_index is used to lookup session. -Then, main module is called to verify the authentication, if present. -Afterwards a check is made if the IP/UDP headers are correct. If yes, then -an RPC call is made to the main thread to consume the packet and take action -upon it. - -#### Packet transmission - -When process node decides that there is a need to transmit the packet, it -creates a buffer, fills the BFD frame data in and calls the UDP module to -add the transport layer. This is a simple operation for the control frames -consisting of just adding UDP/IP headers based on session data. For echo -frames, an additional step, looking at the echo-source interface and picking -and address is performed and if this fails, then the packet cannot be -transmitted and an error is returned to main thread. diff --git a/src/vnet/bfd/bfd_doc.rst b/src/vnet/bfd/bfd_doc.rst new file mode 100644 index 00000000000..54a53c6fe92 --- /dev/null +++ b/src/vnet/bfd/bfd_doc.rst @@ -0,0 +1,512 @@ +.. _bfd_doc: + +BFD module +========== + +Overview +-------- + +Bidirectional Forwarding Detection in VPP currently supports single-hop +UDP transport based on RFC 5880 and RFC 5881. + +Usage +----- + +General usage +~~~~~~~~~~~~~ + +BFD sessions are created using APIs only. The following CLIs are +implemented, which call the APIs to manipulate the BFD: + +Show commands: +^^^^^^^^^^^^^^ + + show bfd [keys|sessions|echo-source] + +Show the existing keys, sessions or echo-source. + +Key manipulation +^^^^^^^^^^^^^^^^ + +Create a new key or modify an existing key +'''''''''''''''''''''''''''''''''''''''''' + + bfd key set conf-key-id type <keyed-sha1|meticulous-keyed-sha1> + secret + +Parameters: + +- conf-key-id - local configuration key ID, used to uniquely identify + this key +- type - type of the key +- secret - shared secret (hex data) + +Example: + + bfd key set conf-key-id 2368880803 type meticulous-keyed-sha1 secret + 69d685b0d990cdba46872706dc + +Notes: + +- in-use key cannot be modified + +Delete an existing key +'''''''''''''''''''''' + + bfd key del conf-key-id + +Parameters: + +- conf-key-id - local configuration key ID, used to uniquely identify + this key + +Example: + + bfd key del conf-key-id 2368880803 + +Notes: + +- in-use key cannot be deleted + +Create a new (plain or authenticated) BFD session +''''''''''''''''''''''''''''''''''''''''''''''''' + + bfd udp session add interface local-addr + + .. raw:: html + + <address> + + peer-addr + + .. raw:: html + + <address> + + desired-min-tx required-min-rx detect-mult [ conf-key-id bfd-key-id ] + +Parameters: + +- interface - interface to which this session is tied to +- local-addr - local address (ipv4 or ipv6) +- peer-addr - peer address (ipv4 or ipv6, must match local-addr family) +- desired-min-tx - desired minimum tx interval (microseconds) +- required-min-rx - required minimum rx interval (microseconds) +- detect-mult - detect multiplier (must be non-zero) +- conf-key-id - local configuration key ID +- bfd-key-id - BFD key ID, as carried in BFD control frames + +Example: + + bfd udp session add interface pg0 local-addr fd01:1::1 peer-addr + fd01:1::2 desired-min-tx 100000 required-min-rx 100000 detect-mult 3 + conf-key-id 1029559112 bfd-key-id 13 + +Notes: + +- if conf-key-id and bfd-key-id are not specified, session is + non-authenticated +- desired-min-tx controls desired transmission rate of both control + frames and echo packets + +Modify BFD session +'''''''''''''''''' + + bfd udp session mod interface local-addr + + .. raw:: html + + <address> + + peer-addr + + .. raw:: html + + <address> + + desired-min-tx required-min-rx detect-mult + +Parameters: + +- interface - interface to which this session is tied to +- local-addr - local address (ipv4 or ipv6) +- peer-addr - peer address (ipv4 or ipv6, must match local-addr family) +- desired-min-tx - desired minimum tx interval (microseconds) +- required-min-rx - required minimum rx interval (microseconds) +- detect-mult - detect multiplier (must be non-zero) + +Example: + + bfd udp session mod interface pg0 local-addr 172.16.1.1 peer-addr + 172.16.1.2 desired-min-tx 300000 required-min-rx 200000 detect-mult + 12 + +Notes: + +- desired-min-tx controls desired transmission rate of both control + frames and echo packets + +Delete an existing BFD session +'''''''''''''''''''''''''''''' + + bfd udp session del interface local-addr + + .. raw:: html + + <address> + + peer-addr + + .. raw:: html + + <address> + +Parameters: + +- interface - interface to which this session is tied to +- local-addr - local address (ipv4 or ipv6) +- peer-addr - peer address (ipv4 or ipv6, must match local-addr family) + +Example: + + bfd udp session del interface pg0 local-addr 172.16.1.1 peer-addr + 172.16.1.2 + +Set session admin-up or admin-down +'''''''''''''''''''''''''''''''''' + + bfd udp session set-flags interface local-addr + + .. raw:: html + + <address> + + peer-addr + + .. raw:: html + + <address> + + admin <up|down> + +Parameters: + +- interface - interface to which this session is tied to +- local-addr - local address (ipv4 or ipv6) +- peer-addr - peer address (ipv4 or ipv6, must match local-addr family) +- admin - up/down based on desired action + +Example: + + bfd udp session set-flags admin down interface pg0 local-addr + 172.16.1.1 peer-addr 172.16.1.2 + +Activate/change authentication for existing session +''''''''''''''''''''''''''''''''''''''''''''''''''' + + bfd udp session auth activate interface local-addr + + .. raw:: html + + <address> + + peer-addr + + .. raw:: html + + <address> + + conf-key-id bfd-key-id [ delayed <yes|no> ] + +Parameters: + +- interface - interface to which this session is tied to +- local-addr - local address (ipv4 or ipv6) +- peer-addr - peer address (ipv4 or ipv6, must match local-addr family) +- conf-key-id - local configuration key ID +- bfd-key-id - BFD key ID, as carried in BFD control frames +- delayed - is yes then this action is delayed until the peer performs + the same action + +Example: + + bfd udp session auth activate interface pg0 local-addr 172.16.1.1 + peer-addr 172.16.1.2 conf-key-id 540928695 bfd-key-id 239 delayed yes + +Notes: + +- see `Delayed option <#delayed-option>`__ for more information + +Deactivate authentication for existing session +'''''''''''''''''''''''''''''''''''''''''''''' + + bfd udp session auth deactivate interface local-addr + + .. raw:: html + + <address> + + peer-addr + + .. raw:: html + + <address> + + [ delayed <yes|no> ] + +Parameters: + +- interface - interface to which this session is tied to +- local-addr - local address (ipv4 or ipv6) +- peer-addr - peer address (ipv4 or ipv6, must match local-addr family) +- delayed - is yes then this action is delayed until the peer performs + the same action + +Example: + + bfd udp session auth deactivate interface pg0 local-addr 172.16.1.1 + peer-addr 172.16.1.2 + +Notes: + +- see `Delayed option <#delayed-option>`__ for more information + +Set echo-source interface +''''''''''''''''''''''''' + + bfd udp echo-source set interface + +Parameters: + +- interface - interface used for getting source address for echo + packets + +Example: + + bfd udp echo-source set interface loop0 + +Delete echo-source interface +'''''''''''''''''''''''''''' + + bfd udp echo-source del + +Example: + + bfd udp echo-source del + +Authentication +~~~~~~~~~~~~~~ + +BFD sessions should be authenticated for security purposes. SHA1 and +meticulous SHA1 authentication is supported by VPP. First, +authentication keys are configured in VPP and afterwards they can be +used by sessions. + +There are two key IDs in the scope of BFD session: + +- configuration key ID is the internal unique key ID inside VPP and is + never communicated to any peer, it serves only the purpose of + identifying the key +- BFD key ID is the key ID carried in BFD control frames and is used + for verifying authentication + +Turning auth on/off +^^^^^^^^^^^^^^^^^^^ + +Authentication can be turned on or off at any time. Care must be taken +however, to either synchronize the authentication manipulation with +peer’s actions to avoid the session going down. + +Delayed option +'''''''''''''' + +Delayed option is useful for synchronizing authentication changes with a +peer. If it’s specified, then authentication change is not performed +immediately. In this case, VPP continues to transmit packets using the +old authentication method (unauthenticated or using old sha1 key). If a +packet is received, which does not pass the current authentication, then +VPP tries to authenticate it using the new method (which might be none, +if deactivating authentication) and if it passes, then the new +authentication method is put in use. + +The recommended procedure for enabling/changing/disabling session +authentication is: + +1. perform authentication change on vpp’s side with delayed option set + to yes +2. perform authentication change on peer’s side (without delayed option) + +Notes: + +- if both peers use delayed option at the same time, the change will + never be carried out, since none of the peers will see any packet + with the new authentication which could trigger the change +- remote peer does not need to support or even be aware of this + mechanism for it to work properly + +Echo function +~~~~~~~~~~~~~ + +Echo function is used by VPP whenever a peer declares the willingness to +support it, echo-source is set and it contains a usable subnet (see +below). When echo function is switched on, the required min rx interval +advertised to peer is set to 1 second (or the configured value, if its +higher). + +Echo source address +^^^^^^^^^^^^^^^^^^^ + +Because echo packets are only looped back (and not processed in any way) +by a peer, it’s necessary to set the source address in a way which +avoids packet drop due to spoofing protection by VPP. Per RFC, the +source address should not be in the subnet set on the interface over +which the echo packets are sent. Also, it must not be any VPP-local +address, otherwise the packet gets dropped on receipt by VPP. The +solution is to create a loopback interface with a (private) IPv4/IPv6 +subnet assigned as echo-source. The BFD then picks an unused address +from the subnet by flipping the last bit and uses that as source address +in the echo packets, thus meeting RFC recommendation while avoiding +spoofing protection. + +Example: if 10.10.10.3/31 is the subnet, then 10.10.10.2 will be used as +source address in (IPv4) echo packets + +Demand mode +~~~~~~~~~~~ + +Demand mode is respected by VPP, but not used locally. The only scenario +when demand mode could make sense currently is when echo is active. +Because echo packets are inherently insecure against an adversary +looping them back a poll sequence would be required for slow periodic +connectivity verification anyway. It’s more efficient to just ask the +remote peer to send slow periodic control frames without VPP initiating +periodic poll sequences. + +Admin-down +~~~~~~~~~~ + +Session may be put admin-down at any time. This immediately causes the +state to be changed to AdminDown and remain so unless the session is put +admin-up. + +BFD implementation notes +------------------------ + +Because BFD can work over different transport layers, the BFD code is +separated into core BFD functionality - main module implemented in +bfd_main.c and transport-specific code implemented in bfd_udp.c. + +Main module +~~~~~~~~~~~ + +Main module is responsible for handling all the BFD functionality +defined in RFC 5880. + +Internal API +^^^^^^^^^^^^ + +Internal APIs defined in bfd_main.h are called from transport-specific +code to create/modify/delete + +Packet receipt +^^^^^^^^^^^^^^ + +When a packet is received by the transport layer, it is forwarded to +main module (to main thread) via an RPC call. At this point, the +authentication has been verified, so the packet is consumed, session +parameters are updated accordingly and state change (if applicable). +Based on these, the timeouts are adjusted if required and an event is +sent to the process node to wake up and recalculate sleep time. + +Packet transmit +^^^^^^^^^^^^^^^ + +Main module allocates a vlib_buffer_t, creates the required BFD frame +(control or echo in it), then calls the transport layer to add the +transport layer. Then a frame containing the buffer to the appropriate +node is created and enqueued. + +Process node +^^^^^^^^^^^^ + +Main module implements one process node which is a simple loop. The +process node gets next timeout from the timer wheel, sleeps until the +timeout expires and then calls a timeout routine which drives the state +machine for each session which timed out. The sleep is interrupted +externally via vlib event, when a session is added or modified in a way +which might require timer wheel manipulation. In this case the caller +inserts the necessary timeout to timer wheel and then signals the +process node to wake up early, handle possible timeouts and recalculate +the sleep time again. + +State machine +^^^^^^^^^^^^^ + +Default state of BFD session when created is Down, per RFC 5880. State +changes to Init, Up or Down based on events like received state from +peer and timeouts. The session state can be set AdminDown using a binary +API, which prevents it from going to any other state, until this +limitation is removed. This state is advertised to peers in slow +periodic control frames. + +For each session, the following timeouts are maintained: + +1. tx timeout - used for sending out control frames +2. rx timeout - used for detecting session timeout +3. echo tx timeout - used for sending out echo frames +4. echo rx timeout - used for detecting session timeout based on echo + +These timeouts are maintained in cpu clocks and recalculated when +appropriate (e.g. rx timeout is bumped when a packet is received, +keeping the session alive). Only the earliest timeout is inserted into +the timer wheel at a time and timer wheel events are never deleted, +rather spurious events are ignored. This allows efficient operation, +like not inserting events into timing wheel for each packet received or +ignoring left-over events in case a bfd session gets removed and a new +one is recreated with the same session index. + +Authentication keys management +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Authentication keys are managed internally in a pool, with each key +tracking it’s use count. The removal/modification is only allowed if the +key is not in use. + +UDP module +~~~~~~~~~~ + +UDP module is responsible for: + +1. public APIs/CLIs to configure BFD over UDP. +2. support code called by main module to encapsulate/decapsulate BFD + packets + +This module implements two graph nodes - for consuming ipv4 and ipv6 +packets target at BFD ports 3874 and 3875. + +.. _packet-receipt-1: + +Packet receipt +^^^^^^^^^^^^^^ + +BFD packet receipt receipt starts in the bfd udp graph nodes. Since the +code needs to verify IP/UDP header data, it relies on ip4-local (and +ip6-local) nodes to store pointers to the appropriate headers. First, +your discriminator is extracted from BFD packet and used to lookup the +existing session. In case it’s zero, the pair of IP addresses and +sw_if_index is used to lookup session. Then, main module is called to +verify the authentication, if present. Afterwards a check is made if the +IP/UDP headers are correct. If yes, then an RPC call is made to the main +thread to consume the packet and take action upon it. + +Packet transmission +^^^^^^^^^^^^^^^^^^^ + +When process node decides that there is a need to transmit the packet, +it creates a buffer, fills the BFD frame data in and calls the UDP +module to add the transport layer. This is a simple operation for the +control frames consisting of just adding UDP/IP headers based on session +data. For echo frames, an additional step, looking at the echo-source +interface and picking and address is performed and if this fails, then +the packet cannot be transmitted and an error is returned to main +thread. diff --git a/src/vnet/ipfix-export/ipfix_doc.md b/src/vnet/ipfix-export/ipfix_doc.md deleted file mode 100644 index edae3f73660..00000000000 --- a/src/vnet/ipfix-export/ipfix_doc.md +++ /dev/null @@ -1,355 +0,0 @@ -# IPFIX support {#ipfix_doc} - -VPP includes a high-performance IPFIX record exporter. This note -explains how to use the internal APIs to export IPFIX data, and how to -configure and send the required IPFIX templates. - -As you'll see, a bit of typing is required. - -## First: create an ipfix "report" - -Include the flow report header file, fill out a @ref -vnet_flow_report_add_del_args_t structure, and call vnet_flow_report_add_del. - -```{.c} - #include <vnet/ipfix-export/flow_report.h> - /* Defined in flow_report.h, of interest when constructing reports */ - - /* ipfix field definitions for a particular report */ - typedef struct - { - u32 info_element; - u32 size; - } ipfix_report_element_t; - - /* Report add/del argument structure */ - typedef struct - { - /* Callback to flush current ipfix packet / frame */ - vnet_flow_data_callback_t *flow_data_callback; - - /* Callback to build the template packet rewrite string */ - vnet_flow_rewrite_callback_t *rewrite_callback; - - /* List of ipfix elements in the report */ - ipfix_report_element_t *report_elements; - u32 n_report_elements; - /* Kept in flow report, used e.g. by flow classifier */ - opaque_t opaque; - /* Add / delete a report */ - int is_add; - /* Ipfix "domain-ID", see RFC, set as desired */ - u32 domain_id; - /* ipfix packet source port, often set to UDP_DST_PORT_ipfix */ - u16 src_port; - /* Set by ipfix infra, needed to send data packets */ - u32 *stream_indexp; - } vnet_flow_report_add_del_args_t; - - /* Private header file contents */ - - /* Report ipfix element definition */ - #define foreach_simple_report_ipfix_element \ - _(sourceIPv4Address, 4) \ - _(destinationIPv4Address, 4) \ - _(sourceTransportPort, 2) \ - _(destinationTransportPort, 2) \ - _(protocolIdentifier, 1) \ - _(flowStartMicroseconds, 8) \ - _(flowEndMicroseconds, 8) - - static ipfix_report_element_t simple_report_elements[] = { - #define _(a,b) {a,b}, - foreach_simple_report_ipfix_element - #undef _ - }; - - typedef struct - { - /** Buffers and frames, per thread */ - vlib_buffer_t **buffers_by_thread; - vlib_frame_t **frames_by_thread; - u32 *next_record_offset_by_thread; - - /** Template ID's */ - u16 *template_ids; - - /** Time reference pair */ - u64 usec_time_0; - f64 vlib_time_0; - - /** Stream index */ - u32 stream_index; - - /* Convenience */ - flow_report_main_t *flow_report_main; - vlib_main_t *vlib_main; - vnet_main_t *vnet_main; - } my_logging_main_t; - - extern my_logging_main_t my_logging_main; - - ... - - /* Recitations */ - flow_report_main_t *frm = &flow_report_main; - my_logging_main_t *mlm = &my_logging_main; - vnet_flow_report_add_del_args_t a; - int rv; - u16 template_id; - - ... - - /* Init function: set up time reference pair */ - mlm->vlib_time_0 = vlib_time_now (vm); - mlm->milisecond_time_0 = unix_time_now_nsec () * 1e-6; - - ... - - /* Create a report */ - memset (&a, 0, sizeof (a)); - a.is_add = 1 /* to enable the report */; - a.domain_id = 1 /* pick a domain ID */; - a.src_port = UDP_DST_PORT_ipfix /* src port for reports */; - - /* Use the generic template packet rewrite string generator */ - a.rewrite_callback = vnet_flow_rewrite_generic_callback; - - /* Supply a list of ipfix report elements */ - a.report_elements = simple_report_elements; - a.n_report_elements = ARRAY_LEN (simple_report_elements); - - /* Pointer to the ipfix stream index, set by the report infra */ - a.stream_indexp = &mlm->stream_index; - a.flow_data_callback = my_flow_data_callback; - - /* Create the report */ - rv = vnet_flow_report_add_del (frm, &a, &template_id); - if (rv) - oops... - - /* Save the template-ID for later use */ - mlm->template_id = template_id; - -``` - -Several things are worth describing in more detail. - -### vnet_flow_rewrite_generic_callback programming - -This generic callback helps build ipfix template packets. When -registering an ipfix report, pass an (array, count) -of ipfix elements as shown above. - -### my_flow_data_callback - -The ipfix flow export infrastructure calls this callback to flush the -current ipfix packet; to make sure that ipfix data is not retained for -an unreasonably long period of time. - -We typically code it as shown below, to call an application-specific -function with (uninteresting arguments), and "do_flush = 1": - - -```{.c} - - vlib_frame_t *my_flow_data_callback - (flow_report_main_t * frm, - flow_report_t * fr, - vlib_frame_t * f, - u32 * to_next, u32 node_index) - { - - my_buffer_flow_record (0, ... , 0, 1 /* do_flush */); - return f; - } -``` - -### my_flow_data_header - -This function creates the packet header for an ipfix data packet - -```{.c} - - static inline void - my_flow_report_header (flow_report_main_t * frm, - vlib_buffer_t * b0, u32 * offset) - { - my_logging_main_t *mlm = &my_logging_main; - flow_report_stream_t *stream; - ip4_ipfix_template_packet_t *tp; - ipfix_message_header_t *h = 0; - - - ipfix_set_header_t *s = 0; - ip4_header_t *ip; - udp_header_t *udp; - - stream = &frm->streams[mlm->stream_index]; - - b0->current_data = 0; - b0->current_length = sizeof (*ip) + sizeof (*udp) + sizeof (*h) + - sizeof (*s); - b0->flags |= (VLIB_BUFFER_TOTAL_LENGTH_VALID | VNET_BUFFER_F_FLOW_REPORT); - vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = frm->fib_index; - tp = vlib_buffer_get_current (b0); - ip = (ip4_header_t *) & tp->ip4; - udp = (udp_header_t *) (ip + 1); - h = (ipfix_message_header_t *) (udp + 1); - s = (ipfix_set_header_t *) (h + 1); - - ip->ip_version_and_header_length = 0x45; - ip->ttl = 254; - ip->protocol = IP_PROTOCOL_UDP; - ip->flags_and_fragment_offset = 0; - ip->src_address.as_u32 = frm->src_address.as_u32; - ip->dst_address.as_u32 = frm->ipfix_collector.as_u32; - udp->src_port = clib_host_to_net_u16 (stream->src_port); - udp->dst_port = clib_host_to_net_u16 (frm->collector_port); - udp->checksum = 0; - - h->export_time = clib_host_to_net_u32 ((u32) - (((f64) frm->unix_time_0) + - (vlib_time_now (frm->vlib_main) - - frm->vlib_time_0))); - h->sequence_number = clib_host_to_net_u32 (stream->sequence_number++); - h->domain_id = clib_host_to_net_u32 (stream->domain_id); - - *offset = (u32) (((u8 *) (s + 1)) - (u8 *) tp); - } - ``` - - ### fixup and transmit a flow record - - ```{.c} - - static inline void - my_send_ipfix_pkt (flow_report_main_t * frm, - vlib_frame_t * f, vlib_buffer_t * b0, u16 template_id) - { - ip4_ipfix_template_packet_t *tp; - ipfix_message_header_t *h = 0; - ipfix_set_header_t *s = 0; - ip4_header_t *ip; - udp_header_t *udp; - vlib_main_t *vm = frm->vlib_main; - - tp = vlib_buffer_get_current (b0); - ip = (ip4_header_t *) & tp->ip4; - udp = (udp_header_t *) (ip + 1); - h = (ipfix_message_header_t *) (udp + 1); - s = (ipfix_set_header_t *) (h + 1); - - s->set_id_length = ipfix_set_id_length (template_id, - b0->current_length - - (sizeof (*ip) + sizeof (*udp) + - sizeof (*h))); - h->version_length = version_length (b0->current_length - - (sizeof (*ip) + sizeof (*udp))); - - ip->length = clib_host_to_net_u16 (b0->current_length); - ip->checksum = ip4_header_checksum (ip); - udp->length = clib_host_to_net_u16 (b0->current_length - sizeof (*ip)); - - if (frm->udp_checksum) - { - udp->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ip); - if (udp->checksum == 0) - udp->checksum = 0xffff; - } - - ASSERT (ip4_header_checksum_is_valid (ip)); - - vlib_put_frame_to_node (vm, ip4_lookup_node.index, f); - } - ``` - - ### my_buffer_flow_record - - This is the key routine which paints individual flow records into - an ipfix packet under construction. It's pretty straightforward - (albeit stateful) vpp data-plane code. The code shown below is - thread-safe by construction. - - ```{.c} - static inline void - my_buffer_flow_record_internal (my_flow_record_t * rp, int do_flush, - u32 thread_index) - { - vlib_main_t *vm = vlib_mains[thread_index]; - my_logging_main_t *mlm = &jvp_ipfix_main; - flow_report_main_t *frm = &flow_report_main; - vlib_frame_t *f; - vlib_buffer_t *b0 = 0; - u32 bi0 = ~0; - u32 offset; - - b0 = mlm->buffers_by_thread[thread_index]; - - if (PREDICT_FALSE (b0 == 0)) - { - if (do_flush) - return; - - if (vlib_buffer_alloc (vm, &bi0, 1) != 1) - { - clib_warning ("can't allocate ipfix data buffer"); - return; - } - - b0 = vlib_get_buffer (vm, bi0); - offset = 0; - mlm->buffers_by_thread[thread_index] = b0; - } - else - { - bi0 = vlib_get_buffer_index (vm, b0); - offset = mlm->next_record_offset_by_thread[thread_index]; - } - - f = mlm->frames_by_thread[thread_index]; - if (PREDICT_FALSE (f == 0)) - { - u32 *to_next; - f = vlib_get_frame_to_node (vm, ip4_lookup_node.index); - mlm->frames_by_thread[thread_index] = f; - to_next = vlib_frame_vector_args (f); - to_next[0] = bi0; - f->n_vectors = 1; - mlm->frames_by_thread[thread_index] = f; - } - - if (PREDICT_FALSE (offset == 0)) - my_flow_report_header (frm, b0, &offset); - - if (PREDICT_TRUE (do_flush == 0)) - { - /* Paint the new ipfix data record into the buffer */ - clib_memcpy (b0->data + offset, rp, sizeof (*rp)); - offset += sizeof (*rp); - b0->current_length += sizeof (*rp); - } - - if (PREDICT_FALSE (do_flush || (offset + sizeof (*rp)) > frm->path_mtu)) - { - /* Nothing to send? */ - if (offset == 0) - return; - - send_ipfix_pkt (frm, f, b0, mlm->template_ids[0]); - mlm->buffers_by_thread[thread_index] = 0; - mlm->frames_by_thread[thread_index] = 0; - offset = 0; - } - mlm->next_record_offset_by_thread[thread_index] = offset; - } - - static void - my_buffer_flow_record (my_flow_record_t * rp, int do_flush) - { - u32 thread_index = vlib_get_thread_index(); - my_buffer_flow_record_internal (rp, do_flush, thread_index); - } - -``` diff --git a/src/vnet/ipfix-export/ipfix_doc.rst b/src/vnet/ipfix-export/ipfix_doc.rst new file mode 100644 index 00000000000..ac660b4bc93 --- /dev/null +++ b/src/vnet/ipfix-export/ipfix_doc.rst @@ -0,0 +1,360 @@ +.. _ipfix_doc: + +IPFIX support +============= + +VPP includes a high-performance IPFIX record exporter. This note +explains how to use the internal APIs to export IPFIX data, and how to +configure and send the required IPFIX templates. + +As you’ll see, a bit of typing is required. + +First: create an ipfix “report” +------------------------------- + +Include the flow report header file, fill out a @ref +vnet_flow_report_add_del_args_t structure, and call +vnet_flow_report_add_del. + +.. code:: c + + #include <vnet/ipfix-export/flow_report.h> + /* Defined in flow_report.h, of interest when constructing reports */ + + /* ipfix field definitions for a particular report */ + typedef struct + { + u32 info_element; + u32 size; + } ipfix_report_element_t; + + /* Report add/del argument structure */ + typedef struct + { + /* Callback to flush current ipfix packet / frame */ + vnet_flow_data_callback_t *flow_data_callback; + + /* Callback to build the template packet rewrite string */ + vnet_flow_rewrite_callback_t *rewrite_callback; + + /* List of ipfix elements in the report */ + ipfix_report_element_t *report_elements; + u32 n_report_elements; + /* Kept in flow report, used e.g. by flow classifier */ + opaque_t opaque; + /* Add / delete a report */ + int is_add; + /* Ipfix "domain-ID", see RFC, set as desired */ + u32 domain_id; + /* ipfix packet source port, often set to UDP_DST_PORT_ipfix */ + u16 src_port; + /* Set by ipfix infra, needed to send data packets */ + u32 *stream_indexp; + } vnet_flow_report_add_del_args_t; + + /* Private header file contents */ + + /* Report ipfix element definition */ + #define foreach_simple_report_ipfix_element \ + _(sourceIPv4Address, 4) \ + _(destinationIPv4Address, 4) \ + _(sourceTransportPort, 2) \ + _(destinationTransportPort, 2) \ + _(protocolIdentifier, 1) \ + _(flowStartMicroseconds, 8) \ + _(flowEndMicroseconds, 8) + + static ipfix_report_element_t simple_report_elements[] = { + #define _(a,b) {a,b}, + foreach_simple_report_ipfix_element + #undef _ + }; + + typedef struct + { + /** Buffers and frames, per thread */ + vlib_buffer_t **buffers_by_thread; + vlib_frame_t **frames_by_thread; + u32 *next_record_offset_by_thread; + + /** Template ID's */ + u16 *template_ids; + + /** Time reference pair */ + u64 usec_time_0; + f64 vlib_time_0; + + /** Stream index */ + u32 stream_index; + + /* Convenience */ + flow_report_main_t *flow_report_main; + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + } my_logging_main_t; + + extern my_logging_main_t my_logging_main; + + ... + + /* Recitations */ + flow_report_main_t *frm = &flow_report_main; + my_logging_main_t *mlm = &my_logging_main; + vnet_flow_report_add_del_args_t a; + int rv; + u16 template_id; + + ... + + /* Init function: set up time reference pair */ + mlm->vlib_time_0 = vlib_time_now (vm); + mlm->milisecond_time_0 = unix_time_now_nsec () * 1e-6; + + ... + + /* Create a report */ + memset (&a, 0, sizeof (a)); + a.is_add = 1 /* to enable the report */; + a.domain_id = 1 /* pick a domain ID */; + a.src_port = UDP_DST_PORT_ipfix /* src port for reports */; + + /* Use the generic template packet rewrite string generator */ + a.rewrite_callback = vnet_flow_rewrite_generic_callback; + + /* Supply a list of ipfix report elements */ + a.report_elements = simple_report_elements; + a.n_report_elements = ARRAY_LEN (simple_report_elements); + + /* Pointer to the ipfix stream index, set by the report infra */ + a.stream_indexp = &mlm->stream_index; + a.flow_data_callback = my_flow_data_callback; + + /* Create the report */ + rv = vnet_flow_report_add_del (frm, &a, &template_id); + if (rv) + oops... + + /* Save the template-ID for later use */ + mlm->template_id = template_id; + +Several things are worth describing in more detail. + +vnet_flow_rewrite_generic_callback programming +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This generic callback helps build ipfix template packets. When +registering an ipfix report, pass an (array, count) of ipfix elements as +shown above. + +my_flow_data_callback +~~~~~~~~~~~~~~~~~~~~~ + +The ipfix flow export infrastructure calls this callback to flush the +current ipfix packet; to make sure that ipfix data is not retained for +an unreasonably long period of time. + +We typically code it as shown below, to call an application-specific +function with (uninteresting arguments), and “do_flush = 1”: + +.. code:: c + + + vlib_frame_t *my_flow_data_callback + (flow_report_main_t * frm, + flow_report_t * fr, + vlib_frame_t * f, + u32 * to_next, u32 node_index) + { + + my_buffer_flow_record (0, ... , 0, 1 /* do_flush */); + return f; + } + +my_flow_data_header +~~~~~~~~~~~~~~~~~~~ + +This function creates the packet header for an ipfix data packet + +.. code:: c + + + static inline void + my_flow_report_header (flow_report_main_t * frm, + vlib_buffer_t * b0, u32 * offset) + { + my_logging_main_t *mlm = &my_logging_main; + flow_report_stream_t *stream; + ip4_ipfix_template_packet_t *tp; + ipfix_message_header_t *h = 0; + + + ipfix_set_header_t *s = 0; + ip4_header_t *ip; + udp_header_t *udp; + + stream = &frm->streams[mlm->stream_index]; + + b0->current_data = 0; + b0->current_length = sizeof (*ip) + sizeof (*udp) + sizeof (*h) + + sizeof (*s); + b0->flags |= (VLIB_BUFFER_TOTAL_LENGTH_VALID | VNET_BUFFER_F_FLOW_REPORT); + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = frm->fib_index; + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->flags_and_fragment_offset = 0; + ip->src_address.as_u32 = frm->src_address.as_u32; + ip->dst_address.as_u32 = frm->ipfix_collector.as_u32; + udp->src_port = clib_host_to_net_u16 (stream->src_port); + udp->dst_port = clib_host_to_net_u16 (frm->collector_port); + udp->checksum = 0; + + h->export_time = clib_host_to_net_u32 ((u32) + (((f64) frm->unix_time_0) + + (vlib_time_now (frm->vlib_main) - + frm->vlib_time_0))); + h->sequence_number = clib_host_to_net_u32 (stream->sequence_number++); + h->domain_id = clib_host_to_net_u32 (stream->domain_id); + + *offset = (u32) (((u8 *) (s + 1)) - (u8 *) tp); + } + +### fixup and transmit a flow record + +.. code:: c + + + static inline void + my_send_ipfix_pkt (flow_report_main_t * frm, + vlib_frame_t * f, vlib_buffer_t * b0, u16 template_id) + { + ip4_ipfix_template_packet_t *tp; + ipfix_message_header_t *h = 0; + ipfix_set_header_t *s = 0; + ip4_header_t *ip; + udp_header_t *udp; + vlib_main_t *vm = frm->vlib_main; + + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + + s->set_id_length = ipfix_set_id_length (template_id, + b0->current_length - + (sizeof (*ip) + sizeof (*udp) + + sizeof (*h))); + h->version_length = version_length (b0->current_length - + (sizeof (*ip) + sizeof (*udp))); + + ip->length = clib_host_to_net_u16 (b0->current_length); + ip->checksum = ip4_header_checksum (ip); + udp->length = clib_host_to_net_u16 (b0->current_length - sizeof (*ip)); + + if (frm->udp_checksum) + { + udp->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ip); + if (udp->checksum == 0) + udp->checksum = 0xffff; + } + + ASSERT (ip4_header_checksum_is_valid (ip)); + + vlib_put_frame_to_node (vm, ip4_lookup_node.index, f); + } + +### my_buffer_flow_record + +This is the key routine which paints individual flow records into an +ipfix packet under construction. It’s pretty straightforward (albeit +stateful) vpp data-plane code. The code shown below is thread-safe by +construction. + +.. code:: c + + static inline void + my_buffer_flow_record_internal (my_flow_record_t * rp, int do_flush, + u32 thread_index) + { + vlib_main_t *vm = vlib_mains[thread_index]; + my_logging_main_t *mlm = &jvp_ipfix_main; + flow_report_main_t *frm = &flow_report_main; + vlib_frame_t *f; + vlib_buffer_t *b0 = 0; + u32 bi0 = ~0; + u32 offset; + + b0 = mlm->buffers_by_thread[thread_index]; + + if (PREDICT_FALSE (b0 == 0)) + { + if (do_flush) + return; + + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + clib_warning ("can't allocate ipfix data buffer"); + return; + } + + b0 = vlib_get_buffer (vm, bi0); + offset = 0; + mlm->buffers_by_thread[thread_index] = b0; + } + else + { + bi0 = vlib_get_buffer_index (vm, b0); + offset = mlm->next_record_offset_by_thread[thread_index]; + } + + f = mlm->frames_by_thread[thread_index]; + if (PREDICT_FALSE (f == 0)) + { + u32 *to_next; + f = vlib_get_frame_to_node (vm, ip4_lookup_node.index); + mlm->frames_by_thread[thread_index] = f; + to_next = vlib_frame_vector_args (f); + to_next[0] = bi0; + f->n_vectors = 1; + mlm->frames_by_thread[thread_index] = f; + } + + if (PREDICT_FALSE (offset == 0)) + my_flow_report_header (frm, b0, &offset); + + if (PREDICT_TRUE (do_flush == 0)) + { + /* Paint the new ipfix data record into the buffer */ + clib_memcpy (b0->data + offset, rp, sizeof (*rp)); + offset += sizeof (*rp); + b0->current_length += sizeof (*rp); + } + + if (PREDICT_FALSE (do_flush || (offset + sizeof (*rp)) > frm->path_mtu)) + { + /* Nothing to send? */ + if (offset == 0) + return; + + send_ipfix_pkt (frm, f, b0, mlm->template_ids[0]); + mlm->buffers_by_thread[thread_index] = 0; + mlm->frames_by_thread[thread_index] = 0; + offset = 0; + } + mlm->next_record_offset_by_thread[thread_index] = offset; + } + + static void + my_buffer_flow_record (my_flow_record_t * rp, int do_flush) + { + u32 thread_index = vlib_get_thread_index(); + my_buffer_flow_record_internal (rp, do_flush, thread_index); + } diff --git a/src/vnet/mtu.rst b/src/vnet/mtu.rst new file mode 100644 index 00000000000..c7e92523c7f --- /dev/null +++ b/src/vnet/mtu.rst @@ -0,0 +1,108 @@ +.. _mtu_doc: + +MTU in VPP +========== + +Maximum Transmission Unit is a term used to describe the maximum sized +“thingy” that can be sent out an interface. It can refer to the maximum +frame size that a NIC can send. On Ethernet that would include the +Ethernet header but typically not the IGF. It can refer to the maximum +packet size, that is, on Ethernet an MTU of 1500, would allow an IPv4 +packet of 1500 bytes, that would result in an Ethernet frame of 1518 +bytes. + + +VPP allows setting of the physical payload MTU. I.e. not including L2 +overhead. Setting the hardware MTU will program the NIC. This MTU will +be inherited by all software interfaces. + +VPP also allows setting of the payload MTU for software interfaces. +Independently of the MTU set on the hardware. If the software payload +MTU is set higher than the capability of the NIC, the packet will be +dropped. + +In addition VPP supports setting the MTU of individual network layer +protocols. IPv4, IPv6 or MPLS. For example an IPv4 MTU of 1500 (includes +the IPv4 header) will fit in a hardware payload MTU of 1500. + +*Note we might consider changing the hardware payload MTU to hardware +MTU*. That is, the MTU includes all L2 framing. Then the payload MTU can +be calculated based on the interface’s configuration. E.g. 802.1q tags +etc. + +There are currently no checks or warnings if e.g. the user configures a +per-protocol MTU larger than the underlying payload MTU. If that happens +packets will be fragmented or dropped. + +Data structures +^^^^^^^^^^^^^^^ + +The hardware payload MTU is stored in the max_packet_bytes variable in +the vnet_hw_interface_t structure. + +The software MTU (previously max_l3_packet_bytes) is in +vnet_sw_interface_t->in mtu[VNET_N_MTU]. + +MTU API +------- + +Set physical MTU +^^^^^^^^^^^^^^^^ + +This API message is used to set the physical MTU. It is currently +limited to Ethernet interfaces. Note, this programs the NIC. + +:: + + autoreply define hw_interface_set_mtu + { + u32 client_index; + u32 context; + u32 sw_if_index; + u16 mtu; + }; + +Set the L2 payload MTU +^^^^^^^^^^^^^^^^^^^^^^ + +:: note + (not including the L2 header) and per-protocol MTUs + +This API message sets the L3 payload MTU. E.g. on Ethernet it is the +maximum size of the Ethernet payload. If a value is left as 0, then the +default is picked from VNET_MTU_L3. + +:: + + autoreply define sw_interface_set_mtu + { + u32 client_index; + u32 context; + u32 sw_if_index; + /* $$$$ Replace with enum */ + u32 mtu[4]; /* 0 - L3, 1 - IP4, 2 - IP6, 3 - MPLS */ + }; + +Get interface MTU +^^^^^^^^^^^^^^^^^ + +The various MTUs on an interface can be queried with the +sw_interface_dump/sw_interface_details calls. + +:: + + define sw_interface_details + { + /* MTU */ + u16 link_mtu; + + /* Per protocol MTUs */ + u32 mtu[4]; /* 0 - L3, 1 - IP4, 2 - IP6, 3 - MPLS */ + }; + +MTU CLI +------- + +:: + + set interface mtu [packet|ip4|ip6|mpls] <value> <interface> diff --git a/src/vnet/span/span_doc.md b/src/vnet/span/span_doc.md deleted file mode 100644 index 9f1db0a6c90..00000000000 --- a/src/vnet/span/span_doc.md +++ /dev/null @@ -1,65 +0,0 @@ -# VPP SPAN implementation {#span_doc} - -This is a memo intended to contain documentation of the VPP SPAN implementation. -Everything that is not directly obvious should come here. - - -## Switched Port Analyzer (SPAN) -Port mirroring is used on a network switch to send a copy of network packets seen on one switch port to a network monitoring connection on another switch port. -Can be used by network engineers or administrators to measure performance, analyze and debug data or diagnose errors on a network. - -### RX traffic node -There is one static node to mirror incoming packets. -* span-input: Creates a copy of incoming buffer due to incoming buffers can be reused internally. - -Chaining: dpdk-input -> span-input -> -* original buffer is sent to ethernet-input for processing -* buffer copy is sent to interface-output - -### Configuration -SPAN supports the following CLI configuration commands: - -#### Enable/Disable SPAN (CLI) - set interface span <if-name> [disable | destination <if-name>] - -<if-name>: mirrored interface name -destination <if-name>: monitoring interface name -disable: delete mirroring - -#### Enable/Disable SPAN (API) -SPAN supports the following API configuration command: - sw_interface_span_enable_disable src GigabitEthernet0/8/0 dst GigabitEthernet0/9/0 - sw_interface_span_enable_disable src_sw_if_index 1 dst_sw_if_index 2 - -src/src_sw_if_index: mirrored interface name -dst/dst_sw_if_index: monitoring interface name - -#### Remove SPAN entry (API) -SPAN supports the following API configuration command: - sw_interface_span_enable_disable src_sw_if_index 1 dst_sw_if_index 2 disable - -src_sw_if_index: mirrored interface name -dst_sw_if_index: monitoring interface name - -### Configuration example - -Mirror all packets on interface GigabitEthernet0/10/0 to interface GigabitEthernet0/11/0. - -Configure IPv4 addresses on mirrored interface: -set interface ip address GigabitEthernet0/10/0 192.168.1.13/24 -set interface state GigabitEthernet0/10/0 up - -Configure IPv4 addresses on monitoring interface: -set interface ip address GigabitEthernet0/11/0 192.168.2.13/24 -set interface state GigabitEthernet0/11/0 up - -Configure SPAN -set span src GigabitEthernet0/10/0 dst GigabitEthernet0/11/0 - -### Operational data - -Active SPAN mirroring CLI show command: - show interfaces span - -Active SPAN mirroring API dump command: - sw_interface_span_dump diff --git a/src/vnet/span/span_doc.rst b/src/vnet/span/span_doc.rst new file mode 100644 index 00000000000..f529fb36eb4 --- /dev/null +++ b/src/vnet/span/span_doc.rst @@ -0,0 +1,84 @@ +.. _span_doc: + +Switched Port Analyzer +====================== + +This is a memo intended to contain documentation of the VPP SPAN +implementation. Everything that is not directly obvious should come +here. + +Port mirroring is used on a network switch to send a copy of network +packets seen on one switch port to a network monitoring connection on +another switch port. Can be used by network engineers or administrators +to measure performance, analyze and debug data or diagnose errors on a +network. + +RX traffic node +~~~~~~~~~~~~~~~ + +There is one static node to mirror incoming packets. \* span-input: +Creates a copy of incoming buffer due to incoming buffers can be reused +internally. + +Chaining: dpdk-input -> span-input -> \* original buffer is sent to +ethernet-input for processing \* buffer copy is sent to interface-output + +Configuration +~~~~~~~~~~~~~ + +SPAN supports the following CLI configuration commands: + +Enable/Disable SPAN (CLI) +^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + set interface span <if-name> [disable | destination <if-name>] + +: mirrored interface name destination : monitoring interface name +disable: delete mirroring + +Enable/Disable SPAN (API) +^^^^^^^^^^^^^^^^^^^^^^^^^ + +SPAN supports the following API configuration command: +sw_interface_span_enable_disable src GigabitEthernet0/8/0 dst +GigabitEthernet0/9/0 sw_interface_span_enable_disable src_sw_if_index 1 +dst_sw_if_index 2 + +src/src_sw_if_index: mirrored interface name dst/dst_sw_if_index: +monitoring interface name + +Remove SPAN entry (API) +^^^^^^^^^^^^^^^^^^^^^^^ + +SPAN supports the following API configuration command: +sw_interface_span_enable_disable src_sw_if_index 1 dst_sw_if_index 2 +disable + +src_sw_if_index: mirrored interface name dst_sw_if_index: monitoring +interface name + +Configuration example +~~~~~~~~~~~~~~~~~~~~~ + +Mirror all packets on interface GigabitEthernet0/10/0 to interface +GigabitEthernet0/11/0. + +Configure IPv4 addresses on mirrored interface: set interface ip address +GigabitEthernet0/10/0 192.168.1.13/24 set interface state +GigabitEthernet0/10/0 up + +Configure IPv4 addresses on monitoring interface: set interface ip +address GigabitEthernet0/11/0 192.168.2.13/24 set interface state +GigabitEthernet0/11/0 up + +Configure SPAN set span src GigabitEthernet0/10/0 dst +GigabitEthernet0/11/0 + +Operational data +~~~~~~~~~~~~~~~~ + +Active SPAN mirroring CLI show command: show interfaces span + +Active SPAN mirroring API dump command: sw_interface_span_dump diff --git a/src/vnet/srmpls/sr_doc.md b/src/vnet/srmpls/sr_doc.md deleted file mode 100644 index 29110ec8c41..00000000000 --- a/src/vnet/srmpls/sr_doc.md +++ /dev/null @@ -1,121 +0,0 @@ -# SR-MPLS: Segment Routing for MPLS {#srmpls_doc} - -This is a memo intended to contain documentation of the VPP SR-MPLS implementation. -Everything that is not directly obvious should come here. -For any feedback on content that should be explained please mailto:pcamaril@cisco.com - -## Segment Routing - -Segment routing is a network technology focused on addressing the limitations of existing IP and Multiprotocol Label Switching (MPLS) networks in terms of simplicity, scale, and ease of operation. It is a foundation for application engineered routing as it prepares the networks for new business models where applications can control the network behavior. - -Segment routing seeks the right balance between distributed intelligence and centralized optimization and programming. It was built for the software-defined networking (SDN) era. - -Segment routing enhances packet forwarding behavior by enabling a network to transport unicast packets through a specific forwarding path, different from the normal path that a packet usually takes (IGP shortest path or BGP best path). This capability benefits many use cases, and one can build those specific paths based on application requirements. - -Segment routing uses the source routing paradigm. A node, usually a router but also a switch, a trusted server, or a virtual forwarder running on a hypervisor, steers a packet through an ordered list of instructions, called segments. A segment can represent any instruction, topological or service-based. A segment can have a local semantic to a segment-routing node or global within a segment-routing network. Segment routing allows an operator to enforce a flow through any topological path and service chain while maintaining per-flow state only at the ingress node to the segment-routing network. Segment routing also supports equal-cost multipath (ECMP) by design. - -Segment routing can operate with either an MPLS or an IPv6 data plane. All the currently available MPLS services, such as Layer 3 VPN (L3VPN), L2VPN (Virtual Private Wire Service [VPWS], Virtual Private LAN Services [VPLS], Ethernet VPN [E-VPN], and Provider Backbone Bridging Ethernet VPN [PBB-EVPN]), can run on top of a segment-routing transport network. - -**The implementation of Segment Routing in VPP covers both the IPv6 data plane (SRv6) as well as the MPLS data plane (SR-MPLS). This page contains the SR-MPLS documentation.** - -## Segment Routing terminology - -* SegmentID (SID): is an MPLS label. -* Segment List (SL) (SID List): is the sequence of SIDs that the packet will traverse. -* SR Policy: is a set of candidate paths (SID list+weight). An SR policy is uniquely identified by its Binding SID and associated with a weighted set of Segment Lists. In case several SID lists are defined, traffic steered into the policy is unevenly load-balanced among them according to their respective weights. -* BindingSID: a BindingSID is a SID (only one) associated one-one with an SR Policy. If a packet arrives with MPLS label corresponding to a BindingSID, then the SR policy will be applied to such packet. (BindingSID is popped first.) - -## SR-MPLS features in VPP - -The SR-MPLS implementation is focused on the SR policies, as well on its steering. Others SR-MPLS features, such as for example AdjSIDs, can be achieved using the regular VPP MPLS implementation. - -The <a href="https://datatracker.ietf.org/doc/draft-filsfils-spring-segment-routing-policy/">Segment Routing Policy (*draft-filsfils-spring-segment-routing-policy*)</a> defines SR Policies. - -## Creating a SR Policy - -An SR Policy is defined by a Binding SID and a weighted set of Segment Lists. - -A new SR policy is created with a first SID list using: - - sr mpls policy add bsid 40001 next 16001 next 16002 next 16003 (weight 5) - -* The weight parameter is only used if more than one SID list is associated with the policy. - -An SR policy is deleted with: - - sr mpls policy del bsid 40001 - -The existing SR policies are listed with: - - show sr mpls policies - -### Adding/Removing SID Lists from an SR policy - -An additional SID list is associated with an existing SR policy with: - - sr mpls policy mod bsid 40001 add sl next 16001 next 16002 next 16003 (weight 3) - -Conversely, a SID list can be removed from an SR policy with: - - sr mpls policy mod bsid 4001 del sl index 1 - -Note that this CLI cannot be used to remove the last SID list of a policy. Instead the SR policy delete CLI must be used. - -The weight of a SID list can also be modified with: - - sr mpls policy mod bsid 40001 mod sl index 1 weight 4 - -### SR Policies: Spray policies - -Spray policies are a specific type of SR policies where the packet is replicated on all the SID lists, rather than load-balanced among them. - -SID list weights are ignored with this type of policies. - -A Spray policy is instantiated by appending the keyword **spray** to a regular SR-MPLS policy command, as in: - - sr mpls policy add bsid 40002 next 16001 next 16002 next 16003 spray - -Spray policies are used for removing multicast state from a network core domain, and instead send a linear unicast copy to every access node. The last SID in each list accesses the multicast tree within the access node. - -## Steering packets into a SR Policy - -Segment Routing supports three methos of steering traffic into an SR policy. - -### Local steering - -In this variant incoming packets match a routing policy which directs them on a local SR policy. - -In order to achieve this behavior the user needs to create an 'sr steering policy via sr policy bsid'. - - sr mpls steer l3 2001::/64 via sr policy bsid 40001 - sr mpls steer l3 2001::/64 via sr policy bsid 40001 fib-table 3 - sr mpls steer l3 10.0.0.0/16 via sr policy bsid 40001 - sr mpls steer l3 10.0.0.0/16 via sr policy bsid 40001 vpn-label 500 - -### Remote steering - -In this variant incoming packets have an active SID matching a local BSID at the head-end. - -In order to achieve this behavior the packets should simply arrive with an active SID equal to the Binding SID of a locally instantiated SR policy. - -### Automated steering - -In this variant incoming packets match a BGP/Service route which recurses on the BSID of a local policy. - -In order to achieve this behavior the user first needs to color the SR policies. He can do so by using the CLI: - - sr mpls policy te bsid xxxxx endpoint x.x.x.x color 12341234 - -Notice that an SR policy can have a single endpoint and a single color. Notice that the *endpoint* value is an IP46 address and the color a u32. - - -Then, for any BGP/Service route the user has to use the API to steer prefixes: - - sr steer l3 2001::/64 via next-hop 2001::1 color 1234 co 2 - sr steer l3 2001::/64 via next-hop 2001::1 color 1234 co 2 vpn-label 500 - -Notice that *co* refers to the CO-bits (values [0|1|2|3]). - -Notice also that a given prefix might be steered over several colors (same next-hop and same co-bit value). In order to add new colors just execute the API several times (or with the del parameter to delete the color). - -This variant is meant to be used in conjunction with a control plane agent that uses the underlying binary API bindings of *sr_mpls_steering_policy_add*/*sr_mpls_steering_policy_del* for any BGP service route received.
\ No newline at end of file diff --git a/src/vnet/srmpls/sr_doc.rst b/src/vnet/srmpls/sr_doc.rst new file mode 100644 index 00000000000..ed847fa0d42 --- /dev/null +++ b/src/vnet/srmpls/sr_doc.rst @@ -0,0 +1,215 @@ +.. _srmpls_doc: + +SR-MPLS: Segment Routing for MPLS +================================= + +This is a memo intended to contain documentation of the VPP SR-MPLS +implementation. Everything that is not directly obvious should come +here. For any feedback on content that should be explained please +mailto:pcamaril@cisco.com + +Segment Routing +--------------- + +Segment routing is a network technology focused on addressing the +limitations of existing IP and Multiprotocol Label Switching (MPLS) +networks in terms of simplicity, scale, and ease of operation. It is a +foundation for application engineered routing as it prepares the +networks for new business models where applications can control the +network behavior. + +Segment routing seeks the right balance between distributed intelligence +and centralized optimization and programming. It was built for the +software-defined networking (SDN) era. + +Segment routing enhances packet forwarding behavior by enabling a +network to transport unicast packets through a specific forwarding path, +different from the normal path that a packet usually takes (IGP shortest +path or BGP best path). This capability benefits many use cases, and one +can build those specific paths based on application requirements. + +Segment routing uses the source routing paradigm. A node, usually a +router but also a switch, a trusted server, or a virtual forwarder +running on a hypervisor, steers a packet through an ordered list of +instructions, called segments. A segment can represent any instruction, +topological or service-based. A segment can have a local semantic to a +segment-routing node or global within a segment-routing network. Segment +routing allows an operator to enforce a flow through any topological +path and service chain while maintaining per-flow state only at the +ingress node to the segment-routing network. Segment routing also +supports equal-cost multipath (ECMP) by design. + +Segment routing can operate with either an MPLS or an IPv6 data plane. +All the currently available MPLS services, such as Layer 3 VPN (L3VPN), +L2VPN (Virtual Private Wire Service [VPWS], Virtual Private LAN Services +[VPLS], Ethernet VPN [E-VPN], and Provider Backbone Bridging Ethernet +VPN [PBB-EVPN]), can run on top of a segment-routing transport network. + +**The implementation of Segment Routing in VPP covers both the IPv6 data +plane (SRv6) as well as the MPLS data plane (SR-MPLS). This page +contains the SR-MPLS documentation.** + +Segment Routing terminology +--------------------------- + +- SegmentID (SID): is an MPLS label. +- Segment List (SL) (SID List): is the sequence of SIDs that the packet + will traverse. +- SR Policy: is a set of candidate paths (SID list+weight). An SR + policy is uniquely identified by its Binding SID and associated with + a weighted set of Segment Lists. In case several SID lists are + defined, traffic steered into the policy is unevenly load-balanced + among them according to their respective weights. +- BindingSID: a BindingSID is a SID (only one) associated one-one with + an SR Policy. If a packet arrives with MPLS label corresponding to a + BindingSID, then the SR policy will be applied to such packet. + (BindingSID is popped first.) + +SR-MPLS features in VPP +----------------------- + +The SR-MPLS implementation is focused on the SR policies, as well on its +steering. Others SR-MPLS features, such as for example AdjSIDs, can be +achieved using the regular VPP MPLS implementation. + +The Segment Routing Policy +(*draft-filsfils-spring-segment-routing-policy*) defines SR Policies. + +Creating a SR Policy +-------------------- + +An SR Policy is defined by a Binding SID and a weighted set of Segment +Lists. + +A new SR policy is created with a first SID list using: + +:: + + sr mpls policy add bsid 40001 next 16001 next 16002 next 16003 (weight 5) + +- The weight parameter is only used if more than one SID list is + associated with the policy. + +An SR policy is deleted with: + +:: + + sr mpls policy del bsid 40001 + +The existing SR policies are listed with: + +:: + + show sr mpls policies + +Adding/Removing SID Lists from an SR policy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +An additional SID list is associated with an existing SR policy with: + +:: + + sr mpls policy mod bsid 40001 add sl next 16001 next 16002 next 16003 (weight 3) + +Conversely, a SID list can be removed from an SR policy with: + +:: + + sr mpls policy mod bsid 4001 del sl index 1 + +Note that this CLI cannot be used to remove the last SID list of a +policy. Instead the SR policy delete CLI must be used. + +The weight of a SID list can also be modified with: + +:: + + sr mpls policy mod bsid 40001 mod sl index 1 weight 4 + +SR Policies: Spray policies +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Spray policies are a specific type of SR policies where the packet is +replicated on all the SID lists, rather than load-balanced among them. + +SID list weights are ignored with this type of policies. + +A Spray policy is instantiated by appending the keyword **spray** to a +regular SR-MPLS policy command, as in: + +:: + + sr mpls policy add bsid 40002 next 16001 next 16002 next 16003 spray + +Spray policies are used for removing multicast state from a network core +domain, and instead send a linear unicast copy to every access node. The +last SID in each list accesses the multicast tree within the access +node. + +Steering packets into a SR Policy +--------------------------------- + +Segment Routing supports three methods of steering traffic into an SR +policy. + +Local steering +~~~~~~~~~~~~~~ + +In this variant incoming packets match a routing policy which directs +them on a local SR policy. + +In order to achieve this behavior the user needs to create an ‘sr +steering policy via sr policy bsid’. + +:: + + sr mpls steer l3 2001::/64 via sr policy bsid 40001 + sr mpls steer l3 2001::/64 via sr policy bsid 40001 fib-table 3 + sr mpls steer l3 10.0.0.0/16 via sr policy bsid 40001 + sr mpls steer l3 10.0.0.0/16 via sr policy bsid 40001 vpn-label 500 + +Remote steering +~~~~~~~~~~~~~~~ + +In this variant incoming packets have an active SID matching a local +BSID at the head-end. + +In order to achieve this behavior the packets should simply arrive with +an active SID equal to the Binding SID of a locally instantiated SR +policy. + +Automated steering +~~~~~~~~~~~~~~~~~~ + +In this variant incoming packets match a BGP/Service route which +recurses on the BSID of a local policy. + +In order to achieve this behavior the user first needs to color the SR +policies. He can do so by using the CLI: + +:: + + sr mpls policy te bsid xxxxx endpoint x.x.x.x color 12341234 + +Notice that an SR policy can have a single endpoint and a single color. +Notice that the *endpoint* value is an IP46 address and the color a u32. + +Then, for any BGP/Service route the user has to use the API to steer +prefixes: + +:: + + sr steer l3 2001::/64 via next-hop 2001::1 color 1234 co 2 + sr steer l3 2001::/64 via next-hop 2001::1 color 1234 co 2 vpn-label 500 + +Notice that *co* refers to the CO-bits (values [0|1|2|3]). + +Notice also that a given prefix might be steered over several colors +(same next-hop and same co-bit value). In order to add new colors just +execute the API several times (or with the del parameter to delete the +color). + +This variant is meant to be used in conjunction with a control plane +agent that uses the underlying binary API bindings of +*sr_mpls_steering_policy_add*/*sr_mpls_steering_policy_del* for any BGP +service route received. diff --git a/src/vnet/srv6/sr_doc.md b/src/vnet/srv6/sr_doc.md deleted file mode 100644 index c80a0fc18f7..00000000000 --- a/src/vnet/srv6/sr_doc.md +++ /dev/null @@ -1,63 +0,0 @@ -# SRv6: Segment Routing for IPv6 {#srv6_doc} - -This is a memo intended to contain documentation of the VPP SRv6 implementation. -Everything that is not directly obvious should come here. -For any feedback on content that should be explained please mailto:pcamaril@cisco.com - -## Segment Routing - -Segment routing is a network technology focused on addressing the limitations of existing IP and Multiprotocol Label Switching (MPLS) networks in terms of simplicity, scale, and ease of operation. It is a foundation for application engineered routing as it prepares the networks for new business models where applications can control the network behavior. - -Segment routing seeks the right balance between distributed intelligence and centralized optimization and programming. It was built for the software-defined networking (SDN) era. - -Segment routing enhances packet forwarding behavior by enabling a network to transport unicast packets through a specific forwarding path, different from the normal path that a packet usually takes (IGP shortest path or BGP best path). This capability benefits many use cases, and one can build those specific paths based on application requirements. - -Segment routing uses the source routing paradigm. A node, usually a router but also a switch, a trusted server, or a virtual forwarder running on a hypervisor, steers a packet through an ordered list of instructions, called segments. A segment can represent any instruction, topological or service-based. A segment can have a local semantic to a segment-routing node or global within a segment-routing network. Segment routing allows an operator to enforce a flow through any topological path and service chain while maintaining per-flow state only at the ingress node to the segment-routing network. Segment routing also supports equal-cost multipath (ECMP) by design. - -Segment routing can operate with either an MPLS or an IPv6 data plane. All the currently available MPLS services, such as Layer 3 VPN (L3VPN), L2VPN (Virtual Private Wire Service [VPWS], Virtual Private LAN Services [VPLS], Ethernet VPN [E-VPN], and Provider Backbone Bridging Ethernet VPN [PBB-EVPN]), can run on top of a segment-routing transport network. - -**The implementation of Segment Routing in VPP covers both the IPv6 data plane (SRv6) as well as the MPLS data plane (SR-MPLS). This page contains the SRv6 documentation.** - -## Segment Routing terminology - -* Segment Routing Header (SRH): IPv6 routing extension header of type 'Segment Routing'. (draft-ietf-6man-segment-routing-header-05) -* SegmentID (SID): is an IPv6 address. -* Segment List (SL) (SID List): is the sequence of SIDs that the packet will traverse. -* SR Policy: defines the SRH that will be applied to a packet. A packet steered into an SR policy may either receive the SRH by IPv6 header encapsulation (as recommended in draft-ietf-6man-rfc2460bis) or it could be inserted within an existing IPv6 header. An SR policy is uniquely identified by its Binding SID and associated with a weighted set of Segment Lists. In case several SID lists are defined, traffic steered into the policy is unevenly load-balanced among them according to their respective weights. -* Local SID: is a SID associated with a processing function on the local node, which may go from advancing to the next SID in the SRH, to complex user-defined behaviors. When a FIB lookup, either in the main FIB or in a specific VRF, returns a match on a local SID, the associated function is performed. -* BindingSID: a BindingSID is a SID (only one) associated one-one with an SR Policy. If a packet arrives with an IPv6 DA corresponding to a BindingSID, then the SR policy will be applied to such packet. - -## SRv6 Features in VPP - -The <a href="https://datatracker.ietf.org/doc/draft-filsfils-spring-srv6-network-programming/">SRv6 Network Programming (*draft-filsfils-spring-srv6-network-programming*)</a> defines the SRv6 architecture. - -VPP supports the following SRv6 LocalSID functions: End, End.X, End.DX6, End.DT6, End.DX4, End.DT4, End.DX2, End.B6, End.B6.Encaps. - -For further information and how to configure each specific function: @subpage srv6_localsid_doc - - -The <a href="https://datatracker.ietf.org/doc/draft-filsfils-spring-segment-routing-policy/">Segment Routing Policy (*draft-filsfils-spring-segment-routing-policy*)</a> defines SR Policies. - -VPP supports SRv6 Policies with T.Insert and T.Encaps behaviors. - -For further information on how to create SR Policies: @subpage srv6_policy_doc - -For further information on how to steer traffic into SR Policies: @subpage srv6_steering_doc - -## SRv6 LocalSID development framework - -One of the *'key'* concepts about SRv6 is network programmability. This is why an SRv6 LocalSID is associated with an specific function. - -However, the trully way to enable network programmability is allowing any developer **easily** create his own SRv6 LocalSID function. That is the reason why we have added some API calls such that any developer can code his own SRv6 LocalSID behaviors as plugins an add them to the running SRv6 code. - -The principle is that the developer only codes the behavior -the graph node-. However all the FIB handling, SR LocalSID instantiation and so on are done by the VPP SRv6 code. - -For more information please refer to: @subpage srv6_plugin_doc - -Available SRv6 plugins include: - -- @subpage srv6_as_plugin_doc -- @subpage srv6_ad_plugin_doc -- @subpage srv6_am_plugin_doc -- @subpage srv6_mobile_plugin_doc - diff --git a/src/vnet/srv6/sr_doc.rst b/src/vnet/srv6/sr_doc.rst new file mode 100644 index 00000000000..24501832b85 --- /dev/null +++ b/src/vnet/srv6/sr_doc.rst @@ -0,0 +1,123 @@ +.. _srv6_doc: + +SRv6: Segment Routing for IPv6 +============================== + +This is a memo intended to contain documentation of the VPP SRv6 +implementation. Everything that is not directly obvious should come +here. For any feedback on content that should be explained please +mailto:pcamaril@cisco.com + +Segment Routing +--------------- + +Segment routing is a network technology focused on addressing the +limitations of existing IP and Multiprotocol Label Switching (MPLS) +networks in terms of simplicity, scale, and ease of operation. It is a +foundation for application engineered routing as it prepares the +networks for new business models where applications can control the +network behavior. + +Segment routing seeks the right balance between distributed intelligence +and centralized optimization and programming. It was built for the +software-defined networking (SDN) era. + +Segment routing enhances packet forwarding behavior by enabling a +network to transport unicast packets through a specific forwarding path, +different from the normal path that a packet usually takes (IGP shortest +path or BGP best path). This capability benefits many use cases, and one +can build those specific paths based on application requirements. + +Segment routing uses the source routing paradigm. A node, usually a +router but also a switch, a trusted server, or a virtual forwarder +running on a hypervisor, steers a packet through an ordered list of +instructions, called segments. A segment can represent any instruction, +topological or service-based. A segment can have a local semantic to a +segment-routing node or global within a segment-routing network. Segment +routing allows an operator to enforce a flow through any topological +path and service chain while maintaining per-flow state only at the +ingress node to the segment-routing network. Segment routing also +supports equal-cost multipath (ECMP) by design. + +Segment routing can operate with either an MPLS or an IPv6 data plane. +All the currently available MPLS services, such as Layer 3 VPN (L3VPN), +L2VPN (Virtual Private Wire Service [VPWS], Virtual Private LAN Services +[VPLS], Ethernet VPN [E-VPN], and Provider Backbone Bridging Ethernet +VPN [PBB-EVPN]), can run on top of a segment-routing transport network. + +**The implementation of Segment Routing in VPP covers both the IPv6 data +plane (SRv6) as well as the MPLS data plane (SR-MPLS). This page +contains the SRv6 documentation.** + +Segment Routing terminology +--------------------------- + +- Segment Routing Header (SRH): IPv6 routing extension header of type + ‘Segment Routing’. (draft-ietf-6man-segment-routing-header-05) +- SegmentID (SID): is an IPv6 address. +- Segment List (SL) (SID List): is the sequence of SIDs that the packet + will traverse. +- SR Policy: defines the SRH that will be applied to a packet. A packet + steered into an SR policy may either receive the SRH by IPv6 header + encapsulation (as recommended in draft-ietf-6man-rfc2460bis) or it + could be inserted within an existing IPv6 header. An SR policy is + uniquely identified by its Binding SID and associated with a weighted + set of Segment Lists. In case several SID lists are defined, traffic + steered into the policy is unevenly load-balanced among them + according to their respective weights. +- Local SID: is a SID associated with a processing function on the + local node, which may go from advancing to the next SID in the SRH, + to complex user-defined behaviors. When a FIB lookup, either in the + main FIB or in a specific VRF, returns a match on a local SID, the + associated function is performed. +- BindingSID: a BindingSID is a SID (only one) associated one-one with + an SR Policy. If a packet arrives with an IPv6 DA corresponding to a + BindingSID, then the SR policy will be applied to such packet. + +SRv6 Features in VPP +-------------------- + +The SRv6 Network Programming +(*draft-filsfils-spring-srv6-network-programming*) defines the SRv6 +architecture. + +VPP supports the following SRv6 LocalSID functions: End, End.X, End.DX6, +End.DT6, End.DX4, End.DT4, End.DX2, End.B6, End.B6.Encaps. + +For further information and how to configure each specific function: +:ref:`srv6_localsid_doc` + +The Segment Routing Policy +(*draft-filsfils-spring-segment-routing-policy*) defines SR Policies. + +VPP supports SRv6 Policies with T.Insert and T.Encaps behaviors. + +For further information on how to create SR Policies: :ref:`srv6_policy_doc` + +For further information on how to steer traffic into SR Policies: +:ref:`srv6_steering_doc` + +SRv6 LocalSID development framework +----------------------------------- + +One of the *‘key’* concepts about SRv6 is network programmability. This +is why an SRv6 LocalSID is associated with an specific function. + +However, the true way to enable network programmability is allowing +any developer **easily** create his own SRv6 LocalSID function. That is +the reason why we have added some API calls such that any developer can +code his own SRv6 LocalSID behaviors as plugins an add them to the +running SRv6 code. + +The principle is that the developer only codes the behavior -the graph +node-. However all the FIB handling, SR LocalSID instantiation and so on +are done by the VPP SRv6 code. + +For more information please refer to: :ref:`srv6_plugin_doc` + +Available SRv6 plugins include: + +- :ref:`srv6_as_plugin_doc` +- :ref:`srv6_ad_plugin_doc` +- :ref:`srv6_am_plugin_doc` +- :ref:`srv6_mobile_plugin_doc` diff --git a/src/vnet/srv6/sr_localsid.md b/src/vnet/srv6/sr_localsid.md deleted file mode 100644 index fbc7ef827e6..00000000000 --- a/src/vnet/srv6/sr_localsid.md +++ /dev/null @@ -1,58 +0,0 @@ -# SR LocalSIDs {#srv6_localsid_doc} - -A local SID is associated to a Segment Routing behavior -or function- on the current node. - -The most basic behavior is called END. It simply activates the next SID in the current packet, by decrementing the Segments Left value and updating the IPv6 DA. - -A local END SID is instantiated using the following CLI: - - sr localsid (del) address XX::YY behavior end - -This creates a new entry in the main FIB for IPv6 address XX::YY. All packets whose IPv6 DA matches this FIB entry are redirected to the sr-localsid node, where they are processed as described above. - -Other examples of local SIDs are the following: - - sr localsid (del) address XX::YY behavior end - sr localsid (del) address XX::YY behavior end.x GE0/1/0 2001::a - sr localsid (del) address XX::YY behavior end.dx6 GE0/1/0 2001::a - sr localsid (del) address XX::YY behavior end.dx4 GE0/1/0 10.0.0.1 - sr localsid (del) address XX::YY behavior end.dx2 GigabitE0/11/0 - sr localsid (del) address XX::YY behavior end.dt6 5 - sr localsid (del) address XX::YY behavior end.dt6 5 - -Note that all of these behaviors match the definitions of the SRv6 architecture (*draft-filsfils-spring-srv6-network-programming*). Please refer to this document for a detailed description of each behavior. - -Note also that you can configure the PSP flavor of the End and End.X behaviors by typing: - - sr localsid (del) address XX::YY behavior end psp - sr localsid (del) address XX::YY behavior end.x GE0/1/0 2001::a psp - -Help on the available local SID behaviors and their usage can be obtained with: - - help sr localsid - -Alternatively they can be obtained using. - - show sr localsids behavior - -The difference in between those two commands is that the first one will only display the SR LocalSID behaviors that are built-in VPP, while the latter will display those behaviors plus the ones added with the SR LocalSID Development Framework. - - -VPP keeps a 'My LocalSID Table' where it stores all the SR local SIDs instantiated as well as their parameters. Every time a new local SID is instantiated, a new entry is added to this table. In addition, counters for correctly and incorrectly processed traffic are maintained for each local SID. The counters store both the number of packets and bytes. - -The contents of the 'My LocalSID Table' is shown with: - - vpp# show sr localsid - SRv6 - My LocalSID Table: - ========================= - Address: c3::1 - Behavior: DX6 (Endpoint with decapsulation and IPv6 cross-connect) - Iface: GigabitEthernet0/5/0 - Next hop: b:c3::b - Good traffic: [51277 packets : 5332808 bytes] - Bad traffic: [0 packets : 0 bytes] - -------------------- - -The traffic counters can be reset with: - - vpp# clear sr localsid-counters diff --git a/src/vnet/srv6/sr_localsid.rst b/src/vnet/srv6/sr_localsid.rst new file mode 100644 index 00000000000..cf042a847b4 --- /dev/null +++ b/src/vnet/srv6/sr_localsid.rst @@ -0,0 +1,90 @@ +.. _srv6_localsid_doc: + +SR LocalSIDs +============ + +A local SID is associated to a Segment Routing behavior -or function- on +the current node. + +The most basic behavior is called END. It simply activates the next SID +in the current packet, by decrementing the Segments Left value and +updating the IPv6 DA. + +A local END SID is instantiated using the following CLI: + +:: + + sr localsid (del) address XX::YY behavior end + +This creates a new entry in the main FIB for IPv6 address XX::YY. All +packets whose IPv6 DA matches this FIB entry are redirected to the +sr-localsid node, where they are processed as described above. + +Other examples of local SIDs are the following: + +:: + + sr localsid (del) address XX::YY behavior end + sr localsid (del) address XX::YY behavior end.x GE0/1/0 2001::a + sr localsid (del) address XX::YY behavior end.dx6 GE0/1/0 2001::a + sr localsid (del) address XX::YY behavior end.dx4 GE0/1/0 10.0.0.1 + sr localsid (del) address XX::YY behavior end.dx2 GigabitE0/11/0 + sr localsid (del) address XX::YY behavior end.dt6 5 + sr localsid (del) address XX::YY behavior end.dt6 5 + +Note that all of these behaviors match the definitions of the SRv6 +architecture (*draft-filsfils-spring-srv6-network-programming*). Please +refer to this document for a detailed description of each behavior. + +Note also that you can configure the PSP flavor of the End and End.X +behaviors by typing: + +:: + + sr localsid (del) address XX::YY behavior end psp + sr localsid (del) address XX::YY behavior end.x GE0/1/0 2001::a psp + +Help on the available local SID behaviors and their usage can be +obtained with: + +:: + + help sr localsid + +Alternatively they can be obtained using. + +:: + + show sr localsids behavior + +The difference in between those two commands is that the first one will +only display the SR LocalSID behaviors that are built-in VPP, while the +latter will display those behaviors plus the ones added with the SR +LocalSID Development Framework. + +VPP keeps a ‘My LocalSID Table’ where it stores all the SR local SIDs +instantiated as well as their parameters. Every time a new local SID is +instantiated, a new entry is added to this table. In addition, counters +for correctly and incorrectly processed traffic are maintained for each +local SID. The counters store both the number of packets and bytes. + +The contents of the ‘My LocalSID Table’ is shown with: + +:: + + vpp# show sr localsid + SRv6 - My LocalSID Table: + ========================= + Address: c3::1 + Behavior: DX6 (Endpoint with decapsulation and IPv6 cross-connect) + Iface: GigabitEthernet0/5/0 + Next hop: b:c3::b + Good traffic: [51277 packets : 5332808 bytes] + Bad traffic: [0 packets : 0 bytes] + -------------------- + +The traffic counters can be reset with: + +:: + + vpp# clear sr localsid-counters diff --git a/src/vnet/srv6/sr_policy.md b/src/vnet/srv6/sr_policy.md deleted file mode 100644 index 2a7eb4c9870..00000000000 --- a/src/vnet/srv6/sr_policy.md +++ /dev/null @@ -1,60 +0,0 @@ -# Creating a SR Policy {#srv6_policy_doc} - -An SR Policy is defined by a Binding SID and a weighted set of Segment Lists. - -A new SR policy is created with a first SID list using: - - sr policy add bsid 2001::1 next A1:: next B1:: next C1:: (weight 5) (fib-table 3) - -* The weight parameter is only used if more than one SID list is associated with the policy. -* The fib-table parameter specifies in which table (VRF) the Binding SID is to be installed. - -An SR policy is deleted with: - - sr policy del bsid 2001::1 - sr policy del index 1 - -The existing SR policies are listed with: - - show sr policies - -## Adding/Removing SID Lists from an SR policy - -An additional SID list is associated with an existing SR policy with: - - sr policy mod bsid 2001::1 add sl next A2:: next B2:: next C2:: (weight 3) - sr policy mod index 3 add sl next A2:: next B2:: next C2:: (weight 3) - -Conversely, a SID list can be removed from an SR policy with: - - sr policy mod bsid 2001::1 del sl index 1 - sr policy mod index 3 del sl index 1 - -Note that this cannot be used to remove the last SID list of a policy. - -The weight of a SID list can also be modified with: - - sr policy mod bsid 2001::1 mod sl index 1 weight 4 - sr policy mod index 3 mod sl index 1 weight 4 - -## SR Policies: Spray policies - -Spray policies are a specific type of SR policies where the packet is replicated on all the SID lists, rather than load-balanced among them. - -SID list weights are ignored with this type of policies. - -A Spray policy is instantiated by appending the keyword **spray** to a regular SR policy command, as in: - - sr policy add bsid 2001::1 next A1:: next B1:: next C1:: spray - -Spray policies are used for removing multicast state from a network core domain, and instead send a linear unicast copy to every access node. The last SID in each list accesses the multicast tree within the access node. - -## Encapsulation SR policies - -In case the user decides to create an SR policy an IPv6 Source Address must be specified for the encapsulated traffic. In order to do so the user might use the following command: - - set sr encaps source addr XXXX::YYYY - -Default hop-limit for the encapsulating IPv6 header is 64. It is possible to specify custom hop-limit value from 1 to 255 using this command: - - set sr encaps hop-limit N diff --git a/src/vnet/srv6/sr_policy.rst b/src/vnet/srv6/sr_policy.rst new file mode 100644 index 00000000000..50cc19bfb14 --- /dev/null +++ b/src/vnet/srv6/sr_policy.rst @@ -0,0 +1,96 @@ +.. _srv6_policy_doc: + +Creating a SR Policy +==================== + +An SR Policy is defined by a Binding SID and a weighted set of Segment +Lists. + +A new SR policy is created with a first SID list using: + +:: + + sr policy add bsid 2001::1 next A1:: next B1:: next C1:: (weight 5) (fib-table 3) + +- The weight parameter is only used if more than one SID list is + associated with the policy. +- The fib-table parameter specifies in which table (VRF) the Binding + SID is to be installed. + +An SR policy is deleted with: + +:: + + sr policy del bsid 2001::1 + sr policy del index 1 + +The existing SR policies are listed with: + +:: + + show sr policies + +Adding/Removing SID Lists from an SR policy +------------------------------------------- + +An additional SID list is associated with an existing SR policy with: + +:: + + sr policy mod bsid 2001::1 add sl next A2:: next B2:: next C2:: (weight 3) + sr policy mod index 3 add sl next A2:: next B2:: next C2:: (weight 3) + +Conversely, a SID list can be removed from an SR policy with: + +:: + + sr policy mod bsid 2001::1 del sl index 1 + sr policy mod index 3 del sl index 1 + +Note that this cannot be used to remove the last SID list of a policy. + +The weight of a SID list can also be modified with: + +:: + + sr policy mod bsid 2001::1 mod sl index 1 weight 4 + sr policy mod index 3 mod sl index 1 weight 4 + +SR Policies: Spray policies +--------------------------- + +Spray policies are a specific type of SR policies where the packet is +replicated on all the SID lists, rather than load-balanced among them. + +SID list weights are ignored with this type of policies. + +A Spray policy is instantiated by appending the keyword **spray** to a +regular SR policy command, as in: + +:: + + sr policy add bsid 2001::1 next A1:: next B1:: next C1:: spray + +Spray policies are used for removing multicast state from a network core +domain, and instead send a linear unicast copy to every access node. The +last SID in each list accesses the multicast tree within the access +node. + +Encapsulation SR policies +------------------------- + +In case the user decides to create an SR policy an IPv6 Source Address +must be specified for the encapsulated traffic. In order to do so the +user might use the following command: + +:: + + set sr encaps source addr XXXX::YYYY + +Default hop-limit for the encapsulating IPv6 header is 64. It is +possible to specify custom hop-limit value from 1 to 255 using this +command: + +:: + + set sr encaps hop-limit N diff --git a/src/vnet/srv6/sr_steering.md b/src/vnet/srv6/sr_steering.md deleted file mode 100644 index ca5cc7b6c7a..00000000000 --- a/src/vnet/srv6/sr_steering.md +++ /dev/null @@ -1,35 +0,0 @@ -# Steering packets into a SR Policy {#srv6_steering_doc} - -## steer packets uging the sr steering policy - -To steer packets in Transit into an SR policy (T.Insert, T.Encaps and T.Encaps.L2 behaviors), the user needs to create an 'sr steering policy'. - - sr steer l3 2001::/64 via index 1 - sr steer l3 2001::/64 via bsid cafe::1 - sr steer l3 2001::/64 via bsid cafe::1 fib-table 3 - sr steer l3 10.0.0.0/16 via bsid cafe::1 - sr steer l2 TenGE0/1/0 via bsid cafe::1 - -Disclaimer: The T.Encaps.L2 will steer L2 frames into an SR Policy. Notice that creating an SR steering policy for L2 frames will actually automatically *put the interface into promiscous mode*. - -## steer packets using the classifier - -Another way to steer packet is to use the classifier. - -First the user need to manually add the source routing node to the list of the -ip6-inacl next nodes. -Using the python api this can be donne with: - - # jsonfiles = get list of json api files - vpp = VPP(jsonfiles) - vpp.add_node_next(node_name='ip6-inacl', next_name='sr-pl-rewrite-insert') - -Below is a classifier mask filtering all the packets from the interface -TenGigabitEthernet5/0/0 on ip version and moving all ipv6 packets to the -sr-pl-rewrite-insert node (dropping the others) and applying the source routing -index 2. -In essence, this means "apply this sr policy to all the packets from this interface) - - vpp# classify table miss-next 0 current-data-flag 1 mask hex f000000000000000 skip 0 - vpp# classify session acl-hit-next 1 table-index 0 match hex 6000000000000000 action set-sr-policy-index 2 - vpp# set interface input acl intfc TenGigabitEthernet5/0/0 ip6-table 0 diff --git a/src/vnet/srv6/sr_steering.rst b/src/vnet/srv6/sr_steering.rst new file mode 100644 index 00000000000..b8a82e57550 --- /dev/null +++ b/src/vnet/srv6/sr_steering.rst @@ -0,0 +1,50 @@ +.. _srv6_steering_doc: + +Steering packets into a SR Policy +================================= + +steer packets using the sr steering policy +------------------------------------------ + +To steer packets in Transit into an SR policy (T.Insert, T.Encaps and +T.Encaps.L2 behaviors), the user needs to create an ‘sr steering +policy’. + +:: + + sr steer l3 2001::/64 via index 1 + sr steer l3 2001::/64 via bsid cafe::1 + sr steer l3 2001::/64 via bsid cafe::1 fib-table 3 + sr steer l3 10.0.0.0/16 via bsid cafe::1 + sr steer l2 TenGE0/1/0 via bsid cafe::1 + +Disclaimer: The T.Encaps.L2 will steer L2 frames into an SR Policy. +Notice that creating an SR steering policy for L2 frames will actually +automatically *put the interface into promiscous mode*. + +steer packets using the classifier +---------------------------------- + +Another way to steer packet is to use the classifier. + +First the user need to manually add the source routing node to the list +of the ip6-inacl next nodes. Using the python api this can be done +with: + +:: + + # jsonfiles = get list of json api files + vpp = VPP(jsonfiles) + vpp.add_node_next(node_name='ip6-inacl', next_name='sr-pl-rewrite-insert') + +Below is a classifier mask filtering all the packets from the interface +TenGigabitEthernet5/0/0 on ip version and moving all ipv6 packets to the +sr-pl-rewrite-insert node (dropping the others) and applying the source +routing index 2. In essence, this means “apply this sr policy to all the +packets from this interface) + +:: + + vpp# classify table miss-next 0 current-data-flag 1 mask hex f000000000000000 skip 0 + vpp# classify session acl-hit-next 1 table-index 0 match hex 6000000000000000 action set-sr-policy-index 2 + vpp# set interface input acl intfc TenGigabitEthernet5/0/0 ip6-table 0 diff --git a/src/vnet/syslog/sylog_doc.md b/src/vnet/syslog/sylog_doc.md deleted file mode 100644 index 0b48d4db573..00000000000 --- a/src/vnet/syslog/sylog_doc.md +++ /dev/null @@ -1,65 +0,0 @@ -# Syslog protocol support {#syslog_doc} - -VPP provides [RFC5424](https://tools.ietf.org/html/rfc5424) syslog protocol -logging, which is used to transport event messages across network. VPP -currently suports UDP transport based on -[RFC5426](https://tools.ietf.org/html/rfc5426). - -The syslog message has the following format: -* header -* structured data -* free-form message - -The header contains, priority, version, timestamp, hostname, application, -process id and message id. It is followed by structured data, which provides -a mechanism to express event data in easily parsable format. Structured data -can contain zero, one or multiple structured data elements. Structured data -element contains name-value pairs. Structured data can by followed by free-form -message. - -Following example explains how to use the internal APIs to genrate syslog -message: -```{.c} - #include <vnet/syslog/syslog.h> - - ... - - syslog_msg_t syslog_msg; - - /* Check if syslog logging is enabled */ - if (!syslog_is_enabled ()) - return; - - /* Severity filer test */ - if (syslog_severity_filter_block (severity)) - return; - - /* Initialize syslog message header */ - syslog_msg_init (&syslog_msg, facility, severity, "NAT", "SADD"); - - /* Create structured data element */ - syslog_msg_sd_init (&syslog_msg, "nsess"); - /* Add structured data element parameters (name-value pairs) */ - syslog_msg_add_sd_param (&syslog_msg, "SSUBIX", "%d", ssubix); - syslog_msg_add_sd_param (&syslog_msg, "SVLAN", "%d", svlan); - syslog_msg_add_sd_param (&syslog_msg, "IATYP", "IPv4"); - syslog_msg_add_sd_param (&syslog_msg, "ISADDR", "%U", - format_ip4_address, isaddr); - syslog_msg_add_sd_param (&syslog_msg, "ISPORT", "%d", isport); - syslog_msg_add_sd_param (&syslog_msg, "XATYP", "IPv4"); - syslog_msg_add_sd_param (&syslog_msg, "XSADDR", "%U", - format_ip4_address, xsaddr); - syslog_msg_add_sd_param (&syslog_msg, "XSPORT", "%d", xsport); - syslog_msg_add_sd_param (&syslog_msg, "PROTO", "%d", proto); - - /* Send syslog message */ - syslog_msg_send (&syslog_msg); -``` - -Example above produces following syslog message: - <134>1 2018-11-12T11:25:30.252715Z 172.16.4.1 NAT 5901 SADD [nsess SSUBIX="0" SVLAN="0" IATYP="IPv4" ISADDR="172.16.1.2" ISPORT="6303" XATYP="IPv4" XSADDR="10.0.0.3" XSPORT="16253" PROTO="6"] - -To add free-form message use: -```{.c} - syslog_msg_add_msg (&syslog_msg, "event log entry"); -``` diff --git a/src/vnet/syslog/sylog_doc.rst b/src/vnet/syslog/sylog_doc.rst new file mode 100644 index 00000000000..f39c9c490dc --- /dev/null +++ b/src/vnet/syslog/sylog_doc.rst @@ -0,0 +1,70 @@ +.. _syslog_doc: + +Syslog protocol support +======================= + +VPP provides `RFC5424 <https://tools.ietf.org/html/rfc5424>`__ syslog +protocol logging, which is used to transport event messages across +network. VPP currently supports UDP transport based on +`RFC5426 <https://tools.ietf.org/html/rfc5426>`__. + +The syslog message has the following format: \* header \* structured +data \* free-form message + +The header contains, priority, version, timestamp, hostname, +application, process id and message id. It is followed by structured +data, which provides a mechanism to express event data in easily +parsable format. Structured data can contain zero, one or multiple +structured data elements. Structured data element contains name-value +pairs. Structured data can by followed by free-form message. + +Following example explains how to use the internal APIs to generate +syslog message: + +.. code:: c + + #include <vnet/syslog/syslog.h> + + ... + + syslog_msg_t syslog_msg; + + /* Check if syslog logging is enabled */ + if (!syslog_is_enabled ()) + return; + + /* Severity filer test */ + if (syslog_severity_filter_block (severity)) + return; + + /* Initialize syslog message header */ + syslog_msg_init (&syslog_msg, facility, severity, "NAT", "SADD"); + + /* Create structured data element */ + syslog_msg_sd_init (&syslog_msg, "nsess"); + /* Add structured data element parameters (name-value pairs) */ + syslog_msg_add_sd_param (&syslog_msg, "SSUBIX", "%d", ssubix); + syslog_msg_add_sd_param (&syslog_msg, "SVLAN", "%d", svlan); + syslog_msg_add_sd_param (&syslog_msg, "IATYP", "IPv4"); + syslog_msg_add_sd_param (&syslog_msg, "ISADDR", "%U", + format_ip4_address, isaddr); + syslog_msg_add_sd_param (&syslog_msg, "ISPORT", "%d", isport); + syslog_msg_add_sd_param (&syslog_msg, "XATYP", "IPv4"); + syslog_msg_add_sd_param (&syslog_msg, "XSADDR", "%U", + format_ip4_address, xsaddr); + syslog_msg_add_sd_param (&syslog_msg, "XSPORT", "%d", xsport); + syslog_msg_add_sd_param (&syslog_msg, "PROTO", "%d", proto); + + /* Send syslog message */ + syslog_msg_send (&syslog_msg); + +Example above produces following syslog message: <134>1 +2018-11-12T11:25:30.252715Z 172.16.4.1 NAT 5901 SADD [nsess SSUBIX=“0” +SVLAN=“0” IATYP=“IPv4” ISADDR=“172.16.1.2” ISPORT=“6303” XATYP=“IPv4” +XSADDR=“10.0.0.3” XSPORT=“16253” PROTO=“6”] + +To add free-form message use: + +.. code:: c + + syslog_msg_add_msg (&syslog_msg, "event log entry"); diff --git a/src/vpp-api/python/README.rst b/src/vpp-api/python/README.rst deleted file mode 100644 index e69de29bb2d..00000000000 --- a/src/vpp-api/python/README.rst +++ /dev/null diff --git a/src/vpp-api/vapi/vapi_doc.md b/src/vpp-api/vapi/vapi_doc.md deleted file mode 100644 index 0e7e29dde01..00000000000 --- a/src/vpp-api/vapi/vapi_doc.md +++ /dev/null @@ -1,155 +0,0 @@ -# VPP API module {#vapi_doc} - -## Overview - -VPP API module allows communicating with VPP over shared memory interface. -The API consists of 3 parts: - -* common code - low-level API -* generated code - high-level API -* code generator - to generate your own high-level API e.g. for custom plugins - -### Common code - -#### C common code - -C common code represents the basic, low-level API, providing functions to -connect/disconnect, perform message discovery and send/receive messages. -The C variant is in vapi.h. - -#### C++ common code - -C++ is provided by vapi.hpp and contains high-level API templates, -which are specialized by generated code. - -### Generated code - -Each API file present in the source tree is automatically translated to JSON -file, which the code generator parses and generates either C (`vapi_c_gen.py`) -or C++ (`vapi_cpp_gen.py`) code. - -This can then be included in the client application and provides convenient way -to interact with VPP. This includes: - -* automatic byte-swapping -* automatic request-response matching based on context -* automatic casts to appropriate types (type-safety) when calling callbacks -* automatic sending of control-pings for dump messages - -The API supports two modes of operation: - -* blocking -* non-blocking - -In blocking mode, whenever an operation is initiated, the code waits until it -can finish. This means that when sending a message, the call blocks until -the message can be written to shared memory. Similarly, receiving a message -blocks until a message becomes available. On higher level, this also means that -when doing a request (e.g. `show_version`), the call blocks until a response -comes back (e.g. `show_version_reply`). - -In non-blocking mode, these are decoupled, the API returns VAPI_EAGAIN whenever -an operation cannot be performed and after sending a request, it's up to -the client to wait for and process a response. - -### Code generator - -Python code generator comes in two flavors - C and C++ and generates high-level -API headers. All the code is stored in the headers. - -## Usage - -### Low-level API - -Refer to inline API documentation in doxygen format in `vapi.h` header -for description of functions. It's recommened to use the safer, high-level -API provided by specialized headers (e.g. `vpe.api.vapi.h` -or `vpe.api.vapi.hpp`). - -#### C high-level API - -##### Callbacks - -The C high-level API is strictly callback-based for maximum efficiency. -Whenever an operation is initiated a callback with a callback context is part -of that operation. The callback is then invoked when the response (or multiple -responses) arrive which are tied to the request. Also, callbacks are invoked -whenever an event arrives, if such callback is registered. All the pointers -to responses/events point to shared memory and are immediately freed after -callback finishes so the client needs to extract/copy any data in which it -is interested in. - -#### Blocking mode - -In simple blocking mode, the whole operation (being a simple request or a dump) -is finished and it's callback is called (potentially multiple times for dumps) -during function call. - -Example pseudo-code for a simple request in this mode: - -` -vapi_show_version(message, callback, callback_context) - -1. generate unique internal context and assign it to message.header.context -2. byteswap the message to network byte order -3. send message to vpp (message is now consumed and vpp will free it) -4. create internal "outstanding request context" which stores the callback, - callback context and the internal context value -5. call dispatch, which in this mode receives and processes responses until - the internal "outstanding requests" queue is empty. In blocking mode, this - queue always contains at most one item. -` - -**Note**: it's possible for different - unrelated callbacks to be called before -the response callbacks is called in cases where e.g. events are stored -in shared memory queue. - -#### Non-blocking mode - -In non-blocking mode, all the requests are only byte-swapped and the context -information along with callbacks is stored locally (so in the above example, -only steps 1-4 are executed and step 5 is skipped). Calling dispatch is up to -the client application. This allows to alternate between sending/receiving -messages or have a dedicated thread which calls dispatch. - -### C++ high level API - -#### Callbacks - -In C++ API, the response is automatically tied to the corresponding `Request`, -`Dump` or `Event_registration` object. Optionally a callback might be specified, -which then gets called when the response is received. - -**Note**: responses take up shared memory space and should be freed either -manually (in case of result sets) or automatically (by destroying the object -owning them) when no longer needed. Once a Request or Dump object was executed, -it cannot be re-sent, since the request itself (stores in shared memory) -is consumed by vpp and inaccessible (set to nullptr) anymore. - -#### Usage - -#### Requests & dumps - -0. Create on object of `Connection` type and call `connect()` to connect to vpp. -1. Create an object of `Request` or `Dump` type using it's typedef (e.g. - `Show_version`) -2. Use `get_request()` to obtain and manipulate the underlying request if - required. -3. Issue `execute()` to send the request. -4. Use either `wait_for_response()` or `dispatch()` to wait for the response. -5. Use `get_response_state()` to get the state and `get_response()` to read - the response. - -#### Events - -0. Create a `Connection` and execute the appropriate `Request` to subscribe to - events (e.g. `Want_stats`) -1. Create an `Event_registration` with a template argument being the type of - event you are insterested in. -2. Call `dispatch()` or `wait_for_response()` to wait for the event. A callback - will be called when an event occurs (if passed to `Event_registration()` - constructor). Alternatively, read the result set. - -**Note**: events stored in the result set take up space in shared memory -and should be freed regularly (e.g. in the callback, once the event is -processed). diff --git a/src/vpp-api/vapi/vapi_doc.rst b/src/vpp-api/vapi/vapi_doc.rst new file mode 100644 index 00000000000..4efbf2d9988 --- /dev/null +++ b/src/vpp-api/vapi/vapi_doc.rst @@ -0,0 +1,191 @@ +.. _vapi_doc: + +VPP API module +============== + +Overview +-------- + +VPP API module allows communicating with VPP over shared memory +interface. The API consists of 3 parts: + +- common code - low-level API +- generated code - high-level API +- code generator - to generate your own high-level API e.g. for custom + plugins + +Common code +~~~~~~~~~~~ + +C common code +^^^^^^^^^^^^^ + +C common code represents the basic, low-level API, providing functions +to connect/disconnect, perform message discovery and send/receive +messages. The C variant is in vapi.h. + +.. _c-common-code-1: + +C++ common code +^^^^^^^^^^^^^^^ + +C++ is provided by vapi.hpp and contains high-level API templates, which +are specialized by generated code. + +Generated code +~~~~~~~~~~~~~~ + +Each API file present in the source tree is automatically translated to +JSON file, which the code generator parses and generates either C +(``vapi_c_gen.py``) or C++ (``vapi_cpp_gen.py``) code. + +This can then be included in the client application and provides +convenient way to interact with VPP. This includes: + +- automatic byte-swapping +- automatic request-response matching based on context +- automatic casts to appropriate types (type-safety) when calling + callbacks +- automatic sending of control-pings for dump messages + +The API supports two modes of operation: + +- blocking +- non-blocking + +In blocking mode, whenever an operation is initiated, the code waits +until it can finish. This means that when sending a message, the call +blocks until the message can be written to shared memory. Similarly, +receiving a message blocks until a message becomes available. On higher +level, this also means that when doing a request +(e.g. ``show_version``), the call blocks until a response comes back +(e.g. ``show_version_reply``). + +In non-blocking mode, these are decoupled, the API returns VAPI_EAGAIN +whenever an operation cannot be performed and after sending a request, +it’s up to the client to wait for and process a response. + +Code generator +~~~~~~~~~~~~~~ + +Python code generator comes in two flavors - C and C++ and generates +high-level API headers. All the code is stored in the headers. + +Usage +----- + +Low-level API +~~~~~~~~~~~~~ + +Refer to inline API documentation in doxygen format in ``vapi.h`` header +for description of functions. It’s recommended to use the safer, +high-level API provided by specialized headers (e.g. ``vpe.api.vapi.h`` +or ``vpe.api.vapi.hpp``). + +C high-level API +^^^^^^^^^^^^^^^^ + +Callbacks +''''''''' + +The C high-level API is strictly callback-based for maximum efficiency. +Whenever an operation is initiated a callback with a callback context is +part of that operation. The callback is then invoked when the response +(or multiple responses) arrive which are tied to the request. Also, +callbacks are invoked whenever an event arrives, if such callback is +registered. All the pointers to responses/events point to shared memory +and are immediately freed after callback finishes so the client needs to +extract/copy any data in which it is interested in. + +Blocking mode +^^^^^^^^^^^^^ + +In simple blocking mode, the whole operation (being a simple request or +a dump) is finished and it’s callback is called (potentially multiple +times for dumps) during function call. + +Example pseudo-code for a simple request in this mode: + +\` vapi_show_version(message, callback, callback_context) + +1. generate unique internal context and assign it to + message.header.context +2. byteswap the message to network byte order +3. send message to vpp (message is now consumed and vpp will free it) +4. create internal “outstanding request context” which stores the + callback, callback context and the internal context value +5. call dispatch, which in this mode receives and processes responses + until the internal “outstanding requests” queue is empty. In blocking + mode, this queue always contains at most one item. \` + +**Note**: it’s possible for different - unrelated callbacks to be called +before the response callbacks is called in cases where e.g. events are +stored in shared memory queue. + +Non-blocking mode +^^^^^^^^^^^^^^^^^ + +In non-blocking mode, all the requests are only byte-swapped and the +context information along with callbacks is stored locally (so in the +above example, only steps 1-4 are executed and step 5 is skipped). +Calling dispatch is up to the client application. This allows to +alternate between sending/receiving messages or have a dedicated thread +which calls dispatch. + +.. _c-high-level-api-1: + +C++ high level API +~~~~~~~~~~~~~~~~~~ + +.. _callbacks-1: + +Callbacks +^^^^^^^^^ + +In C++ API, the response is automatically tied to the corresponding +``Request``, ``Dump`` or ``Event_registration`` object. Optionally a +callback might be specified, which then gets called when the response is +received. + +**Note**: responses take up shared memory space and should be freed +either manually (in case of result sets) or automatically (by destroying +the object owning them) when no longer needed. Once a Request or Dump +object was executed, it cannot be re-sent, since the request itself +(stores in shared memory) is consumed by vpp and inaccessible (set to +nullptr) anymore. + +.. _usage-1: + +Usage +^^^^^ + +Requests & dumps +^^^^^^^^^^^^^^^^ + +0. Create on object of ``Connection`` type and call ``connect()`` to + connect to vpp. +1. Create an object of ``Request`` or ``Dump`` type using it’s typedef + (e.g. ``Show_version``) +2. Use ``get_request()`` to obtain and manipulate the underlying request + if required. +3. Issue ``execute()`` to send the request. +4. Use either ``wait_for_response()`` or ``dispatch()`` to wait for the + response. +5. Use ``get_response_state()`` to get the state and ``get_response()`` + to read the response. + +Events +^^^^^^ + +0. Create a ``Connection`` and execute the appropriate ``Request`` to + subscribe to events (e.g. ``Want_stats``) +1. Create an ``Event_registration`` with a template argument being the + type of event you are interested in. +2. Call ``dispatch()`` or ``wait_for_response()`` to wait for the event. + A callback will be called when an event occurs (if passed to + ``Event_registration()`` constructor). Alternatively, read the result + set. + +**Note**: events stored in the result set take up space in shared memory +and should be freed regularly (e.g. in the callback, once the event is +processed). diff --git a/src/vpp/mem/mem.md b/src/vpp/mem/mem.md deleted file mode 100644 index 84ab820e5e5..00000000000 --- a/src/vpp/mem/mem.md +++ /dev/null @@ -1,21 +0,0 @@ -# VPP mem preload {#mempreload_doc} - -Internal VPP memory allocations rely on VPP main-heap, however when using -external libraries, esp. in plugins (eg. OpenSSL library used by the IKEv2 -plugin), those external libraries usually manages memory using the standard -libc `malloc()`/`free()`/... calls. This, in turn, makes use of the default -libc heap. - -VPP has no knowledge of this heap and tools such as memory traces cannot be -used. - -In order to enable the use of standard VPP debugging tools, this library -replaces standard libc memory management calls with version using VPP -main-heap. - -To use it, you need to use the `LD_PRELOAD` mechanism, eg. -``` -~# LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libvppmem_preload.so /usr/bin/vpp -c /etc/vpp/startup.conf -``` - -You can then use tools such as memory traces as usual. diff --git a/src/vpp/mem/mem.rst b/src/vpp/mem/mem.rst new file mode 100644 index 00000000000..82ae2ff35df --- /dev/null +++ b/src/vpp/mem/mem.rst @@ -0,0 +1,25 @@ +.. _mempreload_doc: + +VPP mem preload +=============== + +Internal VPP memory allocations rely on VPP main-heap, however when +using external libraries, esp. in plugins (e.g. OpenSSL library used by +the IKEv2 plugin), those external libraries usually manages memory using +the standard libc ``malloc()``/``free()``/… calls. This, in turn, makes +use of the default libc heap. + +VPP has no knowledge of this heap and tools such as memory traces cannot +be used. + +In order to enable the use of standard VPP debugging tools, this library +replaces standard libc memory management calls with version using VPP +main-heap. + +To use it, you need to use the ``LD_PRELOAD`` mechanism, e.g. + +:: + + ~# LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libvppmem_preload.so /usr/bin/vpp -c /etc/vpp/startup.conf + +You can then use tools such as memory traces as usual. diff --git a/src/vpp/stats/stats.md b/src/vpp/stats/stats.md deleted file mode 100644 index 8671f56e4a5..00000000000 --- a/src/vpp/stats/stats.md +++ /dev/null @@ -1,130 +0,0 @@ -# Statistics {#stats_doc} - -In VPP most things are measured and counted. There are counters for interface statistics, like RX, TX counters, packet drops, and so on. Every node has a set of per-node counters, one set of error counters, like TTL exceeded, or packet to big or out-of-buffers. And a set of performance counters, like number of clocks, vectors, calls and suspends. - -There is also a set of system counters and performance counters, e.g. memory utilization per heap, buffer utilisation and so on. - -## VPP Counter Architecture - -Counters are exposed directly via shared memory. These are the actual counters in VPP, no sampling or aggregation is done by the statistics infrastructure. With the exception of per node performance data under /sys/node and a few system counters. - - -Clients mount the shared memory segment read-only, using a optimistic concurrency algorithm. - -Directory structure as an index. - -### Memory layout - -The memory segment consists of a shared header, containing atomics for the optimistic concurrency mechanism, and offsets into memory for the directory vectors. The only data structure used is the VPP vectors. All pointers are converted to offsets so that client applications can map the shared memory wherever it pleases. - -### Directory layout - -### Optimistic concurrency - -``` -/* - * Shared header first in the shared memory segment. - */ -typedef struct { - atomic_int_fast64_t epoch; - atomic_int_fast64_t in_progress; - atomic_int_fast64_t directory_offset; - atomic_int_fast64_t error_offset; - atomic_int_fast64_t stats_offset; -} stat_segment_shared_header_t; - -``` - -#### Writer -On the VPP side there is a single writer (controlled by a spinlock). When the writer starts it sets in_progress=1, continues with the update of the data-structures, and when done, bumps epoch++ and sets in_progress=0. - -#### Readers -If in_progress=1, there is no point continuing, so reader sits spinning on the in_progress flag until it is 0. Then it sets start_epoch = epoch and continues copying out the counter data it is interested in, while doing strict boundary checks on all offsets / pointers. When the reader is done, it checks if in_progress=1 or if epoch != start_epoch. If either of those are true is discards the data read. - -## How are counters exposed out of VPP? - -## Types of Counters - -All counters under /err and /if are the directly exposed VPP counters. - -* Gauges -* u64 / float -* Interface Counters - * Simple counters, counter_t array of threads of an array of interfaces - * Combined counters, vlib_counter_t array of threads of an array of interfaces. - - -## Client libraries -### Writing a new client library -A new client library can either wrap the C library (libvppapiclient.so) or it can integrate directly with the shared memory. That involves exchanging a file descriptor over the VPP stats Unix domain socket, and opening the memory mapped segment. - -### Python - -``` -#!/usr/bin/env python3 -from vpp_papi.vpp_stats import VPPStats -stats = VPPStats('/run/vpp/stats.sock') -dir = stats.ls(['^/if', '/err/ip4-input', '/sys/node/ip4-input']) -counters = stats.dump(dir) - -# Print the RX counters for the first interface on the first worker core -print ('RX interface core 0, sw_if_index 0', counters['/if/rx'][0][0]) - -``` -### C -``` -#include <vpp-api/client/stat_client.h> -#include <vppinfra/vec.h> - -int main (int argc, char **argv) { - uint8_t *patterns = 0; - - vec_add1(patterns, "^/if"); - vec_add1(patterns, "ip4-input"); - - int rv = stat_segment_connect(STAT_SEGMENT_SOCKET_FILE); - uint32_t *dir = stat_segment_ls(patterns); - stat_segment_data_t *res = stat_segment_dump(dir); - - for (int i = 0; i < vec_len(res); i++) { - switch (res[i].type) { - case STAT_DIR_TYPE_COUNTER_VECTOR_SIMPLE: - for (k = 0; k < vec_len (res[i].simple_counter_vec) - 1; k++) - for (j = 0; j < vec_len (res[i].simple_counter_vec[k]); j++) - fformat (stdout, "[%d @ %d]: %llu packets %s\n", - j, k, res[i].simple_counter_vec[k][j], - res[i].name); - break; - - case STAT_DIR_TYPE_COUNTER_VECTOR_COMBINED: - for (k = 0; k < vec_len (res[i].combined_counter_vec); k++) - for (j = 0; j < vec_len (res[i].combined_counter_vec[k]); j++) - fformat (stdout, "[%d @ %d]: %llu packets, %llu bytes %s\n", - j, k, res[i].combined_counter_vec[k][j].packets, - res[i].combined_counter_vec[k][j].bytes, - res[i].name); - break; - - case STAT_DIR_TYPE_ERROR_INDEX: - for (j = 0; j < vec_len (res[i].error_vector); j++) - fformat (stdout, "[@%d] %llu %s\n", j, res[i].error_vector[j], res[i].name); - break; - - case STAT_DIR_TYPE_SCALAR_INDEX: - fformat (stdout, "%.2f %s\n", res[i].scalar_value, res[i].name); - break; - - default: - ; - } - } - stat_segment_data_free (res); -} -``` - -## Integrations -* CLI command. vpp_get_stats [ls | dump | poll] -* Prometheus - -## Future evolution -* Deprecate the stats over binary API calls that are based on want_stats diff --git a/src/vpp/stats/stats.rst b/src/vpp/stats/stats.rst new file mode 100644 index 00000000000..26e4db8c0db --- /dev/null +++ b/src/vpp/stats/stats.rst @@ -0,0 +1,178 @@ +.. _stats_doc: + +Statistics +========== + +In VPP most things are measured and counted. There are counters for +interface statistics, like RX, TX counters, packet drops, and so on. +Every node has a set of per-node counters, one set of error counters, +like TTL exceeded, or packet to big or out-of-buffers. And a set of +performance counters, like number of clocks, vectors, calls and +suspends. + +There is also a set of system counters and performance counters, +e.g. memory utilization per heap, buffer utilisation and so on. + +VPP Counter Architecture +------------------------ + +Counters are exposed directly via shared memory. These are the actual +counters in VPP, no sampling or aggregation is done by the statistics +infrastructure. With the exception of per node performance data under +/sys/node and a few system counters. + +Clients mount the shared memory segment read-only, using a optimistic +concurrency algorithm. + +Directory structure as an index. + +Memory layout +~~~~~~~~~~~~~ + +The memory segment consists of a shared header, containing atomics for +the optimistic concurrency mechanism, and offsets into memory for the +directory vectors. The only data structure used is the VPP vectors. All +pointers are converted to offsets so that client applications can map +the shared memory wherever it pleases. + +Directory layout +~~~~~~~~~~~~~~~~ + +Optimistic concurrency +~~~~~~~~~~~~~~~~~~~~~~ + +:: + + /* + * Shared header first in the shared memory segment. + */ + typedef struct { + atomic_int_fast64_t epoch; + atomic_int_fast64_t in_progress; + atomic_int_fast64_t directory_offset; + atomic_int_fast64_t error_offset; + atomic_int_fast64_t stats_offset; + } stat_segment_shared_header_t; + +Writer +^^^^^^ + +On the VPP side there is a single writer (controlled by a spinlock). +When the writer starts it sets in_progress=1, continues with the update +of the data-structures, and when done, bumps epoch++ and sets +in_progress=0. + +Readers +^^^^^^^ + +If in_progress=1, there is no point continuing, so reader sits spinning +on the in_progress flag until it is 0. Then it sets start_epoch = epoch +and continues copying out the counter data it is interested in, while +doing strict boundary checks on all offsets / pointers. When the reader +is done, it checks if in_progress=1 or if epoch != start_epoch. If +either of those are true is discards the data read. + +How are counters exposed out of VPP? +------------------------------------ + +Types of Counters +----------------- + +All counters under /err and /if are the directly exposed VPP counters. + +- Gauges +- u64 / float +- Interface Counters +- Simple counters, counter_t array of threads of an array of interfaces +- Combined counters, vlib_counter_t array of threads of an array of + interfaces. + +Client libraries +---------------- + +Writing a new client library +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A new client library can either wrap the C library (libvppapiclient.so) +or it can integrate directly with the shared memory. That involves +exchanging a file descriptor over the VPP stats Unix domain socket, and +opening the memory mapped segment. + +Python +~~~~~~ + +:: + + #!/usr/bin/env python3 + from vpp_papi.vpp_stats import VPPStats + stats = VPPStats('/run/vpp/stats.sock') + dir = stats.ls(['^/if', '/err/ip4-input', '/sys/node/ip4-input']) + counters = stats.dump(dir) + + # Print the RX counters for the first interface on the first worker core + print ('RX interface core 0, sw_if_index 0', counters['/if/rx'][0][0]) + +C +~ + +:: + + #include <vpp-api/client/stat_client.h> + #include <vppinfra/vec.h> + + int main (int argc, char **argv) { + uint8_t *patterns = 0; + + vec_add1(patterns, "^/if"); + vec_add1(patterns, "ip4-input"); + + int rv = stat_segment_connect(STAT_SEGMENT_SOCKET_FILE); + uint32_t *dir = stat_segment_ls(patterns); + stat_segment_data_t *res = stat_segment_dump(dir); + + for (int i = 0; i < vec_len(res); i++) { + switch (res[i].type) { + case STAT_DIR_TYPE_COUNTER_VECTOR_SIMPLE: + for (k = 0; k < vec_len (res[i].simple_counter_vec) - 1; k++) + for (j = 0; j < vec_len (res[i].simple_counter_vec[k]); j++) + fformat (stdout, "[%d @ %d]: %llu packets %s\n", + j, k, res[i].simple_counter_vec[k][j], + res[i].name); + break; + + case STAT_DIR_TYPE_COUNTER_VECTOR_COMBINED: + for (k = 0; k < vec_len (res[i].combined_counter_vec); k++) + for (j = 0; j < vec_len (res[i].combined_counter_vec[k]); j++) + fformat (stdout, "[%d @ %d]: %llu packets, %llu bytes %s\n", + j, k, res[i].combined_counter_vec[k][j].packets, + res[i].combined_counter_vec[k][j].bytes, + res[i].name); + break; + + case STAT_DIR_TYPE_ERROR_INDEX: + for (j = 0; j < vec_len (res[i].error_vector); j++) + fformat (stdout, "[@%d] %llu %s\n", j, res[i].error_vector[j], res[i].name); + break; + + case STAT_DIR_TYPE_SCALAR_INDEX: + fformat (stdout, "%.2f %s\n", res[i].scalar_value, res[i].name); + break; + + default: + ; + } + } + stat_segment_data_free (res); + } + +Integrations +------------ + +- CLI command. vpp_get_stats [ls \| dump \| poll] +- Prometheus + +Future evolution +---------------- + +- Deprecate the stats over binary API calls that are based on + want_stats |