From b63264c8342e6a1b6971c79550d2af2024b6a4de Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 14 Aug 2018 18:52:30 +0100 Subject: New upstream version 18.08 Change-Id: I32fdf5e5016556d9c0a6d88ddaf1fc468961790a Signed-off-by: Luca Boccassi --- doc/guides/prog_guide/bbdev.rst | 259 ++- doc/guides/prog_guide/bpf_lib.rst | 38 + doc/guides/prog_guide/compressdev.rst | 623 +++++++ doc/guides/prog_guide/cryptodev_lib.rst | 304 ++- doc/guides/prog_guide/env_abstraction_layer.rst | 239 ++- doc/guides/prog_guide/event_crypto_adapter.rst | 296 +++ .../prog_guide/event_ethernet_rx_adapter.rst | 47 +- doc/guides/prog_guide/event_timer_adapter.rst | 296 +++ doc/guides/prog_guide/eventdev.rst | 57 +- .../generic_segmentation_offload_lib.rst | 10 + doc/guides/prog_guide/glossary.rst | 3 - doc/guides/prog_guide/hash_lib.rst | 29 +- .../img/event_crypto_adapter_op_forward.svg | 1078 +++++++++++ .../prog_guide/img/event_crypto_adapter_op_new.svg | 1061 +++++++++++ doc/guides/prog_guide/img/eventdev_usage.svg | 1519 ++++++--------- doc/guides/prog_guide/img/malloc_heap.svg | 1348 ++++---------- doc/guides/prog_guide/img/stateful-op.svg | 116 ++ doc/guides/prog_guide/img/stateless-op-shared.svg | 124 ++ doc/guides/prog_guide/img/stateless-op.svg | 140 ++ doc/guides/prog_guide/img/turbo_tb_decode.svg | 1471 +++++++++++++++ doc/guides/prog_guide/img/turbo_tb_encode.svg | 1948 ++++++++++++++++++++ doc/guides/prog_guide/index.rst | 5 + .../prog_guide/link_bonding_poll_mode_drv_lib.rst | 43 +- doc/guides/prog_guide/mbuf_lib.rst | 11 +- doc/guides/prog_guide/multi_proc_support.rst | 172 +- doc/guides/prog_guide/overview.rst | 4 +- doc/guides/prog_guide/poll_mode_drv.rst | 60 +- doc/guides/prog_guide/rawdev.rst | 2 +- doc/guides/prog_guide/rte_flow.rst | 1226 +++++++----- doc/guides/prog_guide/source_org.rst | 4 +- doc/guides/prog_guide/switch_representation.rst | 837 +++++++++ .../prog_guide/traffic_metering_and_policing.rst | 2 +- doc/guides/prog_guide/vhost_lib.rst | 128 +- 33 files changed, 10888 insertions(+), 2612 deletions(-) create mode 100644 doc/guides/prog_guide/bpf_lib.rst create mode 100644 doc/guides/prog_guide/compressdev.rst create mode 100644 doc/guides/prog_guide/event_crypto_adapter.rst create mode 100644 doc/guides/prog_guide/event_timer_adapter.rst create mode 100644 doc/guides/prog_guide/img/event_crypto_adapter_op_forward.svg create mode 100644 doc/guides/prog_guide/img/event_crypto_adapter_op_new.svg create mode 100644 doc/guides/prog_guide/img/stateful-op.svg create mode 100644 doc/guides/prog_guide/img/stateless-op-shared.svg create mode 100644 doc/guides/prog_guide/img/stateless-op.svg create mode 100644 doc/guides/prog_guide/img/turbo_tb_decode.svg create mode 100644 doc/guides/prog_guide/img/turbo_tb_encode.svg create mode 100644 doc/guides/prog_guide/switch_representation.rst (limited to 'doc/guides/prog_guide') diff --git a/doc/guides/prog_guide/bbdev.rst b/doc/guides/prog_guide/bbdev.rst index d40c7f4c..9de14443 100644 --- a/doc/guides/prog_guide/bbdev.rst +++ b/doc/guides/prog_guide/bbdev.rst @@ -6,12 +6,12 @@ Wireless Baseband Device Library The Wireless Baseband library provides a common programming framework that abstracts HW accelerators based on FPGA and/or Fixed Function Accelerators that -assist with 3gpp Physical Layer processing. Furthermore, it decouples the +assist with 3GPP Physical Layer processing. Furthermore, it decouples the application from the compute-intensive wireless functions by abstracting their optimized libraries to appear as virtual bbdev devices. The functional scope of the BBDEV library are those functions in relation to -the 3gpp Layer 1 signal processing (channel coding, modulation, ...). +the 3GPP Layer 1 signal processing (channel coding, modulation, ...). The framework currently only supports Turbo Code FEC function. @@ -42,13 +42,13 @@ From the command line using the --vdev EAL option .. code-block:: console - --vdev 'turbo_sw,max_nb_queues=8,socket_id=0' + --vdev 'baseband_turbo_sw,max_nb_queues=8,socket_id=0' Our using the rte_vdev_init API within the application code. .. code-block:: c - rte_vdev_init("turbo_sw", "max_nb_queues=2,socket_id=0") + rte_vdev_init("baseband_turbo_sw", "max_nb_queues=2,socket_id=0") All virtual bbdev devices support the following initialization parameters: @@ -167,7 +167,7 @@ Logical Cores, Memory and Queues Relationships ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The bbdev device Library as the Poll Mode Driver library support NUMA for when -a processor’s logical cores and interfaces utilize its local memory. Therefore +a processor's logical cores and interfaces utilize its local memory. Therefore baseband operations, the mbuf being operated on should be allocated from memory pools created in the local memory. The buffers should, if possible, remain on the local processor to obtain the best performance results and buffer @@ -202,7 +202,7 @@ structure in the *DPDK API Reference*. A device reports its capabilities when registering itself in the bbdev framework. With the aid of this capabilities mechanism, an application can query devices to -discover which operations within the 3gpp physical layer they are capable of +discover which operations within the 3GPP physical layer they are capable of performing. Below is an example of the capabilities for a PMD it supports in relation to Turbo Encoding and Decoding operations. @@ -216,7 +216,10 @@ relation to Turbo Encoding and Decoding operations. RTE_BBDEV_TURBO_SUBBLOCK_DEINTERLEAVE | RTE_BBDEV_TURBO_POS_LLR_1_BIT_IN | RTE_BBDEV_TURBO_NEG_LLR_1_BIT_IN | - RTE_BBDEV_TURBO_CRC_TYPE_24B, + RTE_BBDEV_TURBO_CRC_TYPE_24B | + RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP | + RTE_BBDEV_TURBO_EARLY_TERMINATION, + .max_llr_modulus = 16, .num_buffers_src = RTE_BBDEV_MAX_CODE_BLOCKS, .num_buffers_hard_out = RTE_BBDEV_MAX_CODE_BLOCKS, @@ -228,6 +231,7 @@ relation to Turbo Encoding and Decoding operations. .cap.turbo_enc = { .capability_flags = RTE_BBDEV_TURBO_CRC_24B_ATTACH | + RTE_BBDEV_TURBO_CRC_24A_ATTACH | RTE_BBDEV_TURBO_RATE_MATCH | RTE_BBDEV_TURBO_RV_INDEX_BYPASS, .num_buffers_src = RTE_BBDEV_MAX_CODE_BLOCKS, @@ -391,27 +395,101 @@ operation to its allocating pool. void rte_bbdev_enc_op_free_bulk(struct rte_bbdev_enc_op **ops, unsigned int num_ops) -BBDEV Operations -~~~~~~~~~~~~~~~~ +BBDEV Inbound/Outbound Memory +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The bbdev operation structure contains all the mutable data relating to -performing Turbo code processing on a referenced mbuf data buffer. It is used -for either encode or decode operations. +performing Turbo coding on a referenced mbuf data buffer. It is used for either +encode or decode operations. Turbo Encode operation accepts one input and one output. - Turbo Decode operation accepts one input and two outputs, called *hard-decision* and *soft-decision* outputs. *Soft-decision* output is optional. -It is expected that the application provides input and output ``mbuf`` pointers +It is expected that the application provides input and output mbuf pointers allocated and ready to use. The baseband framework supports turbo coding on Code Blocks (CB) and Transport Blocks (TB). -For the output buffer(s), the application needs only to provide an allocated and -free mbuf (containing only one mbuf segment), so that bbdev can write the -operation outcome. +For the output buffer(s), the application is required to provide an allocated +and free mbuf, so that bbdev write back the resulting output. + +The support of split "scattered" buffers is a driver-specific feature, so it is +reported individually by the supporting driver as a capability. + +Input and output data buffers are identified by ``rte_bbdev_op_data`` structure, +as follows: + +.. code-block:: c + + struct rte_bbdev_op_data { + struct rte_mbuf *data; + uint32_t offset; + uint32_t length; + }; + + +This structure has three elements: + +- ``data``: This is the mbuf data structure representing the data for BBDEV + operation. + + This mbuf pointer can point to one Code Block (CB) data buffer or multiple CBs + contiguously located next to each other. A Transport Block (TB) represents a + whole piece of data that is divided into one or more CBs. Maximum number of + CBs can be contained in one TB is defined by ``RTE_BBDEV_MAX_CODE_BLOCKS``. + + An mbuf data structure cannot represent more than one TB. The smallest piece + of data that can be contained in one mbuf is one CB. + An mbuf can include one contiguous CB, subset of contiguous CBs that are + belonging to one TB, or all contiguous CBs that are belonging to one TB. + + If a BBDEV PMD supports the extended capability "Scatter-Gather", then it is + capable of collecting (gathering) non-contiguous (scattered) data from + multiple locations in the memory. + This capability is reported by the capability flags: + + - ``RTE_BBDEV_TURBO_ENC_SCATTER_GATHER``, and -**Turbo Encode Op structure** + - ``RTE_BBDEV_TURBO_DEC_SCATTER_GATHER``. + + Only if a BBDEV PMD supports this feature, chained mbuf data structures are + accepted. A chained mbuf can represent one non-contiguous CB or multiple + non-contiguous CBs. + The first mbuf segment in the given chained mbuf represents the first piece + of the CB. Offset is only applicable to the first segment. ``length`` is the + total length of the CB. + + BBDEV driver is responsible for identifying where the split is and enqueue + the split data to its internal queues. + + If BBDEV PMD does not support this feature, it will assume inbound mbuf data + contains one segment. + + The output mbuf data though is always one segment, even if the input was a + chained mbuf. + + +- ``offset``: This is the starting point of the BBDEV (encode/decode) operation, + in bytes. + + BBDEV starts to read data past this offset. + In case of chained mbuf, this offset applies only to the first mbuf segment. + + +- ``length``: This is the total data length to be processed in one operation, + in bytes. + + In case the mbuf data is representing one CB, this is the length of the CB + undergoing the operation. + If it is for multiple CBs, this is the total length of those CBs undergoing + the operation. + If it is for one TB, this is the total length of the TB under operation. + In case of chained mbuf, this data length includes the lengths of the + "scattered" data segments undergoing the operation. + + +BBDEV Turbo Encode Operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: c @@ -428,8 +506,78 @@ operation outcome. }; }; +The Turbo encode structure is composed of the ``input`` and ``output`` mbuf +data pointers. The provided mbuf pointer of ``input`` needs to be big enough to +stretch for extra CRC trailers. + +``op_flags`` parameter holds all operation related flags, like whether CRC24A is +included by the application or not. + +``code_block_mode`` flag identifies the mode in which bbdev is operating in. + +The encode interface works on both the code block (CB) and the transport block +(TB). An operation executes in "CB-mode" when the CB is standalone. While +"TB-mode" executes when an operation performs on one or multiple CBs that +belong to a TB. Therefore, a given data can be standalone CB, full-size TB or +partial TB. Partial TB means that only a subset of CBs belonging to a bigger TB +are being enqueued. -**Turbo Decode Op structure** + **NOTE:** It is assumed that all enqueued ops in one ``rte_bbdev_enqueue_enc_ops()`` + call belong to one mode, either CB-mode or TB-mode. + +In case that the CB is smaller than Z (6144 bits), then effectively the TB = CB. +CRC24A is appended to the tail of the CB. The application is responsible for +calculating and appending CRC24A before calling BBDEV in case that the +underlying driver does not support CRC24A generation. + +In CB-mode, CRC24A/B is an optional operation. +The input ``k`` is the size of the CB (this maps to K as described in 3GPP TS +36.212 section 5.1.2), this size is inclusive of CRC24A/B. +The ``length`` is inclusive of CRC24A/B and equals to ``k`` in this case. + +Not all BBDEV PMDs are capable of CRC24A/B calculation. Flags +``RTE_BBDEV_TURBO_CRC_24A_ATTACH`` and ``RTE_BBDEV_TURBO_CRC_24B_ATTACH`` +informs the application with relevant capability. These flags can be set in the +``op_flags`` parameter to indicate BBDEV to calculate and append CRC24A to CB +before going forward with Turbo encoding. + +Output format of the CB encode will have the encoded CB in ``e`` size output +(this maps to E described in 3GPP TS 36.212 section 5.1.4.1.2). The output mbuf +buffer size needs to be big enough to hold the encoded buffer of size ``e``. + +In TB-mode, CRC24A is assumed to be pre-calculated and appended to the inbound +TB mbuf data buffer. +The output mbuf data structure is expected to be allocated by the application +with enough room for the output data. + +The difference between the partial and full-size TB is that we need to know the +index of the first CB in this group and the number of CBs contained within. +The first CB index is given by ``r`` but the number of the remaining CBs is +calculated automatically by BBDEV before passing down to the driver. + +The number of remaining CBs should not be confused with ``c``. ``c`` is the +total number of CBs that composes the whole TB (this maps to C as +described in 3GPP TS 36.212 section 5.1.2). + +The ``length`` is total size of the CBs inclusive of any CRC24A and CRC24B in +case they were appended by the application. + +The case when one CB belongs to TB and is being enqueued individually to BBDEV, +this case is considered as a special case of partial TB where its number of CBs +is 1. Therefore, it requires to get processed in TB-mode. + +The figure below visualizes the encoding of CBs using BBDEV interface in +TB-mode. CB-mode is a reduced version, where only one CB exists: + +.. _figure_turbo_tb_encode: + +.. figure:: img/turbo_tb_encode.* + + Turbo encoding of Code Blocks in mbuf structure + + +BBDEV Turbo Decode Operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: c @@ -452,18 +600,75 @@ operation outcome. }; }; -Input and output data buffers are identified by ``rte_bbdev_op_data`` structure. -This structure has three elements: +The Turbo decode structure is composed of the ``input`` and ``output`` mbuf +data pointers. + +``op_flags`` parameter holds all operation related flags, like whether CRC24B is +retained or not. + +``code_block_mode`` flag identifies the mode in which bbdev is operating in. + +Similarly, the decode interface works on both the code block (CB) and the +transport block (TB). An operation executes in "CB-mode" when the CB is +standalone. While "TB-mode" executes when an operation performs on one or +multiple CBs that belong to a TB. Therefore, a given data can be standalone CB, +full-size TB or partial TB. Partial TB means that only a subset of CBs belonging +to a bigger TB are being enqueued. + + **NOTE:** It is assumed that all enqueued ops in one ``rte_bbdev_enqueue_dec_ops()`` + call belong to one mode, either CB-mode or TB-mode. + +The input ``k`` is the size of the decoded CB (this maps to K as described in +3GPP TS 36.212 section 5.1.2), this size is inclusive of CRC24A/B. +The ``length`` is inclusive of CRC24A/B and equals to ``k`` in this case. + +The input encoded CB data is the Virtual Circular Buffer data stream, wk, with +the null padding included as described in 3GPP TS 36.212 section 5.1.4.1.2 and +shown in 3GPP TS 36.212 section 5.1.4.1 Figure 5.1.4-1. +The size of the virtual circular buffer is 3*Kpi, where Kpi is the 32 byte +aligned value of K, as specified in 3GPP TS 36.212 section 5.1.4.1.1. + +Each byte in the input circular buffer is the LLR value of each bit of the +original CB. + +``hard_output`` is a mandatory capability that all BBDEV PMDs support. This is +the decoded CBs of K sizes (CRC24A/B is the last 24-bit in each decoded CB). +Soft output is an optional capability for BBDEV PMDs. Setting flag +``RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP`` in ``op_flags`` directs BBDEV to retain +CRC24B at the end of each CB. This might be useful for the application in debug +mode. +An LLR rate matched output is computed in the ``soft_output`` buffer structure +for the given ``e`` size (this maps to E described in 3GPP TS 36.212 section +5.1.4.1.2). The output mbuf buffer size needs to be big enough to hold the +encoded buffer of size ``e``. + +The first CB Virtual Circular Buffer (VCB) index is given by ``r`` but the +number of the remaining CB VCBs is calculated automatically by BBDEV before +passing down to the driver. + +The number of remaining CB VCBs should not be confused with ``c``. ``c`` is the +total number of CBs that composes the whole TB (this maps to C as +described in 3GPP TS 36.212 section 5.1.2). + +The ``length`` is total size of the CBs inclusive of any CRC24A and CRC24B in +case they were appended by the application. + +The case when one CB belongs to TB and is being enqueued individually to BBDEV, +this case is considered as a special case of partial TB where its number of CBs +is 1. Therefore, it requires to get processed in TB-mode. + +The output mbuf data structure is expected to be allocated by the application +with enough room for the output data. + +The figure below visualizes the decoding of CBs using BBDEV interface in +TB-mode. CB-mode is a reduced version, where only one CB exists: + +.. _figure_turbo_tb_decode: -- ``data`` - This is the mbuf reference +.. figure:: img/turbo_tb_decode.* -- ``offset`` - The starting point for the Turbo input/output, in bytes, from the - start of the data in the data buffer. It must be smaller than data_len of the - mbuf's first segment + Turbo decoding of Code Blocks in mbuf structure -- ``length`` - The length, in bytes, of the buffer on which the Turbo operation - will or has been computed. For the input, the length is set by the application. - For the output(s), the length is computed by the bbdev PMD driver. Sample code ----------- diff --git a/doc/guides/prog_guide/bpf_lib.rst b/doc/guides/prog_guide/bpf_lib.rst new file mode 100644 index 00000000..7c08e6b2 --- /dev/null +++ b/doc/guides/prog_guide/bpf_lib.rst @@ -0,0 +1,38 @@ +.. SPDX-License-Identifier: BSD-3-Clause + Copyright(c) 2018 Intel Corporation. + +Berkeley Packet Filter Library +============================== + +The DPDK provides an BPF library that gives the ability +to load and execute Enhanced Berkeley Packet Filter (eBPF) bytecode within +user-space dpdk application. + +It supports basic set of features from eBPF spec. +Please refer to the +`eBPF spec ` +for more information. +Also it introduces basic framework to load/unload BPF-based filters +on eth devices (right now only via SW RX/TX callbacks). + +The library API provides the following basic operations: + +* Create a new BPF execution context and load user provided eBPF code into it. + +* Destroy an BPF execution context and its runtime structures and free the associated memory. + +* Execute eBPF bytecode associated with provided input parameter. + +* Provide information about natively compiled code for given BPF context. + +* Load BPF program from the ELF file and install callback to execute it on given ethdev port/queue. + +Not currently supported eBPF features +------------------------------------- + + - JIT for non X86_64 platforms + - cBPF + - tail-pointer call + - eBPF MAP + - skb + - external function calls for 32-bit platforms diff --git a/doc/guides/prog_guide/compressdev.rst b/doc/guides/prog_guide/compressdev.rst new file mode 100644 index 00000000..87e26490 --- /dev/null +++ b/doc/guides/prog_guide/compressdev.rst @@ -0,0 +1,623 @@ +.. SPDX-License-Identifier: BSD-3-Clause + Copyright(c) 2017-2018 Cavium Networks. + +Compression Device Library +=========================== + +The compression framework provides a generic set of APIs to perform compression services +as well as to query and configure compression devices both physical(hardware) and virtual(software) +to perform those services. The framework currently only supports lossless compression schemes: +Deflate and LZS. + +Device Management +----------------- + +Device Creation +~~~~~~~~~~~~~~~ + +Physical compression devices are discovered during the bus probe of the EAL function +which is executed at DPDK initialization, based on their unique device identifier. +For eg. PCI devices can be identified using PCI BDF (bus/bridge, device, function). +Specific physical compression devices, like other physical devices in DPDK can be +white-listed or black-listed using the EAL command line options. + +Virtual devices can be created by two mechanisms, either using the EAL command +line options or from within the application using an EAL API directly. + +From the command line using the --vdev EAL option + +.. code-block:: console + + --vdev ',socket_id=0' + +.. Note:: + + * If DPDK application requires multiple software compression PMD devices then required + number of ``--vdev`` with appropriate libraries are to be added. + + * An Application with multiple compression device instances exposed by the same PMD must + specify a unique name for each device. + + Example: ``--vdev 'pmd0' --vdev 'pmd1'`` + +Or, by using the rte_vdev_init API within the application code. + +.. code-block:: c + + rte_vdev_init("","socket_id=0") + +All virtual compression devices support the following initialization parameters: + +* ``socket_id`` - socket on which to allocate the device resources on. + +Device Identification +~~~~~~~~~~~~~~~~~~~~~ + +Each device, whether virtual or physical is uniquely designated by two +identifiers: + +- A unique device index used to designate the compression device in all functions + exported by the compressdev API. + +- A device name used to designate the compression device in console messages, for + administration or debugging purposes. + +Device Configuration +~~~~~~~~~~~~~~~~~~~~ + +The configuration of each compression device includes the following operations: + +- Allocation of resources, including hardware resources if a physical device. +- Resetting the device into a well-known default state. +- Initialization of statistics counters. + +The ``rte_compressdev_configure`` API is used to configure a compression device. + +The ``rte_compressdev_config`` structure is used to pass the configuration +parameters. + +See *DPDK API Reference* for details. + +Configuration of Queue Pairs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each compression device queue pair is individually configured through the +``rte_compressdev_queue_pair_setup`` API. + +The ``max_inflight_ops`` is used to pass maximum number of +rte_comp_op that could be present in a queue at-a-time. +PMD then can allocate resources accordingly on a specified socket. + +See *DPDK API Reference* for details. + +Logical Cores, Memory and Queues Pair Relationships +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Library supports NUMA similarly as described in Cryptodev library section. + +A queue pair cannot be shared and should be exclusively used by a single processing +context for enqueuing operations or dequeuing operations on the same compression device +since sharing would require global locks and hinder performance. It is however possible +to use a different logical core to dequeue an operation on a queue pair from the logical +core on which it was enqueued. This means that a compression burst enqueue/dequeue +APIs are a logical place to transition from one logical core to another in a +data processing pipeline. + +Device Features and Capabilities +--------------------------------- + +Compression devices define their functionality through two mechanisms, global device +features and algorithm features. Global devices features identify device +wide level features which are applicable to the whole device such as supported hardware +acceleration and CPU features. List of compression device features can be seen in the +RTE_COMPDEV_FF_XXX macros. + +The algorithm features lists individual algo feature which device supports per-algorithm, +such as a stateful compression/decompression, checksums operation etc. List of algorithm +features can be seen in the RTE_COMP_FF_XXX macros. + +Capabilities +~~~~~~~~~~~~ +Each PMD has a list of capabilities, including algorithms listed in +enum ``rte_comp_algorithm`` and its associated feature flag and +sliding window range in log base 2 value. Sliding window tells +the minimum and maximum size of lookup window that algorithm uses +to find duplicates. + +See *DPDK API Reference* for details. + +Each Compression poll mode driver defines its array of capabilities +for each algorithm it supports. See PMD implementation for capability +initialization. + +Capabilities Discovery +~~~~~~~~~~~~~~~~~~~~~~ + +PMD capability and features are discovered via ``rte_compressdev_info_get`` function. + +The ``rte_compressdev_info`` structure contains all the relevant information for the device. + +See *DPDK API Reference* for details. + +Compression Operation +---------------------- + +DPDK compression supports two types of compression methodologies: + +- Stateless, data associated to a compression operation is compressed without any reference + to another compression operation. + +- Stateful, data in each compression operation is compressed with reference to previous compression + operations in the same data stream i.e. history of data is maintained between the operations. + +For more explanation, please refer RFC https://www.ietf.org/rfc/rfc1951.txt + +Operation Representation +~~~~~~~~~~~~~~~~~~~~~~~~ + +Compression operation is described via ``struct rte_comp_op``, which contains both input and +output data. The operation structure includes the operation type (stateless or stateful), +the operation status and the priv_xform/stream handle, source, destination and checksum buffer +pointers. It also contains the source mempool from which the operation is allocated. +PMD updates consumed field with amount of data read from source buffer and produced +field with amount of data of written into destination buffer along with status of +operation. See section *Produced, Consumed And Operation Status* for more details. + +Compression operations mempool also has an ability to allocate private memory with the +operation for application's purposes. Application software is responsible for specifying +all the operation specific fields in the ``rte_comp_op`` structure which are then used +by the compression PMD to process the requested operation. + + +Operation Management and Allocation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The compressdev library provides an API set for managing compression operations which +utilize the Mempool Library to allocate operation buffers. Therefore, it ensures +that the compression operation is interleaved optimally across the channels and +ranks for optimal processing. + +A ``rte_comp_op`` contains a field indicating the pool it originated from. + +``rte_comp_op_alloc()`` and ``rte_comp_op_bulk_alloc()`` are used to allocate +compression operations from a given compression operation mempool. +The operation gets reset before being returned to a user so that operation +is always in a good known state before use by the application. + +``rte_comp_op_free()`` is called by the application to return an operation to +its allocating pool. + +See *DPDK API Reference* for details. + +Passing source data as mbuf-chain +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If input data is scattered across several different buffers, then +Application can either parse through all such buffers and make one +mbuf-chain and enqueue it for processing or, alternatively, it can +make multiple sequential enqueue_burst() calls for each of them +processing them statefully. See *Compression API Stateful Operation* +for stateful processing of ops. + +Operation Status +~~~~~~~~~~~~~~~~ +Each operation carries a status information updated by PMD after it is processed. +following are currently supported status: + +- RTE_COMP_OP_STATUS_SUCCESS, + Operation is successfully completed + +- RTE_COMP_OP_STATUS_NOT_PROCESSED, + Operation has not yet been processed by the device + +- RTE_COMP_OP_STATUS_INVALID_ARGS, + Operation failed due to invalid arguments in request + +- RTE_COMP_OP_STATUS_ERROR, + Operation failed because of internal error + +- RTE_COMP_OP_STATUS_INVALID_STATE, + Operation is invoked in invalid state + +- RTE_COMP_OP_STATUS_OUT_OF_SPACE_TERMINATED, + Output buffer ran out of space during processing. Error case, + PMD cannot continue from here. + +- RTE_COMP_OP_STATUS_OUT_OF_SPACE_RECOVERABLE, + Output buffer ran out of space before operation completed, but this + is not an error case. Output data up to op.produced can be used and + next op in the stream should continue on from op.consumed+1. + +Produced, Consumed And Operation Status +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- If status is RTE_COMP_OP_STATUS_SUCCESS, + consumed = amount of data read from input buffer, and + produced = amount of data written in destination buffer +- If status is RTE_COMP_OP_STATUS_FAILURE, + consumed = produced = 0 or undefined +- If status is RTE_COMP_OP_STATUS_OUT_OF_SPACE_TERMINATED, + consumed = 0 and + produced = usually 0, but in decompression cases a PMD may return > 0 + i.e. amount of data successfully produced until out of space condition + hit. Application can consume output data in this case, if required. +- If status is RTE_COMP_OP_STATUS_OUT_OF_SPACE_RECOVERABLE, + consumed = amount of data read, and + produced = amount of data successfully produced until + out of space condition hit. PMD has ability to recover + from here, so application can submit next op from + consumed+1 and a destination buffer with available space. + +Transforms +---------- + +Compression transforms (``rte_comp_xform``) are the mechanism +to specify the details of the compression operation such as algorithm, +window size and checksum. + +Compression API Hash support +---------------------------- + +Compression API allows application to enable digest calculation +alongside compression and decompression of data. A PMD reflects its +support for hash algorithms via capability algo feature flags. +If supported, PMD calculates digest always on plaintext i.e. +before compression and after decompression. + +Currently supported list of hash algos are SHA-1 and SHA2 family +SHA256. + +See *DPDK API Reference* for details. + +If required, application should set valid hash algo in compress +or decompress xforms during ``rte_compressdev_stream_create()`` +or ``rte_compressdev_private_xform_create()`` and pass a valid +output buffer in ``rte_comp_op`` hash field struct to store the +resulting digest. Buffer passed should be contiguous and large +enough to store digest which is 20 bytes for SHA-1 and +32 bytes for SHA2-256. + +Compression API Stateless operation +------------------------------------ + +An op is processed stateless if it has +- op_type set to RTE_COMP_OP_STATELESS +- flush value set to RTE_FLUSH_FULL or RTE_FLUSH_FINAL +(required only on compression side), +- All required input in source buffer + +When all of the above conditions are met, PMD initiates stateless processing +and releases acquired resources after processing of current operation is +complete. Application can enqueue multiple stateless ops in a single burst +and must attach priv_xform handle to such ops. + +priv_xform in Stateless operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +priv_xform is PMD internally managed private data that it maintains to do stateless processing. +priv_xforms are initialized provided a generic xform structure by an application via making call +to ``rte_comp_private_xform_create``, at an output PMD returns an opaque priv_xform reference. +If PMD support SHAREABLE priv_xform indicated via algorithm feature flag, then application can +attach same priv_xform with many stateless ops at-a-time. If not, then application needs to +create as many priv_xforms as it expects to have stateless operations in-flight. + +.. figure:: img/stateless-op.* + + Stateless Ops using Non-Shareable priv_xform + + +.. figure:: img/stateless-op-shared.* + + Stateless Ops using Shareable priv_xform + + +Application should call ``rte_compressdev_private_xform_create()`` and attach to stateless op before +enqueuing them for processing and free via ``rte_compressdev_private_xform_free()`` during termination. + +An example pseudocode to setup and process NUM_OPS stateless ops with each of length OP_LEN +using priv_xform would look like: + +.. code-block:: c + + /* + * pseudocode for stateless compression + */ + + uint8_t cdev_id = rte_compdev_get_dev_id(); + + /* configure the device. */ + if (rte_compressdev_configure(cdev_id, &conf) < 0) + rte_exit(EXIT_FAILURE, "Failed to configure compressdev %u", cdev_id); + + if (rte_compressdev_queue_pair_setup(cdev_id, 0, NUM_MAX_INFLIGHT_OPS, + socket_id()) < 0) + rte_exit(EXIT_FAILURE, "Failed to setup queue pair\n"); + + if (rte_compressdev_start(cdev_id) < 0) + rte_exit(EXIT_FAILURE, "Failed to start device\n"); + + /* setup compress transform */ + struct rte_compress_compress_xform compress_xform = { + .type = RTE_COMP_COMPRESS, + .compress = { + .algo = RTE_COMP_ALGO_DEFLATE, + .deflate = { + .huffman = RTE_COMP_HUFFMAN_DEFAULT + }, + .level = RTE_COMP_LEVEL_PMD_DEFAULT, + .chksum = RTE_COMP_CHECKSUM_NONE, + .window_size = DEFAULT_WINDOW_SIZE, + .hash_algo = RTE_COMP_HASH_ALGO_NONE + } + }; + + /* create priv_xform and initialize it for the compression device. */ + void *priv_xform = NULL; + rte_compressdev_info_get(cdev_id, &dev_info); + if(dev_info.capability->comps_feature_flag & RTE_COMP_FF_SHAREABLE_PRIV_XFORM) { + rte_comp_priv_xform_create(cdev_id, &compress_xform, &priv_xform); + } else { + shareable = 0; + } + + /* create operation pool via call to rte_comp_op_pool_create and alloc ops */ + rte_comp_op_bulk_alloc(op_pool, comp_ops, NUM_OPS); + + /* prepare ops for compression operations */ + for (i = 0; i < NUM_OPS; i++) { + struct rte_comp_op *op = comp_ops[i]; + if (!shareable) + rte_priv_xform_create(cdev_id, &compress_xform, &op->priv_xform) + else + op->priv_xform = priv_xform; + op->type = RTE_COMP_OP_STATELESS; + op->flush = RTE_COMP_FLUSH_FINAL; + + op->src.offset = 0; + op->dst.offset = 0; + op->src.length = OP_LEN; + op->input_chksum = 0; + setup op->m_src and op->m_dst; + } + num_enqd = rte_compressdev_enqueue_burst(cdev_id, 0, comp_ops, NUM_OPS); + /* wait for this to complete before enqueing next*/ + do { + num_deque = rte_compressdev_dequeue_burst(cdev_id, 0 , &processed_ops, NUM_OPS); + } while (num_dqud < num_enqd); + + +Stateless and OUT_OF_SPACE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +OUT_OF_SPACE is a condition when output buffer runs out of space and where PMD +still has more data to produce. If PMD runs into such condition, then PMD returns +RTE_COMP_OP_OUT_OF_SPACE_TERMINATED error. In such case, PMD resets itself and can set +consumed=0 and produced=amount of output it could produce before hitting out_of_space. +Application would need to resubmit the whole input with a larger output buffer, if it +wants the operation to be completed. + +Hash in Stateless +~~~~~~~~~~~~~~~~~ +If hash is enabled, digest buffer will contain valid data after op is successfully +processed i.e. dequeued with status = RTE_COMP_OP_STATUS_SUCCESS. + +Checksum in Stateless +~~~~~~~~~~~~~~~~~~~~~ +If checksum is enabled, checksum will only be available after op is successfully +processed i.e. dequeued with status = RTE_COMP_OP_STATUS_SUCCESS. + +Compression API Stateful operation +----------------------------------- + +Compression API provide RTE_COMP_FF_STATEFUL_COMPRESSION and +RTE_COMP_FF_STATEFUL_DECOMPRESSION feature flag for PMD to reflect +its support for Stateful operations. + +A Stateful operation in DPDK compression means application invokes enqueue +burst() multiple times to process related chunk of data because +application broke data into several ops. + +In such case +- ops are setup with op_type RTE_COMP_OP_STATEFUL, +- all ops except last set to flush value = RTE_COMP_NO/SYNC_FLUSH +and last set to flush value RTE_COMP_FULL/FINAL_FLUSH. + +In case of either one or all of the above conditions, PMD initiates +stateful processing and releases acquired resources after processing +operation with flush value = RTE_COMP_FLUSH_FULL/FINAL is complete. +Unlike stateless, application can enqueue only one stateful op from +a particular stream at a time and must attach stream handle +to each op. + +Stream in Stateful operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`stream` in DPDK compression is a logical entity which identifies related set of ops, say, a one large +file broken into multiple chunks then file is represented by a stream and each chunk of that file is +represented by compression op `rte_comp_op`. Whenever application wants a stateful processing of such +data, then it must get a stream handle via making call to ``rte_comp_stream_create()`` +with xform, at an output the target PMD will return an opaque stream handle to application which +it must attach to all of the ops carrying data of that stream. In stateful processing, every op +requires previous op data for compression/decompression. A PMD allocates and set up resources such +as history, states, etc. within a stream, which are maintained during the processing of the related ops. + +Unlike priv_xforms, stream is always a NON_SHAREABLE entity. One stream handle must be attached to only +one set of related ops and cannot be reused until all of them are processed with status Success or failure. + +.. figure:: img/stateful-op.* + + Stateful Ops + + +Application should call ``rte_comp_stream_create()`` and attach to op before +enqueuing them for processing and free via ``rte_comp_stream_free()`` during +termination. All ops that are to be processed statefully should carry *same* stream. + +See *DPDK API Reference* document for details. + +An example pseudocode to set up and process a stream having NUM_CHUNKS with each chunk size of CHUNK_LEN would look like: + +.. code-block:: c + + /* + * pseudocode for stateful compression + */ + + uint8_t cdev_id = rte_compdev_get_dev_id(); + + /* configure the device. */ + if (rte_compressdev_configure(cdev_id, &conf) < 0) + rte_exit(EXIT_FAILURE, "Failed to configure compressdev %u", cdev_id); + + if (rte_compressdev_queue_pair_setup(cdev_id, 0, NUM_MAX_INFLIGHT_OPS, + socket_id()) < 0) + rte_exit(EXIT_FAILURE, "Failed to setup queue pair\n"); + + if (rte_compressdev_start(cdev_id) < 0) + rte_exit(EXIT_FAILURE, "Failed to start device\n"); + + /* setup compress transform. */ + struct rte_compress_compress_xform compress_xform = { + .type = RTE_COMP_COMPRESS, + .compress = { + .algo = RTE_COMP_ALGO_DEFLATE, + .deflate = { + .huffman = RTE_COMP_HUFFMAN_DEFAULT + }, + .level = RTE_COMP_LEVEL_PMD_DEFAULT, + .chksum = RTE_COMP_CHECKSUM_NONE, + .window_size = DEFAULT_WINDOW_SIZE, + .hash_algo = RTE_COMP_HASH_ALGO_NONE + } + }; + + /* create stream */ + rte_comp_stream_create(cdev_id, &compress_xform, &stream); + + /* create an op pool and allocate ops */ + rte_comp_op_bulk_alloc(op_pool, comp_ops, NUM_CHUNKS); + + /* Prepare source and destination mbufs for compression operations */ + unsigned int i; + for (i = 0; i < NUM_CHUNKS; i++) { + if (rte_pktmbuf_append(mbufs[i], CHUNK_LEN) == NULL) + rte_exit(EXIT_FAILURE, "Not enough room in the mbuf\n"); + comp_ops[i]->m_src = mbufs[i]; + if (rte_pktmbuf_append(dst_mbufs[i], CHUNK_LEN) == NULL) + rte_exit(EXIT_FAILURE, "Not enough room in the mbuf\n"); + comp_ops[i]->m_dst = dst_mbufs[i]; + } + + /* Set up the compress operations. */ + for (i = 0; i < NUM_CHUNKS; i++) { + struct rte_comp_op *op = comp_ops[i]; + op->stream = stream; + op->m_src = src_buf[i]; + op->m_dst = dst_buf[i]; + op->type = RTE_COMP_OP_STATEFUL; + if(i == NUM_CHUNKS-1) { + /* set to final, if last chunk*/ + op->flush = RTE_COMP_FLUSH_FINAL; + } else { + /* set to NONE, for all intermediary ops */ + op->flush = RTE_COMP_FLUSH_NONE; + } + op->src.offset = 0; + op->dst.offset = 0; + op->src.length = CHUNK_LEN; + op->input_chksum = 0; + num_enqd = rte_compressdev_enqueue_burst(cdev_id, 0, &op[i], 1); + /* wait for this to complete before enqueing next*/ + do { + num_deqd = rte_compressdev_dequeue_burst(cdev_id, 0 , &processed_ops, 1); + } while (num_deqd < num_enqd); + /* push next op*/ + } + + +Stateful and OUT_OF_SPACE +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If PMD supports stateful operation, then OUT_OF_SPACE status is not an actual +error for the PMD. In such case, PMD returns with status +RTE_COMP_OP_STATUS_OUT_OF_SPACE_RECOVERABLE with consumed = number of input bytes +read and produced = length of complete output buffer. +Application should enqueue next op with source starting at consumed+1 and an +output buffer with available space. + +Hash in Stateful +~~~~~~~~~~~~~~~~ +If enabled, digest buffer will contain valid digest after last op in stream +(having flush = RTE_COMP_OP_FLUSH_FINAL) is successfully processed i.e. dequeued +with status = RTE_COMP_OP_STATUS_SUCCESS. + +Checksum in Stateful +~~~~~~~~~~~~~~~~~~~~ +If enabled, checksum will only be available after last op in stream +(having flush = RTE_COMP_OP_FLUSH_FINAL) is successfully processed i.e. dequeued +with status = RTE_COMP_OP_STATUS_SUCCESS. + +Burst in compression API +------------------------- + +Scheduling of compression operations on DPDK's application data path is +performed using a burst oriented asynchronous API set. A queue pair on a compression +device accepts a burst of compression operations using enqueue burst API. On physical +devices the enqueue burst API will place the operations to be processed +on the device's hardware input queue, for virtual devices the processing of the +operations is usually completed during the enqueue call to the compression +device. The dequeue burst API will retrieve any processed operations available +from the queue pair on the compression device, from physical devices this is usually +directly from the devices processed queue, and for virtual device's from a +``rte_ring`` where processed operations are place after being processed on the +enqueue call. + +A burst in DPDK compression can be a combination of stateless and stateful operations with a condition +that for stateful ops only one op at-a-time should be enqueued from a particular stream i.e. no-two ops +should belong to same stream in a single burst. However a burst may contain multiple stateful ops as long +as each op is attached to a different stream i.e. a burst can look like: + ++---------------+--------------+--------------+-----------------+--------------+--------------+ +| enqueue_burst | op1.no_flush | op2.no_flush | op3.flush_final | op4.no_flush | op5.no_flush | ++---------------+--------------+--------------+-----------------+--------------+--------------+ + +Where, op1 .. op5 all belong to different independent data units. op1, op2, op4, op5 must be stateful +as stateless ops can only use flush full or final and op3 can be of type stateless or stateful. +Every op with type set to RTE_COMP_OP_TYPE_STATELESS must be attached to priv_xform and +Every op with type set to RTE_COMP_OP_TYPE_STATEFUL *must* be attached to stream. + +Since each operation in a burst is independent and thus can be completed +out-of-order, applications which need ordering, should setup per-op user data +area with reordering information so that it can determine enqueue order at +dequeue. + +Also if multiple threads calls enqueue_burst() on same queue pair then it’s +application onus to use proper locking mechanism to ensure exclusive enqueuing +of operations. + +Enqueue / Dequeue Burst APIs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The burst enqueue API uses a compression device identifier and a queue pair +identifier to specify the compression device queue pair to schedule the processing on. +The ``nb_ops`` parameter is the number of operations to process which are +supplied in the ``ops`` array of ``rte_comp_op`` structures. +The enqueue function returns the number of operations it actually enqueued for +processing, a return value equal to ``nb_ops`` means that all packets have been +enqueued. + +The dequeue API uses the same format as the enqueue API but +the ``nb_ops`` and ``ops`` parameters are now used to specify the max processed +operations the user wishes to retrieve and the location in which to store them. +The API call returns the actual number of processed operations returned, this +can never be larger than ``nb_ops``. + +Sample code +----------- + +There are unit test applications that show how to use the compressdev library inside +test/test/test_compressdev.c + +Compression Device API +~~~~~~~~~~~~~~~~~~~~~~ + +The compressdev Library API is described in the *DPDK API Reference* document. diff --git a/doc/guides/prog_guide/cryptodev_lib.rst b/doc/guides/prog_guide/cryptodev_lib.rst index 066fe2d2..90d01e93 100644 --- a/doc/guides/prog_guide/cryptodev_lib.rst +++ b/doc/guides/prog_guide/cryptodev_lib.rst @@ -8,7 +8,7 @@ The cryptodev library provides a Crypto device framework for management and provisioning of hardware and software Crypto poll mode drivers, defining generic APIs which support a number of different Crypto operations. The framework currently only supports cipher, authentication, chained cipher/authentication -and AEAD symmetric Crypto operations. +and AEAD symmetric and asymmetric Crypto operations. Design Principles @@ -41,7 +41,7 @@ From the command line using the --vdev EAL option .. code-block:: console - --vdev 'crypto_aesni_mb0,max_nb_queue_pairs=2,max_nb_sessions=1024,socket_id=0' + --vdev 'crypto_aesni_mb0,max_nb_queue_pairs=2,socket_id=0' .. Note:: @@ -57,12 +57,11 @@ Our using the rte_vdev_init API within the application code. .. code-block:: c rte_vdev_init("crypto_aesni_mb", - "max_nb_queue_pairs=2,max_nb_sessions=1024,socket_id=0") + "max_nb_queue_pairs=2,socket_id=0") All virtual Crypto devices support the following initialization parameters: * ``max_nb_queue_pairs`` - maximum number of queue pairs supported by the device. -* ``max_nb_sessions`` - maximum number of sessions supported by the device * ``socket_id`` - socket on which to allocate the device resources on. @@ -159,8 +158,8 @@ Device Features and Capabilities Crypto devices define their functionality through two mechanisms, global device features and algorithm capabilities. Global devices features identify device wide level features which are applicable to the whole device such as -the device having hardware acceleration or supporting symmetric Crypto -operations, +the device having hardware acceleration or supporting symmetric and/or asymmetric +Crypto operations. The capabilities mechanism defines the individual algorithms/functions which the device supports, such as a specific symmetric Crypto cipher, @@ -269,7 +268,7 @@ relevant information for the device. struct rte_cryptodev_info { const char *driver_name; uint8_t driver_id; - struct rte_pci_device *pci_dev; + struct rte_device *device; uint64_t feature_flags; @@ -299,6 +298,33 @@ directly from the devices processed queue, and for virtual device's from a enqueue call. +Private data +~~~~~~~~~~~~ +For session-based operations, the set and get API provides a mechanism for an +application to store and retrieve the private user data information stored along +with the crypto session. + +For example, suppose an application is submitting a crypto operation with a session +associated and wants to indicate private user data information which is required to be +used after completion of the crypto operation. In this case, the application can use +the set API to set the user data and retrieve it using get API. + +.. code-block:: c + + int rte_cryptodev_sym_session_set_user_data( + struct rte_cryptodev_sym_session *sess, void *data, uint16_t size); + + void * rte_cryptodev_sym_session_get_user_data( + struct rte_cryptodev_sym_session *sess); + + +For session-less mode, the private user data information can be placed along with the +``struct rte_crypto_op``. The ``rte_crypto_op::private_data_offset`` indicates the +start of private data information. The offset is counted from the start of the +rte_crypto_op including other crypto information such as the IVs (since there can +be an IV also for authentication). + + Enqueue / Dequeue Burst APIs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -419,7 +445,7 @@ Crypto workloads. .. figure:: img/cryptodev_sym_sess.* -The Crypto device framework provides APIs to allocate and initizalize sessions +The Crypto device framework provides APIs to allocate and initialize sessions for crypto devices, where sessions are mempool objects. It is the application's responsibility to create and manage the session mempools. This approach allows for different scenarios such as having a single session @@ -427,12 +453,12 @@ mempool for all crypto devices (where the mempool object size is big enough to hold the private session of any crypto device), as well as having multiple session mempools of different sizes for better memory usage. -An application can use ``rte_cryptodev_get_private_session_size()`` to +An application can use ``rte_cryptodev_sym_get_private_session_size()`` to get the private session size of given crypto device. This function would allow an application to calculate the max device session size of all crypto devices to create a single session mempool. If instead an application creates multiple session mempools, the Crypto device -framework also provides ``rte_cryptodev_get_header_session_size`` to get +framework also provides ``rte_cryptodev_sym_get_header_session_size`` to get the size of an uninitialized session. Once the session mempools have been created, ``rte_cryptodev_sym_session_create()`` @@ -635,7 +661,7 @@ using one of the crypto PMDs available in DPDK. uint8_t cdev_id = rte_cryptodev_get_dev_id(crypto_name); /* Get private session data size. */ - session_size = rte_cryptodev_get_private_session_size(cdev_id); + session_size = rte_cryptodev_sym_get_private_session_size(cdev_id); /* * Create session mempool, with two objects per session, @@ -761,14 +787,260 @@ using one of the crypto PMDs available in DPDK. num_dequeued_ops); } while (total_num_dequeued_ops < num_enqueued_ops); - Asymmetric Cryptography ----------------------- -Asymmetric functionality is currently not supported by the cryptodev API. +The cryptodev library currently provides support for the following asymmetric +Crypto operations; RSA, Modular exponentiation and inversion, Diffie-Hellman +public and/or private key generation and shared secret compute, DSA Signature +generation and verification. + +Session and Session Management +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Sessions are used in asymmetric cryptographic processing to store the immutable +data defined in asymmetric cryptographic transform which is further used in the +operation processing. Sessions typically stores information, such as, public +and private key information or domain params or prime modulus data i.e. immutable +across data sets. Crypto sessions cache this immutable data in a optimal way for the +underlying PMD and this allows further acceleration of the offload of Crypto workloads. + +Like symmetric, the Crypto device framework provides APIs to allocate and initialize +asymmetric sessions for crypto devices, where sessions are mempool objects. +It is the application's responsibility to create and manage the session mempools. +Application using both symmetric and asymmetric sessions should allocate and maintain +different sessions pools for each type. + +An application can use ``rte_cryptodev_get_asym_session_private_size()`` to +get the private size of asymmetric session on a given crypto device. This +function would allow an application to calculate the max device asymmetric +session size of all crypto devices to create a single session mempool. +If instead an application creates multiple asymmetric session mempools, +the Crypto device framework also provides ``rte_cryptodev_asym_get_header_session_size()`` to get +the size of an uninitialized session. + +Once the session mempools have been created, ``rte_cryptodev_asym_session_create()`` +is used to allocate an uninitialized asymmetric session from the given mempool. +The session then must be initialized using ``rte_cryptodev_asym_session_init()`` +for each of the required crypto devices. An asymmetric transform chain +is used to specify the operation and its parameters. See the section below for +details on transforms. + +When a session is no longer used, user must call ``rte_cryptodev_asym_session_clear()`` +for each of the crypto devices that are using the session, to free all driver +private asymmetric session data. Once this is done, session should be freed using +``rte_cryptodev_asym_session_free()`` which returns them to their mempool. + +Asymmetric Sessionless Support +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Currently asymmetric crypto framework does not support sessionless. + +Transforms and Transform Chaining +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Asymmetric Crypto transforms (``rte_crypto_asym_xform``) are the mechanism used +to specify the details of the asymmetric Crypto operation. Next pointer within +xform allows transform to be chained together. Also it is important to note that +the order in which the transforms are passed indicates the order of the chaining. + +Not all asymmetric crypto xforms are supported for chaining. Currently supported +asymmetric crypto chaining is Diffie-Hellman private key generation followed by +public generation. Also, currently API does not support chaining of symmetric and +asymmetric crypto xfroms. + +Each xform defines specific asymmetric crypto algo. Currently supported are: +* RSA +* Modular operations (Exponentiation and Inverse) +* Diffie-Hellman +* DSA +* None - special case where PMD may support a passthrough mode. More for diagnostic purpose + +See *DPDK API Reference* for details on each rte_crypto_xxx_xform struct + +Asymmetric Operations +~~~~~~~~~~~~~~~~~~~~~ + +The asymmetric Crypto operation structure contains all the mutable data relating +to asymmetric cryptographic processing on an input data buffer. It uses either +RSA, Modular, Diffie-Hellman or DSA operations depending upon session it is attached +to. + +Every operation must carry a valid session handle which further carries information +on xform or xform-chain to be performed on op. Every xform type defines its own set +of operational params in their respective rte_crypto_xxx_op_param struct. Depending +on xform information within session, PMD picks up and process respective op_param +struct. +Unlike symmetric, asymmetric operations do not use mbufs for input/output. +They operate on data buffer of type ``rte_crypto_param``. + +See *DPDK API Reference* for details on each rte_crypto_xxx_op_param struct + +Asymmetric crypto Sample code +----------------------------- + +There's a unit test application test_cryptodev_asym.c inside unit test framework that +show how to setup and process asymmetric operations using cryptodev library. + +The following sample code shows the basic steps to compute modular exponentiation +using 1024-bit modulus length using openssl PMD available in DPDK (performing other +crypto operations is similar except change to respective op and xform setup). + +.. code-block:: c + + /* + * Simple example to compute modular exponentiation with 1024-bit key + * + */ + #define MAX_ASYM_SESSIONS 10 + #define NUM_ASYM_BUFS 10 + + struct rte_mempool *crypto_op_pool, *asym_session_pool; + unsigned int asym_session_size; + int ret; + + /* Initialize EAL. */ + ret = rte_eal_init(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n"); + + uint8_t socket_id = rte_socket_id(); + + /* Create crypto operation pool. */ + crypto_op_pool = rte_crypto_op_pool_create( + "crypto_op_pool", + RTE_CRYPTO_OP_TYPE_ASYMMETRIC, + NUM_ASYM_BUFS, 0, 0, + socket_id); + if (crypto_op_pool == NULL) + rte_exit(EXIT_FAILURE, "Cannot create crypto op pool\n"); + /* Create the virtual crypto device. */ + char args[128]; + const char *crypto_name = "crypto_openssl"; + snprintf(args, sizeof(args), "socket_id=%d", socket_id); + ret = rte_vdev_init(crypto_name, args); + if (ret != 0) + rte_exit(EXIT_FAILURE, "Cannot create virtual device"); -Crypto Device API -~~~~~~~~~~~~~~~~~ + uint8_t cdev_id = rte_cryptodev_get_dev_id(crypto_name); + + /* Get private asym session data size. */ + asym_session_size = rte_cryptodev_get_asym_private_session_size(cdev_id); + + /* + * Create session mempool, with two objects per session, + * one for the session header and another one for the + * private asym session data for the crypto device. + */ + asym_session_pool = rte_mempool_create("asym_session_pool", + MAX_ASYM_SESSIONS * 2, + asym_session_size, + 0, + 0, NULL, NULL, NULL, + NULL, socket_id, + 0); + + /* Configure the crypto device. */ + struct rte_cryptodev_config conf = { + .nb_queue_pairs = 1, + .socket_id = socket_id + }; + struct rte_cryptodev_qp_conf qp_conf = { + .nb_descriptors = 2048 + }; + + if (rte_cryptodev_configure(cdev_id, &conf) < 0) + rte_exit(EXIT_FAILURE, "Failed to configure cryptodev %u", cdev_id); + + if (rte_cryptodev_queue_pair_setup(cdev_id, 0, &qp_conf, + socket_id, asym_session_pool) < 0) + rte_exit(EXIT_FAILURE, "Failed to setup queue pair\n"); + + if (rte_cryptodev_start(cdev_id) < 0) + rte_exit(EXIT_FAILURE, "Failed to start device\n"); + + /* Setup crypto xform to do modular exponentiation with 1024 bit + * length modulus + */ + struct rte_crypto_asym_xform modex_xform = { + .next = NULL, + .xform_type = RTE_CRYPTO_ASYM_XFORM_MODEX, + .modex = { + .modulus = { + .data = + (uint8_t *) + ("\xb3\xa1\xaf\xb7\x13\x08\x00\x0a\x35\xdc\x2b\x20\x8d" + "\xa1\xb5\xce\x47\x8a\xc3\x80\xf4\x7d\x4a\xa2\x62\xfd\x61\x7f" + "\xb5\xa8\xde\x0a\x17\x97\xa0\xbf\xdf\x56\x5a\x3d\x51\x56\x4f" + "\x70\x70\x3f\x63\x6a\x44\x5b\xad\x84\x0d\x3f\x27\x6e\x3b\x34" + "\x91\x60\x14\xb9\xaa\x72\xfd\xa3\x64\xd2\x03\xa7\x53\x87\x9e" + "\x88\x0b\xc1\x14\x93\x1a\x62\xff\xb1\x5d\x74\xcd\x59\x63\x18" + "\x11\x3d\x4f\xba\x75\xd4\x33\x4e\x23\x6b\x7b\x57\x44\xe1\xd3" + "\x03\x13\xa6\xf0\x8b\x60\xb0\x9e\xee\x75\x08\x9d\x71\x63\x13" + "\xcb\xa6\x81\x92\x14\x03\x22\x2d\xde\x55"), + .length = 128 + }, + .exponent = { + .data = (uint8_t *)("\x01\x00\x01"), + .length = 3 + } + } + }; + /* Create asym crypto session and initialize it for the crypto device. */ + struct rte_cryptodev_asym_session *asym_session; + asym_session = rte_cryptodev_asym_session_create(asym_session_pool); + if (asym_session == NULL) + rte_exit(EXIT_FAILURE, "Session could not be created\n"); + + if (rte_cryptodev_asym_session_init(cdev_id, asym_session, + &modex_xform, asym_session_pool) < 0) + rte_exit(EXIT_FAILURE, "Session could not be initialized " + "for the crypto device\n"); + + /* Get a burst of crypto operations. */ + struct rte_crypto_op *crypto_ops[1]; + if (rte_crypto_op_bulk_alloc(crypto_op_pool, + RTE_CRYPTO_OP_TYPE_ASYMMETRIC, + crypto_ops, 1) == 0) + rte_exit(EXIT_FAILURE, "Not enough crypto operations available\n"); + + /* Set up the crypto operations. */ + struct rte_crypto_asym_op *asym_op = crypto_ops[0]->asym; + + /* calculate mod exp of value 0xf8 */ + static unsigned char base[] = {0xF8}; + asym_op->modex.base.data = base; + asym_op->modex.base.length = sizeof(base); + asym_op->modex.base.iova = base; + + /* Attach the asym crypto session to the operation */ + rte_crypto_op_attach_asym_session(op, asym_session); + + /* Enqueue the crypto operations in the crypto device. */ + uint16_t num_enqueued_ops = rte_cryptodev_enqueue_burst(cdev_id, 0, + crypto_ops, 1); + + /* + * Dequeue the crypto operations until all the operations + * are processed in the crypto device. + */ + uint16_t num_dequeued_ops, total_num_dequeued_ops = 0; + do { + struct rte_crypto_op *dequeued_ops[1]; + num_dequeued_ops = rte_cryptodev_dequeue_burst(cdev_id, 0, + dequeued_ops, 1); + total_num_dequeued_ops += num_dequeued_ops; + + /* Check if operation was processed successfully */ + if (dequeued_ops[0]->status != RTE_CRYPTO_OP_STATUS_SUCCESS) + rte_exit(EXIT_FAILURE, + "Some operations were not processed correctly"); + + } while (total_num_dequeued_ops < num_enqueued_ops); + + +Asymmetric Crypto Device API +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The cryptodev Library API is described in the *DPDK API Reference* document. +The cryptodev Library API is described in the +`DPDK API Reference `_ diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst index 9a2fab1e..d362c920 100644 --- a/doc/guides/prog_guide/env_abstraction_layer.rst +++ b/doc/guides/prog_guide/env_abstraction_layer.rst @@ -94,10 +94,125 @@ The allocation of large contiguous physical memory is done using the hugetlbfs k The EAL provides an API to reserve named memory zones in this contiguous memory. The physical address of the reserved memory for that memory zone is also returned to the user by the memory zone reservation API. +There are two modes in which DPDK memory subsystem can operate: dynamic mode, +and legacy mode. Both modes are explained below. + .. note:: Memory reservations done using the APIs provided by rte_malloc are also backed by pages from the hugetlbfs filesystem. ++ Dynamic memory mode + +Currently, this mode is only supported on Linux. + +In this mode, usage of hugepages by DPDK application will grow and shrink based +on application's requests. Any memory allocation through ``rte_malloc()``, +``rte_memzone_reserve()`` or other methods, can potentially result in more +hugepages being reserved from the system. Similarly, any memory deallocation can +potentially result in hugepages being released back to the system. + +Memory allocated in this mode is not guaranteed to be IOVA-contiguous. If large +chunks of IOVA-contiguous are required (with "large" defined as "more than one +page"), it is recommended to either use VFIO driver for all physical devices (so +that IOVA and VA addresses can be the same, thereby bypassing physical addresses +entirely), or use legacy memory mode. + +For chunks of memory which must be IOVA-contiguous, it is recommended to use +``rte_memzone_reserve()`` function with ``RTE_MEMZONE_IOVA_CONTIG`` flag +specified. This way, memory allocator will ensure that, whatever memory mode is +in use, either reserved memory will satisfy the requirements, or the allocation +will fail. + +There is no need to preallocate any memory at startup using ``-m`` or +``--socket-mem`` command-line parameters, however it is still possible to do so, +in which case preallocate memory will be "pinned" (i.e. will never be released +by the application back to the system). It will be possible to allocate more +hugepages, and deallocate those, but any preallocated pages will not be freed. +If neither ``-m`` nor ``--socket-mem`` were specified, no memory will be +preallocated, and all memory will be allocated at runtime, as needed. + +Another available option to use in dynamic memory mode is +``--single-file-segments`` command-line option. This option will put pages in +single files (per memseg list), as opposed to creating a file per page. This is +normally not needed, but can be useful for use cases like userspace vhost, where +there is limited number of page file descriptors that can be passed to VirtIO. + +If the application (or DPDK-internal code, such as device drivers) wishes to +receive notifications about newly allocated memory, it is possible to register +for memory event callbacks via ``rte_mem_event_callback_register()`` function. +This will call a callback function any time DPDK's memory map has changed. + +If the application (or DPDK-internal code, such as device drivers) wishes to be +notified about memory allocations above specified threshold (and have a chance +to deny them), allocation validator callbacks are also available via +``rte_mem_alloc_validator_callback_register()`` function. + +A default validator callback is provided by EAL, which can be enabled with a +``--socket-limit`` command-line option, for a simple way to limit maximum amount +of memory that can be used by DPDK application. + +.. note:: + + In multiprocess scenario, all related processes (i.e. primary process, and + secondary processes running with the same prefix) must be in the same memory + modes. That is, if primary process is run in dynamic memory mode, all of its + secondary processes must be run in the same mode. The same is applicable to + ``--single-file-segments`` command-line option - both primary and secondary + processes must shared this mode. + ++ Legacy memory mode + +This mode is enabled by specifying ``--legacy-mem`` command-line switch to the +EAL. This switch will have no effect on FreeBSD as FreeBSD only supports +legacy mode anyway. + +This mode mimics historical behavior of EAL. That is, EAL will reserve all +memory at startup, sort all memory into large IOVA-contiguous chunks, and will +not allow acquiring or releasing hugepages from the system at runtime. + +If neither ``-m`` nor ``--socket-mem`` were specified, the entire available +hugepage memory will be preallocated. + ++ 32-bit support + +Additional restrictions are present when running in 32-bit mode. In dynamic +memory mode, by default maximum of 2 gigabytes of VA space will be preallocated, +and all of it will be on master lcore NUMA node unless ``--socket-mem`` flag is +used. + +In legacy mode, VA space will only be preallocated for segments that were +requested (plus padding, to keep IOVA-contiguousness). + ++ Maximum amount of memory + +All possible virtual memory space that can ever be used for hugepage mapping in +a DPDK process is preallocated at startup, thereby placing an upper limit on how +much memory a DPDK application can have. DPDK memory is stored in segment lists, +each segment is strictly one physical page. It is possible to change the amount +of virtual memory being preallocated at startup by editing the following config +variables: + +* ``CONFIG_RTE_MAX_MEMSEG_LISTS`` controls how many segment lists can DPDK have +* ``CONFIG_RTE_MAX_MEM_MB_PER_LIST`` controls how much megabytes of memory each + segment list can address +* ``CONFIG_RTE_MAX_MEMSEG_PER_LIST`` controls how many segments each segment can + have +* ``CONFIG_RTE_MAX_MEMSEG_PER_TYPE`` controls how many segments each memory type + can have (where "type" is defined as "page size + NUMA node" combination) +* ``CONFIG_RTE_MAX_MEM_MB_PER_TYPE`` controls how much megabytes of memory each + memory type can address +* ``CONFIG_RTE_MAX_MEM_MB`` places a global maximum on the amount of memory + DPDK can reserve + +Normally, these options do not need to be changed. + +.. note:: + + Preallocated virtual memory is not to be confused with preallocated hugepage + memory! All DPDK processes preallocate virtual memory at startup. Hugepages + can later be mapped into that preallocated VA space (if dynamic memory mode + is enabled), and can optionally be mapped into it at startup. + PCI Access ~~~~~~~~~~ @@ -211,7 +326,7 @@ Memory Segments and Memory Zones (memzone) The mapping of physical memory is provided by this feature in the EAL. As physical memory can have gaps, the memory is described in a table of descriptors, -and each descriptor (called rte_memseg ) describes a contiguous portion of memory. +and each descriptor (called rte_memseg ) describes a physical page. On top of this, the memzone allocator's role is to reserve contiguous portions of physical memory. These zones are identified by a unique name when the memory is reserved. @@ -225,6 +340,9 @@ Memory zones can be reserved with specific start address alignment by supplying The alignment value should be a power of two and not less than the cache line size (64 bytes). Memory zones can also be reserved from either 2 MB or 1 GB hugepages, provided that both are available on the system. +Both memsegs and memzones are stored using ``rte_fbarray`` structures. Please +refer to *DPDK API Reference* for more information. + Multiple pthread ---------------- @@ -331,17 +449,21 @@ Known Issues Bypassing this constraint may cause the 2nd pthread to spin until the 1st one is scheduled again. Moreover, if the 1st pthread is preempted by a context that has an higher priority, it may even cause a dead lock. - This does not mean it cannot be used, simply, there is a need to narrow down the situation when it is used by multi-pthread on the same core. + This means, use cases involving preemptible pthreads should consider using rte_ring carefully. + + 1. It CAN be used for preemptible single-producer and single-consumer use case. - 1. It CAN be used for any single-producer or single-consumer situation. + 2. It CAN be used for non-preemptible multi-producer and preemptible single-consumer use case. - 2. It MAY be used by multi-producer/consumer pthread whose scheduling policy are all SCHED_OTHER(cfs). User SHOULD be aware of the performance penalty before using it. + 3. It CAN be used for preemptible single-producer and non-preemptible multi-consumer use case. - 3. It MUST not be used by multi-producer/consumer pthreads, whose scheduling policies are SCHED_FIFO or SCHED_RR. + 4. It MAY be used by preemptible multi-producer and/or preemptible multi-consumer pthreads whose scheduling policy are all SCHED_OTHER(cfs), SCHED_IDLE or SCHED_BATCH. User SHOULD be aware of the performance penalty before using it. + + 5. It MUST not be used by multi-producer/consumer pthreads, whose scheduling policies are SCHED_FIFO or SCHED_RR. + rte_timer - Running ``rte_timer_manager()`` on a non-EAL pthread is not allowed. However, resetting/stopping the timer from a non-EAL pthread is allowed. + Running ``rte_timer_manage()`` on a non-EAL pthread is not allowed. However, resetting/stopping the timer from a non-EAL pthread is allowed. + rte_log @@ -453,11 +575,9 @@ The key fields of the heap structure and their function are described below * free_head - this points to the first element in the list of free nodes for this malloc heap. -.. note:: +* first - this points to the first element in the heap. - The malloc_heap structure does not keep track of in-use blocks of memory, - since these are never touched except when they are to be freed again - - at which point the pointer to the block is an input to the free() function. +* last - this points to the last element in the heap. .. _figure_malloc_heap: @@ -473,16 +593,21 @@ Structure: malloc_elem The malloc_elem structure is used as a generic header structure for various blocks of memory. -It is used in three different ways - all shown in the diagram above: +It is used in two different ways - all shown in the diagram above: #. As a header on a block of free or allocated memory - normal case #. As a padding header inside a block of memory -#. As an end-of-memseg marker - The most important fields in the structure and how they are used are described below. +Malloc heap is a doubly-linked list, where each element keeps track of its +previous and next elements. Due to the fact that hugepage memory can come and +go, neighbouring malloc elements may not necessarily be adjacent in memory. +Also, since a malloc element may span multiple pages, its contents may not +necessarily be IOVA-contiguous either - each malloc element is only guaranteed +to be virtually contiguous. + .. note:: If the usage of a particular field in one of the above three usages is not @@ -495,13 +620,20 @@ The most important fields in the structure and how they are used are described b It is used for normal memory blocks when they are being freed, to add the newly-freed block to the heap's free-list. -* prev - this pointer points to the header element/block in the memseg - immediately behind the current one. When freeing a block, this pointer is - used to reference the previous block to check if that block is also free. - If so, then the two free blocks are merged to form a single larger block. +* prev - this pointer points to previous header element/block in memory. When + freeing a block, this pointer is used to reference the previous block to + check if that block is also free. If so, and the two blocks are immediately + adjacent to each other, then the two free blocks are merged to form a single + larger block. -* next_free - this pointer is used to chain the free-list of unallocated - memory blocks together. +* next - this pointer points to next header element/block in memory. When + freeing a block, this pointer is used to reference the next block to check + if that block is also free. If so, and the two blocks are immediately + adjacent to each other, then the two free blocks are merged to form a single + larger block. + +* free_list - this is a structure pointing to previous and next elements in + this heap's free list. It is only used in normal memory blocks; on ``malloc()`` to find a suitable free block to allocate and on ``free()`` to add the newly freed element to the free-list. @@ -515,9 +647,6 @@ The most important fields in the structure and how they are used are described b constraints. In that case, the pad header is used to locate the actual malloc element header for the block. - For the end-of-memseg structure, this is always a ``BUSY`` value, which - ensures that no element, on being freed, searches beyond the end of the - memseg for other blocks to merge with into a larger free area. * pad - this holds the length of the padding present at the start of the block. In the case of a normal block header, it is added to the address of the end @@ -528,22 +657,19 @@ The most important fields in the structure and how they are used are described b actual block header. * size - the size of the data block, including the header itself. - For end-of-memseg structures, this size is given as zero, though it is never - actually checked. - For normal blocks which are being freed, this size value is used in place of - a "next" pointer to identify the location of the next block of memory that - in the case of being ``FREE``, the two free blocks can be merged into one. Memory Allocation ^^^^^^^^^^^^^^^^^ -On EAL initialization, all memsegs are setup as part of the malloc heap. -This setup involves placing a dummy structure at the end with ``BUSY`` state, -which may contain a sentinel value if ``CONFIG_RTE_MALLOC_DEBUG`` is enabled, -and a proper :ref:`element header` with ``FREE`` at the start -for each memseg. +On EAL initialization, all preallocated memory segments are setup as part of the +malloc heap. This setup involves placing an :ref:`element header` +with ``FREE`` at the start of each virtually contiguous segment of memory. The ``FREE`` element is then added to the ``free_list`` for the malloc heap. +This setup also happens whenever memory is allocated at runtime (if supported), +in which case newly allocated pages are also added to the heap, merging with any +adjacent free segments if there are any. + When an application makes a call to a malloc-like function, the malloc function will first index the ``lcore_config`` structure for the calling thread, and determine the NUMA node of that thread. @@ -574,8 +700,34 @@ the start and/or end of the element, resulting in the following behavior: The advantage of allocating the memory from the end of the existing element is that no adjustment of the free list needs to take place - the existing element -on the free list just has its size pointer adjusted, and the following element -has its "prev" pointer redirected to the newly created element. +on the free list just has its size value adjusted, and the next/previous elements +have their "prev"/"next" pointers redirected to the newly created element. + +In case when there is not enough memory in the heap to satisfy allocation +request, EAL will attempt to allocate more memory from the system (if supported) +and, following successful allocation, will retry reserving the memory again. In +a multiprocessing scenario, all primary and secondary processes will synchronize +their memory maps to ensure that any valid pointer to DPDK memory is guaranteed +to be valid at all times in all currently running processes. + +Failure to synchronize memory maps in one of the processes will cause allocation +to fail, even though some of the processes may have allocated the memory +successfully. The memory is not added to the malloc heap unless primary process +has ensured that all other processes have mapped this memory successfully. + +Any successful allocation event will trigger a callback, for which user +applications and other DPDK subsystems can register. Additionally, validation +callbacks will be triggered before allocation if the newly allocated memory will +exceed threshold set by the user, giving a chance to allow or deny allocation. + +.. note:: + + Any allocation of new pages has to go through primary process. If the + primary process is not active, no memory will be allocated even if it was + theoretically possible to do so. This is because primary's process map acts + as an authority on what should or should not be mapped, while each secondary + process has its own, local memory map. Secondary processes do not update the + shared memory map, they only copy its contents to their local memory map. Freeing Memory ^^^^^^^^^^^^^^ @@ -589,8 +741,17 @@ the pointer to get the proper element header for the entire block. From this element header, we get pointers to the heap from which the block was allocated and to where it must be freed, as well as the pointer to the previous -element, and via the size field, we can calculate the pointer to the next element. -These next and previous elements are then checked to see if they are also -``FREE``, and if so, they are merged with the current element. -This means that we can never have two ``FREE`` memory blocks adjacent to one -another, as they are always merged into a single block. +and next elements. These next and previous elements are then checked to see if +they are also ``FREE`` and are immediately adjacent to the current one, and if +so, they are merged with the current element. This means that we can never have +two ``FREE`` memory blocks adjacent to one another, as they are always merged +into a single block. + +If deallocating pages at runtime is supported, and the free element encloses +one or more pages, those pages can be deallocated and be removed from the heap. +If DPDK was started with command-line parameters for preallocating memory +(``-m`` or ``--socket-mem``), then those pages that were allocated at startup +will not be deallocated. + +Any successful deallocation event will trigger a callback, for which user +applications and other DPDK subsystems can register. diff --git a/doc/guides/prog_guide/event_crypto_adapter.rst b/doc/guides/prog_guide/event_crypto_adapter.rst new file mode 100644 index 00000000..9fe09c80 --- /dev/null +++ b/doc/guides/prog_guide/event_crypto_adapter.rst @@ -0,0 +1,296 @@ +.. SPDX-License-Identifier: BSD-3-Clause + Copyright(c) 2018 Intel Corporation. All rights reserved. + +Event Crypto Adapter Library +============================ + +The DPDK :doc:`Eventdev library ` provides event driven +programming model with features to schedule events. +The :doc:`Cryptodev library ` provides an interface to +the crypto poll mode drivers which supports different crypto operations. +The Event Crypto Adapter is one of the adapter which is intended to +bridge between the event device and the crypto device. + +The packet flow from crypto device to the event device can be accomplished +using SW and HW based transfer mechanism. +The Adapter queries an eventdev PMD to determine which mechanism to be used. +The adapter uses an EAL service core function for SW based packet transfer +and uses the eventdev PMD functions to configure HW based packet transfer +between the crypto device and the event device. The crypto adapter uses a new +event type called ``RTE_EVENT_TYPE_CRYPTODEV`` to indicate the event source. + +The application can choose to submit a crypto operation directly to +crypto device or send it to the crypto adapter via eventdev based on +RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD capability. +The first mode is known as the event new(RTE_EVENT_CRYPTO_ADAPTER_OP_NEW) +mode and the second as the event forward(RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD) +mode. The choice of mode can be specified while creating the adapter. +In the former mode, it is an application responsibility to enable ingress +packet ordering. In the latter mode, it is the adapter responsibility to +enable the ingress packet ordering. + + +Adapter Mode +------------ + +RTE_EVENT_CRYPTO_ADAPTER_OP_NEW mode +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the RTE_EVENT_CRYPTO_ADAPTER_OP_NEW mode, application submits crypto +operations directly to crypto device. The adapter then dequeues crypto +completions from crypto device and enqueues them as events to the event device. +This mode does not ensure ingress ordering, if the application directly +enqueues to the cryptodev without going through crypto/atomic stage. +In this mode, events dequeued from the adapter will be treated as new events. +The application needs to specify event information (response information) +which is needed to enqueue an event after the crypto operation is completed. + +.. _figure_event_crypto_adapter_op_new: + +.. figure:: img/event_crypto_adapter_op_new.* + + Working model of ``RTE_EVENT_CRYPTO_ADAPTER_OP_NEW`` mode + + +RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode, if HW supports +RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD capability the application +can directly submit the crypto operations to the cryptodev. +If not, application retrieves crypto adapter's event port using +rte_event_crypto_adapter_event_port_get() API. Then, links its event +queue to this port and starts enqueuing crypto operations as events +to the eventdev. The adapter then dequeues the events and submits the +crypto operations to the cryptodev. After the crypto completions, the +adapter enqueues events to the event device. +Application can use this mode, when ingress packet ordering is needed. +In this mode, events dequeued from the adapter will be treated as +forwarded events. The application needs to specify the cryptodev ID +and queue pair ID (request information) needed to enqueue a crypto +operation in addition to the event information (response information) +needed to enqueue an event after the crypto operation has completed. + +.. _figure_event_crypto_adapter_op_forward: + +.. figure:: img/event_crypto_adapter_op_forward.* + + Working model of ``RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD`` mode + + +API Overview +------------ + +This section has a brief introduction to the event crypto adapter APIs. +The application is expected to create an adapter which is associated with +a single eventdev, then add cryptodev and queue pair to the adapter instance. + +Create an adapter instance +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +An adapter instance is created using ``rte_event_crypto_adapter_create()``. This +function is called with event device to be associated with the adapter and port +configuration for the adapter to setup an event port(if the adapter needs to use +a service function). + +Adapter can be started in ``RTE_EVENT_CRYPTO_ADAPTER_OP_NEW`` or +``RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD`` mode. + +.. code-block:: c + + int err; + uint8_t dev_id, id; + struct rte_event_dev_info dev_info; + struct rte_event_port_conf conf; + enum rte_event_crypto_adapter_mode mode; + + err = rte_event_dev_info_get(id, &dev_info); + + conf.new_event_threshold = dev_info.max_num_events; + conf.dequeue_depth = dev_info.max_event_port_dequeue_depth; + conf.enqueue_depth = dev_info.max_event_port_enqueue_depth; + mode = RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD; + err = rte_event_crypto_adapter_create(id, dev_id, &conf, mode); + +If the application desires to have finer control of eventdev port allocation +and setup, it can use the ``rte_event_crypto_adapter_create_ext()`` function. +The ``rte_event_crypto_adapter_create_ext()`` function is passed as a callback +function. The callback function is invoked if the adapter needs to use a +service function and needs to create an event port for it. The callback is +expected to fill the ``struct rte_event_crypto_adapter_conf`` structure +passed to it. + +For RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode, the event port created by adapter +can be retrieved using ``rte_event_crypto_adapter_event_port_get()`` API. +Application can use this event port to link with event queue on which it +enqueues events towards the crypto adapter. + +.. code-block:: c + + uint8_t id, evdev, crypto_ev_port_id, app_qid; + struct rte_event ev; + int ret; + + ret = rte_event_crypto_adapter_event_port_get(id, &crypto_ev_port_id); + ret = rte_event_queue_setup(evdev, app_qid, NULL); + ret = rte_event_port_link(evdev, crypto_ev_port_id, &app_qid, NULL, 1); + + // Fill in event info and update event_ptr with rte_crypto_op + memset(&ev, 0, sizeof(ev)); + ev.queue_id = app_qid; + . + . + ev.event_ptr = op; + ret = rte_event_enqueue_burst(evdev, app_ev_port_id, ev, nb_events); + +Querying adapter capabilities +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``rte_event_crypto_adapter_caps_get()`` function allows +the application to query the adapter capabilities for an eventdev and cryptodev +combination. This API provides whether cryptodev and eventdev are connected using +internal HW port or not. + +.. code-block:: c + + rte_event_crypto_adapter_caps_get(dev_id, cdev_id, &cap); + +Adding queue pair to the adapter instance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Cryptodev device id and queue pair are created using cryptodev APIs. +For more information see :doc:`here `. + +.. code-block:: c + + struct rte_cryptodev_config conf; + struct rte_cryptodev_qp_conf qp_conf; + uint8_t cdev_id = 0; + uint16_t qp_id = 0; + + rte_cryptodev_configure(cdev_id, &conf); + rte_cryptodev_queue_pair_setup(cdev_id, qp_id, &qp_conf); + +These cryptodev id and queue pair are added to the instance using the +``rte_event_crypto_adapter_queue_pair_add()`` API. +The same is removed using ``rte_event_crypto_adapter_queue_pair_del()`` API. +If HW supports RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND +capability, event information must be passed to the add API. + +.. code-block:: c + + uint32_t cap; + int ret; + + ret = rte_event_crypto_adapter_caps_get(id, evdev, &cap); + if (cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND) { + struct rte_event event; + + // Fill in event information & pass it to add API + rte_event_crypto_adapter_queue_pair_add(id, cdev_id, qp_id, &event); + } else + rte_event_crypto_adapter_queue_pair_add(id, cdev_id, qp_id, NULL); + +Configure the service function +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If the adapter uses a service function, the application is required to assign +a service core to the service function as show below. + +.. code-block:: c + + uint32_t service_id; + + if (rte_event_crypto_adapter_service_id_get(id, &service_id) == 0) + rte_service_map_lcore_set(service_id, CORE_ID); + +Set event request/response information +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode, the application needs +to specify the cryptodev ID and queue pair ID (request information) in +addition to the event information (response information) needed to enqueue +an event after the crypto operation has completed. The request and response +information are specified in the ``struct rte_crypto_op`` private data or +session's private data. + +In the RTE_EVENT_CRYPTO_ADAPTER_OP_NEW mode, the application is required +to provide only the response information. + +The SW adapter or HW PMD uses ``rte_crypto_op::sess_type`` to +decide whether request/response data is located in the crypto session/ +crypto security session or at an offset in the ``struct rte_crypto_op``. +The ``rte_crypto_op::private_data_offset`` is used to locate the request/ +response in the ``rte_crypto_op``. + +For crypto session, ``rte_cryptodev_sym_session_set_user_data()`` API +will be used to set request/response data. The same data will be obtained +by ``rte_cryptodev_sym_session_get_user_data()`` API. The +RTE_EVENT_CRYPTO_ADAPTER_CAP_SESSION_PRIVATE_DATA capability indicates +whether HW or SW supports this feature. + +For security session, ``rte_security_session_set_private_data()`` API +will be used to set request/response data. The same data will be obtained +by ``rte_security_session_get_private_data()`` API. + +For session-less it is mandatory to place the request/response data with +the ``rte_crypto_op``. + +.. code-block:: c + + union rte_event_crypto_metadata m_data; + struct rte_event ev; + struct rte_crypto_op *op; + + /* Allocate & fill op structure */ + op = rte_crypto_op_alloc(); + + memset(&m_data, 0, sizeof(m_data)); + memset(&ev, 0, sizeof(ev)); + /* Fill event information and update event_ptr to rte_crypto_op */ + ev.event_ptr = op; + + if (op->sess_type == RTE_CRYPTO_OP_WITH_SESSION) { + /* Copy response information */ + rte_memcpy(&m_data.response_info, &ev, sizeof(ev)); + /* Copy request information */ + m_data.request_info.cdev_id = cdev_id; + m_data.request_info.queue_pair_id = qp_id; + /* Call set API to store private data information */ + rte_cryptodev_sym_session_set_user_data( + op->sym->session, + &m_data, + sizeof(m_data)); + } if (op->sess_type == RTE_CRYPTO_OP_SESSIONLESS) { + uint32_t len = IV_OFFSET + MAXIMUM_IV_LENGTH + + (sizeof(struct rte_crypto_sym_xform) * 2); + op->private_data_offset = len; + /* Copy response information */ + rte_memcpy(&m_data.response_info, &ev, sizeof(ev)); + /* Copy request information */ + m_data.request_info.cdev_id = cdev_id; + m_data.request_info.queue_pair_id = qp_id; + /* Store private data information along with rte_crypto_op */ + rte_memcpy(op + len, &m_data, sizeof(m_data)); + } + +Start the adapter instance +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The application calls ``rte_event_crypto_adapter_start()`` to start the adapter. +This function calls the start callbacks of the eventdev PMDs for hardware based +eventdev-cryptodev connections and ``rte_service_run_state_set()`` to enable the +service function if one exists. + +.. code-block:: c + + rte_event_crypto_adapter_start(id, mode); + +Get adapter statistics +~~~~~~~~~~~~~~~~~~~~~~ + +The ``rte_event_crypto_adapter_stats_get()`` function reports counters defined +in struct ``rte_event_crypto_adapter_stats``. The received packet and +enqueued event counts are a sum of the counts from the eventdev PMD callbacks +if the callback is supported, and the counts maintained by the service function, +if one exists. diff --git a/doc/guides/prog_guide/event_ethernet_rx_adapter.rst b/doc/guides/prog_guide/event_ethernet_rx_adapter.rst index 4ab87a37..0166bb45 100644 --- a/doc/guides/prog_guide/event_ethernet_rx_adapter.rst +++ b/doc/guides/prog_guide/event_ethernet_rx_adapter.rst @@ -12,7 +12,11 @@ be supported in hardware or require a software thread to receive packets from the ethdev port using ethdev poll mode APIs and enqueue these as events to the event device using the eventdev API. Both transfer mechanisms may be present on the same platform depending on the particular combination of the ethdev and -the event device. +the event device. For SW based packet transfer, if the mbuf does not have a +timestamp set, the adapter adds a timestamp to the mbuf using +rte_get_tsc_cycles(), this provides a more accurate timestamp as compared to +if the application were to set the timestamp since it avoids event device +schedule latency. The Event Ethernet Rx Adapter library is intended for the application code to configure both transfer mechanisms using a common API. A capability API allows @@ -140,3 +144,44 @@ enqueued event counts are a sum of the counts from the eventdev PMD callbacks if the callback is supported, and the counts maintained by the service function, if one exists. The service function also maintains a count of cycles for which it was not able to enqueue to the event device. + +Interrupt Based Rx Queues +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The service core function is typically set up to poll ethernet Rx queues for +packets. Certain queues may have low packet rates and it would be more +efficient to enable the Rx queue interrupt and read packets after receiving +the interrupt. + +The servicing_weight member of struct rte_event_eth_rx_adapter_queue_conf +is applicable when the adapter uses a service core function. The application +has to enable Rx queue interrupts when configuring the ethernet device +using the ``rte_eth_dev_configure()`` function and then use a servicing_weight +of zero when addding the Rx queue to the adapter. + +The adapter creates a thread blocked on the interrupt, on an interrupt this +thread enqueues the port id and the queue id to a ring buffer. The adapter +service function dequeues the port id and queue id from the ring buffer, +invokes the ``rte_eth_rx_burst()`` to receive packets on the queue and +converts the received packets to events in the same manner as packets +received on a polled Rx queue. The interrupt thread is affinitized to the same +CPUs as the lcores of the Rx adapter service function, if the Rx adapter +service function has not been mapped to any lcores, the interrupt thread +is mapped to the master lcore. + +Rx Callback for SW Rx Adapter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For SW based packet transfers, i.e., when the +``RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT`` is not set in the adapter's +capabilities flags for a particular ethernet device, the service function +temporarily enqueues mbufs to an event buffer before batch enqueueing these +to the event device. If the buffer fills up, the service function stops +dequeueing packets from the ethernet device. The application may want to +monitor the buffer fill level and instruct the service function to selectively +enqueue packets to the event device. The application may also use some other +criteria to decide which packets should enter the event device even when +the event buffer fill level is low. The +``rte_event_eth_rx_adapter_cb_register()`` function allow the application +to register a callback that selects which packets to enqueue to the event +device. diff --git a/doc/guides/prog_guide/event_timer_adapter.rst b/doc/guides/prog_guide/event_timer_adapter.rst new file mode 100644 index 00000000..7bbbdfe9 --- /dev/null +++ b/doc/guides/prog_guide/event_timer_adapter.rst @@ -0,0 +1,296 @@ +.. SPDX-License-Identifier: BSD-3-Clause + Copyright(c) 2017 Intel Corporation. All rights reserved. + +Event Timer Adapter Library +=========================== + +The DPDK +`Event Device library `_ +introduces an event driven programming model which presents applications with +an alternative to the polling model traditionally used in DPDK +applications. Event devices can be coupled with arbitrary components to provide +new event sources by using **event adapters**. The Event Timer Adapter is one +such adapter; it bridges event devices and timer mechanisms. + +The Event Timer Adapter library extends the event driven model +by introducing a :ref:`new type of event ` that represents +a timer expiration, and providing an API with which adapters can be created or +destroyed, and :ref:`event timers ` can be armed and canceled. + +The Event Timer Adapter library is designed to interface with hardware or +software implementations of the timer mechanism; it will query an eventdev PMD +to determine which implementation should be used. The default software +implementation manages timers using the DPDK +`Timer library `_. + +Examples of using the API are presented in the `API Overview`_ and +`Processing Timer Expiry Events`_ sections. Code samples are abstracted and +are based on the example of handling a TCP retransmission. + +.. _event_timer: + +Event Timer struct +------------------ +Event timers are timers that enqueue a timer expiration event to an event +device upon timer expiration. + +The Event Timer Adapter API represents each event timer with a generic struct, +which contains an event and user metadata. The ``rte_event_timer`` struct is +defined in ``lib/librte_event/librte_event_timer_adapter.h``. + +.. _timer_expiry_event: + +Timer Expiry Event +~~~~~~~~~~~~~~~~~~ + +The event contained by an event timer is enqueued in the event device when the +timer expires, and the event device uses the attributes below when scheduling +it: + +* ``event_queue_id`` - Application should set this to specify an event queue to + which the timer expiry event should be enqueued +* ``event_priority`` - Application can set this to indicate the priority of the + timer expiry event in the event queue relative to other events +* ``sched_type`` - Application can set this to specify the scheduling type of + the timer expiry event +* ``flow_id`` - Application can set this to indicate which flow this timer + expiry event corresponds to +* ``op`` - Will be set to ``RTE_EVENT_OP_NEW`` by the event timer adapter +* ``event_type`` - Will be set to ``RTE_EVENT_TYPE_TIMER`` by the event timer + adapter + +Timeout Ticks +~~~~~~~~~~~~~ + +The number of ticks from now in which the timer will expire. The ticks value +has a resolution (``timer_tick_ns``) that is specified in the event timer +adapter configuration. + +State +~~~~~ + +Before arming an event timer, the application should initialize its state to +RTE_EVENT_TIMER_NOT_ARMED. The event timer's state will be updated when a +request to arm or cancel it takes effect. + +If the application wishes to rearm the timer after it has expired, it should +reset the state back to RTE_EVENT_TIMER_NOT_ARMED before doing so. + +User Metadata +~~~~~~~~~~~~~ + +Memory to store user specific metadata. The event timer adapter implementation +will not modify this area. + +API Overview +------------ + +This section will introduce the reader to the event timer adapter API, showing +how to create and configure an event timer adapter and use it to manage event +timers. + +From a high level, the setup steps are: + +* rte_event_timer_adapter_create() +* rte_event_timer_adapter_start() + +And to start and stop timers: + +* rte_event_timer_arm_burst() +* rte_event_timer_cancel_burst() + +Create and Configure an Adapter Instance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To create an event timer adapter instance, initialize an +``rte_event_timer_adapter_conf`` struct with the desired values, and pass it +to ``rte_event_timer_adapter_create()``. + +.. code-block:: c + + #define NSECPERSEC 1E9 // No of ns in 1 sec + const struct rte_event_timer_adapter_conf adapter_config = { + .event_dev_id = event_dev_id, + .timer_adapter_id = 0, + .clk_src = RTE_EVENT_TIMER_ADAPTER_CPU_CLK, + .timer_tick_ns = NSECPERSEC / 10, // 100 milliseconds + .max_tmo_nsec = 180 * NSECPERSEC // 2 minutes + .nb_timers = 40000, + .timer_adapter_flags = 0, + }; + + struct rte_event_timer_adapter *adapter = NULL; + adapter = rte_event_timer_adapter_create(&adapter_config); + + if (adapter == NULL) { ... }; + +Before creating an instance of a timer adapter, the application should create +and configure an event device along with its event ports. Based on the event +device capability, it might require creating an additional event port to be +used by the timer adapter. If required, the +``rte_event_timer_adapter_create()`` function will use a default method to +configure an event port; it will examine the current event device +configuration, determine the next available port identifier number, and create +a new event port with a default port configuration. + +If the application desires to have finer control of event port allocation +and setup, it can use the ``rte_event_timer_adapter_create_ext()`` function. +This function is passed a callback function that will be invoked if the +adapter needs to create an event port, giving the application the opportunity +to control how it is done. + +Retrieve Event Timer Adapter Contextual Information +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The event timer adapter implementation may have constraints on tick resolution +or maximum timer expiry timeout based on the given event timer adapter or +system. In this case, the implementation may adjust the tick resolution or +maximum timeout to the best possible configuration. + +Upon successful event timer adapter creation, the application can get the +configured resolution and max timeout with +``rte_event_timer_adapter_get_info()``. This function will return an +``rte_event_timer_adapter_info`` struct, which contains the following members: + +* ``min_resolution_ns`` - Minimum timer adapter tick resolution in ns. +* ``max_tmo_ns`` - Maximum timer timeout(expiry) in ns. +* ``adapter_conf`` - Configured event timer adapter attributes + +Configuring the Service Component +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If the adapter uses a service component, the application is required to map +the service to a service core before starting the adapter: + +.. code-block:: c + + uint32_t service_id; + + if (rte_event_timer_adapter_service_id_get(adapter, &service_id) == 0) + rte_service_map_lcore_set(service_id, EVTIM_CORE_ID); + +An event timer adapter uses a service component if the event device PMD +indicates that the adapter should use a software implementation. + +Starting the Adapter Instance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The application should call ``rte_event_timer_adapter_start()`` to start +running the event timer adapter. This function calls the start entry points +defined by eventdev PMDs for hardware implementations or puts a service +component into the running state in the software implementation. + +Arming Event Timers +~~~~~~~~~~~~~~~~~~~ + +Once an event timer adapter has been started, an application can begin to +manage event timers with it. + +The application should allocate ``struct rte_event_timer`` objects from a +mempool or huge-page backed application buffers of required size. Upon +successful allocation, the application should initialize the event timer, and +then set any of the necessary event attributes described in the +`Timer Expiry Event`_ section. In the following example, assume ``conn`` +represents a TCP connection and that ``event_timer_pool`` is a mempool that +was created previously: + +.. code-block:: c + + rte_mempool_get(event_timer_pool, (void **)&conn->evtim); + if (conn->evtim == NULL) { ... } + + /* Set up the event timer. */ + conn->evtim->ev.op = RTE_EVENT_OP_NEW; + conn->evtim->ev.queue_id = event_queue_id; + conn->evtim->ev.sched_type = RTE_SCHED_TYPE_ATOMIC; + conn->evtim->ev.priority = RTE_EVENT_DEV_PRIORITY_NORMAL; + conn->evtim->ev.event_type = RTE_EVENT_TYPE_TIMER; + conn->evtim->ev.event_ptr = conn; + conn->evtim->state = RTE_EVENT_TIMER_NOT_ARMED; + conn->evtim->timeout_ticks = 30; //3 sec Per RFC1122(TCP returns) + +Note that it is necessary to initialize the event timer state to +RTE_EVENT_TIMER_NOT_ARMED. Also note that we have saved a pointer to the +``conn`` object in the timer's event payload. This will allow us to locate +the connection object again once we dequeue the timer expiry event from the +event device later. As a convenience, the application may specify no value for +ev.event_ptr, and the adapter will by default set it to point at the event +timer itself. + +Now we can arm the event timer with ``rte_event_timer_arm_burst()``: + +.. code-block:: c + + ret = rte_event_timer_arm_burst(adapter, &conn->evtim, 1); + if (ret != 1) { ... } + +Once an event timer expires, the application may free it or rearm it as +necessary. If the application will rearm the timer, the state should be reset +to RTE_EVENT_TIMER_NOT_ARMED by the application before rearming it. + +Multiple Event Timers with Same Expiry Value +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In the special case that there is a set of event timers that should all expire +at the same time, the application may call +``rte_event_timer_arm_tmo_tick_burst()``, which allows the implementation to +optimize the operation if possible. + +Canceling Event Timers +~~~~~~~~~~~~~~~~~~~~~~ + +An event timer that has been armed as described in `Arming Event Timers`_ can +be canceled by calling ``rte_event_timer_cancel_burst()``: + +.. code-block:: c + + /* Ack for the previous tcp data packet has been received; + * cancel the retransmission timer + */ + rte_event_timer_cancel_burst(adapter, &conn->timer, 1); + +Processing Timer Expiry Events +------------------------------ + +Once an event timer has successfully enqueued a timer expiry event in the event +device, the application will subsequently dequeue it from the event device. +The application can use the event payload to retrieve a pointer to the object +associated with the event timer. It can then re-arm the event timer or free the +event timer object as desired: + +.. code-block:: c + + void + event_processing_loop(...) + { + while (...) { + /* Receive events from the configured event port. */ + rte_event_dequeue_burst(event_dev_id, event_port, &ev, 1, 0); + ... + switch(ev.event_type) { + ... + case RTE_EVENT_TYPE_TIMER: + process_timer_event(ev); + ... + break; + } + } + } + + uint8_t + process_timer_event(...) + { + /* A retransmission timeout for the connection has been received. */ + conn = ev.event_ptr; + /* Retransmit last packet (e.g. TCP segment). */ + ... + /* Re-arm timer using original values. */ + rte_event_timer_arm_burst(adapter_id, &conn->timer, 1); + } + +Summary +------- + +The Event Timer Adapter library extends the DPDK event-based programming model +by representing timer expirations as events in the system and allowing +applications to use existing event processing loops to arm and cancel event +timers or handle timer expiry events. diff --git a/doc/guides/prog_guide/eventdev.rst b/doc/guides/prog_guide/eventdev.rst index ce19997d..8fcae546 100644 --- a/doc/guides/prog_guide/eventdev.rst +++ b/doc/guides/prog_guide/eventdev.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: BSD-3-Clause Copyright(c) 2017 Intel Corporation. + Copyright(c) 2018 Arm Limited. Event Device Library ==================== @@ -129,8 +130,10 @@ API Walk-through This section will introduce the reader to the eventdev API, showing how to create and configure an eventdev and use it for a two-stage atomic pipeline -with a single core for TX. The diagram below shows the final state of the -application after this walk-through: +with one core each for RX and TX. RX and TX cores are shown here for +illustration, refer to Eventdev Adapter documentation for further details. +The diagram below shows the final state of the application after this +walk-through: .. _figure_eventdev-usage1: @@ -196,23 +199,29 @@ calling the setup function. Repeat this step for each queue, starting from .nb_atomic_flows = 1024, .nb_atomic_order_sequences = 1024, }; + struct rte_event_queue_conf single_link_conf = { + .event_queue_cfg = RTE_EVENT_QUEUE_CFG_SINGLE_LINK, + }; int dev_id = 0; - int queue_id = 0; - int err = rte_event_queue_setup(dev_id, queue_id, &atomic_conf); + int atomic_q_1 = 0; + int atomic_q_2 = 1; + int single_link_q = 2; + int err = rte_event_queue_setup(dev_id, atomic_q_1, &atomic_conf); + int err = rte_event_queue_setup(dev_id, atomic_q_2, &atomic_conf); + int err = rte_event_queue_setup(dev_id, single_link_q, &single_link_conf); -The remainder of this walk-through assumes that the queues are configured as -follows: +As shown above, queue IDs are as follows: * id 0, atomic queue #1 * id 1, atomic queue #2 * id 2, single-link queue +These queues are used for the remainder of this walk-through. + Setting up Ports ~~~~~~~~~~~~~~~~ -Once queues are set up successfully, create the ports as required. Each port -should be set up with its corresponding port_conf type, worker for worker cores, -rx and tx for the RX and TX cores: +Once queues are set up successfully, create the ports as required. .. code-block:: c @@ -232,15 +241,24 @@ rx and tx for the RX and TX cores: .new_event_threshold = 4096, }; int dev_id = 0; - int port_id = 0; - int err = rte_event_port_setup(dev_id, port_id, &CORE_FUNCTION_conf); + int rx_port_id = 0; + int err = rte_event_port_setup(dev_id, rx_port_id, &rx_conf); + + for(int worker_port_id = 1; worker_port_id <= 4; worker_port_id++) { + int err = rte_event_port_setup(dev_id, worker_port_id, &worker_conf); + } -It is now assumed that: + int tx_port_id = 5; + int err = rte_event_port_setup(dev_id, tx_port_id, &tx_conf); + +As shown above: * port 0: RX core * ports 1,2,3,4: Workers * port 5: TX core +These ports are used for the remainder of this walk-through. + Linking Queues and Ports ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -254,15 +272,14 @@ can be achieved like this: .. code-block:: c - uint8_t port_id = 0; + uint8_t rx_port_id = 0; + uint8_t tx_port_id = 5; uint8_t atomic_qs[] = {0, 1}; uint8_t single_link_q = 2; - uint8_t tx_port_id = 5; uin8t_t priority = RTE_EVENT_DEV_PRIORITY_NORMAL; - for(int i = 0; i < 4; i++) { - int worker_port = i + 1; - int links_made = rte_event_port_link(dev_id, worker_port, atomic_qs, NULL, 2); + for(int worker_port_id = 1; worker_port_id <= 4; worker_port_id++) { + int links_made = rte_event_port_link(dev_id, worker_port_id, atomic_qs, NULL, 2); } int links_made = rte_event_port_link(dev_id, tx_port_id, &single_link_q, &priority, 1); @@ -295,14 +312,14 @@ The following code shows how those packets can be enqueued into the eventdev: ev[i].flow_id = mbufs[i]->hash.rss; ev[i].op = RTE_EVENT_OP_NEW; ev[i].sched_type = RTE_SCHED_TYPE_ATOMIC; - ev[i].queue_id = 0; + ev[i].queue_id = atomic_q_1; ev[i].event_type = RTE_EVENT_TYPE_ETHDEV; ev[i].sub_event_type = 0; ev[i].priority = RTE_EVENT_DEV_PRIORITY_NORMAL; ev[i].mbuf = mbufs[i]; } - const int nb_tx = rte_event_enqueue_burst(dev_id, port_id, ev, nb_rx); + const int nb_tx = rte_event_enqueue_burst(dev_id, rx_port_id, ev, nb_rx); if (nb_tx != nb_rx) { for(i = nb_tx; i < nb_rx; i++) rte_pktmbuf_free(mbufs[i]); @@ -334,7 +351,7 @@ the event to the next stage in the pipeline. events[i].queue_id++; } - uint16_t nb_tx = rte_event_enqueue_burst(dev_id, port_id, events, nb_rx); + uint16_t nb_tx = rte_event_enqueue_burst(dev_id, worker_port_id, events, nb_rx); Egress of Events diff --git a/doc/guides/prog_guide/generic_segmentation_offload_lib.rst b/doc/guides/prog_guide/generic_segmentation_offload_lib.rst index 9959f0d2..0cfc1198 100644 --- a/doc/guides/prog_guide/generic_segmentation_offload_lib.rst +++ b/doc/guides/prog_guide/generic_segmentation_offload_lib.rst @@ -43,6 +43,7 @@ Limitations #. Currently, the GSO library supports the following IPv4 packet types: - TCP + - UDP - VxLAN - GRE @@ -146,6 +147,15 @@ TCP/IPv4 GSO TCP/IPv4 GSO supports segmentation of suitably large TCP/IPv4 packets, which may also contain an optional VLAN tag. +UDP/IPv4 GSO +~~~~~~~~~~~~ +UDP/IPv4 GSO supports segmentation of suitably large UDP/IPv4 packets, which +may also contain an optional VLAN tag. UDP GSO is the same as IP fragmentation. +Specifically, UDP GSO treats the UDP header as a part of the payload and +does not modify it during segmentation. Therefore, after UDP GSO, only the +first output packet has the original UDP header, and others just have l2 +and l3 headers. + VxLAN GSO ~~~~~~~~~ VxLAN packets GSO supports segmentation of suitably large VxLAN packets, diff --git a/doc/guides/prog_guide/glossary.rst b/doc/guides/prog_guide/glossary.rst index e101bc02..dda45bd1 100644 --- a/doc/guides/prog_guide/glossary.rst +++ b/doc/guides/prog_guide/glossary.rst @@ -41,9 +41,6 @@ CPU CRC Cyclic Redundancy Check -ctrlmbuf - An *mbuf* carrying control data. - Data Plane In contrast to the control plane, the data plane in a network architecture are the layers involved when forwarding packets. These layers must be diff --git a/doc/guides/prog_guide/hash_lib.rst b/doc/guides/prog_guide/hash_lib.rst index 180c776e..76a1f323 100644 --- a/doc/guides/prog_guide/hash_lib.rst +++ b/doc/guides/prog_guide/hash_lib.rst @@ -19,6 +19,8 @@ The main configuration parameters for the hash are: * Size of the key in bytes +* An extra flag used to describe additional settings, for example the multithreading mode of operation (as will be described later) + The hash also allows the configuration of some low-level implementation related parameters such as: * Hash function to translate the key into a bucket index @@ -49,8 +51,7 @@ Apart from these method explained above, the API allows the user three more opti Also, the API contains a method to allow the user to look up entries in bursts, achieving higher performance than looking up individual entries, as the function prefetches next entries at the time it is operating with the first ones, which reduces significantly the impact of the necessary memory accesses. -Notice that this method uses a pipeline of 8 entries (4 stages of 2 entries), so it is highly recommended -to use at least 8 entries per burst. + The actual data associated with each key can be either managed by the user using a separate table that mirrors the hash in terms of number of entries and position of each entry, @@ -63,11 +64,33 @@ However, this table could also be used for more sophisticated features and provi Multi-process support --------------------- -The hash library can be used in a multi-process environment, minding that only lookups are thread-safe. +The hash library can be used in a multi-process environment. The only function that can only be used in single-process mode is rte_hash_set_cmp_func(), which sets up a custom compare function, which is assigned to a function pointer (therefore, it is not supported in multi-process mode). + +Multi-thread support +--------------------- + +The hash library supports multithreading, and the user specifies the needed mode of operation at the creation time of the hash table +by appropriately setting the flag. In all modes of operation lookups are thread-safe meaning lookups can be called from multiple +threads concurrently. + +For concurrent writes, and concurrent reads and writes the following flag values define the corresponding modes of operation: + +* If the multi-writer flag (RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) is set, multiple threads writing to the table is allowed. + Key add, delete, and table reset are protected from other writer threads. With only this flag set, readers are not protected from ongoing writes. + +* If the read/write concurrency (RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) is set, multithread read/write operation is safe + (i.e., no need to stop the readers from accessing the hash table until writers finish their updates. Reads and writes can operate table concurrently). + +* In addition to these two flag values, if the transactional memory flag (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT) is also set, + hardware transactional memory will be used to guarantee the thread safety as long as it is supported by the hardware (for example the Intel® TSX support). + +If the platform supports Intel® TSX, it is advised to set the transactional memory flag, as this will speed up concurrent table operations. +Otherwise concurrent operations will be slower because of the overhead associated with the software locking mechanisms. + Implementation Details ---------------------- diff --git a/doc/guides/prog_guide/img/event_crypto_adapter_op_forward.svg b/doc/guides/prog_guide/img/event_crypto_adapter_op_forward.svg new file mode 100644 index 00000000..54466f2e --- /dev/null +++ b/doc/guides/prog_guide/img/event_crypto_adapter_op_forward.svg @@ -0,0 +1,1078 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + + + + + + + 1 + + + 2 + + + + 8 + + + + + 7 + + + + + 3 + + + + 4 + + + 5 + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + + + 6 + + + Eventdev + + + CryptoAdapter + + + Applicationin orderedstage + + + Cryptodev + + + 1. Events from the previous stage.2. Application in ordered stage dequeues events from eventdev.3. Application enqueues crypto operations as events to eventdev.4. Crypto adapter dequeues event from eventdev.5. Crypto adapter submits crypto operations to cryptodev (Atomic stage)6. Crypto adapter dequeues crypto completions from cryptodev7. Crypto adapter enqueues events to the eventdev8. Events to the next stage + + + diff --git a/doc/guides/prog_guide/img/event_crypto_adapter_op_new.svg b/doc/guides/prog_guide/img/event_crypto_adapter_op_new.svg new file mode 100644 index 00000000..256862ed --- /dev/null +++ b/doc/guides/prog_guide/img/event_crypto_adapter_op_new.svg @@ -0,0 +1,1061 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1 + + + 2 + + + + + 3 + + + 4 + + + 6 + + + Eventdev + + + Atomic stage+enqueue tocryptodev + + + 5 + + + Cryptodev + + + Cryptoadapter + + + 1. Application dequeues events from the previous stage2. Application prepares the crypto operations.3. Crypto operations are submitted to cryptodev by application..4. Crypto adapter dequeues crypto completions from cryptodev.5. Crypto adapter enqueues events to the eventdev.6. Application dequeues from eventdev and prepare for further processing + + + Square + Atomic Queue #1 + + + + + + + + + + + + + + + + Application + + + diff --git a/doc/guides/prog_guide/img/eventdev_usage.svg b/doc/guides/prog_guide/img/eventdev_usage.svg index 7765649b..c19818b9 100644 --- a/doc/guides/prog_guide/img/eventdev_usage.svg +++ b/doc/guides/prog_guide/img/eventdev_usage.svg @@ -1,994 +1,549 @@ + + + + -image/svg+xml - - - - - - - -